# Copyright (C) 2020-2021 all contributors # License: AGPL-3.0+ # read-only counterpart to MiscIdx package PublicInbox::MiscSearch; use strict; use v5.10.1; use PublicInbox::Search qw(retry_reopen int_val xap_terms); my $json; # Xapian value columns: our $MODIFIED = 0; our $UIDVALIDITY = 1; # (created time) # avoid conflicting with message Search::prob_prefix for UI/UX reasons my %PROB_PREFIX = ( description => 'S', # $INBOX_DIR/description address => 'A', listid => 'XLISTID', url => 'XURL', infourl => 'XINFOURL', name => 'XNAME', '' => 'S A XLISTID XNAME XURL XINFOURL' ); sub new { my ($class, $dir) = @_; PublicInbox::Search::load_xapian(); $json //= PublicInbox::Config::json(); bless { xdb => $PublicInbox::Search::X{Database}->new($dir) }, $class; } # read-only sub mi_qp_new ($) { my ($self) = @_; my $xdb = $self->{xdb}; my $qp = $PublicInbox::Search::X{QueryParser}->new; $qp->set_default_op(PublicInbox::Search::OP_AND()); $qp->set_database($xdb); $qp->set_stemmer(PublicInbox::Search::stemmer($self)); $qp->set_stemming_strategy(PublicInbox::Search::STEM_SOME()); my $cb = $qp->can('set_max_wildcard_expansion') // $qp->can('set_max_expansion'); # Xapian 1.5.0+ $cb->($qp, 100); $cb = $qp->can('add_valuerangeprocessor') // $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ while (my ($name, $prefix) = each %PROB_PREFIX) { $qp->add_prefix($name, $_) for split(/ /, $prefix); } $qp->add_boolean_prefix('type', 'T'); $qp; } sub misc_enquire_once { # retry_reopen callback my ($self, $qr, $opt) = @_; my $eq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); $eq->set_query($qr); my $desc = !$opt->{asc}; my $rel = $opt->{relevance} // 0; if ($rel == -1) { # ORDER BY docid $eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); $eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); } elsif ($rel) { $eq->set_sort_by_relevance_then_value($MODIFIED, $desc); } else { $eq->set_sort_by_value_then_relevance($MODIFIED, $desc); } $eq->get_mset($opt->{offset} || 0, $opt->{limit} || 200); } sub mset { my ($self, $qs, $opt) = @_; $opt ||= {}; reopen($self); my $qp = $self->{qp} //= mi_qp_new($self); $qs = 'type:inbox' if $qs eq ''; my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); $opt->{relevance} = 1 unless exists $opt->{relevance}; retry_reopen($self, \&misc_enquire_once, $qr, $opt); } sub ibx_matches_once { # retry_reopen callback my ($self, $qr, $by_newsgroup) = @_; # double in case no newsgroups are configured: my $limit = scalar(keys %$by_newsgroup) * 2; my $opt = { limit => $limit, offset => 0, relevance => -1 }; my $ret = {}; # newsgroup => $ibx of matches while (1) { my $mset = misc_enquire_once($self, $qr, $opt); for my $mi ($mset->items) { my ($eidx_key) = xap_terms('Q', $mi->get_document); if (defined($eidx_key)) { if (my $ibx = $by_newsgroup->{$eidx_key}) { $ret->{$eidx_key} = $ibx; } } else { warn <get_docid} has no `Q' (eidx_key) term EOF } } my $nr = $mset->size; return $ret if $nr < $limit; $opt->{offset} += $nr; } } # returns a newsgroup => PublicInbox::Inbox mapping sub newsgroup_matches { my ($self, $qs, $pi_cfg) = @_; my $qp = $self->{qp} //= mi_qp_new($self); $qs .= ' type:inbox'; my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); retry_reopen($self, \&ibx_matches_once, $qr, $pi_cfg->{-by_newsgroup}); } sub ibx_data_once { my ($self, $ibx) = @_; my $xdb = $self->{xdb}; my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private my $head = $xdb->postlist_begin($term); my $tail = $xdb->postlist_end($term); if ($head != $tail) { my $doc = $xdb->get_document($head->get_docid); $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); $ibx->{-modified} = int_val($doc, $MODIFIED); $doc->get_data; } else { undef; } } sub doc2ibx_cache_ent { # @_ == ($self, $doc) OR ($doc) my ($doc) = $_[-1]; my $d; my $data = $json->decode($doc->get_data); for (values %$data) { $d = $_->{description} // next; $d =~ s/ \[epoch [0-9]+\]\z// or next; last; } { uidvalidity => int_val($doc, $UIDVALIDITY), -modified => int_val($doc, $MODIFIED), # extract description from manifest.js.gz epoch description description => $d }; } sub inbox_data { my ($self, $ibx) = @_; retry_reopen($self, \&ibx_data_once, $ibx); } sub ibx_cache_load { my ($doc, $cache) = @_; my ($eidx_key) = xap_terms('Q', $doc); return unless defined($eidx_key); # expired $cache->{$eidx_key} = doc2ibx_cache_ent($doc); } sub _nntpd_cache_load { # retry_reopen callback my ($self) = @_; my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 }; my $mset = mset($self, 'type:newsgroup type:inbox', $opt); my $cache = {}; for my $it ($mset->items) { ibx_cache_load($it->get_document, $cache); } $cache } # returns { newsgroup => $cache_entry } mapping, $cache_entry contains # anything which may trigger seeks at startup, currently: description, # -modified, and uidvalidity. sub nntpd_cache_load { my ($self) = @_; retry_reopen($self, \&_nntpd_cache_load); } no warnings 'once'; *reopen = \&PublicInbox::Search::reopen; 1;