about summary refs log tree commit homepage
path: root/lib/PublicInbox/Search.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox/Search.pm')
-rw-r--r--lib/PublicInbox/Search.pm365
1 files changed, 226 insertions, 139 deletions
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 2feb3e13..678c8c5d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -59,20 +59,46 @@ use PublicInbox::Smsg;
 use PublicInbox::Over;
 our $QP_FLAGS;
 our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query);
-our $Xap; # 'Search::Xapian' or 'Xapian'
+our $Xap; # 'Xapian' or 'Search::Xapian'
 our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
 
 # ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16,
 # let's hope the ABI is stable
 our $ENQ_DESCENDING = 0;
 our $ENQ_ASCENDING = 1;
+our @MAIL_VMAP = (
+        [ YYYYMMDD, 'd:'],
+        [ TS, 'rt:' ],
+        # these are undocumented for WWW, but lei and IMAP use them
+        [ DT, 'dt:' ],
+        [ BYTES, 'z:' ],
+        [ UID, 'uid:' ]
+);
+our @MAIL_NRP;
+
+# Getopt::Long spec, only short options for portability in C++ implementation
+our @XH_SPEC = (
+        'a', # ascending sort
+        'c', # code search
+        'd=s@', # shard dirs
+        'g=s', # git dir (with -c)
+        'k=i', # sort column (like sort(1))
+        'm=i', # maximum number of results
+        'o=i', # offset
+        'p', # show percent
+        'r', # 1=relevance then column
+        't', # collapse threads
+        'A=s@', # prefixes
+        'D', # emit docdata
+        'K=i', # timeout kill after i seconds
+        'O=s', # eidx_key
+        'T=i', # threadid
+);
 
 sub load_xapian () {
         return 1 if defined $Xap;
-        # n.b. PI_XAPIAN is intended for development use only.  We still
-        # favor Search::Xapian since that's what's available in current
-        # Debian stable (10.x) and derived distros.
-        for my $x (($ENV{PI_XAPIAN} // 'Search::Xapian'), 'Xapian') {
+        # n.b. PI_XAPIAN is intended for development use only
+        for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') {
                 eval "require $x";
                 next if $@;
 
@@ -85,8 +111,7 @@ sub load_xapian () {
 
                 # NumberRangeProcessor was added in Xapian 1.3.6,
                 # NumberValueRangeProcessor was removed for 1.5.0+,
-                # favor the older /Value/ variant since that's what our
-                # (currently) preferred Search::Xapian supports
+                # continue with the older /Value/ variant for now...
                 $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ?
                         'NumberRangeProcessor' : 'NumberValueRangeProcessor');
                 $X{$_} = $Xap.'::'.$_ for (keys %X);
@@ -101,6 +126,7 @@ sub load_xapian () {
                 # or make indexlevel=medium as default
                 $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() |
                                 FLAG_WILDCARD();
+                @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP;
                 return 1;
         }
         undef;
@@ -110,43 +136,50 @@ sub load_xapian () {
 # a prefix common in patch emails
 our $LANG = 'english';
 
+our %PATCH_BOOL_COMMON = (
+        dfpre => 'XDFPRE',
+        dfpost => 'XDFPOST',
+        dfblob => 'XDFPRE XDFPOST',
+        patchid => 'XDFID',
+);
+
 # note: the non-X term prefix allocations are shared with
 # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
 my %bool_pfx_external = (
         mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
         lid => 'G', # newsGroup (or similar entity), just inside <>
-        dfpre => 'XDFPRE',
-        dfpost => 'XDFPOST',
-        dfblob => 'XDFPRE XDFPOST',
-        patchid => 'XDFID',
+        %PATCH_BOOL_COMMON
 );
 
-my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
-my %prob_prefix = (
-        # for mairix compatibility
+# for mairix compatibility
+our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
+our %PATCH_PROB_COMMON = (
         s => 'S',
-        m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
-        l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
         f => 'A',
-        t => 'XTO',
-        tc => 'XTO XCC',
-        c => 'XCC',
-        tcf => 'XTO XCC A',
-        a => 'XTO XCC A',
-        b => $non_quoted_body . ' XQUOT',
-        bs => $non_quoted_body . ' XQUOT S',
+        b => $NON_QUOTED_BODY . ' XQUOT',
+        bs => $NON_QUOTED_BODY . ' XQUOT S',
         n => 'XFN',
 
         q => 'XQUOT',
-        nq => $non_quoted_body,
+        nq => $NON_QUOTED_BODY,
         dfn => 'XDFN',
         dfa => 'XDFA',
         dfb => 'XDFB',
         dfhh => 'XDFHH',
         dfctx => 'XDFCTX',
+);
 
+my %prob_prefix = (
+        m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+        l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
+        t => 'XTO',
+        tc => 'XTO XCC',
+        c => 'XCC',
+        tcf => 'XTO XCC A',
+        a => 'XTO XCC A',
+        %PATCH_PROB_COMMON,
         # default:
-        '' => 'XM S A XQUOT XFN ' . $non_quoted_body,
+        '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY,
 );
 
 # not documenting m: and mid: for now, the using the URLs works w/o Xapian
@@ -190,33 +223,37 @@ sub xdir ($;$) {
         my ($self, $rdonly) = @_;
         if ($rdonly || !defined($self->{shard})) {
                 $self->{xpfx};
-        } else { # v2 + extindex only:
+        } else { # v2, extindex, cindex only:
                 "$self->{xpfx}/$self->{shard}";
         }
 }
 
-# returns all shards as separate Xapian::Database objects w/o combining
-sub xdb_shards_flat ($) {
+# returns shard directories as an array of strings, does not verify existence
+sub shard_dirs ($) {
         my ($self) = @_;
         my $xpfx = $self->{xpfx};
-        my (@xdb, $slow_phrase);
-        load_xapian();
-        $self->{qp_flags} //= $QP_FLAGS;
-        if ($xpfx =~ m!/xapian[0-9]+\z!) {
-                @xdb = ($X{Database}->new($xpfx));
-                $self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert";
-        } else {
+        if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox
+                ($xpfx);
+        } else { # v2 inbox, eidx, cidx
                 opendir(my $dh, $xpfx) or return (); # not initialized yet
                 # We need numeric sorting so shard[0] is first for reading
                 # Xapian metadata, if needed
                 my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return ();
-                for (0..$last) {
-                        my $shard_dir = "$self->{xpfx}/$_";
-                        push @xdb, $X{Database}->new($shard_dir);
-                        $slow_phrase ||= -f "$shard_dir/iamchert";
-                }
-                $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase;
+                map { "$xpfx/$_" } (0..$last);
         }
+}
+
+# returns all shards as separate Xapian::Database objects w/o combining
+sub xdb_shards_flat ($) {
+        my ($self) = @_;
+        load_xapian();
+        $self->{qp_flags} //= $QP_FLAGS;
+        my $slow_phrase;
+        my @xdb = map {
+                $slow_phrase ||= -f "$_/iamchert";
+                $X{Database}->new($_); # raises if missing
+        } shard_dirs($self);
+        $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase;
         @xdb;
 }
 
@@ -229,6 +266,12 @@ sub mdocid {
         int(($docid - 1) / $nshard) + 1;
 }
 
+sub docids_to_artnums {
+        my $nshard = shift->{nshard};
+        # XXX does array vs arrayref make a difference in modern Perls?
+        map { int(($_ - 1) / $nshard) + 1 } @_;
+}
+
 sub mset_to_artnums {
         my ($self, $mset) = @_;
         my $nshard = $self->{nshard};
@@ -277,42 +320,18 @@ sub date_parse_prepare {
         my $end = $range =~ s/([\)\s]*)\z// ? $1 : '';
         my @r = split(/\.\./, $range, 2);
 
-        # expand "d:20101002" => "d:20101002..20101003" and like
+        # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like
         # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD
-        # We upgrade "d:" to "dt:" to iff using approxidate
+        # We upgrade "d:" to "dt:" unconditionally
         if ($pfx eq 'd') {
-                my $fmt = "\0%Y%m%d";
-                if (!defined($r[1])) {
-                        if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2})\z/) {
-                                push @$to_parse, "$1-$2-$3";
-                                # we could've handled as-is, but we need
-                                # to parse anyways for "d+" below
-                        } else {
-                                push @$to_parse, $r[0];
-                                if ($r[0] !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) {
-                                        $pfx = 'dt';
-                                        $fmt = "\0%Y%m%d%H%M%S";
-                                }
-                        }
-                        $r[0] = "$fmt+$#$to_parse\0";
-                        $r[1] = "$fmt+\0";
-                } else {
-                        for my $x (@r) {
-                                next if $x eq '' || $x =~ /\A[0-9]{8}\z/;
-                                push @$to_parse, $x;
-                                if ($x !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) {
-                                        $pfx = 'dt';
-                                }
-                                $x = "$fmt$#$to_parse\0";
-                        }
-                        if ($pfx eq 'dt') {
-                                for (@r) {
-                                        s/\0%Y%m%d/\0%Y%m%d%H%M%S/;
-                                        s/\A([0-9]{8})\z/${1}000000/;
-                                }
-                        }
-                }
-        } elsif ($pfx eq 'dt') {
+                $pfx = 'dt';
+                # upgrade YYYYMMDD to YYYYMMDDHHMMSS
+                $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]]
+                                        [0-9]{2}[^[:alnum:]]
+                                        [0-9]{2}\z!x, @r));
+                $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r));
+        }
+        if ($pfx eq 'dt') {
                 if (!defined($r[1])) { # git needs gaps and not /\d{14}/
                         if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2})
                                         ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) {
@@ -329,7 +348,7 @@ sub date_parse_prepare {
                                 $x = "\0%Y%m%d%H%M%S$#$to_parse\0";
                         }
                 }
-        } else { # "rt", let git interpret "YYYY", deal with Y10K later :P
+        } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P
                 for my $x (@r) {
                         next if $x eq '' || $x =~ /\A[0-9]{5,}\z/;
                         push @$to_parse, $x;
@@ -388,13 +407,51 @@ sub query_approxidate {
         date_parse_finalize($git, $to_parse, $_[2]) if $to_parse;
 }
 
-# read-only
+# read-only, for mail only (codesearch has different rules)
 sub mset {
-        my ($self, $query_string, $opts) = @_;
-        $opts ||= {};
+        my ($self, $qry_str, $opt) = @_;
         my $qp = $self->{qp} //= $self->qparse_new;
-        my $query = $qp->parse_query($query_string, $self->{qp_flags});
-        _do_enquire($self, $query, $opts);
+        my $qry = $qp->parse_query($qry_str, $self->{qp_flags});
+        if (defined(my $eidx_key = $opt->{eidx_key})) {
+                $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key);
+        }
+        if (defined(my $uid_range = $opt->{uid_range})) {
+                my $range = $X{Query}->new(OP_VALUE_RANGE(), UID,
+                                        sortable_serialise($uid_range->[0]),
+                                        sortable_serialise($uid_range->[1]));
+                $qry = $X{Query}->new(OP_FILTER(), $qry, $range);
+        }
+        if (defined(my $tid = $opt->{threadid})) {
+                $tid = sortable_serialise($tid);
+                $qry = $X{Query}->new(OP_FILTER(), $qry,
+                        $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid));
+        }
+        do_enquire($self, $qry, $opt, TS);
+}
+
+sub do_enquire { # shared with CodeSearch
+        my ($self, $qry, $opt, $col) = @_;
+        my $enq = $X{Enquire}->new(xdb($self));
+        $enq->set_query($qry);
+        my $rel = $opt->{relevance} // 0;
+        if ($rel == -2) { # ORDER BY docid/UID (highest first)
+                $enq->set_weighting_scheme($X{BoolWeight}->new);
+                $enq->set_docid_order($ENQ_DESCENDING);
+        } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first)
+                $enq->set_weighting_scheme($X{BoolWeight}->new);
+                $enq->set_docid_order($ENQ_ASCENDING);
+        } elsif ($rel == 0) {
+                $enq->set_sort_by_value_then_relevance($col, !$opt->{asc});
+        } else { # rel > 0
+                $enq->set_sort_by_relevance_then_value($col, !$opt->{asc});
+        }
+
+        # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse
+        # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword
+        (($opt->{threads} // 0) == 1 && has_threadid($self)) and
+                $enq->set_collapse_key(THREADID);
+        retry_reopen($self, \&enquire_once, $enq,
+                        $opt->{offset} || 0, $opt->{limit} || 50);
 }
 
 sub retry_reopen {
@@ -421,50 +478,15 @@ sub retry_reopen {
         Carp::croak("Too many Xapian database modifications in progress\n");
 }
 
-sub _do_enquire {
-        my ($self, $query, $opts) = @_;
-        retry_reopen($self, \&_enquire_once, $query, $opts);
-}
-
 # returns true if all docs have the THREADID value
 sub has_threadid ($) {
         my ($self) = @_;
         (xdb($self)->get_metadata('has_threadid') // '') eq '1';
 }
 
-sub _enquire_once { # retry_reopen callback
-        my ($self, $query, $opts) = @_;
-        my $xdb = xdb($self);
-        if (defined(my $eidx_key = $opts->{eidx_key})) {
-                $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key);
-        }
-        if (defined(my $uid_range = $opts->{uid_range})) {
-                my $range = $X{Query}->new(OP_VALUE_RANGE(), UID,
-                                        sortable_serialise($uid_range->[0]),
-                                        sortable_serialise($uid_range->[1]));
-                $query = $X{Query}->new(OP_FILTER(), $query, $range);
-        }
-        my $enquire = $X{Enquire}->new($xdb);
-        $enquire->set_query($query);
-        $opts ||= {};
-        my $rel = $opts->{relevance} // 0;
-        if ($rel == -2) { # ORDER BY docid/UID (highest first)
-                $enquire->set_weighting_scheme($X{BoolWeight}->new);
-                $enquire->set_docid_order($ENQ_DESCENDING);
-        } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first)
-                $enquire->set_weighting_scheme($X{BoolWeight}->new);
-                $enquire->set_docid_order($ENQ_ASCENDING);
-        } elsif ($rel == 0) {
-                $enquire->set_sort_by_value_then_relevance(TS, !$opts->{asc});
-        } else { # rel > 0
-                $enquire->set_sort_by_relevance_then_value(TS, !$opts->{asc});
-        }
-
-        # `mairix -t / --threads' or JMAP collapseThreads
-        if ($opts->{threads} && has_threadid($self)) {
-                $enquire->set_collapse_key(THREADID);
-        }
-        $enquire->get_mset($opts->{offset} || 0, $opts->{limit} || 50);
+sub enquire_once { # retry_reopen callback
+        my (undef, $enq, $offset, $limit) = @_;
+        $enq->get_mset($offset, $limit);
 }
 
 sub mset_to_smsg {
@@ -481,29 +503,27 @@ sub mset_to_smsg {
 # read-write
 sub stemmer { $X{Stem}->new($LANG) }
 
-# read-only
-sub qparse_new {
+sub qp_init_common {
         my ($self) = @_;
-
-        my $xdb = xdb($self);
         my $qp = $X{QueryParser}->new;
         $qp->set_default_op(OP_AND());
-        $qp->set_database($xdb);
+        $qp->set_database(xdb($self));
         $qp->set_stemmer(stemmer($self));
         $qp->set_stemming_strategy(STEM_SOME());
         my $cb = $qp->can('set_max_wildcard_expansion') //
                 $qp->can('set_max_expansion'); # Xapian 1.5.0+
         $cb->($qp, 100);
-        $cb = $qp->can('add_valuerangeprocessor') //
-                $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
-        $cb->($qp, $NVRP->new(YYYYMMDD, 'd:'));
-        $cb->($qp, $NVRP->new(DT, 'dt:'));
+        $qp;
+}
 
-        # for IMAP, undocumented for WWW and may be split off go away
-        $cb->($qp, $NVRP->new(BYTES, 'z:'));
-        $cb->($qp, $NVRP->new(TS, 'rt:'));
-        $cb->($qp, $NVRP->new(UID, 'uid:'));
+# read-only
+sub qparse_new {
+        my ($self) = @_;
+        my $qp = qp_init_common($self);
+        my $cb = $qp->can('add_valuerangeprocessor') //
+                $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
 
+        $cb->($qp, $_) for @MAIL_NRP;
         while (my ($name, $prefix) = each %bool_pfx_external) {
                 $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);
         }
@@ -533,6 +553,40 @@ EOF
         $qp;
 }
 
+sub generate_cxx () { # generates snippet for xap_helper.h
+        my $ret = <<EOM;
+# line ${\__LINE__} "${\__FILE__}"
+static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}];
+static void mail_nrp_init(void)
+{
+EOM
+        for (0..$#MAIL_VMAP) {
+                my $x = $MAIL_VMAP[$_];
+                $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n}
+        }
+$ret .= <<EOM;
+}
+
+# line ${\__LINE__} "${\__FILE__}"
+static void qp_init_mail_search(Xapian::QueryParser *qp)
+{
+        for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++)
+                qp->ADD_RP(mail_nrp[i]);
+EOM
+        for my $name (sort keys %bool_pfx_external) {
+                for (split(/ /, $bool_pfx_external{$name})) {
+                        $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n}
+                }
+        }
+        # TODO: altid support
+        for my $name (sort keys %prob_prefix) {
+                for (split(/ /, $prob_prefix{$name})) {
+                        $ret .= qq{\tqp->add_prefix("$name", "$_");\n}
+                }
+        }
+        $ret .= "}\n";
+}
+
 sub help {
         my ($self) = @_;
         $self->{qp} //= $self->qparse_new; # parse altids
@@ -560,24 +614,57 @@ sub get_pct ($) { # mset item
 
 sub xap_terms ($$;@) {
         my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
-        my %ret;
         my $end = $xdb_or_doc->termlist_end(@docid);
         my $cur = $xdb_or_doc->termlist_begin(@docid);
+        $cur->skip_to($pfx);
+        my (@ret, $tn);
+        my $pfxlen = length($pfx);
         for (; $cur != $end; $cur++) {
-                $cur->skip_to($pfx);
-                last if $cur == $end;
-                my $tn = $cur->get_termname;
-                $ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx);
+                $tn = $cur->get_termname;
+                index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
         }
-        wantarray ? sort(keys(%ret)) : \%ret;
+        wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 # get combined docid from over.num:
-# (not generic Xapian, only works with our sharding scheme)
+# (not generic Xapian, only works with our sharding scheme for mail)
 sub num2docid ($$) {
         my ($self, $num) = @_;
         my $nshard = $self->{nshard};
         ($num - 1) * $nshard + $num % $nshard + 1;
 }
 
+sub all_terms {
+        my ($self, $pfx) = @_;
+        my $cur = xdb($self)->allterms_begin($pfx);
+        my $end = $self->{xdb}->allterms_end($pfx);
+        my $pfxlen = length($pfx);
+        my @ret;
+        for (; $cur != $end; $cur++) {
+                push @ret, substr($cur->get_termname, $pfxlen);
+        }
+        wantarray ? @ret : +{ map { $_ => undef } @ret };
+}
+
+sub xh_args { # prep getopt args to feed to xap_helper.h socket
+        map { ('-d', $_) } shard_dirs($_[0]);
+}
+
+sub docids_by_postlist ($$) {
+        my ($self, $q) = @_;
+        my $cur = $self->xdb->postlist_begin($q);
+        my $end = $self->{xdb}->postlist_end($q);
+        my @ids;
+        for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) };
+        @ids;
+}
+
+sub get_doc ($$) {
+        my ($self, $docid) = @_;
+        eval { $self->{xdb}->get_document($docid) } // do {
+                die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/;
+                undef;
+        }
+}
+
 1;