diff options
Diffstat (limited to 'lib/PublicInbox/Search.pm')
-rw-r--r-- | lib/PublicInbox/Search.pm | 445 |
1 files changed, 305 insertions, 140 deletions
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 2feb3e13..25ef49c5 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -11,6 +11,7 @@ our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); use List::Util qw(max); use POSIX qw(strftime); use Carp (); +our $XHC = 0; # defined but false # values for searching, changing the numeric value breaks # compatibility with old indices (so don't change them it) @@ -53,26 +54,54 @@ use constant { # # v1.6.0 adds BYTES, UID and THREADID values SCHEMA_VERSION => 15, + + # we may have up to 8 FDs per shard (depends on Xapian *shrug*) + SHARD_COST => 8, }; use PublicInbox::Smsg; -use PublicInbox::Over; +eval { require PublicInbox::Over }; our $QP_FLAGS; our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); -our $Xap; # 'Search::Xapian' or 'Xapian' +our $Xap; # 'Xapian' or 'Search::Xapian' our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') # ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, # let's hope the ABI is stable our $ENQ_DESCENDING = 0; our $ENQ_ASCENDING = 1; +our @MAIL_VMAP = ( + [ YYYYMMDD, 'd:'], + [ TS, 'rt:' ], + # these are undocumented for WWW, but lei and IMAP use them + [ DT, 'dt:' ], + [ BYTES, 'z:' ], + [ UID, 'uid:' ] +); +our @MAIL_NRP; + +# Getopt::Long spec, only short options for portability in C++ implementation +our @XH_SPEC = ( + 'a', # ascending sort + 'c', # code search + 'd=s@', # shard dirs + 'g=s', # git dir (with -c) + 'k=i', # sort column (like sort(1)) + 'm=i', # maximum number of results + 'o=i', # offset + 'r', # 1=relevance then column + 't', # collapse threads + 'A=s@', # prefixes + 'K=i', # timeout kill after i seconds + 'O=s', # eidx_key + 'T=i', # threadid + 'Q=s@', # query prefixes "$user_prefix[:=]$XPREFIX" +); sub load_xapian () { return 1 if defined $Xap; - # n.b. PI_XAPIAN is intended for development use only. We still - # favor Search::Xapian since that's what's available in current - # Debian stable (10.x) and derived distros. - for my $x (($ENV{PI_XAPIAN} // 'Search::Xapian'), 'Xapian') { + # n.b. PI_XAPIAN is intended for development use only + for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') { eval "require $x"; next if $@; @@ -85,8 +114,7 @@ sub load_xapian () { # NumberRangeProcessor was added in Xapian 1.3.6, # NumberValueRangeProcessor was removed for 1.5.0+, - # favor the older /Value/ variant since that's what our - # (currently) preferred Search::Xapian supports + # continue with the older /Value/ variant for now... $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ? 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); @@ -101,6 +129,7 @@ sub load_xapian () { # or make indexlevel=medium as default $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() | FLAG_WILDCARD(); + @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP; return 1; } undef; @@ -110,43 +139,50 @@ sub load_xapian () { # a prefix common in patch emails our $LANG = 'english'; +our %PATCH_BOOL_COMMON = ( + dfpre => 'XDFPRE', + dfpost => 'XDFPOST', + dfblob => 'XDFPRE XDFPOST', + patchid => 'XDFID', +); + # note: the non-X term prefix allocations are shared with # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue lid => 'G', # newsGroup (or similar entity), just inside <> - dfpre => 'XDFPRE', - dfpost => 'XDFPOST', - dfblob => 'XDFPRE XDFPOST', - patchid => 'XDFID', + %PATCH_BOOL_COMMON ); -my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; -my %prob_prefix = ( - # for mairix compatibility +# for mairix compatibility +our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; +our %PATCH_PROB_COMMON = ( s => 'S', - m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial - l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial f => 'A', - t => 'XTO', - tc => 'XTO XCC', - c => 'XCC', - tcf => 'XTO XCC A', - a => 'XTO XCC A', - b => $non_quoted_body . ' XQUOT', - bs => $non_quoted_body . ' XQUOT S', + b => $NON_QUOTED_BODY . ' XQUOT', + bs => $NON_QUOTED_BODY . ' XQUOT S', n => 'XFN', q => 'XQUOT', - nq => $non_quoted_body, + nq => $NON_QUOTED_BODY, dfn => 'XDFN', dfa => 'XDFA', dfb => 'XDFB', dfhh => 'XDFHH', dfctx => 'XDFCTX', +); +my %prob_prefix = ( + m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial + t => 'XTO', + tc => 'XTO XCC', + c => 'XCC', + tcf => 'XTO XCC A', + a => 'XTO XCC A', + %PATCH_PROB_COMMON, # default: - '' => 'XM S A XQUOT XFN ' . $non_quoted_body, + '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY, ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian @@ -190,33 +226,37 @@ sub xdir ($;$) { my ($self, $rdonly) = @_; if ($rdonly || !defined($self->{shard})) { $self->{xpfx}; - } else { # v2 + extindex only: + } else { # v2, extindex, cindex only: "$self->{xpfx}/$self->{shard}"; } } -# returns all shards as separate Xapian::Database objects w/o combining -sub xdb_shards_flat ($) { +# returns shard directories as an array of strings, does not verify existence +sub shard_dirs ($) { my ($self) = @_; my $xpfx = $self->{xpfx}; - my (@xdb, $slow_phrase); - load_xapian(); - $self->{qp_flags} //= $QP_FLAGS; - if ($xpfx =~ m!/xapian[0-9]+\z!) { - @xdb = ($X{Database}->new($xpfx)); - $self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert"; - } else { + if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox + ($xpfx); + } else { # v2 inbox, eidx, cidx opendir(my $dh, $xpfx) or return (); # not initialized yet # We need numeric sorting so shard[0] is first for reading # Xapian metadata, if needed my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return (); - for (0..$last) { - my $shard_dir = "$self->{xpfx}/$_"; - push @xdb, $X{Database}->new($shard_dir); - $slow_phrase ||= -f "$shard_dir/iamchert"; - } - $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; + map { "$xpfx/$_" } (0..$last); } +} + +# returns all shards as separate Xapian::Database objects w/o combining +sub xdb_shards_flat ($) { + my ($self) = @_; + load_xapian(); + $self->{qp_flags} //= $QP_FLAGS; + my $slow_phrase; + my @xdb = map { + $slow_phrase ||= -f "$_/iamchert"; + $X{Database}->new($_); # raises if missing + } shard_dirs($self); + $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; @xdb; } @@ -229,6 +269,12 @@ sub mdocid { int(($docid - 1) / $nshard) + 1; } +sub docids_to_artnums { + my $nshard = shift->{nshard}; + # XXX does array vs arrayref make a difference in modern Perls? + map { int(($_ - 1) / $nshard) + 1 } @_; +} + sub mset_to_artnums { my ($self, $mset) = @_; my $nshard = $self->{nshard}; @@ -277,42 +323,18 @@ sub date_parse_prepare { my $end = $range =~ s/([\)\s]*)\z// ? $1 : ''; my @r = split(/\.\./, $range, 2); - # expand "d:20101002" => "d:20101002..20101003" and like + # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD - # We upgrade "d:" to "dt:" to iff using approxidate + # We upgrade "d:" to "dt:" unconditionally if ($pfx eq 'd') { - my $fmt = "\0%Y%m%d"; - if (!defined($r[1])) { - if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2})\z/) { - push @$to_parse, "$1-$2-$3"; - # we could've handled as-is, but we need - # to parse anyways for "d+" below - } else { - push @$to_parse, $r[0]; - if ($r[0] !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) { - $pfx = 'dt'; - $fmt = "\0%Y%m%d%H%M%S"; - } - } - $r[0] = "$fmt+$#$to_parse\0"; - $r[1] = "$fmt+\0"; - } else { - for my $x (@r) { - next if $x eq '' || $x =~ /\A[0-9]{8}\z/; - push @$to_parse, $x; - if ($x !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) { - $pfx = 'dt'; - } - $x = "$fmt$#$to_parse\0"; - } - if ($pfx eq 'dt') { - for (@r) { - s/\0%Y%m%d/\0%Y%m%d%H%M%S/; - s/\A([0-9]{8})\z/${1}000000/; - } - } - } - } elsif ($pfx eq 'dt') { + $pfx = 'dt'; + # upgrade YYYYMMDD to YYYYMMDDHHMMSS + $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]] + [0-9]{2}[^[:alnum:]] + [0-9]{2}\z!x, @r)); + $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r)); + } + if ($pfx eq 'dt') { if (!defined($r[1])) { # git needs gaps and not /\d{14}/ if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2}) ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) { @@ -329,7 +351,7 @@ sub date_parse_prepare { $x = "\0%Y%m%d%H%M%S$#$to_parse\0"; } } - } else { # "rt", let git interpret "YYYY", deal with Y10K later :P + } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P for my $x (@r) { next if $x eq '' || $x =~ /\A[0-9]{5,}\z/; push @$to_parse, $x; @@ -388,13 +410,113 @@ sub query_approxidate { date_parse_finalize($git, $to_parse, $_[2]) if $to_parse; } -# read-only +# read-only, for mail only (codesearch has different rules) sub mset { - my ($self, $query_string, $opts) = @_; - $opts ||= {}; + my ($self, $qry_str, $opt) = @_; my $qp = $self->{qp} //= $self->qparse_new; - my $query = $qp->parse_query($query_string, $self->{qp_flags}); - _do_enquire($self, $query, $opts); + my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); + if (defined(my $eidx_key = $opt->{eidx_key})) { + $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key); + } + if (defined(my $uid_range = $opt->{uid_range})) { + my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, + sortable_serialise($uid_range->[0]), + sortable_serialise($uid_range->[1])); + $qry = $X{Query}->new(OP_FILTER(), $qry, $range); + } + if (defined(my $tid = $opt->{threadid})) { + $tid = sortable_serialise($tid); + $qry = $X{Query}->new(OP_FILTER(), $qry, + $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid)); + } + do_enquire($self, $qry, $opt, TS); +} + +sub xhc_start_maybe (@) { + require PublicInbox::XapClient; + my $xhc = PublicInbox::XapClient::start_helper(@_); + require PublicInbox::XhcMset if $xhc; + $xhc; +} + +sub xh_opt ($$) { + my ($self, $opt) = @_; + my $lim = $opt->{limit} || 50; + my @ret; + push @ret, '-o', $opt->{offset} if $opt->{offset}; + push @ret, '-m', $lim; + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + push @ret, '-k', '-1'; + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + push @ret, '-k', '-1'; + push @ret, '-a'; + } elsif ($rel == 0) { + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } else { # rel > 0 + push @ret, '-r'; + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } + push @ret, '-t' if $opt->{threads}; + push @ret, '-T', $opt->{threadid} if defined $opt->{threadid}; + push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; + my $apfx = $self->{-alt_pfx} //= do { + my @tmp; + for (grep /\Aserial:/, @{$self->{altid} // []}) { + my (undef, $pfx) = split /:/, $_; + push @tmp, '-Q', "$pfx=X\U$pfx"; + } + # TODO: arbitrary header indexing goes here + \@tmp; + }; + (@ret, @$apfx); +} + +# returns a true value if actually handled asynchronously, +# and a falsy value if handled synchronously +sub async_mset { + my ($self, $qry_str, $opt, $cb, @args) = @_; + if ($XHC) { # unconditionally retrieving pct + rank for now + xdb($self); # populate {nshards} + my @margs = ($self->xh_args, xh_opt($self, $opt)); + my $ret = eval { + my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); + PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); + }; + $cb->(@args, undef, $@) if $@; + $ret; + } else { # synchronous + my $mset = $self->mset($qry_str, $opt); + $cb->(@args, $mset); + undef; + } +} + +sub do_enquire { # shared with CodeSearch + my ($self, $qry, $opt, $col) = @_; + my $enq = $X{Enquire}->new(xdb($self)); + $enq->set_query($qry); + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_DESCENDING); + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enq->set_sort_by_value_then_relevance($col, !$opt->{asc}); + } else { # rel > 0 + $enq->set_sort_by_relevance_then_value($col, !$opt->{asc}); + } + + # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse + # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword + (($opt->{threads} // 0) == 1 && has_threadid($self)) and + $enq->set_collapse_key(THREADID); + retry_reopen($self, \&enquire_once, $enq, + $opt->{offset} || 0, $opt->{limit} || 50); } sub retry_reopen { @@ -421,50 +543,15 @@ sub retry_reopen { Carp::croak("Too many Xapian database modifications in progress\n"); } -sub _do_enquire { - my ($self, $query, $opts) = @_; - retry_reopen($self, \&_enquire_once, $query, $opts); -} - # returns true if all docs have the THREADID value sub has_threadid ($) { my ($self) = @_; (xdb($self)->get_metadata('has_threadid') // '') eq '1'; } -sub _enquire_once { # retry_reopen callback - my ($self, $query, $opts) = @_; - my $xdb = xdb($self); - if (defined(my $eidx_key = $opts->{eidx_key})) { - $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key); - } - if (defined(my $uid_range = $opts->{uid_range})) { - my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, - sortable_serialise($uid_range->[0]), - sortable_serialise($uid_range->[1])); - $query = $X{Query}->new(OP_FILTER(), $query, $range); - } - my $enquire = $X{Enquire}->new($xdb); - $enquire->set_query($query); - $opts ||= {}; - my $rel = $opts->{relevance} // 0; - if ($rel == -2) { # ORDER BY docid/UID (highest first) - $enquire->set_weighting_scheme($X{BoolWeight}->new); - $enquire->set_docid_order($ENQ_DESCENDING); - } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) - $enquire->set_weighting_scheme($X{BoolWeight}->new); - $enquire->set_docid_order($ENQ_ASCENDING); - } elsif ($rel == 0) { - $enquire->set_sort_by_value_then_relevance(TS, !$opts->{asc}); - } else { # rel > 0 - $enquire->set_sort_by_relevance_then_value(TS, !$opts->{asc}); - } - - # `mairix -t / --threads' or JMAP collapseThreads - if ($opts->{threads} && has_threadid($self)) { - $enquire->set_collapse_key(THREADID); - } - $enquire->get_mset($opts->{offset} || 0, $opts->{limit} || 50); +sub enquire_once { # retry_reopen callback + my (undef, $enq, $offset, $limit) = @_; + $enq->get_mset($offset, $limit); } sub mset_to_smsg { @@ -481,29 +568,27 @@ sub mset_to_smsg { # read-write sub stemmer { $X{Stem}->new($LANG) } -# read-only -sub qparse_new { +sub qp_init_common { my ($self) = @_; - - my $xdb = xdb($self); my $qp = $X{QueryParser}->new; $qp->set_default_op(OP_AND()); - $qp->set_database($xdb); + $qp->set_database(xdb($self)); $qp->set_stemmer(stemmer($self)); $qp->set_stemming_strategy(STEM_SOME()); my $cb = $qp->can('set_max_wildcard_expansion') // $qp->can('set_max_expansion'); # Xapian 1.5.0+ $cb->($qp, 100); - $cb = $qp->can('add_valuerangeprocessor') // - $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ - $cb->($qp, $NVRP->new(YYYYMMDD, 'd:')); - $cb->($qp, $NVRP->new(DT, 'dt:')); + $qp; +} - # for IMAP, undocumented for WWW and may be split off go away - $cb->($qp, $NVRP->new(BYTES, 'z:')); - $cb->($qp, $NVRP->new(TS, 'rt:')); - $cb->($qp, $NVRP->new(UID, 'uid:')); +# read-only +sub qparse_new { + my ($self) = @_; + my $qp = qp_init_common($self); + my $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + $cb->($qp, $_) for @MAIL_NRP; while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix); } @@ -533,6 +618,40 @@ EOF $qp; } +sub generate_cxx () { # generates snippet for xap_helper.h + my $ret = <<EOM; +# line ${\__LINE__} "${\__FILE__}" +static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}]; +static void mail_nrp_init(void) +{ +EOM + for (0..$#MAIL_VMAP) { + my $x = $MAIL_VMAP[$_]; + $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n} + } +$ret .= <<EOM; +} + +# line ${\__LINE__} "${\__FILE__}" +static void qp_init_mail_search(Xapian::QueryParser *qp) +{ + for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++) + qp->ADD_RP(mail_nrp[i]); +EOM + for my $name (sort keys %bool_pfx_external) { + for (split(/ /, $bool_pfx_external{$name})) { + $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} + } + } + # altid support is handled in xh_opt and srch_init_extra in XH + for my $name (sort keys %prob_prefix) { + for (split(/ /, $prob_prefix{$name})) { + $ret .= qq{\tqp->add_prefix("$name", "$_");\n} + } + } + $ret .= "}\n"; +} + sub help { my ($self) = @_; $self->{qp} //= $self->qparse_new; # parse altids @@ -560,24 +679,70 @@ sub get_pct ($) { # mset item sub xap_terms ($$;@) { my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () - my %ret; my $end = $xdb_or_doc->termlist_end(@docid); my $cur = $xdb_or_doc->termlist_begin(@docid); + $cur->skip_to($pfx); + my (@ret, $tn); + my $pfxlen = length($pfx); for (; $cur != $end; $cur++) { - $cur->skip_to($pfx); - last if $cur == $end; - my $tn = $cur->get_termname; - $ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx); + $tn = $cur->get_termname; + index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen)); } - wantarray ? sort(keys(%ret)) : \%ret; + wantarray ? @ret : +{ map { $_ => undef } @ret }; } # get combined docid from over.num: -# (not generic Xapian, only works with our sharding scheme) +# (not generic Xapian, only works with our sharding scheme for mail) sub num2docid ($$) { my ($self, $num) = @_; my $nshard = $self->{nshard}; ($num - 1) * $nshard + $num % $nshard + 1; } +sub all_terms { + my ($self, $pfx) = @_; + my $cur = xdb($self)->allterms_begin($pfx); + my $end = $self->{xdb}->allterms_end($pfx); + my $pfxlen = length($pfx); + my @ret; + for (; $cur != $end; $cur++) { + push @ret, substr($cur->get_termname, $pfxlen); + } + wantarray ? @ret : +{ map { $_ => undef } @ret }; +} + +sub xh_args { # prep getopt args to feed to xap_helper.h socket + map { ('-d', $_) } shard_dirs($_[0]); +} + +sub docids_by_postlist ($$) { + my ($self, $q) = @_; + my $cur = $self->xdb->postlist_begin($q); + my $end = $self->{xdb}->postlist_end($q); + my @ids; + for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) }; + @ids; +} + +sub get_doc ($$) { + my ($self, $docid) = @_; + eval { $self->{xdb}->get_document($docid) } // do { + die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/; + undef; + } +} + +# not sure where best to put this... +sub ulimit_n () { + my $n; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($n, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + require PublicInbox::Spawn; + $n = PublicInbox::Spawn::run_qx([qw(/bin/sh -c), 'ulimit -n']); + } + $n; +} + 1; |