diff options
Diffstat (limited to 'lib/PublicInbox/Search.pm')
-rw-r--r-- | lib/PublicInbox/Search.pm | 465 |
1 files changed, 300 insertions, 165 deletions
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 17e202e1..fbdb48a3 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -1,21 +1,23 @@ -# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # based on notmuch, but with no concept of folders, files or flags # # Read-only search interface for use by the web and NNTP interfaces package PublicInbox::Search; use strict; +use v5.10.1; use parent qw(Exporter); our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); use List::Util qw(max); use POSIX qw(strftime); use Carp (); +our $XHC = 0; # defined but false # values for searching, changing the numeric value breaks # compatibility with old indices (so don't change them it) use constant { TS => 0, # Received: in Unix time (IMAP INTERNALDATE, JMAP receivedAt) - YYYYMMDD => 1, # Date: header for searching in the WWW UI + YYYYMMDD => 1, # redundant with DT below DT => 2, # Date: YYYYMMDDHHMMSS (IMAP SENT*, JMAP sentAt) # added for public-inbox 1.6.0+ @@ -55,23 +57,47 @@ use constant { }; use PublicInbox::Smsg; -use PublicInbox::Over; +eval { require PublicInbox::Over }; our $QP_FLAGS; our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); -our $Xap; # 'Search::Xapian' or 'Xapian' +our $Xap; # 'Xapian' or 'Search::Xapian' our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') # ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, # let's hope the ABI is stable our $ENQ_DESCENDING = 0; our $ENQ_ASCENDING = 1; +our @MAIL_VMAP = ( + [ YYYYMMDD, 'd:'], + [ TS, 'rt:' ], + # these are undocumented for WWW, but lei and IMAP use them + [ DT, 'dt:' ], + [ BYTES, 'z:' ], + [ UID, 'uid:' ] +); +our @MAIL_NRP; + +# Getopt::Long spec, only short options for portability in C++ implementation +our @XH_SPEC = ( + 'a', # ascending sort + 'c', # code search + 'd=s@', # shard dirs + 'g=s', # git dir (with -c) + 'k=i', # sort column (like sort(1)) + 'm=i', # maximum number of results + 'o=i', # offset + 'r', # 1=relevance then column + 't', # collapse threads + 'A=s@', # prefixes + 'K=i', # timeout kill after i seconds + 'O=s', # eidx_key + 'T=i', # threadid +); sub load_xapian () { return 1 if defined $Xap; - # n.b. PI_XAPIAN is intended for development use only. We still - # favor Search::Xapian since that's what's available in current - # Debian stable (10.x) and derived distros. - for my $x (($ENV{PI_XAPIAN} // 'Search::Xapian'), 'Xapian') { + # n.b. PI_XAPIAN is intended for development use only + for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') { eval "require $x"; next if $@; @@ -84,8 +110,7 @@ sub load_xapian () { # NumberRangeProcessor was added in Xapian 1.3.6, # NumberValueRangeProcessor was removed for 1.5.0+, - # favor the older /Value/ variant since that's what our - # (currently) preferred Search::Xapian supports + # continue with the older /Value/ variant for now... $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ? 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); @@ -100,6 +125,7 @@ sub load_xapian () { # or make indexlevel=medium as default $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() | FLAG_WILDCARD(); + @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP; return 1; } undef; @@ -109,42 +135,50 @@ sub load_xapian () { # a prefix common in patch emails our $LANG = 'english'; +our %PATCH_BOOL_COMMON = ( + dfpre => 'XDFPRE', + dfpost => 'XDFPOST', + dfblob => 'XDFPRE XDFPOST', + patchid => 'XDFID', +); + # note: the non-X term prefix allocations are shared with # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue lid => 'G', # newsGroup (or similar entity), just inside <> - dfpre => 'XDFPRE', - dfpost => 'XDFPOST', - dfblob => 'XDFPRE XDFPOST', + %PATCH_BOOL_COMMON ); -my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST'; -my %prob_prefix = ( - # for mairix compatibility +# for mairix compatibility +our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; +our %PATCH_PROB_COMMON = ( s => 'S', - m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial - l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial f => 'A', - t => 'XTO', - tc => 'XTO XCC', - c => 'XCC', - tcf => 'XTO XCC A', - a => 'XTO XCC A', - b => $non_quoted_body . ' XQUOT', - bs => $non_quoted_body . ' XQUOT S', + b => $NON_QUOTED_BODY . ' XQUOT', + bs => $NON_QUOTED_BODY . ' XQUOT S', n => 'XFN', q => 'XQUOT', - nq => $non_quoted_body, + nq => $NON_QUOTED_BODY, dfn => 'XDFN', dfa => 'XDFA', dfb => 'XDFB', dfhh => 'XDFHH', dfctx => 'XDFCTX', +); +my %prob_prefix = ( + m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial + t => 'XTO', + tc => 'XTO XCC', + c => 'XCC', + tcf => 'XTO XCC A', + a => 'XTO XCC A', + %PATCH_PROB_COMMON, # default: - '' => 'XM S A XQUOT XFN ' . $non_quoted_body, + '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY, ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian @@ -154,12 +188,9 @@ my %prob_prefix = ( our @HELP = ( 's:' => 'match within Subject e.g. s:"a quick brown fox"', 'd:' => <<EOF, -date range as YYYYMMDD e.g. d:19931002..20101002 -Open-ended ranges such as d:19931002.. and d:..20101002 -are also supported -EOF - 'dt:' => <<EOF, -date-time range as YYYYMMDDhhmmss (e.g. dt:19931002011000..19931002011200) +match date-time range, git "approxidate" formats supported +Open-ended ranges such as `d:last.week..' and +`d:..2.days.ago' are supported EOF 'b:' => 'match within message body, including text attachments', 'nq:' => 'match non-quoted text within message body', @@ -180,6 +211,10 @@ EOF 'dfpre:' => 'match pre-image git blob ID', 'dfpost:' => 'match post-image git blob ID', 'dfblob:' => 'match either pre or post-image git blob ID', + 'patchid:' => "match `git patch-id --stable' output", + 'rt:' => <<EOF, +match received time, like `d:' if sender's clock was correct +EOF ); chomp @HELP; @@ -187,33 +222,37 @@ sub xdir ($;$) { my ($self, $rdonly) = @_; if ($rdonly || !defined($self->{shard})) { $self->{xpfx}; - } else { # v2 + extindex only: + } else { # v2, extindex, cindex only: "$self->{xpfx}/$self->{shard}"; } } -# returns all shards as separate Xapian::Database objects w/o combining -sub xdb_shards_flat ($) { +# returns shard directories as an array of strings, does not verify existence +sub shard_dirs ($) { my ($self) = @_; my $xpfx = $self->{xpfx}; - my (@xdb, $slow_phrase); - load_xapian(); - $self->{qp_flags} //= $QP_FLAGS; - if ($xpfx =~ m!/xapian[0-9]+\z!) { - @xdb = ($X{Database}->new($xpfx)); - $self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert"; - } else { + if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox + ($xpfx); + } else { # v2 inbox, eidx, cidx opendir(my $dh, $xpfx) or return (); # not initialized yet # We need numeric sorting so shard[0] is first for reading # Xapian metadata, if needed my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return (); - for (0..$last) { - my $shard_dir = "$self->{xpfx}/$_"; - push @xdb, $X{Database}->new($shard_dir); - $slow_phrase ||= -f "$shard_dir/iamchert"; - } - $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; + map { "$xpfx/$_" } (0..$last); } +} + +# returns all shards as separate Xapian::Database objects w/o combining +sub xdb_shards_flat ($) { + my ($self) = @_; + load_xapian(); + $self->{qp_flags} //= $QP_FLAGS; + my $slow_phrase; + my @xdb = map { + $slow_phrase ||= -f "$_/iamchert"; + $X{Database}->new($_); # raises if missing + } shard_dirs($self); + $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; @xdb; } @@ -226,6 +265,12 @@ sub mdocid { int(($docid - 1) / $nshard) + 1; } +sub docids_to_artnums { + my $nshard = shift->{nshard}; + # XXX does array vs arrayref make a difference in modern Perls? + map { int(($_ - 1) / $nshard) + 1 } @_; +} + sub mset_to_artnums { my ($self, $mset) = @_; my $nshard = $self->{nshard}; @@ -243,20 +288,6 @@ sub xdb ($) { }; } -# returns true if a future rescan is desired -sub cleanup_shards { - my ($self) = @_; - return unless exists($self->{xdb}); - my $xpfx = $self->{xpfx}; - return reopen($self) if $xpfx =~ m!/xapian[0-9]+\z!; # true - opendir(my $dh, $xpfx) or return warn("$xpfx gone: $!\n"); # true - my $nr = grep(/\A[0-9]+\z/, readdir($dh)) or - return warn("$xpfx has no shards\n"); # true - return reopen($self) if $nr == ($self->{nshard} // -1); - delete($self->{xdb}); - undef; -} - sub new { my ($class, $ibx) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; @@ -288,42 +319,18 @@ sub date_parse_prepare { my $end = $range =~ s/([\)\s]*)\z// ? $1 : ''; my @r = split(/\.\./, $range, 2); - # expand "d:20101002" => "d:20101002..20101003" and like + # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD - # We upgrade "d:" to "dt:" to iff using approxidate + # We upgrade "d:" to "dt:" unconditionally if ($pfx eq 'd') { - my $fmt = "\0%Y%m%d"; - if (!defined($r[1])) { - if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2})\z/) { - push @$to_parse, "$1-$2-$3"; - # we could've handled as-is, but we need - # to parse anyways for "d+" below - } else { - push @$to_parse, $r[0]; - if ($r[0] !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) { - $pfx = 'dt'; - $fmt = "\0%Y%m%d%H%M%S"; - } - } - $r[0] = "$fmt+$#$to_parse\0"; - $r[1] = "$fmt+\0"; - } else { - for my $x (@r) { - next if $x eq '' || $x =~ /\A[0-9]{8}\z/; - push @$to_parse, $x; - if ($x !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) { - $pfx = 'dt'; - } - $x = "$fmt$#$to_parse\0"; - } - if ($pfx eq 'dt') { - for (@r) { - s/\0%Y%m%d/\0%Y%m%d%H%M%S/; - s/\A([0-9]{8})\z/${1}000000/; - } - } - } - } elsif ($pfx eq 'dt') { + $pfx = 'dt'; + # upgrade YYYYMMDD to YYYYMMDDHHMMSS + $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]] + [0-9]{2}[^[:alnum:]] + [0-9]{2}\z!x, @r)); + $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r)); + } + if ($pfx eq 'dt') { if (!defined($r[1])) { # git needs gaps and not /\d{14}/ if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2}) ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) { @@ -340,7 +347,7 @@ sub date_parse_prepare { $x = "\0%Y%m%d%H%M%S$#$to_parse\0"; } } - } else { # "rt", let git interpret "YYYY", deal with Y10K later :P + } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P for my $x (@r) { next if $x eq '' || $x =~ /\A[0-9]{5,}\z/; push @$to_parse, $x; @@ -399,25 +406,114 @@ sub query_approxidate { date_parse_finalize($git, $to_parse, $_[2]) if $to_parse; } -# read-only +# read-only, for mail only (codesearch has different rules) sub mset { - my ($self, $query_string, $opts) = @_; - $opts ||= {}; + my ($self, $qry_str, $opt) = @_; my $qp = $self->{qp} //= $self->qparse_new; - my $query = $qp->parse_query($query_string, $self->{qp_flags}); - _do_enquire($self, $query, $opts); + my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); + if (defined(my $eidx_key = $opt->{eidx_key})) { + $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key); + } + if (defined(my $uid_range = $opt->{uid_range})) { + my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, + sortable_serialise($uid_range->[0]), + sortable_serialise($uid_range->[1])); + $qry = $X{Query}->new(OP_FILTER(), $qry, $range); + } + if (defined(my $tid = $opt->{threadid})) { + $tid = sortable_serialise($tid); + $qry = $X{Query}->new(OP_FILTER(), $qry, + $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid)); + } + do_enquire($self, $qry, $opt, TS); +} + +sub xhc_start_maybe (@) { + require PublicInbox::XapClient; + my $xhc = PublicInbox::XapClient::start_helper(@_); + require PublicInbox::XhcMset if $xhc; + $xhc; +} + +sub xh_opt ($) { + my ($opt) = @_; + my $lim = $opt->{limit} || 50; + my @ret; + push @ret, '-o', $opt->{offset} if $opt->{offset}; + push @ret, '-m', $lim; + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + push @ret, '-k', '-1'; + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + push @ret, '-k', '-1'; + push @ret, '-a'; + } elsif ($rel == 0) { + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } else { # rel > 0 + push @ret, '-r'; + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } + push @ret, '-t' if $opt->{threads}; + push @ret, '-T', $opt->{threadid} if defined $opt->{threadid}; + push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; + @ret; +} + +# returns a true value if actually handled asynchronously, +# and a falsy value if handled synchronously +sub async_mset { + my ($self, $qry_str, $opt, $cb, @args) = @_; + if ($XHC) { # unconditionally retrieving pct + rank for now + xdb($self); # populate {nshards} + my @margs = ($self->xh_args, xh_opt($opt)); + my $ret = eval { + my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); + PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); + }; + $cb->(@args, undef, $@) if $@; + $ret; + } else { # synchronous + my $mset = $self->mset($qry_str, $opt); + $cb->(@args, $mset); + undef; + } +} + +sub do_enquire { # shared with CodeSearch + my ($self, $qry, $opt, $col) = @_; + my $enq = $X{Enquire}->new(xdb($self)); + $enq->set_query($qry); + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_DESCENDING); + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enq->set_sort_by_value_then_relevance($col, !$opt->{asc}); + } else { # rel > 0 + $enq->set_sort_by_relevance_then_value($col, !$opt->{asc}); + } + + # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse + # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword + (($opt->{threads} // 0) == 1 && has_threadid($self)) and + $enq->set_collapse_key(THREADID); + retry_reopen($self, \&enquire_once, $enq, + $opt->{offset} || 0, $opt->{limit} || 50); } sub retry_reopen { my ($self, $cb, @arg) = @_; for my $i (1..10) { if (wantarray) { - my @ret; - eval { @ret = $cb->($self, @arg) }; + my @ret = eval { $cb->($self, @arg) }; return @ret unless $@; } else { - my $ret; - eval { $ret = $cb->($self, @arg) }; + my $ret = eval { $cb->($self, @arg) }; return $ret unless $@; } # Exception: The revision being read has been discarded - @@ -434,50 +530,15 @@ sub retry_reopen { Carp::croak("Too many Xapian database modifications in progress\n"); } -sub _do_enquire { - my ($self, $query, $opts) = @_; - retry_reopen($self, \&_enquire_once, $query, $opts); -} - # returns true if all docs have the THREADID value sub has_threadid ($) { my ($self) = @_; (xdb($self)->get_metadata('has_threadid') // '') eq '1'; } -sub _enquire_once { # retry_reopen callback - my ($self, $query, $opts) = @_; - my $xdb = xdb($self); - if (defined(my $eidx_key = $opts->{eidx_key})) { - $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key); - } - if (defined(my $uid_range = $opts->{uid_range})) { - my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, - sortable_serialise($uid_range->[0]), - sortable_serialise($uid_range->[1])); - $query = $X{Query}->new(OP_FILTER(), $query, $range); - } - my $enquire = $X{Enquire}->new($xdb); - $enquire->set_query($query); - $opts ||= {}; - my $rel = $opts->{relevance} // 0; - if ($rel == -2) { # ORDER BY docid/UID (highest first) - $enquire->set_weighting_scheme($X{BoolWeight}->new); - $enquire->set_docid_order($ENQ_DESCENDING); - } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) - $enquire->set_weighting_scheme($X{BoolWeight}->new); - $enquire->set_docid_order($ENQ_ASCENDING); - } elsif ($rel == 0) { - $enquire->set_sort_by_value_then_relevance(TS, !$opts->{asc}); - } else { # rel > 0 - $enquire->set_sort_by_relevance_then_value(TS, !$opts->{asc}); - } - - # `mairix -t / --threads' or JMAP collapseThreads - if ($opts->{threads} && has_threadid($self)) { - $enquire->set_collapse_key(THREADID); - } - $enquire->get_mset($opts->{offset} || 0, $opts->{limit} || 50); +sub enquire_once { # retry_reopen callback + my (undef, $enq, $offset, $limit) = @_; + $enq->get_mset($offset, $limit); } sub mset_to_smsg { @@ -494,29 +555,27 @@ sub mset_to_smsg { # read-write sub stemmer { $X{Stem}->new($LANG) } -# read-only -sub qparse_new { +sub qp_init_common { my ($self) = @_; - - my $xdb = xdb($self); my $qp = $X{QueryParser}->new; $qp->set_default_op(OP_AND()); - $qp->set_database($xdb); + $qp->set_database(xdb($self)); $qp->set_stemmer(stemmer($self)); $qp->set_stemming_strategy(STEM_SOME()); my $cb = $qp->can('set_max_wildcard_expansion') // $qp->can('set_max_expansion'); # Xapian 1.5.0+ $cb->($qp, 100); - $cb = $qp->can('add_valuerangeprocessor') // - $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ - $cb->($qp, $NVRP->new(YYYYMMDD, 'd:')); - $cb->($qp, $NVRP->new(DT, 'dt:')); + $qp; +} - # for IMAP, undocumented for WWW and may be split off go away - $cb->($qp, $NVRP->new(BYTES, 'z:')); - $cb->($qp, $NVRP->new(TS, 'rt:')); - $cb->($qp, $NVRP->new(UID, 'uid:')); +# read-only +sub qparse_new { + my ($self) = @_; + my $qp = qp_init_common($self); + my $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + $cb->($qp, $_) for @MAIL_NRP; while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix); } @@ -546,6 +605,40 @@ EOF $qp; } +sub generate_cxx () { # generates snippet for xap_helper.h + my $ret = <<EOM; +# line ${\__LINE__} "${\__FILE__}" +static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}]; +static void mail_nrp_init(void) +{ +EOM + for (0..$#MAIL_VMAP) { + my $x = $MAIL_VMAP[$_]; + $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n} + } +$ret .= <<EOM; +} + +# line ${\__LINE__} "${\__FILE__}" +static void qp_init_mail_search(Xapian::QueryParser *qp) +{ + for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++) + qp->ADD_RP(mail_nrp[i]); +EOM + for my $name (sort keys %bool_pfx_external) { + for (split(/ /, $bool_pfx_external{$name})) { + $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} + } + } + # TODO: altid support + for my $name (sort keys %prob_prefix) { + for (split(/ /, $prob_prefix{$name})) { + $ret .= qq{\tqp->add_prefix("$name", "$_");\n} + } + } + $ret .= "}\n"; +} + sub help { my ($self) = @_; $self->{qp} //= $self->qparse_new; # parse altids @@ -556,9 +649,10 @@ sub help { \@ret; } +# always returns a scalar value sub int_val ($$) { my ($doc, $col) = @_; - my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + my $val = $doc->get_value($col) or return undef; # undef is '' in Xapian sortable_unserialise($val) + 0; # PV => IV conversion } @@ -572,16 +666,57 @@ sub get_pct ($) { # mset item sub xap_terms ($$;@) { my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () - my %ret; my $end = $xdb_or_doc->termlist_end(@docid); my $cur = $xdb_or_doc->termlist_begin(@docid); + $cur->skip_to($pfx); + my (@ret, $tn); + my $pfxlen = length($pfx); + for (; $cur != $end; $cur++) { + $tn = $cur->get_termname; + index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen)); + } + wantarray ? @ret : +{ map { $_ => undef } @ret }; +} + +# get combined docid from over.num: +# (not generic Xapian, only works with our sharding scheme for mail) +sub num2docid ($$) { + my ($self, $num) = @_; + my $nshard = $self->{nshard}; + ($num - 1) * $nshard + $num % $nshard + 1; +} + +sub all_terms { + my ($self, $pfx) = @_; + my $cur = xdb($self)->allterms_begin($pfx); + my $end = $self->{xdb}->allterms_end($pfx); + my $pfxlen = length($pfx); + my @ret; for (; $cur != $end; $cur++) { - $cur->skip_to($pfx); - last if $cur == $end; - my $tn = $cur->get_termname; - $ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx); + push @ret, substr($cur->get_termname, $pfxlen); + } + wantarray ? @ret : +{ map { $_ => undef } @ret }; +} + +sub xh_args { # prep getopt args to feed to xap_helper.h socket + map { ('-d', $_) } shard_dirs($_[0]); +} + +sub docids_by_postlist ($$) { + my ($self, $q) = @_; + my $cur = $self->xdb->postlist_begin($q); + my $end = $self->{xdb}->postlist_end($q); + my @ids; + for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) }; + @ids; +} + +sub get_doc ($$) { + my ($self, $docid) = @_; + eval { $self->{xdb}->get_document($docid) } // do { + die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/; + undef; } - wantarray ? sort(keys(%ret)) : \%ret; } 1; |