about summary refs log tree commit homepage
path: root/lib/PublicInbox/Search.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox/Search.pm')
-rw-r--r--lib/PublicInbox/Search.pm721
1 files changed, 548 insertions, 173 deletions
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7f901125..fbdb48a3 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -1,44 +1,122 @@
-# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 # based on notmuch, but with no concept of folders, files or flags
 #
 # Read-only search interface for use by the web and NNTP interfaces
 package PublicInbox::Search;
 use strict;
-use warnings;
+use v5.10.1;
+use parent qw(Exporter);
+our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms);
+use List::Util qw(max);
+use POSIX qw(strftime);
+use Carp ();
+our $XHC = 0; # defined but false
+
+# values for searching, changing the numeric value breaks
+# compatibility with old indices (so don't change them it)
+use constant {
+        TS => 0, # Received: in Unix time (IMAP INTERNALDATE, JMAP receivedAt)
+        YYYYMMDD => 1, # redundant with DT below
+        DT => 2, # Date: YYYYMMDDHHMMSS (IMAP SENT*, JMAP sentAt)
+
+        # added for public-inbox 1.6.0+
+        BYTES => 3, # IMAP RFC822.SIZE
+        UID => 4, # IMAP UID == NNTP article number == Xapian docid
+        THREADID => 5, # RFC 8474, RFC 8621
+
+        # TODO
+        # REPLYCNT => ?, # IMAP ANSWERED
 
-# values for searching
-use constant TS => 0;  # Received: header in Unix time
-use constant YYYYMMDD => 1; # Date: header for searching in the WWW UI
-use constant DT => 2; # Date: YYYYMMDDHHMMSS
+        # SCHEMA_VERSION history
+        # 0 - initial
+        # 1 - subject_path is lower-cased
+        # 2 - subject_path is id_compress in the index, only
+        # 3 - message-ID is compressed if it includes '%' (hack!)
+        # 4 - change "Re: " normalization, avoid circular Reference ghosts
+        # 5 - subject_path drops trailing '.'
+        # 6 - preserve References: order in document data
+        # 7 - remove references and inreplyto terms
+        # 8 - remove redundant/unneeded document data
+        # 9 - disable Message-ID compression (SHA-1)
+        # 10 - optimize doc for NNTP overviews
+        # 11 - merge threads when vivifying ghosts
+        # 12 - change YYYYMMDD value column to numeric
+        # 13 - fix threading for empty References/In-Reply-To
+        #      (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
+        # 14 - fix ghost root vivification
+        # 15 - see public-inbox-v2-format(5)
+        #      further bumps likely unnecessary, we'll suggest in-place
+        #      "--reindex" use for further fixes and tweaks:
+        #
+        #      public-inbox v1.5.0 adds (still SCHEMA_VERSION=15):
+        #      * "lid:" and "l:" for List-Id searches
+        #
+        #      v1.6.0 adds BYTES, UID and THREADID values
+        SCHEMA_VERSION => 15,
+};
 
-use PublicInbox::SearchMsg;
-use PublicInbox::Over;
-my $QP_FLAGS;
-our %X = map { $_ => 0 } qw(BoolWeight Database Enquire
-                        NumberValueRangeProcessor QueryParser Stem);
-our $Xap; # 'Search::Xapian' or 'Xapian'
-my $ENQ_ASCENDING;
+use PublicInbox::Smsg;
+eval { require PublicInbox::Over };
+our $QP_FLAGS;
+our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query);
+our $Xap; # 'Xapian' or 'Search::Xapian'
+our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
+
+# ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16,
+# let's hope the ABI is stable
+our $ENQ_DESCENDING = 0;
+our $ENQ_ASCENDING = 1;
+our @MAIL_VMAP = (
+        [ YYYYMMDD, 'd:'],
+        [ TS, 'rt:' ],
+        # these are undocumented for WWW, but lei and IMAP use them
+        [ DT, 'dt:' ],
+        [ BYTES, 'z:' ],
+        [ UID, 'uid:' ]
+);
+our @MAIL_NRP;
+
+# Getopt::Long spec, only short options for portability in C++ implementation
+our @XH_SPEC = (
+        'a', # ascending sort
+        'c', # code search
+        'd=s@', # shard dirs
+        'g=s', # git dir (with -c)
+        'k=i', # sort column (like sort(1))
+        'm=i', # maximum number of results
+        'o=i', # offset
+        'r', # 1=relevance then column
+        't', # collapse threads
+        'A=s@', # prefixes
+        'K=i', # timeout kill after i seconds
+        'O=s', # eidx_key
+        'T=i', # threadid
+);
 
 sub load_xapian () {
         return 1 if defined $Xap;
-        for my $x (qw(Search::Xapian Xapian)) {
+        # n.b. PI_XAPIAN is intended for development use only
+        for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') {
                 eval "require $x";
                 next if $@;
 
                 $x->import(qw(:standard));
                 $Xap = $x;
-                $X{$_} = $Xap.'::'.$_ for (keys %X);
 
-                # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm,
-                # so lets hope this part of the ABI is stable because it's
-                # just an integer:
-                $ENQ_ASCENDING = $x eq 'Xapian' ?
-                                1 : Search::Xapian::ENQ_ASCENDING();
+                # `version_string' was added in Xapian 1.1
+                my $xver = eval('v'.eval($x.'::version_string()')) //
+                                eval('v'.eval($x.'::xapian_version_string()'));
 
-                # for SearchMsg:
-                *PublicInbox::SearchMsg::sortable_unserialise =
-                                                $Xap.'::sortable_unserialise';
+                # NumberRangeProcessor was added in Xapian 1.3.6,
+                # NumberValueRangeProcessor was removed for 1.5.0+,
+                # continue with the older /Value/ variant for now...
+                $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ?
+                        'NumberRangeProcessor' : 'NumberValueRangeProcessor');
+                $X{$_} = $Xap.'::'.$_ for (keys %X);
+
+                *sortable_serialise = $x.'::sortable_serialise';
+                *sortable_unserialise = $x.'::sortable_unserialise';
                 # n.b. FLAG_PURE_NOT is expensive not suitable for a public
                 # website as it could become a denial-of-service vector
                 # FLAG_PHRASE also seems to cause performance problems chert
@@ -47,6 +125,7 @@ sub load_xapian () {
                 # or make indexlevel=medium as default
                 $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() |
                                 FLAG_WILDCARD();
+                @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP;
                 return 1;
         }
         undef;
@@ -56,74 +135,62 @@ sub load_xapian () {
 # a prefix common in patch emails
 our $LANG = 'english';
 
-use constant {
-        # SCHEMA_VERSION history
-        # 0 - initial
-        # 1 - subject_path is lower-cased
-        # 2 - subject_path is id_compress in the index, only
-        # 3 - message-ID is compressed if it includes '%' (hack!)
-        # 4 - change "Re: " normalization, avoid circular Reference ghosts
-        # 5 - subject_path drops trailing '.'
-        # 6 - preserve References: order in document data
-        # 7 - remove references and inreplyto terms
-        # 8 - remove redundant/unneeded document data
-        # 9 - disable Message-ID compression (SHA-1)
-        # 10 - optimize doc for NNTP overviews
-        # 11 - merge threads when vivifying ghosts
-        # 12 - change YYYYMMDD value column to numeric
-        # 13 - fix threading for empty References/In-Reply-To
-        #      (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
-        # 14 - fix ghost root vivification
-        # 15 - see public-inbox-v2-format(5)
-        #      further bumps likely unnecessary, we'll suggest in-place
-        #      "--reindex" use for further fixes and tweaks
-        SCHEMA_VERSION => 15,
-};
-
-my %bool_pfx_external = (
-        mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
+our %PATCH_BOOL_COMMON = (
         dfpre => 'XDFPRE',
         dfpost => 'XDFPOST',
         dfblob => 'XDFPRE XDFPOST',
+        patchid => 'XDFID',
 );
 
-my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST';
-my %prob_prefix = (
-        # for mairix compatibility
+# note: the non-X term prefix allocations are shared with
+# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
+my %bool_pfx_external = (
+        mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
+        lid => 'G', # newsGroup (or similar entity), just inside <>
+        %PATCH_BOOL_COMMON
+);
+
+# for mairix compatibility
+our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
+our %PATCH_PROB_COMMON = (
         s => 'S',
-        m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
         f => 'A',
-        t => 'XTO',
-        tc => 'XTO XCC',
-        c => 'XCC',
-        tcf => 'XTO XCC A',
-        a => 'XTO XCC A',
-        b => $non_quoted_body . ' XQUOT',
-        bs => $non_quoted_body . ' XQUOT S',
+        b => $NON_QUOTED_BODY . ' XQUOT',
+        bs => $NON_QUOTED_BODY . ' XQUOT S',
         n => 'XFN',
 
         q => 'XQUOT',
-        nq => $non_quoted_body,
+        nq => $NON_QUOTED_BODY,
         dfn => 'XDFN',
         dfa => 'XDFA',
         dfb => 'XDFB',
         dfhh => 'XDFHH',
         dfctx => 'XDFCTX',
+);
 
+my %prob_prefix = (
+        m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+        l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
+        t => 'XTO',
+        tc => 'XTO XCC',
+        c => 'XCC',
+        tcf => 'XTO XCC A',
+        a => 'XTO XCC A',
+        %PATCH_PROB_COMMON,
         # default:
-        '' => 'XM S A XQUOT XFN ' . $non_quoted_body,
+        '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY,
 );
 
 # not documenting m: and mid: for now, the using the URLs works w/o Xapian
+# not documenting lid: for now, either, it is probably redundant with l:,
+# especially since we don't offer boolean searches for To/Cc/From
+# headers, either
 our @HELP = (
         's:' => 'match within Subject  e.g. s:"a quick brown fox"',
         'd:' => <<EOF,
-date range as YYYYMMDD  e.g. d:19931002..20101002
-Open-ended ranges such as d:19931002.. and d:..20101002
-are also supported
-EOF
-        'dt:' => <<EOF,
-date-time range as YYYYMMDDhhmmss (e.g. dt:19931002011000..19931002011200)
+match date-time range, git "approxidate" formats supported
+Open-ended ranges such as `d:last.week..' and
+`d:..2.days.ago' are supported
 EOF
         'b:' => 'match within message body, including text attachments',
         'nq:' => 'match non-quoted text within message body',
@@ -134,6 +201,7 @@ EOF
         'f:' => 'match within the From header',
         'a:' => 'match within the To, Cc, and From headers',
         'tc:' => 'match within the To and Cc headers',
+        'l:' => 'match contents of the List-Id header',
         'bs:' => 'match within the Subject and body',
         'dfn:' => 'match filename from diff',
         'dfa:' => 'match diff removed (-) lines',
@@ -143,65 +211,90 @@ EOF
         'dfpre:' => 'match pre-image git blob ID',
         'dfpost:' => 'match post-image git blob ID',
         'dfblob:' => 'match either pre or post-image git blob ID',
+        'patchid:' => "match `git patch-id --stable' output",
+        'rt:' => <<EOF,
+match received time, like `d:' if sender's clock was correct
+EOF
 );
 chomp @HELP;
 
 sub xdir ($;$) {
         my ($self, $rdonly) = @_;
-        if ($self->{ibx_ver} == 1) {
-                "$self->{inboxdir}/public-inbox/xapian" . SCHEMA_VERSION;
-        } else {
-                my $dir = "$self->{inboxdir}/xap" . SCHEMA_VERSION;
-                return $dir if $rdonly;
-
-                my $shard = $self->{shard};
-                defined $shard or die "shard not given";
-                $dir .= "/$shard";
+        if ($rdonly || !defined($self->{shard})) {
+                $self->{xpfx};
+        } else { # v2, extindex, cindex only:
+                "$self->{xpfx}/$self->{shard}";
         }
 }
 
-sub _xdb ($) {
+# returns shard directories as an array of strings, does not verify existence
+sub shard_dirs ($) {
         my ($self) = @_;
-        my $dir = xdir($self, 1);
-        my ($xdb, $slow_phrase);
-        my $qpf = \($self->{qp_flags} ||= $QP_FLAGS);
-        if ($self->{ibx_ver} >= 2) {
-                foreach my $shard (<$dir/*>) {
-                        -d $shard && $shard =~ m!/[0-9]+\z! or next;
-                        my $sub = $X{Database}->new($shard);
-                        if ($xdb) {
-                                $xdb->add_database($sub);
-                        } else {
-                                $xdb = $sub;
-                        }
-                        $slow_phrase ||= -f "$shard/iamchert";
-                }
-        } else {
-                $slow_phrase = -f "$dir/iamchert";
-                $xdb = $X{Database}->new($dir);
+        my $xpfx = $self->{xpfx};
+        if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox
+                ($xpfx);
+        } else { # v2 inbox, eidx, cidx
+                opendir(my $dh, $xpfx) or return (); # not initialized yet
+                # We need numeric sorting so shard[0] is first for reading
+                # Xapian metadata, if needed
+                my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return ();
+                map { "$xpfx/$_" } (0..$last);
         }
-        $$qpf |= FLAG_PHRASE() unless $slow_phrase;
-        $xdb;
+}
+
+# returns all shards as separate Xapian::Database objects w/o combining
+sub xdb_shards_flat ($) {
+        my ($self) = @_;
+        load_xapian();
+        $self->{qp_flags} //= $QP_FLAGS;
+        my $slow_phrase;
+        my @xdb = map {
+                $slow_phrase ||= -f "$_/iamchert";
+                $X{Database}->new($_); # raises if missing
+        } shard_dirs($self);
+        $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase;
+        @xdb;
+}
+
+# v2 Xapian docids don't conflict, so they're identical to
+# NNTP article numbers and IMAP UIDs.
+# https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
+sub mdocid {
+        my ($nshard, $mitem) = @_;
+        my $docid = $mitem->get_docid;
+        int(($docid - 1) / $nshard) + 1;
+}
+
+sub docids_to_artnums {
+        my $nshard = shift->{nshard};
+        # XXX does array vs arrayref make a difference in modern Perls?
+        map { int(($_ - 1) / $nshard) + 1 } @_;
+}
+
+sub mset_to_artnums {
+        my ($self, $mset) = @_;
+        my $nshard = $self->{nshard};
+        [ map { mdocid($nshard, $_) } $mset->items ];
 }
 
 sub xdb ($) {
         my ($self) = @_;
-        $self->{xdb} ||= do {
-                load_xapian();
-                _xdb($self);
+        $self->{xdb} // do {
+                my @xdb = $self->xdb_shards_flat or return;
+                $self->{nshard} = scalar(@xdb);
+                my $xdb = shift @xdb;
+                $xdb->add_database($_) for @xdb;
+                $self->{xdb} = $xdb;
         };
 }
 
 sub new {
         my ($class, $ibx) = @_;
         ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx";
-        my $self = bless {
-                inboxdir => $ibx->{inboxdir},
-                altid => $ibx->{altid},
-                ibx_ver => $ibx->version,
-        }, $class;
-        my $dir = xdir($self, 1);
-        $self->{over_ro} = PublicInbox::Over->new("$dir/over.sqlite3");
+        my $xap = $ibx->version > 1 ? 'xap' : 'public-inbox/xapian';
+        my $xpfx = "$ibx->{inboxdir}/$xap".SCHEMA_VERSION;
+        my $self = bless { xpfx => $xpfx }, $class;
+        $self->{altid} = $ibx->{altid} if defined($ibx->{altid});
         $self;
 }
 
@@ -213,99 +306,276 @@ sub reopen {
         $self; # make chaining easier
 }
 
-# read-only
-sub query {
-        my ($self, $query_string, $opts) = @_;
-        $opts ||= {};
-        if ($query_string eq '' && !$opts->{mset}) {
-                $self->{over_ro}->recent($opts);
-        } else {
-                my $qp = qp($self);
-                my $qp_flags = $self->{qp_flags};
-                my $query = $qp->parse_query($query_string, $qp_flags);
-                $opts->{relevance} = 1 unless exists $opts->{relevance};
-                _do_enquire($self, $query, $opts);
+# Convert git "approxidate" ranges to something usable with our
+# Xapian indices.  At the moment, Xapian only offers a C++-only API
+# and neither the SWIG nor XS bindings allow us to use custom code
+# to parse dates (and libgit2 doesn't expose git__date_parse, either,
+# so we're running git-rev-parse(1)).
+# This replaces things we need to send to $git->date_parse with
+# "\0".$strftime_format.['+'|$idx]."\0" placeholders
+sub date_parse_prepare {
+        my ($to_parse, $pfx, $range) = @_;
+        # are we inside a parenthesized statement?
+        my $end = $range =~ s/([\)\s]*)\z// ? $1 : '';
+        my @r = split(/\.\./, $range, 2);
+
+        # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like
+        # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD
+        # We upgrade "d:" to "dt:" unconditionally
+        if ($pfx eq 'd') {
+                $pfx = 'dt';
+                # upgrade YYYYMMDD to YYYYMMDDHHMMSS
+                $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]]
+                                        [0-9]{2}[^[:alnum:]]
+                                        [0-9]{2}\z!x, @r));
+                $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r));
+        }
+        if ($pfx eq 'dt') {
+                if (!defined($r[1])) { # git needs gaps and not /\d{14}/
+                        if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2})
+                                        ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) {
+                                push @$to_parse, "$1-$2-$3 $4:$5:$6";
+                        } else {
+                                push @$to_parse, $r[0];
+                        }
+                        $r[0] = "\0%Y%m%d%H%M%S$#$to_parse\0";
+                        $r[1] = "\0%Y%m%d%H%M%S+\0";
+                } else {
+                        for my $x (@r) {
+                                next if $x eq '' || $x =~ /\A[0-9]{14}\z/;
+                                push @$to_parse, $x;
+                                $x = "\0%Y%m%d%H%M%S$#$to_parse\0";
+                        }
+                }
+        } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P
+                for my $x (@r) {
+                        next if $x eq '' || $x =~ /\A[0-9]{5,}\z/;
+                        push @$to_parse, $x;
+                        $x = "\0%s$#$to_parse\0";
+                }
+                $r[1] //= "\0%s+\0"; # add 1 day
+        }
+        "$pfx:".join('..', @r).$end;
+}
+
+sub date_parse_finalize {
+        my ($git, $to_parse) = @_;
+        # git-rev-parse can handle any number of args up to system
+        # limits (around (4096*32) bytes on Linux).
+        my @r = $git->date_parse(@$to_parse);
+        # n.b. git respects TZ, times stored in SQLite/Xapian are always UTC,
+        # and gmtime doesn't seem to do the right thing when TZ!=UTC
+        my ($i, $t);
+        $_[2] =~ s/\0(%[%YmdHMSs]+)([0-9\+]+)\0/
+                $t = $2 eq '+' ? ($r[$i]+86400) : $r[$i=$2+0];
+                $1 eq '%s' ? $t : strftime($1, gmtime($t))/sge;
+}
+
+# n.b. argv never has NUL, though we'll need to filter it out
+# if this $argv isn't from a command execution
+sub query_argv_to_string {
+        my (undef, $git, $argv) = @_;
+        my $to_parse;
+        my $tmp = join(' ', map {;
+                if (s!\b(d|rt|dt):(\S+)\z!date_parse_prepare(
+                                                $to_parse //= [], $1, $2)!sge) {
+                        $_;
+                } elsif (/\s/) {
+                        s/(.*?)\b(\w+:)// ? qq{$1$2"$_"} : qq{"$_"};
+                } else {
+                        $_
+                }
+        } @$argv);
+        date_parse_finalize($git, $to_parse, $tmp) if $to_parse;
+        $tmp
+}
+
+# this is for the WWW "q=" query parameter and "lei q --stdin"
+# it can't do d:"5 days ago", but it will do d:5.days.ago
+sub query_approxidate {
+        my (undef, $git) = @_; # $_[2] = $query_string (modified in-place)
+        my $DQ = qq<"\x{201c}\x{201d}>; # Xapian can use curly quotes
+        $_[2] =~ tr/\x00/ /; # Xapian doesn't do NUL, we use it as a placeholder
+        my ($terms, $phrase, $to_parse);
+        $_[2] =~ s{([^$DQ]*)([$DQ][^$DQ]*[$DQ])?}{
+                ($terms, $phrase) = ($1, $2);
+                $terms =~ s!\b(d|rt|dt):(\S+)!
+                        date_parse_prepare($to_parse //= [], $1, $2)!sge;
+                $terms.($phrase // '');
+                }sge;
+        date_parse_finalize($git, $to_parse, $_[2]) if $to_parse;
+}
+
+# read-only, for mail only (codesearch has different rules)
+sub mset {
+        my ($self, $qry_str, $opt) = @_;
+        my $qp = $self->{qp} //= $self->qparse_new;
+        my $qry = $qp->parse_query($qry_str, $self->{qp_flags});
+        if (defined(my $eidx_key = $opt->{eidx_key})) {
+                $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key);
+        }
+        if (defined(my $uid_range = $opt->{uid_range})) {
+                my $range = $X{Query}->new(OP_VALUE_RANGE(), UID,
+                                        sortable_serialise($uid_range->[0]),
+                                        sortable_serialise($uid_range->[1]));
+                $qry = $X{Query}->new(OP_FILTER(), $qry, $range);
+        }
+        if (defined(my $tid = $opt->{threadid})) {
+                $tid = sortable_serialise($tid);
+                $qry = $X{Query}->new(OP_FILTER(), $qry,
+                        $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid));
+        }
+        do_enquire($self, $qry, $opt, TS);
+}
+
+sub xhc_start_maybe (@) {
+        require PublicInbox::XapClient;
+        my $xhc = PublicInbox::XapClient::start_helper(@_);
+        require PublicInbox::XhcMset if $xhc;
+        $xhc;
+}
+
+sub xh_opt ($) {
+        my ($opt) = @_;
+        my $lim = $opt->{limit} || 50;
+        my @ret;
+        push @ret, '-o', $opt->{offset} if $opt->{offset};
+        push @ret, '-m', $lim;
+        my $rel = $opt->{relevance} // 0;
+        if ($rel == -2) { # ORDER BY docid/UID (highest first)
+                push @ret, '-k', '-1';
+        } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first)
+                push @ret, '-k', '-1';
+                push @ret, '-a';
+        } elsif ($rel == 0) {
+                push @ret, '-k', $opt->{sort_col} // TS;
+                push @ret, '-a' if $opt->{asc};
+        } else { # rel > 0
+                push @ret, '-r';
+                push @ret, '-k', $opt->{sort_col} // TS;
+                push @ret, '-a' if $opt->{asc};
+        }
+        push @ret, '-t' if $opt->{threads};
+        push @ret, '-T', $opt->{threadid} if defined $opt->{threadid};
+        push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key};
+        @ret;
+}
+
+# returns a true value if actually handled asynchronously,
+# and a falsy value if handled synchronously
+sub async_mset {
+        my ($self, $qry_str, $opt, $cb, @args) = @_;
+        if ($XHC) { # unconditionally retrieving pct + rank for now
+                xdb($self); # populate {nshards}
+                my @margs = ($self->xh_args, xh_opt($opt));
+                my $ret = eval {
+                        my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str);
+                        PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args);
+                };
+                $cb->(@args, undef, $@) if $@;
+                $ret;
+        } else { # synchronous
+                my $mset = $self->mset($qry_str, $opt);
+                $cb->(@args, $mset);
+                undef;
         }
 }
 
+sub do_enquire { # shared with CodeSearch
+        my ($self, $qry, $opt, $col) = @_;
+        my $enq = $X{Enquire}->new(xdb($self));
+        $enq->set_query($qry);
+        my $rel = $opt->{relevance} // 0;
+        if ($rel == -2) { # ORDER BY docid/UID (highest first)
+                $enq->set_weighting_scheme($X{BoolWeight}->new);
+                $enq->set_docid_order($ENQ_DESCENDING);
+        } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first)
+                $enq->set_weighting_scheme($X{BoolWeight}->new);
+                $enq->set_docid_order($ENQ_ASCENDING);
+        } elsif ($rel == 0) {
+                $enq->set_sort_by_value_then_relevance($col, !$opt->{asc});
+        } else { # rel > 0
+                $enq->set_sort_by_relevance_then_value($col, !$opt->{asc});
+        }
+
+        # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse
+        # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword
+        (($opt->{threads} // 0) == 1 && has_threadid($self)) and
+                $enq->set_collapse_key(THREADID);
+        retry_reopen($self, \&enquire_once, $enq,
+                        $opt->{offset} || 0, $opt->{limit} || 50);
+}
+
 sub retry_reopen {
-        my ($self, $cb, $arg) = @_;
+        my ($self, $cb, @arg) = @_;
         for my $i (1..10) {
                 if (wantarray) {
-                        my @ret;
-                        eval { @ret = $cb->($arg) };
+                        my @ret = eval { $cb->($self, @arg) };
                         return @ret unless $@;
                 } else {
-                        my $ret;
-                        eval { $ret = $cb->($arg) };
+                        my $ret = eval { $cb->($self, @arg) };
                         return $ret unless $@;
                 }
                 # Exception: The revision being read has been discarded -
                 # you should call Xapian::Database::reopen()
                 if (ref($@) =~ /\bDatabaseModifiedError\b/) {
-                        warn "reopen try #$i on $@\n";
                         reopen($self);
                 } else {
                         # let caller decide how to spew, because ExtMsg queries
                         # get wonky and trigger:
                         # "something terrible happened at .../Xapian/Enquire.pm"
-                        die;
+                        Carp::croak($@);
                 }
         }
-        die "Too many Xapian database modifications in progress\n";
-}
-
-sub _do_enquire {
-        my ($self, $query, $opts) = @_;
-        retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]);
-}
-
-sub _enquire_once { # retry_reopen callback
-        my ($self, $query, $opts) = @{$_[0]};
-        my $xdb = xdb($self);
-        my $enquire = $X{Enquire}->new($xdb);
-        $enquire->set_query($query);
-        $opts ||= {};
-        my $desc = !$opts->{asc};
-        if (($opts->{mset} || 0) == 2) {
-                $enquire->set_docid_order($ENQ_ASCENDING);
-                $enquire->set_weighting_scheme($X{BoolWeight}->new);
-        } elsif ($opts->{relevance}) {
-                $enquire->set_sort_by_relevance_then_value(TS, $desc);
-        } else {
-                $enquire->set_sort_by_value_then_relevance(TS, $desc);
-        }
-        my $offset = $opts->{offset} || 0;
-        my $limit = $opts->{limit} || 50;
-        my $mset = $enquire->get_mset($offset, $limit);
-        return $mset if $opts->{mset};
-        my @msgs = map { PublicInbox::SearchMsg::from_mitem($_) } $mset->items;
-        return \@msgs unless wantarray;
+        Carp::croak("Too many Xapian database modifications in progress\n");
+}
+
+# returns true if all docs have the THREADID value
+sub has_threadid ($) {
+        my ($self) = @_;
+        (xdb($self)->get_metadata('has_threadid') // '') eq '1';
+}
 
-        ($mset->get_matches_estimated, \@msgs)
+sub enquire_once { # retry_reopen callback
+        my (undef, $enq, $offset, $limit) = @_;
+        $enq->get_mset($offset, $limit);
+}
+
+sub mset_to_smsg {
+        my ($self, $ibx, $mset) = @_;
+        my $nshard = $self->{nshard};
+        my $i = 0;
+        my %order = map { mdocid($nshard, $_) => ++$i } $mset->items;
+        my @msgs = sort {
+                $order{$a->{num}} <=> $order{$b->{num}}
+        } @{$ibx->over->get_all(keys %order)};
+        wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs;
 }
 
 # read-write
 sub stemmer { $X{Stem}->new($LANG) }
 
-# read-only
-sub qp {
+sub qp_init_common {
         my ($self) = @_;
-
-        my $qp = $self->{query_parser};
-        return $qp if $qp;
-        my $xdb = xdb($self);
-        # new parser
-        $qp = $X{QueryParser}->new;
+        my $qp = $X{QueryParser}->new;
         $qp->set_default_op(OP_AND());
-        $qp->set_database($xdb);
-        $qp->set_stemmer($self->stemmer);
+        $qp->set_database(xdb($self));
+        $qp->set_stemmer(stemmer($self));
         $qp->set_stemming_strategy(STEM_SOME());
-        $qp->set_max_wildcard_expansion(100);
-        my $nvrp = $X{NumberValueRangeProcessor};
-        $qp->add_valuerangeprocessor($nvrp->new(YYYYMMDD, 'd:'));
-        $qp->add_valuerangeprocessor($nvrp->new(DT, 'dt:'));
+        my $cb = $qp->can('set_max_wildcard_expansion') //
+                $qp->can('set_max_expansion'); # Xapian 1.5.0+
+        $cb->($qp, 100);
+        $qp;
+}
 
+# read-only
+sub qparse_new {
+        my ($self) = @_;
+        my $qp = qp_init_common($self);
+        my $cb = $qp->can('add_valuerangeprocessor') //
+                $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
+
+        $cb->($qp, $_) for @MAIL_NRP;
         while (my ($name, $prefix) = each %bool_pfx_external) {
                 $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);
         }
@@ -313,9 +583,11 @@ sub qp {
         # we do not actually create AltId objects,
         # just parse the spec to avoid the extra DB handles for now.
         if (my $altid = $self->{altid}) {
-                my $user_pfx = $self->{-user_pfx} ||= [];
+                my $user_pfx = $self->{-user_pfx} = [];
                 for (@$altid) {
                         # $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3'
+                        # note: Xapian supports multibyte UTF-8, /^[0-9]+$/,
+                        # and '_' with prefixes matching \w+
                         /\Aserial:(\w+):/ or next;
                         my $pfx = $1;
                         push @$user_pfx, "$pfx:", <<EOF;
@@ -330,13 +602,46 @@ EOF
         while (my ($name, $prefix) = each %prob_prefix) {
                 $qp->add_prefix($name, $_) foreach split(/ /, $prefix);
         }
+        $qp;
+}
+
+sub generate_cxx () { # generates snippet for xap_helper.h
+        my $ret = <<EOM;
+# line ${\__LINE__} "${\__FILE__}"
+static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}];
+static void mail_nrp_init(void)
+{
+EOM
+        for (0..$#MAIL_VMAP) {
+                my $x = $MAIL_VMAP[$_];
+                $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n}
+        }
+$ret .= <<EOM;
+}
 
-        $self->{query_parser} = $qp;
+# line ${\__LINE__} "${\__FILE__}"
+static void qp_init_mail_search(Xapian::QueryParser *qp)
+{
+        for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++)
+                qp->ADD_RP(mail_nrp[i]);
+EOM
+        for my $name (sort keys %bool_pfx_external) {
+                for (split(/ /, $bool_pfx_external{$name})) {
+                        $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n}
+                }
+        }
+        # TODO: altid support
+        for my $name (sort keys %prob_prefix) {
+                for (split(/ /, $prob_prefix{$name})) {
+                        $ret .= qq{\tqp->add_prefix("$name", "$_");\n}
+                }
+        }
+        $ret .= "}\n";
 }
 
 sub help {
         my ($self) = @_;
-        $self->qp; # parse altids
+        $self->{qp} //= $self->qparse_new; # parse altids
         my @ret = @HELP;
         if (my $user_pfx = $self->{-user_pfx}) {
                 push @ret, @$user_pfx;
@@ -344,4 +649,74 @@ sub help {
         \@ret;
 }
 
+# always returns a scalar value
+sub int_val ($$) {
+        my ($doc, $col) = @_;
+        my $val = $doc->get_value($col) or return undef; # undef is '' in Xapian
+        sortable_unserialise($val) + 0; # PV => IV conversion
+}
+
+sub get_pct ($) { # mset item
+        # Capped at "99%" since "100%" takes an extra column in the
+        # thread skeleton view.  <xapian/mset.h> says the value isn't
+        # very meaningful, anyways.
+        my $n = $_[0]->get_percent;
+        $n > 99 ? 99 : $n;
+}
+
+sub xap_terms ($$;@) {
+        my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
+        my $end = $xdb_or_doc->termlist_end(@docid);
+        my $cur = $xdb_or_doc->termlist_begin(@docid);
+        $cur->skip_to($pfx);
+        my (@ret, $tn);
+        my $pfxlen = length($pfx);
+        for (; $cur != $end; $cur++) {
+                $tn = $cur->get_termname;
+                index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
+        }
+        wantarray ? @ret : +{ map { $_ => undef } @ret };
+}
+
+# get combined docid from over.num:
+# (not generic Xapian, only works with our sharding scheme for mail)
+sub num2docid ($$) {
+        my ($self, $num) = @_;
+        my $nshard = $self->{nshard};
+        ($num - 1) * $nshard + $num % $nshard + 1;
+}
+
+sub all_terms {
+        my ($self, $pfx) = @_;
+        my $cur = xdb($self)->allterms_begin($pfx);
+        my $end = $self->{xdb}->allterms_end($pfx);
+        my $pfxlen = length($pfx);
+        my @ret;
+        for (; $cur != $end; $cur++) {
+                push @ret, substr($cur->get_termname, $pfxlen);
+        }
+        wantarray ? @ret : +{ map { $_ => undef } @ret };
+}
+
+sub xh_args { # prep getopt args to feed to xap_helper.h socket
+        map { ('-d', $_) } shard_dirs($_[0]);
+}
+
+sub docids_by_postlist ($$) {
+        my ($self, $q) = @_;
+        my $cur = $self->xdb->postlist_begin($q);
+        my $end = $self->{xdb}->postlist_end($q);
+        my @ids;
+        for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) };
+        @ids;
+}
+
+sub get_doc ($$) {
+        my ($self, $docid) = @_;
+        eval { $self->{xdb}->get_document($docid) } // do {
+                die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/;
+                undef;
+        }
+}
+
 1;