about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2020-12-05 10:11:38 +0000
committerEric Wong <e@80x24.org>2020-12-05 21:41:52 +0000
commit525555d14118f92f86be54c683f797089c52a78d (patch)
tree59baf4193882b020ffe67da2e6e96156e86e8b41
parent89990b0cfc8bbdabc8d650325d9e1bc585df8a0c (diff)
downloadpublic-inbox-525555d14118f92f86be54c683f797089c52a78d.tar.gz
isearch: emulate per-inbox search with ->ALL
Using "eidx_key:" boolean prefix to limit results to a given
inbox, we can use ->ALL to emulate and replace per-Inbox
xap15/[0-9] search indices.

With this change, the presence of "extindex.all.topdir" in the
$PI_CONFIG will cause the WWW code to use that extindex and
ignore per-inbox Xapian DBs in xap15/[0-9].

Unfortunately IMAP search still requires old per-inbox indices,
for now.  Mapping extindex Xapian docids to per-Inbox UIDs and
vice-versa is proving tricky.  Fortunately, IMAP search is
rarely used and optional.  The RFCs don't specify expensive
phrase search, either, so `indexlevel=medium' can be used in
per-inbox Xapian indices to save space.

For primarily WWW (and future JMAP) users; this should result in
significant disk space, FD, and page cache footprint savings for
large instances with many inboxes and many cross-posted
messages.
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/Config.pm4
-rw-r--r--lib/PublicInbox/DummyInbox.pm2
-rw-r--r--lib/PublicInbox/ExtMsg.pm2
-rw-r--r--lib/PublicInbox/ExtSearch.pm1
-rw-r--r--lib/PublicInbox/Inbox.pm4
-rw-r--r--lib/PublicInbox/Isearch.pm87
-rw-r--r--lib/PublicInbox/Mbox.pm6
-rw-r--r--lib/PublicInbox/Search.pm5
-rw-r--r--lib/PublicInbox/SearchView.pm10
-rw-r--r--lib/PublicInbox/SolverGit.pm2
-rw-r--r--lib/PublicInbox/WWW.pm2
-rw-r--r--lib/PublicInbox/WwwStream.pm2
-rw-r--r--lib/PublicInbox/WwwText.pm2
-rw-r--r--t/extsearch.t25
15 files changed, 139 insertions, 16 deletions
diff --git a/MANIFEST b/MANIFEST
index 946e4b8a..b39f63db 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -156,6 +156,7 @@ lib/PublicInbox/In2Tie.pm
 lib/PublicInbox/Inbox.pm
 lib/PublicInbox/InboxIdle.pm
 lib/PublicInbox/InboxWritable.pm
+lib/PublicInbox/Isearch.pm
 lib/PublicInbox/KQNotify.pm
 lib/PublicInbox/Linkify.pm
 lib/PublicInbox/Listener.pm
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index ba0ead6e..1844f8b2 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -477,6 +477,10 @@ EOF
                         push @$repo_objs, $repo if $repo;
                 }
         }
+        if (my $es = ALL($self)) {
+                require PublicInbox::Isearch;
+                $ibx->{isrch} = PublicInbox::Isearch->new($ibx, $es);
+        }
         $self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx;
 }
 
diff --git a/lib/PublicInbox/DummyInbox.pm b/lib/PublicInbox/DummyInbox.pm
index 02426f13..981043ce 100644
--- a/lib/PublicInbox/DummyInbox.pm
+++ b/lib/PublicInbox/DummyInbox.pm
@@ -16,7 +16,7 @@ no warnings 'once';
 *max = \&uidvalidity;
 *query_xover = \&uid_range;
 *over = \&mm;
-*search = *unsubscribe_unlock =
+*isrch = *search = *unsubscribe_unlock =
         *get_art = *description = *base_url = \&subscribe_unlock;
 
 1;
diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 2a0a3e46..2a579c1b 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -32,7 +32,7 @@ sub PARTIAL_MAX () { 100 }
 sub search_partial ($$) {
         my ($ibx, $mid) = @_;
         return if length($mid) < $MIN_PARTIAL_LEN;
-        my $srch = $ibx->search or return;
+        my $srch = $ibx->search or return; # NOT ->isrch, we already try ->ALL
         my $opt = { limit => PARTIAL_MAX, mset => 2 };
         my @try = ("m:$mid*");
         my $chop = $mid;
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index 80455d8d..2a560935 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -128,5 +128,6 @@ no warnings 'once';
 *recent = \&PublicInbox::Inbox::recent;
 
 *max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef
+*isrch = *search;
 
 1;
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 58651687..52aece7c 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -201,6 +201,10 @@ sub search {
         (eval { $srch->xdb }) ? $srch : undef;
 }
 
+# isrch is preferred for read-only interfaces if available since it
+# reduces kernel cache and FD overhead
+sub isrch { $_[0]->{isrch} // search($_[0]) }
+
 sub over {
         $_[0]->{over} //= eval {
                 my $srch = $_[0]->{search} //= eval {
diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm
new file mode 100644
index 00000000..0ab3b19a
--- /dev/null
+++ b/lib/PublicInbox/Isearch.pm
@@ -0,0 +1,87 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Provides everything the PublicInbox::Search object does;
+# but uses global ExtSearch (->ALL) with an eidx_key query to
+# emulate per-Inbox search using ->ALL.
+package PublicInbox::Isearch;
+use strict;
+use v5.10.1;
+use PublicInbox::ExtSearch;
+use PublicInbox::Search;
+
+sub new {
+        my (undef, $ibx, $es) = @_;
+        bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__;
+}
+
+sub mset {
+        my ($self, $str, $opt) = @_;
+        $self->{es}->mset($str, { $opt ? %$opt : (),
+                                eidx_key => $self->{eidx_key} });
+}
+
+sub _ibx_id ($) {
+        my ($self) = @_;
+        my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
+
+        $sth->execute($self->{eidx_key});
+        $sth->fetchrow_array //
+                die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n";
+}
+
+sub mset_to_artnums {
+        my ($self, $mset) = @_;
+        my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset);
+        my $ibx_id = $self->{-ibx_id} //= _ibx_id($self);
+        my $qmarks = join(',', map { '?' } @$docids);
+        my $rows = $self->{es}->over->dbh->
+                        selectall_arrayref(<<"", undef, $ibx_id, @$docids);
+SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks)
+
+        my $i = -1;
+        my %order = map { $_ => ++$i } @$docids;
+        my @xnums;
+        for my $row (@$rows) { # @row = ($docid, $xnum)
+                my $idx = delete($order{$row->[0]}) // next;
+                $xnums[$idx] = $row->[1];
+        }
+        if (scalar keys %order) {
+                warn "W: $self->{es}->{topdir} #",
+                        join(', #', sort keys %order),
+                        " not mapped to `$self->{eidx_key}'\n";
+                warn "W: $self->{es}->{topdir} may need to be reindexed\n";
+                @xnums = grep { defined } @xnums;
+        }
+        \@xnums;
+}
+
+sub mset_to_smsg {
+        my ($self, $ibx, $mset) = @_; # $ibx is a real inbox, not eidx
+        my $xnums = mset_to_artnums($self, $mset);
+        my $i = -1;
+        my %order = map { $_ => ++$i } @$xnums;
+        my $unordered = $ibx->over->get_all(@$xnums);
+        my @msgs;
+        for my $smsg (@$unordered) {
+                my $idx = delete($order{$smsg->{num}}) // do {
+                        warn "W: $ibx->{inboxdir} #$smsg->{num}\n";
+                        next;
+                };
+                $msgs[$idx] = $smsg;
+        }
+        if (scalar keys %order) {
+                warn "W: $ibx->{inboxdir} #",
+                        join(', #', sort keys %order),
+                        " no longer valid\n";
+                warn "W: $self->{es}->{topdir} may need to be reindexed\n";
+        }
+        wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs;
+}
+
+sub has_threadid { 1 }
+
+sub help { $_[0]->{es}->help }
+
+1;
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 22516998..19459150 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -218,7 +218,7 @@ sub results_cb {
                         return $smsg;
                 }
                 # refill result set
-                my $srch = $ctx->{-inbox}->search or return gone($ctx,'search');
+                my $srch = $ctx->{-inbox}->isrch or return gone($ctx, 'search');
                 my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
                 my $size = $mset->size or return;
                 $ctx->{qopts}->{offset} += $size;
@@ -240,7 +240,7 @@ sub results_thread_cb {
                 next if $over->expand_thread($ctx);
 
                 # refill result set
-                my $srch = $ctx->{-inbox}->search or return gone($ctx,'search');
+                my $srch = $ctx->{-inbox}->isrch or return gone($ctx, 'search');
                 my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
                 my $size = $mset->size or return;
                 $ctx->{qopts}->{offset} += $size;
@@ -253,7 +253,7 @@ sub mbox_all {
         my ($ctx, $q) = @_;
         my $q_string = $q->{'q'};
         return mbox_all_ids($ctx) if $q_string !~ /\S/;
-        my $srch = $ctx->{-inbox}->search or
+        my $srch = $ctx->{-inbox}->isrch or
                 return PublicInbox::WWW::need($ctx, 'Search');
         my $over = $ctx->{-inbox}->over or
                 return PublicInbox::WWW::need($ctx, 'Overview');
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7e72913f..ba239255 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -55,7 +55,7 @@ use constant {
 use PublicInbox::Smsg;
 use PublicInbox::Over;
 our $QP_FLAGS;
-our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem);
+our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query);
 our $Xap; # 'Search::Xapian' or 'Xapian'
 our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
 our $ENQ_ASCENDING;
@@ -331,6 +331,9 @@ sub has_threadid ($) {
 sub _enquire_once { # retry_reopen callback
         my ($self, $query, $opts) = @_;
         my $xdb = xdb($self);
+        if (defined(my $eidx_key = $opts->{eidx_key})) {
+                $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key);
+        }
         my $enquire = $X{Enquire}->new($xdb);
         $enquire->set_query($query);
         $opts ||= {};
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 26426c01..f3c96126 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -30,7 +30,7 @@ sub mbox_results {
 
 sub sres_top_html {
         my ($ctx) = @_;
-        my $srch = $ctx->{-inbox}->search or
+        my $srch = $ctx->{-inbox}->isrch or
                 return PublicInbox::WWW::need($ctx, 'Search');
         my $q = PublicInbox::SearchQuery->new($ctx->{qp});
         my $x = $q->{x};
@@ -95,7 +95,7 @@ sub mset_summary {
         my $res = \($ctx->{-html_tip});
         my $ibx = $ctx->{-inbox};
         my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef;
-        my @nums = @{$ibx->search->mset_to_artnums($mset)};
+        my @nums = @{$ibx->isrch->mset_to_artnums($mset)};
         my %num2msg = map { $_->{num} => $_ } @{$ibx->over->get_all(@nums)};
         my ($min, $max);
 
@@ -201,7 +201,7 @@ sub search_nav_top {
         }
         my $A = $q->qs_html(x => 'A', r => undef);
         $rv .= qq{|<a\nhref="?$A">Atom feed</a>]};
-        if ($ctx->{-inbox}->search->has_threadid) {
+        if ($ctx->{-inbox}->isrch->has_threadid) {
                 $rv .= qq{\n\t\t\tdownload mbox.gz: } .
                         # we set name=z w/o using it since it seems required for
                         # lynx (but works fine for w3m).
@@ -288,7 +288,7 @@ sub mset_thread {
         my ($ctx, $mset, $q) = @_;
         my $ibx = $ctx->{-inbox};
         my @pct = map { get_pct($_) } $mset->items;
-        my $msgs = $ibx->search->mset_to_smsg($ibx, $mset);
+        my $msgs = $ibx->isrch->mset_to_smsg($ibx, $mset);
         my $i = 0;
         $_->{pct} = $pct[$i++] for @$msgs;
         my $r = $q->{r};
@@ -353,7 +353,7 @@ sub ctx_prepare {
 
 sub adump {
         my ($cb, $mset, $q, $ctx) = @_;
-        $ctx->{ids} = $ctx->{-inbox}->search->mset_to_artnums($mset);
+        $ctx->{ids} = $ctx->{-inbox}->isrch->mset_to_artnums($mset);
         $ctx->{search_query} = $q; # used by WwwAtomStream::atom_header
         PublicInbox::WwwAtomStream->response($ctx, 200, \&adump_i);
 }
diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
index 83f7a4ee..a53f28b1 100644
--- a/lib/PublicInbox/SolverGit.pm
+++ b/lib/PublicInbox/SolverGit.pm
@@ -216,7 +216,7 @@ sub filename_query ($) {
 
 sub find_smsgs ($$$) {
         my ($self, $ibx, $want) = @_;
-        my $srch = $ibx->search or return;
+        my $srch = $ibx->isrch or return;
 
         my $post = $want->{oid_b} or die 'BUG: no {oid_b}';
         $post =~ /\A[a-f0-9]+\z/ or die "BUG: oid_b not hex: $post";
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index cdbcff1e..fc208816 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -462,7 +462,7 @@ sub serve_git {
 sub mbox_results {
         my ($ctx) = @_;
         if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) {
-                $ctx->{-inbox}->search or return need($ctx, 'search');
+                $ctx->{-inbox}->isrch or return need($ctx, 'search');
                 require PublicInbox::SearchView;
                 return PublicInbox::SearchView::mbox_results($ctx);
         }
diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm
index 638f4e27..2527b8ed 100644
--- a/lib/PublicInbox/WwwStream.pm
+++ b/lib/PublicInbox/WwwStream.pm
@@ -54,7 +54,7 @@ sub html_top ($) {
                         qq(<a\nhref="$color">color</a> / ).
                         qq(<a\nhref=#mirror>mirror</a> / ).
                         qq(<a\nhref="$atom">Atom feed</a>);
-        if ($ibx->search) {
+        if ($ibx->isrch) {
                 my $q_val = delete($ctx->{-q_value_html}) // '';
                 $q_val = qq(\nvalue="$q_val") if $q_val ne '';
                 # XXX gross, for SearchView.pm
diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm
index 04c9b1c4..8cc818df 100644
--- a/lib/PublicInbox/WwwText.pm
+++ b/lib/PublicInbox/WwwText.pm
@@ -250,7 +250,7 @@ EOF
 
         # n.b. we use the Xapian DB for any regeneratable,
         # order-of-arrival-independent data.
-        my $srch = $ibx->search;
+        my $srch = $ibx->isrch;
         if ($srch) {
                 $$txt .= <<EOF;
 search
diff --git a/t/extsearch.t b/t/extsearch.t
index 2b8b88ea..97786b21 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -130,9 +130,32 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex");
         is($mset->size, 1, 'new message found');
         $mset = $es->mset('b:"test message"');
         is($mset->size, 1, 'old message found');
-
         delete @$es{qw(git over xdb)}; # fork preparation
 
+        my $pi_cfg = PublicInbox::Config->new;
+        $pi_cfg->fill_all;
+        is(scalar($pi_cfg->ALL->mset('s:Testing')->items), 2,
+                '2 results in ->ALL');
+        my $res = {};
+        my $nr = 0;
+        $pi_cfg->each_inbox(sub {
+                $nr++;
+                my ($ibx) = @_;
+                local $SIG{__WARN__} = sub {}; # FIXME support --reindex
+                my $mset = $ibx->isrch->mset('s:Testing');
+                $res->{$ibx->eidx_key} = $ibx->isrch->mset_to_smsg($ibx, $mset);
+        });
+        is($nr, 2, 'two inboxes');
+        my $exp = {};
+        for my $v (qw(v1 v2)) {
+                my $ibx = $pi_cfg->lookup_newsgroup("$v.example");
+                my $smsg = $ibx->over->get_art(1);
+                $smsg->psgi_cull;
+                $exp->{"$v.example"} = [ $smsg ];
+        }
+        is_deeply($res, $exp, 'isearch limited results');
+        $pi_cfg = $res = $exp = undef;
+
         open my $rmfh, '+>', undef or BAIL_OUT $!;
         $rmfh->autoflush(1);
         print $rmfh $eml2->as_string or BAIL_OUT $!;