about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2020-12-27 20:02:51 +0000
committerEric Wong <e@80x24.org>2020-12-31 13:20:54 +0000
commit08de05443804120a2663aa3611c47c84a18e0c35 (patch)
treeabfd80cfba197d6a38b0cfa52c17ca3adaf22ed9 /lib
parent12583f45f29f3acd6cd704df9a7e5aaff5acc3f7 (diff)
downloadpublic-inbox-08de05443804120a2663aa3611c47c84a18e0c35.tar.gz
While a single extindex combines multiple inboxes into a single
search index, extindex still requires up-front indexing on items
which can be searched.  XSearch has no on-disk footprint itself
and uses Xapian DBs of existing publicinbox and extindex
("extinbox") exclusively.

XSearch still suffers from the multi-shard Xapian scalability
problems which led to the creation of extindex, but I expect the
number of shards to remain relatively low.

I envision users hosting public-inbox instances on their
workstations will only have two extindex combined by this, one
read-only extindex for serving public archives, and one
read-write extindex managed by LeiStore for private mail.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/LeiSearch.pm14
-rw-r--r--lib/PublicInbox/LeiXSearch.pm72
-rw-r--r--lib/PublicInbox/Search.pm19
3 files changed, 85 insertions, 20 deletions
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index 66c16e04..0b962b11 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -7,20 +7,18 @@ use v5.10.1;
 use parent qw(PublicInbox::ExtSearch);
 use PublicInbox::Search;
 
-sub combined_docid ($$) {
+# get combined docid from over.num:
+# (not generic Xapian, only works with our sharding scheme)
+sub num2docid ($$) {
         my ($self, $num) = @_;
-        ($num - 1) * $self->{nshard} + 1;
+        my $nshard = $self->{nshard};
+        ($num - 1) * $nshard + $num % $nshard + 1;
 }
 
 sub msg_keywords {
         my ($self, $num) = @_; # num_or_mitem
         my $xdb = $self->xdb; # set {nshard};
-        my $docid = ref($num) ? $num->get_docid : do {
-                # get combined docid from over.num:
-                # (not generic Xapian, only works with our sharding scheme)
-                my $nshard = $self->{nshard};
-                ($num - 1) * $nshard + $num % $nshard + 1;
-        };
+        my $docid = ref($num) ? $num->get_docid : num2docid($self, $num);
         my %kw;
         eval {
                 my $end = $xdb->termlist_end($docid);
diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
new file mode 100644
index 00000000..1a81b14a
--- /dev/null
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -0,0 +1,72 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Combine any combination of PublicInbox::Search,
+# PublicInbox::ExtSearch, and PublicInbox::LeiSearch objects
+# into one Xapian DB
+package PublicInbox::LeiXSearch;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::LeiSearch);
+
+sub new {
+        my ($class) = @_;
+        PublicInbox::Search::load_xapian();
+        bless {
+                qp_flags => $PublicInbox::Search::QP_FLAGS |
+                                PublicInbox::Search::FLAG_PURE_NOT(),
+        }, $class
+}
+
+sub attach_extinbox {
+        my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox
+        if (!$ibxish->can('over')) {
+                push @{$self->{remotes}}, $ibxish
+        }
+        if (delete $self->{xdb}) { # XXX: do we need this?
+                # clobber existing {xdb} if amending
+                my $expect = delete $self->{nshard};
+                my $shards = delete $self->{shards_flat};
+                scalar(@$shards) == $expect or die
+                        "BUG: {nshard}$expect != shards=".scalar(@$shards);
+
+                my $prev = {};
+                for my $old_ibxish (@{$self->{shard2ibx}}) {
+                        next if $prev == $old_ibxish;
+                        $prev = $old_ibxish;
+                        my @shards = $old_ibxish->search->xdb_shards_flat;
+                        push @{$self->{shards_flat}}, @shards;
+                }
+                my $nr = scalar(@{$self->{shards_flat}});
+                $nr == $expect or die
+                        "BUG: reloaded $nr shards, expected $expect"
+        }
+        my @shards = $ibxish->search->xdb_shards_flat;
+        push @{$self->{shards_flat}}, @shards;
+        push(@{$self->{shard2ibx}}, $ibxish) for (@shards);
+}
+
+# called by PublicInbox::Search::xdb
+sub xdb_shards_flat { @{$_[0]->{shards_flat}} }
+
+# like over->get_art
+sub smsg_for {
+        my ($self, $mitem) = @_;
+        # cf. https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
+        my $nshard = $self->{nshard};
+        my $docid = $mitem->get_docid;
+        my $shard = ($docid - 1) % $nshard;
+        my $num = int(($docid - 1) / $nshard) + 1;
+        my $smsg = $self->{shard2ibx}->[$shard]->over->get_art($num);
+        $smsg->{docid} = $docid;
+        $smsg;
+}
+
+sub recent {
+        my ($self, $qstr, $opt) = @_;
+        $opt //= {};
+        $opt->{relevance} //= -2;
+        $self->mset($qstr //= 'bytes:1..', $opt);
+}
+
+1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index bbc5e32f..bca2036c 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -196,6 +196,7 @@ sub xdb_shards_flat ($) {
         my ($self) = @_;
         my $xpfx = $self->{xpfx};
         my (@xdb, $slow_phrase);
+        load_xapian();
         if ($xpfx =~ m/xapian${\SCHEMA_VERSION}\z/) {
                 @xdb = ($X{Database}->new($xpfx));
                 $self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert";
@@ -214,16 +215,6 @@ sub xdb_shards_flat ($) {
         @xdb;
 }
 
-sub _xdb {
-        my ($self) = @_;
-        $self->{qp_flags} //= $QP_FLAGS;
-        my @xdb = xdb_shards_flat($self) or return;
-        $self->{nshard} = scalar(@xdb);
-        my $xdb = shift @xdb;
-        $xdb->add_database($_) for @xdb;
-        $xdb;
-}
-
 # v2 Xapian docids don't conflict, so they're identical to
 # NNTP article numbers and IMAP UIDs.
 # https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
@@ -242,8 +233,12 @@ sub mset_to_artnums {
 sub xdb ($) {
         my ($self) = @_;
         $self->{xdb} //= do {
-                load_xapian();
-                $self->_xdb;
+                $self->{qp_flags} //= $QP_FLAGS;
+                my @xdb = $self->xdb_shards_flat or return;
+                $self->{nshard} = scalar(@xdb);
+                my $xdb = shift @xdb;
+                $xdb->add_database($_) for @xdb;
+                $xdb;
         };
 }