about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2020-11-23 07:05:51 +0000
committerEric Wong <e@80x24.org>2020-11-24 05:03:55 +0000
commit58e764d179131b0fba8590915e2528248be26329 (patch)
tree18a24739cb8bcc1ec4704772475e8e2dcb225453 /lib
parentd792a6e8029e4fe56977c5c5d76bae8fe8836cc4 (diff)
downloadpublic-inbox-58e764d179131b0fba8590915e2528248be26329.tar.gz
This will be used to index and search Inbox objects and perhaps
individual git repositories/epochs for grokmirror manifest.js.gz
generation.  There is no sharding planned for this at the moment
since inbox count should remain low (~100K to 1M) compared to
message count.

Folding this into the existing sharded DBs could be possible;
but would likely increase query and maintenance costs, as well
as development complexity.  So we'll use a few more inodes and
FDs at runtime, instead.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/ExtSearch.pm6
-rw-r--r--lib/PublicInbox/ExtSearchIdx.pm11
-rw-r--r--lib/PublicInbox/MiscIdx.pm107
-rw-r--r--lib/PublicInbox/MiscSearch.pm79
-rw-r--r--lib/PublicInbox/Search.pm8
-rw-r--r--lib/PublicInbox/SearchIdx.pm7
-rw-r--r--lib/PublicInbox/V2Writable.pm5
7 files changed, 215 insertions, 8 deletions
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index eb665027..c41ae443 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -10,6 +10,7 @@ use v5.10.1;
 use PublicInbox::Over;
 use PublicInbox::Inbox;
 use File::Spec ();
+use PublicInbox::MiscSearch;
 
 # for ->reopen, ->mset, ->mset_to_artnums
 use parent qw(PublicInbox::Search);
@@ -24,6 +25,11 @@ sub new {
         }, __PACKAGE__;
 }
 
+sub misc {
+        my ($self) = @_;
+        $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc");
+}
+
 sub search { $_[0] } # self
 
 # overrides PublicInbox::Search::_xdb
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 91434b26..708f8a3e 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -21,6 +21,7 @@ use Carp qw(croak carp);
 use PublicInbox::Search;
 use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor);
 use PublicInbox::OverIdx;
+use PublicInbox::MiscIdx;
 use PublicInbox::MID qw(mids);
 use PublicInbox::V2Writable;
 use PublicInbox::InboxWritable;
@@ -309,6 +310,7 @@ sub _sync_inbox ($$$) {
                 return;
         }
         index_todo($self, $sync, $_) for @{delete($sync->{todo}) // []};
+        $self->{midx}->index_ibx($ibx);
 }
 
 sub eidx_sync { # main entry point
@@ -374,6 +376,12 @@ sub update_last_commit { # overrides V2Writable
         $self->{oidx}->eidx_meta($meta_key, $latest_cmt);
 }
 
+sub _idx_init { # with_umask callback
+        my ($self, $opt) = @_;
+        PublicInbox::V2Writable::_idx_init($self, $opt);
+        $self->{midx} = PublicInbox::MiscIdx->new($self);
+}
+
 sub idx_init { # similar to V2Writable
         my ($self, $opt) = @_;
         return if $self->{idx_shards};
@@ -406,9 +414,10 @@ sub idx_init { # similar to V2Writable
         }
         $self->parallel_init($self->{indexlevel});
         $self->umask_prepare;
-        $self->with_umask(\&PublicInbox::V2Writable::_idx_init, $self, $opt);
+        $self->with_umask(\&_idx_init, $self, $opt);
         $self->{oidx}->begin_lazy;
         $self->{oidx}->eidx_prep;
+        $self->{midx}->begin_txn;
 }
 
 no warnings 'once';
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
new file mode 100644
index 00000000..edc70f9b
--- /dev/null
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -0,0 +1,107 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# like PublicInbox::SearchIdx, but for searching for non-mail messages.
+# Things indexed include:
+# * inboxes themselves
+# * epoch information
+# * (maybe) git code repository information
+# Expect ~100K-1M documents with no parallelism opportunities,
+# so no sharding, here.
+#
+# See MiscSearch for read-only counterpart
+package PublicInbox::MiscIdx;
+use strict;
+use v5.10.1;
+use PublicInbox::InboxWritable;
+use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat
+use PublicInbox::SearchIdx qw(index_text term_generator add_val);
+use PublicInbox::Spawn qw(nodatacow_dir);
+use Carp qw(croak);
+use File::Path ();
+use PublicInbox::MiscSearch;
+
+sub new {
+        my ($class, $eidx) = @_;
+        PublicInbox::SearchIdx::load_xapian_writable();
+        my $mi_dir = "$eidx->{xpfx}/misc";
+        File::Path::mkpath($mi_dir);
+        nodatacow_dir($mi_dir);
+        my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN;
+        $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync};
+        bless {
+                mi_dir => $mi_dir,
+                flags => $flags,
+                indexlevel => 'full', # small DB, no point in medium?
+        }, $class;
+}
+
+sub begin_txn {
+        my ($self) = @_;
+        croak 'BUG: already in txn' if $self->{xdb}; # XXX make lazy?
+        my $wdb = $PublicInbox::Search::X{WritableDatabase};
+        my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) };
+        croak "Failed opening $self->{mi_dir}: $@" if $@;
+        $self->{xdb} = $xdb;
+        $xdb->begin_transaction;
+}
+
+sub commit_txn {
+        my ($self) = @_;
+        croak 'BUG: not in txn' unless $self->{xdb}; # XXX make lazy?
+        delete($self->{xdb})->commit_transaction;
+}
+
+sub index_ibx {
+        my ($self, $ibx) = @_;
+        my $eidx_key = $ibx->eidx_key;
+        my $xdb = $self->{xdb};
+        # Q = uniQue in Xapian terminology
+        my $head = $xdb->postlist_begin('Q'.$eidx_key);
+        my $tail = $xdb->postlist_end('Q'.$eidx_key);
+        my ($docid, @drop);
+        for (; $head != $tail; $head++) {
+                if (defined $docid) {
+                        my $i = $head->get_docid;
+                        push @drop, $i;
+                        warn <<EOF;
+W: multiple inboxes keyed to `$eidx_key', deleting #$i
+EOF
+                } else {
+                        $docid = $head->get_docid;
+                }
+        }
+        $xdb->delete_document($_) for @drop; # just in case
+
+        my $doc = $PublicInbox::Search::X{Document}->new;
+
+        # allow sorting by modified
+        add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified);
+
+        $doc->add_boolean_term('Q'.$eidx_key);
+        $doc->add_boolean_term('T'.'inbox');
+        term_generator($self)->set_document($doc);
+
+        # description = S/Subject (or title)
+        # address = A/Author
+        index_text($self, $ibx->description, 1, 'S');
+        my %map = (
+                address => 'A',
+                listid => 'XLISTID',
+                infourl => 'XINFOURL',
+                url => 'XURL'
+        );
+        while (my ($f, $pfx) = each %map) {
+                for my $v (@{$ibx->{$f} // []}) {
+                        index_text($self, $v, 1, $pfx);
+                }
+        }
+        index_text($self, $ibx->{name}, 1, 'XNAME');
+        if (defined $docid) {
+                $xdb->replace_document($docid, $doc);
+        } else {
+                $xdb->add_document($doc);
+        }
+}
+
+1;
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
new file mode 100644
index 00000000..8beb8349
--- /dev/null
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -0,0 +1,79 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# read-only counterpart to MiscIdx
+package PublicInbox::MiscSearch;
+use strict;
+use v5.10.1;
+use PublicInbox::Search qw(retry_reopen);
+
+# Xapian value columns:
+our $MODIFIED = 0;
+
+# avoid conflicting with message Search::prob_prefix for UI/UX reasons
+my %PROB_PREFIX = (
+        description => 'S', # $INBOX_DIR/description
+        address => 'A',
+        listid => 'XLISTID',
+        url => 'XURL',
+        infourl => 'XINFOURL',
+        name => 'XNAME',
+        '' => 'S A XLISTID XNAME XURL XINFOURL'
+);
+
+sub new {
+        my ($class, $dir) = @_;
+        bless {
+                xdb => $PublicInbox::Search::X{Database}->new($dir)
+        }, $class;
+}
+
+# read-only
+sub mi_qp_new ($) {
+        my ($self) = @_;
+        my $xdb = $self->{xdb};
+        my $qp = $PublicInbox::Search::X{QueryParser}->new;
+        $qp->set_default_op(PublicInbox::Search::OP_AND());
+        $qp->set_database($xdb);
+        $qp->set_stemmer(PublicInbox::Search::stemmer($self));
+        $qp->set_stemming_strategy(PublicInbox::Search::STEM_SOME());
+        my $cb = $qp->can('set_max_wildcard_expansion') //
+                $qp->can('set_max_expansion'); # Xapian 1.5.0+
+        $cb->($qp, 100);
+        $cb = $qp->can('add_valuerangeprocessor') //
+                $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
+        while (my ($name, $prefix) = each %PROB_PREFIX) {
+                $qp->add_prefix($name, $_) for split(/ /, $prefix);
+        }
+        $qp->add_boolean_prefix('type', 'T');
+        $qp;
+}
+
+sub misc_enquire_once { # retry_reopen callback
+        my ($self, $qr, $opt) = @{$_[0]};
+        my $eq = $PublicInbox::Search::X{Enquire}->new($self->{xdb});
+        $eq->set_query($qr);
+        my $desc = !$opt->{asc};
+        my $rel = $opt->{relevance} // 0;
+        if ($rel == -1) { # ORDER BY docid/UID
+                $eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING);
+                $eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new);
+        } elsif ($rel) {
+                $eq->set_sort_by_relevance_then_value($MODIFIED, $desc);
+        } else {
+                $eq->set_sort_by_value_then_relevance($MODIFIED, $desc);
+        }
+        $eq->get_mset($opt->{offset} || 0, $opt->{limit} || 200);
+}
+
+sub mset {
+        my ($self, $qs, $opt) = @_;
+        $opt ||= {};
+        my $qp = $self->{qp} //= mi_qp_new($self);
+        $qs = 'type:inbox' if $qs eq '';
+        my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS);
+        $opt->{relevance} = 1 unless exists $opt->{relevance};
+        retry_reopen($self, \&misc_enquire_once, [ $self, $qr, $opt ]);
+}
+
+1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 71417d5e..05d5a133 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -6,7 +6,7 @@
 package PublicInbox::Search;
 use strict;
 use parent qw(Exporter);
-our @EXPORT_OK = qw(mdocid);
+our @EXPORT_OK = qw(mdocid retry_reopen);
 use List::Util qw(max);
 
 # values for searching, changing the numeric value breaks
@@ -54,11 +54,11 @@ use constant {
 
 use PublicInbox::Smsg;
 use PublicInbox::Over;
-my $QP_FLAGS;
+our $QP_FLAGS;
 our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem);
 our $Xap; # 'Search::Xapian' or 'Xapian'
-my $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
-my $ENQ_ASCENDING;
+our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
+our $ENQ_ASCENDING;
 
 sub load_xapian () {
         return 1 if defined $Xap;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 6ff2cf94..18390602 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -22,9 +22,10 @@ use PublicInbox::OverIdx;
 use PublicInbox::Spawn qw(spawn nodatacow_dir);
 use PublicInbox::Git qw(git_unquote);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
-our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack);
+our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack
+        index_text term_generator add_val);
 my $X = \%PublicInbox::Search::X;
-my ($DB_CREATE_OR_OPEN, $DB_OPEN);
+our ($DB_CREATE_OR_OPEN, $DB_OPEN);
 our $DB_NO_SYNC = 0;
 our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000;
 use constant DEBUG => !!$ENV{DEBUG};
@@ -154,7 +155,7 @@ sub term_generator ($) { # write-only
 
         $self->{term_generator} //= do {
                 my $tg = $X->{TermGenerator}->new;
-                $tg->set_stemmer($self->stemmer);
+                $tg->set_stemmer(PublicInbox::Search::stemmer($self));
                 $tg;
         }
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index ba7cef13..afba0220 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -631,6 +631,9 @@ sub checkpoint ($;$) {
                         $_->shard_commit for @$shards;
                 }
 
+                my $midx = $self->{midx}; # misc index
+                $midx->commit_txn if $midx;
+
                 # last_commit is special, don't commit these until
                 # Xapian shards are done:
                 $dbh->begin_work if $dbh;
@@ -639,6 +642,7 @@ sub checkpoint ($;$) {
                         $dbh->commit;
                         $dbh->begin_work;
                 }
+                $midx->begin_txn if $midx;
         }
         $self->{total_bytes} += $self->{transact_bytes};
         $self->{transact_bytes} = 0;
@@ -678,6 +682,7 @@ sub done {
         }
         eval { $self->{oidx}->dbh_close };
         $err .= "over close: $@\n" if $@;
+        delete $self->{midx};
         delete $self->{bnote};
         my $nbytes = $self->{total_bytes};
         $self->{total_bytes} = 0;