about summary refs log tree commit homepage
path: root/lib/PublicInbox/SearchIdxPart.pm
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-02-22 01:49:08 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-02-22 18:33:46 +0000
commit9ecbfc09928dada28094fd3fc79e91a5472b27ea (patch)
treea829ab7765f45e139e8a9d5de1c3784fc26bbf69 /lib/PublicInbox/SearchIdxPart.pm
parenta81ad9c4b1b5d8c2ae8444b6dcb8710bd361f628 (diff)
downloadpublic-inbox-9ecbfc09928dada28094fd3fc79e91a5472b27ea.tar.gz
The parallelization requires splitting Msgmap, text+term
indexing, and thread-linking out into separate processes.

git-fast-import is fast, so we don't bother parallelizing it.

Msgmap (SQLite) and thread-linking (Xapian) must be serialized
because they rely on monotonically increasing numbers (NNTP
article number and internal thread_id, respectively).

We handle msgmap in the main process which drives fast-import.
When the article number is retrieved/generated, we write the
entire message to per-partition subprocesses via pipes for
expensive text+term indexing.

When these per-partition subprocesses are done with the
expensive text+term indexing, they write SearchMsg (small data)
to a shared pipe (inherited from the main V2Writable process)
back to the threader, which runs its own subprocess.

The number of text+term Xapian partitions is chosen at import
and can be made equal to the number of cores in a machine.

V2Writable --> Import -> git-fast-import
           \-> SearchIdxThread -> Msgmap (synchronous)
           \-> SearchIdxPart[n] -> SearchIdx[*]
	   \-> SearchIdxThread -> SearchIdx ("threader", a subprocess)

[* ] each subprocess writes to threader
Diffstat (limited to 'lib/PublicInbox/SearchIdxPart.pm')
-rw-r--r--lib/PublicInbox/SearchIdxPart.pm70
1 files changed, 70 insertions, 0 deletions
diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm
new file mode 100644
index 00000000..d5a3fd17
--- /dev/null
+++ b/lib/PublicInbox/SearchIdxPart.pm
@@ -0,0 +1,70 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+package PublicInbox::SearchIdxPart;
+use strict;
+use warnings;
+use base qw(PublicInbox::SearchIdx);
+
+sub new {
+        my ($class, $v2writable, $part, $threader) = @_;
+        my $self = $class->SUPER::new($v2writable->{-inbox}, 1, $part);
+        $self->{threader} = $threader;
+        my ($r, $w);
+        pipe($r, $w) or die "pipe failed: $!\n";
+        my $pid = fork;
+        defined $pid or die "fork failed: $!\n";
+        if ($pid == 0) {
+                foreach my $other (@{$v2writable->{idx_parts}}) {
+                        my $other_w = $other->{w} or next;
+                        close $other_w or die "close other failed: $!\n";
+                }
+                $v2writable = undef;
+                close $w;
+                eval { partition_worker_loop($self, $r) };
+                die "worker $part died: $@\n" if $@;
+                die "unexpected MM $self->{mm}" if $self->{mm};
+                exit;
+        }
+        $self->{pid} = $pid;
+        $self->{w} = $w;
+        close $r;
+        $self;
+}
+
+sub partition_worker_loop ($$) {
+        my ($self, $r) = @_;
+        my $xdb = $self->_xdb_acquire;
+        $xdb->begin_transaction;
+        my $txn = 1;
+        while (my $line = $r->getline) {
+                if ($line eq "commit\n") {
+                        $xdb->commit_transaction if $txn;
+                        $txn = undef;
+                } elsif ($line eq "close\n") {
+                        $self->_xdb_release;
+                        $xdb = $txn = undef;
+                } else {
+                        my ($len, $artnum, $object_id) = split(/ /, $line);
+                        $xdb ||= $self->_xdb_acquire;
+                        if (!$txn) {
+                                $xdb->begin_transaction;
+                                $txn = 1;
+                        }
+                        my $n = read($r, my $msg, $len) or die "read: $!\n";
+                        $n == $len or die "short read: $n != $len\n";
+                        my $mime = PublicInbox::MIME->new(\$msg);
+                        $self->index_blob($mime, $len, $artnum, $object_id);
+                }
+        }
+        warn "$$ still in transaction\n" if $txn;
+        warn "$$ xdb active\n" if $xdb;
+}
+
+# called by V2Writable
+sub index_raw {
+        my ($self, $len, $msgref, $artnum, $object_id) = @_;
+        print { $self->{w} } "$len $artnum $object_id\n", $$msgref or die
+                "failed to write partition $!\n";
+}
+
+1;