From 9ecbfc09928dada28094fd3fc79e91a5472b27ea Mon Sep 17 00:00:00 2001
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
Date: Thu, 22 Feb 2018 01:49:08 +0000
Subject: v2: parallelize Xapian indexing

The parallelization requires splitting Msgmap, text+term
indexing, and thread-linking out into separate processes.

git-fast-import is fast, so we don't bother parallelizing it.

Msgmap (SQLite) and thread-linking (Xapian) must be serialized
because they rely on monotonically increasing numbers (NNTP
article number and internal thread_id, respectively).

We handle msgmap in the main process which drives fast-import.
When the article number is retrieved/generated, we write the
entire message to per-partition subprocesses via pipes for
expensive text+term indexing.

When these per-partition subprocesses are done with the
expensive text+term indexing, they write SearchMsg (small data)
to a shared pipe (inherited from the main V2Writable process)
back to the threader, which runs its own subprocess.

The number of text+term Xapian partitions is chosen at import
and can be made equal to the number of cores in a machine.

V2Writable --> Import -> git-fast-import
           \-> SearchIdxThread -> Msgmap (synchronous)
           \-> SearchIdxPart[n] -> SearchIdx[*]
	   \-> SearchIdxThread -> SearchIdx ("threader", a subprocess)

[* ] each subprocess writes to threader
---
 lib/PublicInbox/SearchMsg.pm | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'lib/PublicInbox/SearchMsg.pm')

diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 25c1abb8..941bfd24 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -29,19 +29,24 @@ sub get_val ($$) {
 	Search::Xapian::sortable_unserialise($doc->get_value($col));
 }
 
-sub load_expand {
-	my ($self) = @_;
-	my $doc = $self->{doc};
-	my $data = $doc->get_data or return;
-	$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
-	utf8::decode($data);
-	my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data);
+sub load_from_data ($$) {
+	my ($self) = $_[0]; # data = $_[1]
+	my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $_[1]);
 	$self->{subject} = $subj;
 	$self->{from} = $from;
 	$self->{references} = $refs;
 	$self->{to} = $to;
 	$self->{cc} = $cc;
 	$self->{blob} = $blob;
+}
+
+sub load_expand {
+	my ($self) = @_;
+	my $doc = $self->{doc};
+	my $data = $doc->get_data or return;
+	$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
+	utf8::decode($data);
+	load_from_data($self, $data);
 	$self;
 }
 
@@ -50,17 +55,9 @@ sub load_doc {
 	my $data = $doc->get_data or return;
 	my $ts = get_val($doc, &PublicInbox::Search::TS);
 	utf8::decode($data);
-	my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data);
-	bless {
-		doc => $doc,
-		subject => $subj,
-		ts => $ts,
-		from => $from,
-		references => $refs,
-		to => $to,
-		cc => $cc,
-		blob => $blob,
-	}, $class;
+	my $self = bless { doc => $doc, ts => $ts }, $class;
+	load_from_data($self, $data);
+	$self
 }
 
 # :bytes and :lines metadata in RFC 3977
-- 
cgit v1.2.3-24-ge0c7