From 9ecbfc09928dada28094fd3fc79e91a5472b27ea Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Thu, 22 Feb 2018 01:49:08 +0000 Subject: v2: parallelize Xapian indexing The parallelization requires splitting Msgmap, text+term indexing, and thread-linking out into separate processes. git-fast-import is fast, so we don't bother parallelizing it. Msgmap (SQLite) and thread-linking (Xapian) must be serialized because they rely on monotonically increasing numbers (NNTP article number and internal thread_id, respectively). We handle msgmap in the main process which drives fast-import. When the article number is retrieved/generated, we write the entire message to per-partition subprocesses via pipes for expensive text+term indexing. When these per-partition subprocesses are done with the expensive text+term indexing, they write SearchMsg (small data) to a shared pipe (inherited from the main V2Writable process) back to the threader, which runs its own subprocess. The number of text+term Xapian partitions is chosen at import and can be made equal to the number of cores in a machine. V2Writable --> Import -> git-fast-import \-> SearchIdxThread -> Msgmap (synchronous) \-> SearchIdxPart[n] -> SearchIdx[*] \-> SearchIdxThread -> SearchIdx ("threader", a subprocess) [* ] each subprocess writes to threader --- lib/PublicInbox/SearchMsg.pm | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'lib/PublicInbox/SearchMsg.pm') diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 25c1abb8..941bfd24 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -29,19 +29,24 @@ sub get_val ($$) { Search::Xapian::sortable_unserialise($doc->get_value($col)); } -sub load_expand { - my ($self) = @_; - my $doc = $self->{doc}; - my $data = $doc->get_data or return; - $self->{ts} = get_val($doc, &PublicInbox::Search::TS); - utf8::decode($data); - my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data); +sub load_from_data ($$) { + my ($self) = $_[0]; # data = $_[1] + my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $_[1]); $self->{subject} = $subj; $self->{from} = $from; $self->{references} = $refs; $self->{to} = $to; $self->{cc} = $cc; $self->{blob} = $blob; +} + +sub load_expand { + my ($self) = @_; + my $doc = $self->{doc}; + my $data = $doc->get_data or return; + $self->{ts} = get_val($doc, &PublicInbox::Search::TS); + utf8::decode($data); + load_from_data($self, $data); $self; } @@ -50,17 +55,9 @@ sub load_doc { my $data = $doc->get_data or return; my $ts = get_val($doc, &PublicInbox::Search::TS); utf8::decode($data); - my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data); - bless { - doc => $doc, - subject => $subj, - ts => $ts, - from => $from, - references => $refs, - to => $to, - cc => $cc, - blob => $blob, - }, $class; + my $self = bless { doc => $doc, ts => $ts }, $class; + load_from_data($self, $data); + $self } # :bytes and :lines metadata in RFC 3977 -- cgit v1.2.3-24-ge0c7