diff options
author | Eric Wong <e@80x24.org> | 2020-12-15 02:02:22 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2020-12-17 19:13:14 +0000 |
commit | 75ffc6a266699e465471adf5992d36a1db8dc1ae (patch) | |
tree | 874275015e175ebf12c9fad5858228021bba4ce5 /lib/PublicInbox/SearchIdx.pm | |
parent | c014cd93de1f2c73348db0e6531f93cf0f1be60f (diff) | |
download | public-inbox-75ffc6a266699e465471adf5992d36a1db8dc1ae.tar.gz |
Since we're inside a Xapian transaction, calling ->index_raw followed by ->shard_add_eidx_info calls on the same docid doesn't seem to hurt indexing performance. It definitely reduces FS read traffic and IPC from git at the cost of some more IPC between the parent and workers. Nevertheless, the code and FD reductions seem worth it.
Diffstat (limited to 'lib/PublicInbox/SearchIdx.pm')
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 64 |
1 files changed, 0 insertions, 64 deletions
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index cd8f4dd7..c6d2a0e8 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -1008,68 +1008,4 @@ SELECT COUNT(*) FROM over WHERE num = ? } } -sub reindex_xap { # git->cat_async callback - my ($bref, $oid, $type, $size, $ary) = @_; - my ($ibx_id, $oidhex, $req, $more) = @$ary; - my $self = $req->{self} // die 'BUG: {self} missing'; - my $eidx = $self->{eidx} // die 'BUG: {eidx} missing'; - my $eidx_key = $self->{-eidx_key_for}->{$ibx_id} // - die "BUG: bad ibx_id=$ibx_id ($oid)"; - - my $docid = $req->{docid}; - local $eidx->{current_info} = "#$docid $oid"; - return if is_bad_blob($oid, $type, $size, $oidhex); - if (my $doc = $req->{doc}) { # modify existing doc - $req->{tg_isset} //= do { # for existing documents in {xdb} - term_generator($self)->set_document($doc); - 1; - }; - $doc->add_boolean_term('O'.$eidx_key); - index_list_id($self, $doc, PublicInbox::Eml->new($bref)); - } else { # first time seeing this doc - my $smsg = $self->{eidx}->over->get_art($docid) // - die "BUG: #$docid ($oid) not in over"; - $smsg->{bytes} = $size + crlf_adjust($$bref); - $smsg->{eidx_key} = $eidx_key; - my $eml = PublicInbox::Eml->new($bref); - $req->{doc} = eml2doc($self, $eml, $smsg); - $req->{tg_isset} = 1; # eml2doc calls $tg->set_document - } - return if $more; - my $doc = delete($req->{doc}) or return; # all bad blobs! - $eidx->{transact_bytes} += $size; - $self->{xdb}->replace_document($req->{docid}, $doc); -} - -sub reindex_docid { - my ($self, $docid) = @_; - my $eidx = $self->{eidx} // die 'BUG: {eidx} missing'; - my $eidx_key_for = $self->{-eidx_key_for} //= do { - my %eidx_key_for = map { - $_->[0] => $_->[1]; - } @{$eidx->over->dbh->selectall_arrayref(<<'')}; -SELECT ibx_id,eidx_key FROM inboxes - - \%eidx_key_for; - }; - - begin_txn_lazy($self); - my $doc = eval { $self->{xdb}->get_document($docid) }; - my $req = { doc => $doc, self => $self, docid => $docid }; - my $sth = $eidx->over->dbh->prepare_cached(<<'', undef, 1); -SELECT ibx_id,oidbin FROM xref3 WHERE docid = ? ORDER BY ibx_id ASC - - $sth->execute($docid); - my $rows = $sth->fetchall_arrayref; - while (my $row = shift(@$rows)) { - my ($ibx_id, $oidbin) = @$row; - my $oidhex = unpack('H*', $oidbin); - $eidx->git->cat_async($oidhex, \&reindex_xap, - [ $ibx_id, $oidhex, $req, scalar(@$rows) ]); - } - if ($eidx->{transact_bytes} >= $eidx->{batch_bytes}) { - commit_txn_lazy($self); - } -} - 1; |