From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, LOTS_OF_MONEY shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5B5AA1FBF5 for ; Wed, 10 Jun 2020 07:07:31 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 64/82] searchidx: v1 (re)-index uses git asynchronously Date: Wed, 10 Jun 2020 07:05:01 +0000 Message-Id: <20200610070519.18252-65-e@yhbt.net> In-Reply-To: <20200610070519.18252-1-e@yhbt.net> References: <20200610070519.18252-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We can cleanup some of our v1 code slightly and let git do I/O+decoding in parallel. This gives a slight 2-4% re-indexing performance boost even on an SSD. --- lib/PublicInbox/SearchIdx.pm | 76 +++++++++++++++++------------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 3df7970ebf9..a790ac4076a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -378,7 +378,7 @@ sub _msgmap_init ($) { sub add_message { # mime = PublicInbox::Eml or Email::MIME object - my ($self, $mime, $smsg) = @_; + my ($self, $mime, $smsg, $sync) = @_; my $hdr = $mime->header_obj; my $mids = mids_for_index($hdr); $smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat @@ -389,7 +389,7 @@ sub add_message { }; # v1 and tests only: - $smsg->populate($hdr, $self); + $smsg->populate($hdr, $sync); $smsg->{bytes} //= length($mime->as_string); eval { @@ -549,24 +549,24 @@ sub unindex_mm { $self->{mm}->mid_delete(mid_mime($mime)); } -sub index_both { - my ($self, $mime, $smsg) = @_; - my $num = index_mm($self, $mime); +sub index_both { # git->cat_async callback + my ($bref, $oid, $type, $size, $sync) = @_; + my ($nr, $max) = @$sync{qw(nr max)}; + ++$$nr; + $$max -= $size; + my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; + my $self = $sync->{sidx}; + my $eml = PublicInbox::Eml->new($bref); + my $num = index_mm($self, $eml); $smsg->{num} = $num; - add_message($self, $mime, $smsg); + add_message($self, $eml, $smsg, $sync); } -sub unindex_both { - my ($self, $mime) = @_; - unindex_blob($self, $mime); - unindex_mm($self, $mime); -} - -sub do_cat_mail { - my ($git, $blob, $sizeref) = @_; - my $str = $git->cat_file($blob, $sizeref) or - die "BUG: $blob not found in $git->{git_dir}"; - PublicInbox::Eml->new($str); +sub unindex_both { # git->cat_async callback + my ($bref, $oid, $type, $size, $self) = @_; + my $eml = PublicInbox::Eml->new($bref); + unindex_blob($self, $eml); + unindex_mm($self, $eml); } # called by public-inbox-index @@ -576,15 +576,6 @@ sub index_sync { $self->{-inbox}->with_umask(sub { $self->_index_sync($opts) }) } -sub batch_adjust ($$$$$) { - my ($max, $bytes, $batch_cb, $latest, $nr) = @_; - $$max -= $bytes; - if ($$max <= 0) { - $$max = $BATCH_BYTES; - $batch_cb->($nr, $latest); - } -} - sub too_big ($$$) { my ($self, $git, $oid) = @_; my $max_size = $self->{index_max_size} or return; @@ -597,24 +588,28 @@ sub too_big ($$$) { # only for v1 sub read_log { - my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; + my ($self, $log, $batch_cb) = @_; my $hex = '[a-f0-9]'; my $h40 = $hex .'{40}'; my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!; my $git = $self->{git}; my $latest; - my $bytes; my $max = $BATCH_BYTES; local $/ = "\n"; my %D; my $line; my $newest; my $nr = 0; + my $sync = { sidx => $self, nr => \$nr, max => \$max }; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { my $blob = $1; if (delete $D{$blob}) { + # make sure pending index writes are done + # before writing to ->mm + $git->cat_async_wait; + if (defined $self->{regen_down}) { my $num = $self->{regen_down}--; $self->{mm}->num_highwater($num); @@ -622,12 +617,12 @@ sub read_log { next; } next if too_big($self, $git, $blob); - my $mime = do_cat_mail($git, $blob, \$bytes); - my $smsg = bless {}, 'PublicInbox::Smsg'; - batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr); - $smsg->{blob} = $blob; - $smsg->{bytes} = $bytes; - $add_cb->($self, $mime, $smsg); + $git->cat_async($blob, \&index_both, { %$sync }); + if ($max <= 0) { + $git->cat_async_wait; + $max = $BATCH_BYTES; + $batch_cb->($nr, $latest); + } } elsif ($line =~ /$delmsg/o) { my $blob = $1; $D{$blob} = 1 unless too_big($self, $git, $blob); @@ -635,18 +630,17 @@ sub read_log { $latest = $1; $newest ||= $latest; } elsif ($line =~ /^author .*? ([0-9]+) [\-\+][0-9]+$/) { - $self->{autime} = $1; + $sync->{autime} = $1; } elsif ($line =~ /^committer .*? ([0-9]+) [\-\+][0-9]+$/) { - $self->{cotime} = $1; + $sync->{cotime} = $1; } } close($log) or die "git log failed: \$?=$?"; # get the leftovers foreach my $blob (keys %D) { - my $mime = do_cat_mail($git, $blob, \$bytes); - $del_cb->($self, $mime); + $git->cat_async($blob, \&unindex_both, $self); } - delete @$self{qw(autime cotime)}; + $git->cat_async_wait; $batch_cb->($nr, $latest, $newest); } @@ -774,7 +768,7 @@ sub _index_sync { } while (_last_x_commit($self, $mm) ne $last_commit); my $dbh = $mm->{dbh} if $mm; - my $cb = sub { + my $batch_cb = sub { my ($nr, $commit, $newest) = @_; if ($dbh) { if ($newest) { @@ -803,7 +797,7 @@ sub _index_sync { }; $dbh->begin_work; - read_log($self, $xlog, *index_both, *unindex_both, $cb); + read_log($self, $xlog, $batch_cb); } sub DESTROY {