From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id AF01F1F66E for ; Mon, 10 Aug 2020 02:12:05 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 02/14] index: --sequential-shard works incrementally Date: Mon, 10 Aug 2020 02:11:53 +0000 Message-Id: <20200810021205.18909-3-e@yhbt.net> In-Reply-To: <20200810021205.18909-1-e@yhbt.net> References: <20200810021205.18909-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We should never reindex all data in Xapian unless --reindex is specified on the command-line. This means users who put publicInbox.indexSequentialShard in their config file won't have to put up with a full reindex at every invocation, only when they specify --reindex. We'll also cleanup the progress output to not emit non-sensical ranges where the starting number is higher than the end. --- lib/PublicInbox/V2Writable.pm | 36 ++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index f7a318e5..0b527f18 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1198,20 +1198,20 @@ sub index_xap_only { # git->cat_async callback sub index_xap_step ($$$;$) { my ($self, $sync, $beg, $step) = @_; - my $ibx = $self->{ibx}; - my $all = $ibx->git; - my $over = $ibx->over; - my $batch_bytes = batch_bytes($self); - $step //= $self->{shards}; my $end = $sync->{art_end}; + return if $beg > $end; # nothing to do + + $step //= $self->{shards}; + my $ibx = $self->{ibx}; if (my $pr = $sync->{-opt}->{-progress}) { $pr->("Xapian indexlevel=$ibx->{indexlevel} ". "$beg..$end (% $step)\n"); } + my $batch_bytes = batch_bytes($self); for (my $num = $beg; $num <= $end; $num += $step) { - my $smsg = $over->get_art($num) or next; + my $smsg = $ibx->over->get_art($num) or next; $smsg->{v2w} = $self; - $all->cat_async($smsg->{blob}, \&index_xap_only, $smsg); + $ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg); if ($self->{transact_bytes} >= $batch_bytes) { ${$sync->{nr}} = $num; reindex_checkpoint($self, $sync); @@ -1253,8 +1253,9 @@ sub index_epoch ($$$) { } sub xapian_only { - my ($self, $opt, $sync) = @_; + my ($self, $opt, $sync, $art_beg) = @_; my $seq = $opt->{sequentialshard}; + $art_beg //= 0; local $self->{parallel} = 0 if $seq; $self->idx_init($opt); # acquire lock if (my $art_end = $self->{ibx}->mm->max) { @@ -1268,9 +1269,11 @@ sub xapian_only { $sync->{art_end} = $art_end; if ($seq || !$self->{parallel}) { my $shard_end = $self->{shards} - 1; - index_xap_step($self, $sync, $_) for (0..$shard_end); + for (0..$shard_end) { + index_xap_step($self, $sync, $art_beg + $_) + } } else { # parallel (maybe) - index_xap_step($self, $sync, 0, 1); + index_xap_step($self, $sync, $art_beg, 1); } } $self->{ibx}->git->cat_async_wait; @@ -1289,6 +1292,7 @@ sub index_sync { return unless defined $latest; my $seq = $opt->{sequentialshard}; + my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; local $self->{ibx}->{indexlevel} = 'basic' if $seq; @@ -1312,6 +1316,12 @@ sub index_sync { $self->{mm}->{dbh}->begin_work; $sync->{mm_tmp} = $self->{mm}->tmp_clone($self->{ibx}->{inboxdir}); + + # xapian_only works incrementally w/o --reindex + if ($seq && !$opt->{reindex}) { + $art_beg = $sync->{mm_tmp}->max; + $art_beg++ if defined($art_beg); + } } if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) { $sync->{index_oid} = \&index_oid; @@ -1326,10 +1336,10 @@ sub index_sync { $pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr; } - if ($seq) { # deal with Xapian shards sequentially + # deal with Xapian shards sequentially + if ($seq && delete($sync->{mm_tmp})) { $self->{ibx}->{indexlevel} = $idxlevel; - delete $sync->{mm_tmp}; - xapian_only($self, $opt, $sync); + xapian_only($self, $opt, $sync, $art_beg); } # reindex does not pick up new changes, so we rerun w/o it: