From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 963C02065F for ; Thu, 23 May 2019 09:37:12 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 20/26] xcpdb: show re-indexing progress Date: Thu, 23 May 2019 09:36:58 +0000 Message-Id: <20190523093704.18367-21-e@80x24.org> In-Reply-To: <20190523093704.18367-1-e@80x24.org> References: <20190523093704.18367-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Emit information about reindexing git revision ranges when used with xcpdb. Additionally, distinguish Xapian copy output from v2 git epoch counting by increasing directory context info. For now, v1 batches batches are emitted. v2 indexing is still missing progress reporting for batches, as the data structures for reindexing would benefit from a refactoring, first. This does not currently affect the use of public-inbox-index, but may in the future. --- lib/PublicInbox/SearchIdx.pm | 24 ++++++++++++++++-------- lib/PublicInbox/V2Writable.pm | 10 +++++++++- lib/PublicInbox/Xapcmd.pm | 8 ++++---- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 0aeeb6b..9c29106 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -549,12 +549,12 @@ sub index_sync { $self->{-inbox}->with_umask(sub { $self->_index_sync($opts) }) } -sub batch_adjust ($$$$) { - my ($max, $bytes, $batch_cb, $latest) = @_; +sub batch_adjust ($$$$$) { + my ($max, $bytes, $batch_cb, $latest, $nr) = @_; $$max -= $bytes; if ($$max <= 0) { $$max = BATCH_BYTES; - $batch_cb->($latest); + $batch_cb->($nr, $latest); } } @@ -573,6 +573,7 @@ sub read_log { my %D; my $line; my $newest; + my $nr = 0; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { my $blob = $1; @@ -584,7 +585,7 @@ sub read_log { next; } my $mime = do_cat_mail($git, $blob, \$bytes) or next; - batch_adjust(\$max, $bytes, $batch_cb, $latest); + batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr); $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { my $blob = $1; @@ -599,7 +600,7 @@ sub read_log { my $mime = do_cat_mail($git, $blob, \$bytes) or next; $del_cb->($self, $mime); } - $batch_cb->($latest, $newest); + $batch_cb->($nr, $latest, $newest); } sub _msgmap_init { @@ -612,7 +613,7 @@ sub _msgmap_init { } sub _git_log { - my ($self, $range) = @_; + my ($self, $opts, $range) = @_; my $git = $self->{git}; if (index($range, '..') < 0) { @@ -629,12 +630,17 @@ sub _git_log { # Count the new files so they can be added newest to oldest # and still have numbers increasing from oldest to newest my $fcount = 0; + my $pr = $opts->{-progress}; + $pr->("counting changes\n\t$range ... ") if $pr; # can't use 'rev-list --count' if we use --diff-filter my $fh = $git->popen(qw(log --pretty=tformat:%h --no-notes --no-color --no-renames --diff-filter=AM), $range); ++$fcount while <$fh>; + close $fh; my $high = $self->{mm}->num_highwater; + $pr->("$fcount\n") if $pr; # continue previous line + $self->{ntodo} = $fcount; if (index($range, '..') < 0) { if ($high && $high == $fcount) { @@ -707,6 +713,7 @@ sub _index_sync { my ($last_commit, $lx, $xlog); my $git = $self->{git}; $git->batch_prepare; + my $pr = $opts->{-progress}; my $xdb = $self->begin_txn_lazy; my $mm = _msgmap_init($self); @@ -724,14 +731,14 @@ sub _index_sync { # ensure we leak no FDs to "git log" with Xapian <= 1.2 my $range = $lx eq '' ? $tip : "$lx..$tip"; - $xlog = _git_log($self, $range); + $xlog = _git_log($self, $opts, $range); $xdb = $self->begin_txn_lazy; } while (_last_x_commit($self, $mm) ne $last_commit); my $dbh = $mm->{dbh} if $mm; my $cb = sub { - my ($commit, $newest) = @_; + my ($nr, $commit, $newest) = @_; if ($dbh) { if ($newest) { my $cur = $mm->last_commit || ''; @@ -751,6 +758,7 @@ sub _index_sync { $git->cleanup; $xdb = _xdb_release($self); # let another process do some work... < + $pr->("indexed $nr/$self->{ntodo}\n") if $pr && $nr; if (!$newest) { $xdb = $self->begin_txn_lazy; $dbh->begin_work if $dbh; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 1ee19b2..1170f32 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -777,6 +777,9 @@ sub reindex_oid { $git->cleanup; $mm_tmp->atfork_prepare; $self->done; # release lock + + # TODO: print progress info, here + # allow -watch or -mda to write... $self->idx_init; # reacquire lock $mm_tmp->atfork_parent; @@ -844,6 +847,7 @@ $range sub index_prepare { my ($self, $opts, $epoch_max, $ranges) = @_; + my $pr = $opts->{-progress}; my $regen_max = 0; my $head = $self->{-inbox}->{ref_head} || 'refs/heads/master'; for (my $i = $epoch_max; $i >= 0; $i--) { @@ -858,10 +862,14 @@ sub index_prepare { $ranges->[$i] = $range; # can't use 'rev-list --count' if we use --diff-filter + $pr->("$i.git counting changes\n\t$range ... ") if $pr; + my $n = 0; my $fh = $git->popen(qw(log --pretty=tformat:%H --no-notes --no-color --no-renames --diff-filter=AM), $range, '--', 'm'); - ++$regen_max while <$fh>; + ++$n while <$fh>; + $pr->("$n\n") if $pr; + $regen_max += $n; } \$regen_max; } diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index aa3e4c0..0e44804 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -222,10 +222,11 @@ sub cpdb { $it = $src->postlist_begin(''); $end = $src->postlist_end(''); - $pfx = (split('/', $old))[-1].':'; if ($pr) { $nr = 0; $tot = $src->get_doccount; + my @p = split('/', $old); + $pfx = "$p[-2]/$p[-1]:"; $fmt = "$pfx % ".length($tot)."u/$tot\n"; $pr->("$pfx copying $tot documents\n"); } @@ -255,7 +256,6 @@ sub cpdb { return unless $opt->{compact}; $src = $dst = undef; # flushes and closes - $pfx = undef unless $fmt; $pr->("$pfx compacting...\n") if $pr; # this is probably the best place to do xapian-compact @@ -268,11 +268,11 @@ sub cpdb { } my ($r, $w); - if ($pfx && pipe($r, $w)) { + if ($pr && pipe($r, $w)) { $rdr->{1} = fileno($w); } my $pid = spawn($cmd, $env, $rdr); - if ($pfx) { + if ($pr) { close $w or die "close: \$w: $!"; foreach (<$r>) { s/\r/\r$pfx /g; -- EW