user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 20/26] xcpdb: show re-indexing progress
Date: Thu, 23 May 2019 09:36:58 +0000	[thread overview]
Message-ID: <20190523093704.18367-21-e@80x24.org> (raw)
In-Reply-To: <20190523093704.18367-1-e@80x24.org>

Emit information about reindexing git revision ranges when used
with xcpdb.  Additionally, distinguish Xapian copy output from
v2 git epoch counting by increasing directory context info.

For now, v1 batches batches are emitted.  v2 indexing is still
missing progress reporting for batches, as the data structures
for reindexing would benefit from a refactoring, first.

This does not currently affect the use of public-inbox-index,
but may in the future.
---
 lib/PublicInbox/SearchIdx.pm  | 24 ++++++++++++++++--------
 lib/PublicInbox/V2Writable.pm | 10 +++++++++-
 lib/PublicInbox/Xapcmd.pm     |  8 ++++----
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 0aeeb6b..9c29106 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -549,12 +549,12 @@ sub index_sync {
 	$self->{-inbox}->with_umask(sub { $self->_index_sync($opts) })
 }
 
-sub batch_adjust ($$$$) {
-	my ($max, $bytes, $batch_cb, $latest) = @_;
+sub batch_adjust ($$$$$) {
+	my ($max, $bytes, $batch_cb, $latest, $nr) = @_;
 	$$max -= $bytes;
 	if ($$max <= 0) {
 		$$max = BATCH_BYTES;
-		$batch_cb->($latest);
+		$batch_cb->($nr, $latest);
 	}
 }
 
@@ -573,6 +573,7 @@ sub read_log {
 	my %D;
 	my $line;
 	my $newest;
+	my $nr = 0;
 	while (defined($line = <$log>)) {
 		if ($line =~ /$addmsg/o) {
 			my $blob = $1;
@@ -584,7 +585,7 @@ sub read_log {
 				next;
 			}
 			my $mime = do_cat_mail($git, $blob, \$bytes) or next;
-			batch_adjust(\$max, $bytes, $batch_cb, $latest);
+			batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
 			$add_cb->($self, $mime, $bytes, $blob);
 		} elsif ($line =~ /$delmsg/o) {
 			my $blob = $1;
@@ -599,7 +600,7 @@ sub read_log {
 		my $mime = do_cat_mail($git, $blob, \$bytes) or next;
 		$del_cb->($self, $mime);
 	}
-	$batch_cb->($latest, $newest);
+	$batch_cb->($nr, $latest, $newest);
 }
 
 sub _msgmap_init {
@@ -612,7 +613,7 @@ sub _msgmap_init {
 }
 
 sub _git_log {
-	my ($self, $range) = @_;
+	my ($self, $opts, $range) = @_;
 	my $git = $self->{git};
 
 	if (index($range, '..') < 0) {
@@ -629,12 +630,17 @@ sub _git_log {
 	# Count the new files so they can be added newest to oldest
 	# and still have numbers increasing from oldest to newest
 	my $fcount = 0;
+	my $pr = $opts->{-progress};
+	$pr->("counting changes\n\t$range ... ") if $pr;
 	# can't use 'rev-list --count' if we use --diff-filter
 	my $fh = $git->popen(qw(log --pretty=tformat:%h
 			     --no-notes --no-color --no-renames
 			     --diff-filter=AM), $range);
 	++$fcount while <$fh>;
+	close $fh;
 	my $high = $self->{mm}->num_highwater;
+	$pr->("$fcount\n") if $pr; # continue previous line
+	$self->{ntodo} = $fcount;
 
 	if (index($range, '..') < 0) {
 		if ($high && $high == $fcount) {
@@ -707,6 +713,7 @@ sub _index_sync {
 	my ($last_commit, $lx, $xlog);
 	my $git = $self->{git};
 	$git->batch_prepare;
+	my $pr = $opts->{-progress};
 
 	my $xdb = $self->begin_txn_lazy;
 	my $mm = _msgmap_init($self);
@@ -724,14 +731,14 @@ sub _index_sync {
 
 		# ensure we leak no FDs to "git log" with Xapian <= 1.2
 		my $range = $lx eq '' ? $tip : "$lx..$tip";
-		$xlog = _git_log($self, $range);
+		$xlog = _git_log($self, $opts, $range);
 
 		$xdb = $self->begin_txn_lazy;
 	} while (_last_x_commit($self, $mm) ne $last_commit);
 
 	my $dbh = $mm->{dbh} if $mm;
 	my $cb = sub {
-		my ($commit, $newest) = @_;
+		my ($nr, $commit, $newest) = @_;
 		if ($dbh) {
 			if ($newest) {
 				my $cur = $mm->last_commit || '';
@@ -751,6 +758,7 @@ sub _index_sync {
 		$git->cleanup;
 		$xdb = _xdb_release($self);
 		# let another process do some work... <
+		$pr->("indexed $nr/$self->{ntodo}\n") if $pr && $nr;
 		if (!$newest) {
 			$xdb = $self->begin_txn_lazy;
 			$dbh->begin_work if $dbh;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 1ee19b2..1170f32 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -777,6 +777,9 @@ sub reindex_oid {
 		$git->cleanup;
 		$mm_tmp->atfork_prepare;
 		$self->done; # release lock
+
+		# TODO: print progress info, here
+
 		# allow -watch or -mda to write...
 		$self->idx_init; # reacquire lock
 		$mm_tmp->atfork_parent;
@@ -844,6 +847,7 @@ $range
 
 sub index_prepare {
 	my ($self, $opts, $epoch_max, $ranges) = @_;
+	my $pr = $opts->{-progress};
 	my $regen_max = 0;
 	my $head = $self->{-inbox}->{ref_head} || 'refs/heads/master';
 	for (my $i = $epoch_max; $i >= 0; $i--) {
@@ -858,10 +862,14 @@ sub index_prepare {
 		$ranges->[$i] = $range;
 
 		# can't use 'rev-list --count' if we use --diff-filter
+		$pr->("$i.git counting changes\n\t$range ... ") if $pr;
+		my $n = 0;
 		my $fh = $git->popen(qw(log --pretty=tformat:%H
 				--no-notes --no-color --no-renames
 				--diff-filter=AM), $range, '--', 'm');
-		++$regen_max while <$fh>;
+		++$n while <$fh>;
+		$pr->("$n\n") if $pr;
+		$regen_max += $n;
 	}
 	\$regen_max;
 }
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index aa3e4c0..0e44804 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -222,10 +222,11 @@ sub cpdb {
 
 			$it = $src->postlist_begin('');
 			$end = $src->postlist_end('');
-			$pfx = (split('/', $old))[-1].':';
 			if ($pr) {
 				$nr = 0;
 				$tot = $src->get_doccount;
+				my @p = split('/', $old);
+				$pfx = "$p[-2]/$p[-1]:";
 				$fmt = "$pfx % ".length($tot)."u/$tot\n";
 				$pr->("$pfx copying $tot documents\n");
 			}
@@ -255,7 +256,6 @@ sub cpdb {
 	return unless $opt->{compact};
 
 	$src = $dst = undef; # flushes and closes
-	$pfx = undef unless $fmt;
 
 	$pr->("$pfx compacting...\n") if $pr;
 	# this is probably the best place to do xapian-compact
@@ -268,11 +268,11 @@ sub cpdb {
 	}
 
 	my ($r, $w);
-	if ($pfx && pipe($r, $w)) {
+	if ($pr && pipe($r, $w)) {
 		$rdr->{1} = fileno($w);
 	}
 	my $pid = spawn($cmd, $env, $rdr);
-	if ($pfx) {
+	if ($pr) {
 		close $w or die "close: \$w: $!";
 		foreach (<$r>) {
 			s/\r/\r$pfx /g;
-- 
EW


  parent reply	other threads:[~2019-05-23  9:37 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-23  9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
2019-05-23  9:36 ` [PATCH 01/26] t/convert-compact: skip on missing xapian-compact(1) Eric Wong
2019-05-23  9:36 ` [PATCH 02/26] v1writable: retire in favor of InboxWritable Eric Wong
2019-05-23  9:36 ` [PATCH 03/26] doc: document the reason for --no-renumber Eric Wong
2019-05-23  9:36 ` [PATCH 04/26] search: reenable phrase search on non-chert Xapian Eric Wong
2019-05-23  9:36 ` [PATCH 05/26] xapcmd: new module for wrapping Xapian commands Eric Wong
2019-05-23  9:36 ` [PATCH 06/26] admin: hoist out resolve_inboxes for -compact and -index Eric Wong
2019-05-23  9:36 ` [PATCH 07/26] xapcmd: support spawn options Eric Wong
2019-05-23  9:36 ` [PATCH 08/26] xcpdb: new tool which wraps Xapian's copydatabase(1) Eric Wong
2019-05-23  9:36 ` [PATCH 09/26] xapcmd: do not cleanup on errors Eric Wong
2019-05-23  9:36 ` [PATCH 10/26] admin: move index_inbox over Eric Wong
2019-05-23  9:36 ` [PATCH 11/26] xcpdb: implement using Perl bindings Eric Wong
2019-05-23  9:36 ` [PATCH 12/26] xapcmd: xcpdb supports compaction Eric Wong
2019-05-23  9:36 ` [PATCH 13/26] v2writable: hoist out log_range sub for readability Eric Wong
2019-05-23  9:36 ` [PATCH 14/26] xcpdb: use fine-grained locking Eric Wong
2019-05-23  9:36 ` [PATCH 15/26] xcpdb: implement progress reporting Eric Wong
2019-05-23  9:36 ` [PATCH 16/26] xcpdb: cleanup error handling and diagnosis Eric Wong
2019-05-23  9:36 ` [PATCH 17/26] xapcmd: avoid EXDEV when finalizing changes Eric Wong
2019-05-23  9:36 ` [PATCH 18/26] doc: xcpdb: update to reflect the current state Eric Wong
2019-05-23  9:36 ` [PATCH 19/26] xapcmd: use "print STDERR" for progress reporting Eric Wong
2019-05-23  9:36 ` Eric Wong [this message]
2019-05-23  9:36 ` [PATCH 21/26] xcpdb: remove temporary directories on aborts Eric Wong
2019-05-23  9:37 ` [PATCH 22/26] compact: reuse infrastructure from xcpdb Eric Wong
2019-05-23  9:37 ` [PATCH 23/26] xcpdb|compact: support some xapian-compact switches Eric Wong
2019-05-23  9:37 ` [PATCH 24/26] xapcmd: cleanup on interrupted xcpdb "--compact" Eric Wong
2019-05-23  9:37 ` [PATCH 25/26] xcpdb|compact: support --jobs/-j flag like gmake(1) Eric Wong
2019-05-23  9:37 ` [PATCH 26/26] xapcmd: do not reset %SIG until last Xtmpdir is done Eric Wong
2019-05-23 10:37 ` [PATCH 27/26] doc: various updates to reflect current state Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190523093704.18367-21-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).