user/dev discussion of public-inbox itself
 help / color / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 15/26] xcpdb: implement progress reporting
Date: Thu, 23 May 2019 09:36:53 +0000
Message-ID: <20190523093704.18367-16-e@80x24.org> (raw)
In-Reply-To: <20190523093704.18367-1-e@80x24.org>

Copying an entire Xapian DB is horribly slow whether it's done
via Perl or copydatabase(1).  So displaying some progress
indication is good for user experience.

While we're at it, prefix xapian-compact output, too; since
parallel processes end up clobbering each other.
---
 lib/PublicInbox/Xapcmd.pm | 47 +++++++++++++++++++++++++++++++++++----
 script/public-inbox-xcpdb |  5 +----
 t/indexlevels-mirror.t    |  4 ++--
 3 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 4555340..99f0e7c 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -82,9 +82,21 @@ sub prepare_reindex ($$) {
 	}
 }
 
+sub progress_prepare ($) {
+	my ($opt) = @_;
+	if ($opt->{quiet}) {
+		open my $null, '>', '/dev/null' or
+			die "failed to open /dev/null: $!\n";
+		$opt->{1} = fileno($null);
+		$opt->{-dev_null} = $null;
+	} else {
+		$opt->{-progress} = 1;
+	}
+}
+
 sub run {
 	my ($ibx, $cmd, $env, $opt) = @_;
-	$opt ||= {};
+	progress_prepare($opt ||= {});
 	my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
 	my $exe = $cmd->[0];
 	my $pfx = $exe;
@@ -161,6 +173,8 @@ sub cpdb_retryable ($$) {
 	0;
 }
 
+# Like copydatabase(1), this is horribly slow; and it doesn't seem due
+# to the overhead of Perl.
 sub cpdb {
 	my ($args, $env, $opt) = @_;
 	my ($old, $new) = @$args;
@@ -172,6 +186,7 @@ sub cpdb {
 	my $creat = Search::Xapian::DB_CREATE();
 	my $dst = Search::Xapian::WritableDatabase->new($tmp, $creat);
 	my ($it, $end);
+	my ($pfx, $nr, $tot, $fmt); # progress output
 
 	do {
 		eval {
@@ -181,6 +196,13 @@ sub cpdb {
 
 			$it = $src->postlist_begin('');
 			$end = $src->postlist_end('');
+			if ($opt->{-progress}) {
+				$nr = 0;
+				$pfx = (split('/', $old))[-1].':';
+				$tot = $src->get_doccount;
+				$fmt = "$pfx % ".length($tot)."u/$tot\n";
+				warn "$pfx copying $tot documents\n";
+			}
 		};
 	} while (cpdb_retryable($src, $@));
 
@@ -191,6 +213,9 @@ sub cpdb {
 				my $doc = $src->get_document($docid);
 				$dst->replace_document($docid, $doc);
 				$it->inc;
+				if ($fmt && !(++$nr & 1023)) {
+					warn(sprintf($fmt, $nr));
+				}
 			}
 
 			# unlike copydatabase(1), we don't copy spelling
@@ -200,10 +225,12 @@ sub cpdb {
 		};
 	} while (cpdb_retryable($src, $@));
 
+	warn(sprintf($fmt, $nr)) if $fmt;
 	return unless $opt->{compact};
 
 	$src = $dst = undef; # flushes and closes
 
+	warn "$pfx compacting...\n" if $pfx;
 	# this is probably the best place to do xapian-compact
 	# since $dst isn't readable by HTTP or NNTP clients, yet:
 	my $cmd = [ $XAPIAN_COMPACT, '--no-renumber', $tmp, $new ];
@@ -212,10 +239,22 @@ sub cpdb {
 		defined(my $dst = $opt->{$fd}) or next;
 		$rdr->{$fd} = $dst;
 	}
+
+	my ($r, $w);
+	if ($pfx && pipe($r, $w)) {
+		$rdr->{1} = fileno($w);
+	}
 	my $pid = spawn($cmd, $env, $rdr);
-	my $r = waitpid($pid, 0);
-	if ($? || $r != $pid) {
-		die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$r)\n";
+	if ($pfx) {
+		close $w or die "close: \$w: $!";
+		foreach (<$r>) {
+			s/\r/\r$pfx /g;
+			warn "$pfx $_";
+		}
+	}
+	my $rp = waitpid($pid, 0);
+	if ($? || $rp != $pid) {
+		die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$rp)\n";
 	}
 	remove_tree($tmp) or die "failed to remove $tmp: $!\n";
 }
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index 78d37da..5b66337 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -9,12 +9,9 @@ use PublicInbox::Admin;
 PublicInbox::Admin::require_or_die('-search');
 my $usage = "Usage: public-inbox-xcpdb INBOX_DIR\n";
 my $opt = {};
-GetOptions($opt, qw(compact)) or die "bad command-line args\n$usage";
+GetOptions($opt, qw(compact quiet|q)) or die "bad command-line args\n$usage";
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
-
 my $cmd = [ \&PublicInbox::Xapcmd::cpdb ];
-open my $null, '>', '/dev/null' or die "failed to open /dev/null: $!\n";
-$opt->{1} = fileno($null);
 foreach (@ibxs) {
 	my $ibx = PublicInbox::InboxWritable->new($_);
 	# we rely on --no-renumber to keep docids synched to NNTP
diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t
index 61053b6..57a776f 100644
--- a/t/indexlevels-mirror.t
+++ b/t/indexlevels-mirror.t
@@ -18,7 +18,7 @@ foreach my $mod (qw(DBD::SQLite)) {
 
 my $path = 'blib/script';
 my $index = "$path/public-inbox-index";
-my $xcpdb = "$path/public-inbox-xcpdb";
+my @xcpdb = ("$path/public-inbox-xcpdb", '-q');
 
 my $mime = PublicInbox::MIME->create(
 	header => [
@@ -110,7 +110,7 @@ sub import_index_incremental {
 	$im->done;
 
 	if ($level ne 'basic') {
-		is(system($xcpdb, $mirror), 0, "v$v xcpdb OK");
+		is(system(@xcpdb, $mirror), 0, "v$v xcpdb OK");
 		delete $ro_mirror->{$_} for (qw(over search));
 		($nr, $msgs) = $ro_mirror->search->query('m:m@2');
 		is($nr, 1, "v$v found m\@2 via Xapian on $level");
-- 
EW


  parent reply index

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-23  9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
2019-05-23  9:36 ` [PATCH 01/26] t/convert-compact: skip on missing xapian-compact(1) Eric Wong
2019-05-23  9:36 ` [PATCH 02/26] v1writable: retire in favor of InboxWritable Eric Wong
2019-05-23  9:36 ` [PATCH 03/26] doc: document the reason for --no-renumber Eric Wong
2019-05-23  9:36 ` [PATCH 04/26] search: reenable phrase search on non-chert Xapian Eric Wong
2019-05-23  9:36 ` [PATCH 05/26] xapcmd: new module for wrapping Xapian commands Eric Wong
2019-05-23  9:36 ` [PATCH 06/26] admin: hoist out resolve_inboxes for -compact and -index Eric Wong
2019-05-23  9:36 ` [PATCH 07/26] xapcmd: support spawn options Eric Wong
2019-05-23  9:36 ` [PATCH 08/26] xcpdb: new tool which wraps Xapian's copydatabase(1) Eric Wong
2019-05-23  9:36 ` [PATCH 09/26] xapcmd: do not cleanup on errors Eric Wong
2019-05-23  9:36 ` [PATCH 10/26] admin: move index_inbox over Eric Wong
2019-05-23  9:36 ` [PATCH 11/26] xcpdb: implement using Perl bindings Eric Wong
2019-05-23  9:36 ` [PATCH 12/26] xapcmd: xcpdb supports compaction Eric Wong
2019-05-23  9:36 ` [PATCH 13/26] v2writable: hoist out log_range sub for readability Eric Wong
2019-05-23  9:36 ` [PATCH 14/26] xcpdb: use fine-grained locking Eric Wong
2019-05-23  9:36 ` Eric Wong [this message]
2019-05-23  9:36 ` [PATCH 16/26] xcpdb: cleanup error handling and diagnosis Eric Wong
2019-05-23  9:36 ` [PATCH 17/26] xapcmd: avoid EXDEV when finalizing changes Eric Wong
2019-05-23  9:36 ` [PATCH 18/26] doc: xcpdb: update to reflect the current state Eric Wong
2019-05-23  9:36 ` [PATCH 19/26] xapcmd: use "print STDERR" for progress reporting Eric Wong
2019-05-23  9:36 ` [PATCH 20/26] xcpdb: show re-indexing progress Eric Wong
2019-05-23  9:36 ` [PATCH 21/26] xcpdb: remove temporary directories on aborts Eric Wong
2019-05-23  9:37 ` [PATCH 22/26] compact: reuse infrastructure from xcpdb Eric Wong
2019-05-23  9:37 ` [PATCH 23/26] xcpdb|compact: support some xapian-compact switches Eric Wong
2019-05-23  9:37 ` [PATCH 24/26] xapcmd: cleanup on interrupted xcpdb "--compact" Eric Wong
2019-05-23  9:37 ` [PATCH 25/26] xcpdb|compact: support --jobs/-j flag like gmake(1) Eric Wong
2019-05-23  9:37 ` [PATCH 26/26] xapcmd: do not reset %SIG until last Xtmpdir is done Eric Wong
2019-05-23 10:37 ` [PATCH 27/26] doc: various updates to reflect current state Eric Wong

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190523093704.18367-16-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror http://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git