user/dev discussion of public-inbox itself
 help / color / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 14/26] xcpdb: use fine-grained locking
Date: Thu, 23 May 2019 09:36:52 +0000
Message-ID: <20190523093704.18367-15-e@80x24.org> (raw)
In-Reply-To: <20190523093704.18367-1-e@80x24.org>

Copying an entire Xapian DB takes a long time, so update our
reindexing code to support partial reindexing, snapshot the
pre-copydatabase git revisions, perform the lengthy copy,
and do a partial reindex when the copy + renames are done.
---
 lib/PublicInbox/Admin.pm      |  2 +-
 lib/PublicInbox/SearchIdx.pm  | 10 +++++-
 lib/PublicInbox/V2Writable.pm | 21 ++++++++++---
 lib/PublicInbox/Xapcmd.pm     | 58 ++++++++++++++++++++++++++++++++---
 4 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 94f47ab..34aa312 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -141,7 +141,7 @@ sub index_inbox {
 	if (ref($ibx) && ($ibx->{version} || 1) == 2) {
 		eval { require PublicInbox::V2Writable };
 		die "v2 requirements not met: $@\n" if $@;
-		my $v2w = eval {
+		my $v2w = eval { $ibx->importer(0) } || eval {
 			PublicInbox::V2Writable->new($ibx, {nproc=>$jobs});
 		};
 		if (defined $jobs) {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 114420e..0aeeb6b 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -542,8 +542,10 @@ sub do_cat_mail {
 	$@ ? undef : $mime;
 }
 
+# called by public-inbox-index
 sub index_sync {
 	my ($self, $opts) = @_;
+	delete $self->{lock_path} if $opts->{-skip_lock};
 	$self->{-inbox}->with_umask(sub { $self->_index_sync($opts) })
 }
 
@@ -692,6 +694,12 @@ sub _last_x_commit {
 	$lx;
 }
 
+sub reindex_from ($$) {
+	my ($reindex, $last_commit) = @_;
+	return $last_commit unless $reindex;
+	ref($reindex) eq 'HASH' ? $reindex->{from} : '';
+}
+
 # indexes all unindexed messages (v1 only)
 sub _index_sync {
 	my ($self, $opts) = @_;
@@ -705,7 +713,7 @@ sub _index_sync {
 	do {
 		$xlog = undef;
 		$last_commit = _last_x_commit($self, $mm);
-		$lx = $opts->{reindex} ? '' : $last_commit;
+		$lx = reindex_from($opts->{reindex}, $last_commit);
 
 		$self->{over}->rollback_lazy;
 		$self->{over}->disconnect;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 3dd606e..1ee19b2 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -238,7 +238,7 @@ sub idx_part {
 
 # idempotent
 sub idx_init {
-	my ($self) = @_;
+	my ($self, $opt) = @_;
 	return if $self->{idx_parts};
 	my $ibx = $self->{-inbox};
 
@@ -264,7 +264,7 @@ sub idx_init {
 	my $over = $self->{over};
 	$ibx->umask_prepare;
 	$ibx->with_umask(sub {
-		$self->lock_acquire;
+		$self->lock_acquire unless ($opt && $opt->{-skip_lock});
 		$over->create;
 
 		# -compact can change partition count while -watch is idle
@@ -924,6 +924,19 @@ sub unindex {
 		qw(-c gc.reflogExpire=now gc --prune=all)]);
 }
 
+sub index_ranges ($$$) {
+	my ($self, $reindex, $epoch_max) = @_;
+	return last_commits($self, $epoch_max) unless $reindex;
+
+	return [] if ref($reindex) ne 'HASH';
+
+	my $ranges = $reindex->{from}; # arrayref;
+	if (ref($ranges) ne 'ARRAY') {
+		die 'BUG: $reindex->{from} not an ARRAY';
+	}
+	$ranges;
+}
+
 # called for public-inbox-index
 sub index_sync {
 	my ($self, $opts) = @_;
@@ -931,10 +944,10 @@ sub index_sync {
 	my $epoch_max;
 	my $latest = git_dir_latest($self, \$epoch_max);
 	return unless defined $latest;
-	$self->idx_init; # acquire lock
+	$self->idx_init($opts); # acquire lock
 	my $mm_tmp = $self->{mm}->tmp_clone;
 	my $reindex = $opts->{reindex};
-	my $ranges = $reindex ? [] : $self->last_commits($epoch_max);
+	my $ranges = index_ranges($self, $reindex, $epoch_max);
 
 	my $high = $self->{mm}->num_highwater();
 	my $regen = $self->index_prepare($opts, $epoch_max, $ranges);
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index d2de874..4555340 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -5,6 +5,7 @@ use strict;
 use warnings;
 use PublicInbox::Spawn qw(which spawn);
 use PublicInbox::Over;
+use PublicInbox::Search;
 use File::Temp qw(tempdir);
 use File::Path qw(remove_tree);
 
@@ -12,20 +13,33 @@ use File::Path qw(remove_tree);
 # commands with a version number suffix (e.g. "xapian-compact-1.5")
 our $XAPIAN_COMPACT = $ENV{XAPIAN_COMPACT} || 'xapian-compact';
 
-sub commit_changes ($$$) {
-	my ($im, $old, $new) = @_;
+sub commit_changes ($$$$) {
+	my ($ibx, $old, $new, $opt) = @_;
+
+	my $reindex = $opt->{reindex};
+	my $im = $ibx->importer(0);
+	$im->lock_acquire if $reindex;
+
 	my @st = stat($old) or die "failed to stat($old): $!\n";
 
 	my $over = "$old/over.sqlite3";
 	if (-f $over) {
 		$over = PublicInbox::Over->new($over);
 		$over->connect->sqlite_backup_to_file("$new/over.sqlite3");
+		$over = undef;
 	}
 	rename($old, "$new/old") or die "rename $old => $new/old: $!\n";
 	chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
 	rename($new, $old) or die "rename $new => $old: $!\n";
-	$im->lock_release;
 	remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
+
+	if ($reindex) {
+		$opt->{-skip_lock} = 1;
+		PublicInbox::Admin::index_inbox($ibx, $opt);
+		# implicit lock_release
+	} else {
+		$im->lock_release;
+	}
 }
 
 sub xspawn {
@@ -47,6 +61,27 @@ sub runnable_or_die ($) {
 	which($exe) or die "$exe not found in PATH\n";
 }
 
+sub prepare_reindex ($$) {
+	my ($ibx, $reindex) = @_;
+	if ($ibx->{version} == 1) {
+		my $dir = $ibx->search->xdir(1);
+		my $xdb = Search::Xapian::Database->new($dir);
+		if (my $lc = $xdb->get_metadata('last_commit')) {
+			$reindex->{from} = $lc;
+		}
+	} else { # v2
+		my $v2w = $ibx->importer(0);
+		my $max;
+		$v2w->git_dir_latest(\$max) or return;
+		my $from = $reindex->{from};
+		my $mm = $ibx->mm;
+		my $v = PublicInbox::Search::SCHEMA_VERSION();
+		foreach my $i (0..$max) {
+			$from->[$i] = $mm->last_commit_xap($v, $i);
+		}
+	}
+}
+
 sub run {
 	my ($ibx, $cmd, $env, $opt) = @_;
 	$opt ||= {};
@@ -54,8 +89,14 @@ sub run {
 	my $exe = $cmd->[0];
 	my $pfx = $exe;
 	runnable_or_die($XAPIAN_COMPACT) if $opt->{compact};
+
+	my $reindex; # v1:{ from => $x40 }, v2:{ from => [ $x40, $x40, .. ] } }
+	my $from; # per-epoch ranges
+
 	if (ref($exe) eq 'CODE') {
 		$pfx = 'CODE';
+		$reindex = $opt->{reindex} = {};
+		$from = $reindex->{from} = [];
 		require Search::Xapian::WritableDatabase;
 	} else {
 		runnable_or_die($exe);
@@ -64,7 +105,7 @@ sub run {
 	my $old = $ibx->search->xdir(1);
 	-d $old or die "$old does not exist\n";
 	my $new = tempdir("$pfx-XXXXXXXX", DIR => $dir);
-	my $v = $ibx->{version} || 1;
+	my $v = $ibx->{version} ||= 1;
 	my @cmds;
 	if ($v == 1) {
 		push @cmds, [@$cmd, $old, $new];
@@ -85,6 +126,13 @@ sub run {
 	my $max = $opt->{jobs} || scalar(@cmds);
 	$ibx->with_umask(sub {
 		$im->lock_acquire;
+
+		# fine-grained locking if we prepare for reindex
+		if ($reindex) {
+			prepare_reindex($ibx, $reindex);
+			$im->lock_release;
+		}
+		delete($ibx->{$_}) for (qw(mm over search)); # cleanup
 		my %pids;
 		while (@cmds) {
 			while (scalar(keys(%pids)) < $max && scalar(@cmds)) {
@@ -98,7 +146,7 @@ sub run {
 				die join(' ', @$x)." failed: $?\n" if $?;
 			}
 		}
-		commit_changes($im, $old, $new);
+		commit_changes($ibx, $old, $new, $opt);
 	});
 }
 
-- 
EW


  parent reply index

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-23  9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
2019-05-23  9:36 ` [PATCH 01/26] t/convert-compact: skip on missing xapian-compact(1) Eric Wong
2019-05-23  9:36 ` [PATCH 02/26] v1writable: retire in favor of InboxWritable Eric Wong
2019-05-23  9:36 ` [PATCH 03/26] doc: document the reason for --no-renumber Eric Wong
2019-05-23  9:36 ` [PATCH 04/26] search: reenable phrase search on non-chert Xapian Eric Wong
2019-05-23  9:36 ` [PATCH 05/26] xapcmd: new module for wrapping Xapian commands Eric Wong
2019-05-23  9:36 ` [PATCH 06/26] admin: hoist out resolve_inboxes for -compact and -index Eric Wong
2019-05-23  9:36 ` [PATCH 07/26] xapcmd: support spawn options Eric Wong
2019-05-23  9:36 ` [PATCH 08/26] xcpdb: new tool which wraps Xapian's copydatabase(1) Eric Wong
2019-05-23  9:36 ` [PATCH 09/26] xapcmd: do not cleanup on errors Eric Wong
2019-05-23  9:36 ` [PATCH 10/26] admin: move index_inbox over Eric Wong
2019-05-23  9:36 ` [PATCH 11/26] xcpdb: implement using Perl bindings Eric Wong
2019-05-23  9:36 ` [PATCH 12/26] xapcmd: xcpdb supports compaction Eric Wong
2019-05-23  9:36 ` [PATCH 13/26] v2writable: hoist out log_range sub for readability Eric Wong
2019-05-23  9:36 ` Eric Wong [this message]
2019-05-23  9:36 ` [PATCH 15/26] xcpdb: implement progress reporting Eric Wong
2019-05-23  9:36 ` [PATCH 16/26] xcpdb: cleanup error handling and diagnosis Eric Wong
2019-05-23  9:36 ` [PATCH 17/26] xapcmd: avoid EXDEV when finalizing changes Eric Wong
2019-05-23  9:36 ` [PATCH 18/26] doc: xcpdb: update to reflect the current state Eric Wong
2019-05-23  9:36 ` [PATCH 19/26] xapcmd: use "print STDERR" for progress reporting Eric Wong
2019-05-23  9:36 ` [PATCH 20/26] xcpdb: show re-indexing progress Eric Wong
2019-05-23  9:36 ` [PATCH 21/26] xcpdb: remove temporary directories on aborts Eric Wong
2019-05-23  9:37 ` [PATCH 22/26] compact: reuse infrastructure from xcpdb Eric Wong
2019-05-23  9:37 ` [PATCH 23/26] xcpdb|compact: support some xapian-compact switches Eric Wong
2019-05-23  9:37 ` [PATCH 24/26] xapcmd: cleanup on interrupted xcpdb "--compact" Eric Wong
2019-05-23  9:37 ` [PATCH 25/26] xcpdb|compact: support --jobs/-j flag like gmake(1) Eric Wong
2019-05-23  9:37 ` [PATCH 26/26] xapcmd: do not reset %SIG until last Xtmpdir is done Eric Wong
2019-05-23 10:37 ` [PATCH 27/26] doc: various updates to reflect current state Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190523093704.18367-15-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git