user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 6/7] index: support --xapian-only switch
Date: Fri,  7 Aug 2020 01:14:05 +0000	[thread overview]
Message-ID: <20200807011406.12285-7-e@yhbt.net> (raw)
In-Reply-To: <20200807011406.12285-1-e@yhbt.net>

This is useful for speeding up indexing runs when only Xapian
rules change but SQLite indexing doesn't change.  This mostly
implies `--reindex', but does NOT pick up new messages (because
SQLite indexing needs to occur for that).

I'm leaving this undocumented in the manpage for now since it's
mainly to speed up development and testing.  Users upgrading to
1.6.0 will be advised to `--reindex --rethread', anyways, due to
the threading improvements since 1.1.0-pre1.

It may make sense to document for 1.7+ when there's Xapian-only
indexing changes, though.
---
 lib/PublicInbox/SearchIdxShard.pm | 10 ++++--
 lib/PublicInbox/V2Writable.pm     | 51 +++++++++++++++++++++++--------
 script/public-inbox-index         |  5 +--
 3 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index cb79f3dc..59b36087 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -89,16 +89,20 @@ sub shard_worker_loop ($$$$$) {
 
 # called by V2Writable
 sub index_raw {
-	my ($self, $msgref, $mime, $smsg) = @_;
+	my ($self, $msgref, $eml, $smsg) = @_;
 	if (my $w = $self->{w}) {
 		# mid must be last, it can contain spaces (but not LF)
 		print $w join(' ', @$smsg{qw(raw_bytes bytes
 						num blob ds ts mid)}),
 			"\n", $$msgref or die "failed to write shard $!\n";
 	} else {
-		$$msgref = undef;
+		if ($eml) {
+			$$msgref = undef;
+		} else { # --xapian-only + --sequential-shard:
+			$eml = PublicInbox::Eml->new($msgref);
+		}
 		$self->begin_txn_lazy;
-		$self->add_message($mime, $smsg);
+		$self->add_message($eml, $smsg);
 	}
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 7bc24592..6b1effe5 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1185,22 +1185,24 @@ sub index_xap_only { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $smsg) = @_;
 	my $self = $smsg->{v2w};
 	my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
-	$idx->begin_txn_lazy;
-	$idx->add_message(PublicInbox::Eml->new($bref), $smsg);
+	$smsg->{raw_bytes} = $size;
+	$idx->index_raw($bref, undef, $smsg);
 	$self->{transact_bytes} += $size;
 }
 
-sub index_seq_shard ($$$) {
-	my ($self, $sync, $off) = @_;
+sub index_xap_step ($$$;$) {
+	my ($self, $sync, $beg, $step) = @_;
 	my $ibx = $self->{ibx};
-	my $max = $ibx->mm->max or return;
 	my $all = $ibx->git;
 	my $over = $ibx->over;
 	my $batch_bytes = $PublicInbox::SearchIdx::BATCH_BYTES;
+	$step //= $self->{shards};
+	my $end = $sync->{art_end};
 	if (my $pr = $sync->{-opt}->{-progress}) {
-		$pr->("Xapian indexlevel=$ibx->{indexlevel} % $off\n");
+		$pr->("Xapian indexlevel=$ibx->{indexlevel} ".
+			"$beg..$end (% $step)\n");
 	}
-	for (my $num = $off; $num <= $max; $num += $self->{shards}) {
+	for (my $num = $beg; $num <= $end; $num += $step) {
 		my $smsg = $over->get_art($num) or next;
 		$smsg->{v2w} = $self;
 		$all->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
@@ -1244,10 +1246,37 @@ sub index_epoch ($$$) {
 	update_last_commit($self, $git, $i, $stk->{latest_cmt});
 }
 
+sub xapian_only {
+	my ($self, $opt, $sync) = @_;
+	my $seq = $opt->{sequentialshard};
+	local $self->{parallel} = 0 if $seq;
+	$self->idx_init($opt); # acquire lock
+	if (my $art_end = $self->{ibx}->mm->max) {
+		$sync //= {
+			need_checkpoint => \(my $bool = 0),
+			-opt => $opt,
+			v2w => $self,
+			nr => \(my $nr = 0),
+			-regen_fmt => "%u/?\n",
+		};
+		$sync->{art_end} = $art_end;
+		if ($seq || !$self->{parallel}) {
+			my $shard_end = $self->{shards} - 1;
+			index_xap_step($self, $sync, $_) for (0..$shard_end);
+		} else { # parallel (maybe)
+			index_xap_step($self, $sync, 0, 1);
+		}
+	}
+	$self->{ibx}->git->cat_async_wait;
+	$self->done;
+}
+
 # public, called by public-inbox-index
 sub index_sync {
 	my ($self, $opt) = @_;
-	$opt ||= {};
+	$opt //= $_[1] //= {};
+	goto \&xapian_only if $opt->{xapianonly};
+
 	my $pr = $opt->{-progress};
 	my $epoch_max;
 	my $latest = git_dir_latest($self, \$epoch_max);
@@ -1292,13 +1321,9 @@ sub index_sync {
 	}
 
 	if ($seq) { # deal with Xapian shards sequentially
-		my $end = $self->{shards} - 1;
 		$self->{ibx}->{indexlevel} = $idxlevel;
 		delete $sync->{mm_tmp};
-		$self->idx_init($opt); # re-acquire lock
-		index_seq_shard($self, $sync, $_) for (0..$end);
-		$self->{ibx}->git->cat_async_wait;
-		$self->done;
+		xapian_only($self, $opt, $sync);
 	}
 
 	# reindex does not pick up new changes, so we rerun w/o it:
diff --git a/script/public-inbox-index b/script/public-inbox-index
index be518134..a52fb1bf 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -16,6 +16,7 @@ use PublicInbox::Xapcmd;
 my $compact_opt;
 my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 };
 GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync!
+		xapianonly|xapian-only
 		indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s
 		sequentialshard|seq-shard|sequential-shard))
 	or die "bad command-line args\n$usage";
@@ -59,8 +60,8 @@ if (defined $s) {
 my $mods = {};
 foreach my $ibx (@ibxs) {
 	# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
-	$ibx->{indexlevel} //= $opt->{indexlevel} //
-			PublicInbox::Admin::detect_indexlevel($ibx);
+	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapianonly} ?
+			'full' : PublicInbox::Admin::detect_indexlevel($ibx));
 	$ibx->{index_max_size} = $max_size;
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
 }

  parent reply	other threads:[~2020-08-07  1:14 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-07  1:13 [PATCH 0/7] index: --sequential-shard and other stuff Eric Wong
2020-08-07  1:14 ` [PATCH 1/7] xapcmd: quietly no-op on indexlevel=basic Eric Wong
2020-08-07  1:14 ` [PATCH 2/7] xapcmd: remove redundant searchidx require Eric Wong
2020-08-07  1:14 ` [PATCH 3/7] xapcmd: drop outdated comment Eric Wong
2020-08-07  1:14 ` [PATCH 4/7] v2writable: fix rethread cleanup Eric Wong
2020-08-07  1:14 ` [PATCH 5/7] index: v2: --sequential-shard option Eric Wong
2020-08-07  1:14 ` Eric Wong [this message]
2020-08-07  1:14 ` [PATCH 7/7] index+xcpdb: rename `--no-sync' to `--no-fsync' Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200807011406.12285-7-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).