user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 6/7] index: support --xapian-only switch
  2020-08-07  1:13  7% [PATCH 0/7] index: --sequential-shard and other stuff Eric Wong
@ 2020-08-07  1:14  6% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-08-07  1:14 UTC (permalink / raw)
  To: meta

This is useful for speeding up indexing runs when only Xapian
rules change but SQLite indexing doesn't change.  This mostly
implies `--reindex', but does NOT pick up new messages (because
SQLite indexing needs to occur for that).

I'm leaving this undocumented in the manpage for now since it's
mainly to speed up development and testing.  Users upgrading to
1.6.0 will be advised to `--reindex --rethread', anyways, due to
the threading improvements since 1.1.0-pre1.

It may make sense to document for 1.7+ when there's Xapian-only
indexing changes, though.
---
 lib/PublicInbox/SearchIdxShard.pm | 10 ++++--
 lib/PublicInbox/V2Writable.pm     | 51 +++++++++++++++++++++++--------
 script/public-inbox-index         |  5 +--
 3 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index cb79f3dc..59b36087 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -89,16 +89,20 @@ sub shard_worker_loop ($$$$$) {
 
 # called by V2Writable
 sub index_raw {
-	my ($self, $msgref, $mime, $smsg) = @_;
+	my ($self, $msgref, $eml, $smsg) = @_;
 	if (my $w = $self->{w}) {
 		# mid must be last, it can contain spaces (but not LF)
 		print $w join(' ', @$smsg{qw(raw_bytes bytes
 						num blob ds ts mid)}),
 			"\n", $$msgref or die "failed to write shard $!\n";
 	} else {
-		$$msgref = undef;
+		if ($eml) {
+			$$msgref = undef;
+		} else { # --xapian-only + --sequential-shard:
+			$eml = PublicInbox::Eml->new($msgref);
+		}
 		$self->begin_txn_lazy;
-		$self->add_message($mime, $smsg);
+		$self->add_message($eml, $smsg);
 	}
 }
 
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 7bc24592..6b1effe5 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1185,22 +1185,24 @@ sub index_xap_only { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $smsg) = @_;
 	my $self = $smsg->{v2w};
 	my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
-	$idx->begin_txn_lazy;
-	$idx->add_message(PublicInbox::Eml->new($bref), $smsg);
+	$smsg->{raw_bytes} = $size;
+	$idx->index_raw($bref, undef, $smsg);
 	$self->{transact_bytes} += $size;
 }
 
-sub index_seq_shard ($$$) {
-	my ($self, $sync, $off) = @_;
+sub index_xap_step ($$$;$) {
+	my ($self, $sync, $beg, $step) = @_;
 	my $ibx = $self->{ibx};
-	my $max = $ibx->mm->max or return;
 	my $all = $ibx->git;
 	my $over = $ibx->over;
 	my $batch_bytes = $PublicInbox::SearchIdx::BATCH_BYTES;
+	$step //= $self->{shards};
+	my $end = $sync->{art_end};
 	if (my $pr = $sync->{-opt}->{-progress}) {
-		$pr->("Xapian indexlevel=$ibx->{indexlevel} % $off\n");
+		$pr->("Xapian indexlevel=$ibx->{indexlevel} ".
+			"$beg..$end (% $step)\n");
 	}
-	for (my $num = $off; $num <= $max; $num += $self->{shards}) {
+	for (my $num = $beg; $num <= $end; $num += $step) {
 		my $smsg = $over->get_art($num) or next;
 		$smsg->{v2w} = $self;
 		$all->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
@@ -1244,10 +1246,37 @@ sub index_epoch ($$$) {
 	update_last_commit($self, $git, $i, $stk->{latest_cmt});
 }
 
+sub xapian_only {
+	my ($self, $opt, $sync) = @_;
+	my $seq = $opt->{sequentialshard};
+	local $self->{parallel} = 0 if $seq;
+	$self->idx_init($opt); # acquire lock
+	if (my $art_end = $self->{ibx}->mm->max) {
+		$sync //= {
+			need_checkpoint => \(my $bool = 0),
+			-opt => $opt,
+			v2w => $self,
+			nr => \(my $nr = 0),
+			-regen_fmt => "%u/?\n",
+		};
+		$sync->{art_end} = $art_end;
+		if ($seq || !$self->{parallel}) {
+			my $shard_end = $self->{shards} - 1;
+			index_xap_step($self, $sync, $_) for (0..$shard_end);
+		} else { # parallel (maybe)
+			index_xap_step($self, $sync, 0, 1);
+		}
+	}
+	$self->{ibx}->git->cat_async_wait;
+	$self->done;
+}
+
 # public, called by public-inbox-index
 sub index_sync {
 	my ($self, $opt) = @_;
-	$opt ||= {};
+	$opt //= $_[1] //= {};
+	goto \&xapian_only if $opt->{xapianonly};
+
 	my $pr = $opt->{-progress};
 	my $epoch_max;
 	my $latest = git_dir_latest($self, \$epoch_max);
@@ -1292,13 +1321,9 @@ sub index_sync {
 	}
 
 	if ($seq) { # deal with Xapian shards sequentially
-		my $end = $self->{shards} - 1;
 		$self->{ibx}->{indexlevel} = $idxlevel;
 		delete $sync->{mm_tmp};
-		$self->idx_init($opt); # re-acquire lock
-		index_seq_shard($self, $sync, $_) for (0..$end);
-		$self->{ibx}->git->cat_async_wait;
-		$self->done;
+		xapian_only($self, $opt, $sync);
 	}
 
 	# reindex does not pick up new changes, so we rerun w/o it:
diff --git a/script/public-inbox-index b/script/public-inbox-index
index be518134..a52fb1bf 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -16,6 +16,7 @@ use PublicInbox::Xapcmd;
 my $compact_opt;
 my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 };
 GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync!
+		xapianonly|xapian-only
 		indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s
 		sequentialshard|seq-shard|sequential-shard))
 	or die "bad command-line args\n$usage";
@@ -59,8 +60,8 @@ if (defined $s) {
 my $mods = {};
 foreach my $ibx (@ibxs) {
 	# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
-	$ibx->{indexlevel} //= $opt->{indexlevel} //
-			PublicInbox::Admin::detect_indexlevel($ibx);
+	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapianonly} ?
+			'full' : PublicInbox::Admin::detect_indexlevel($ibx));
 	$ibx->{index_max_size} = $max_size;
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
 }

^ permalink raw reply related	[relevance 6%]

* [PATCH 0/7] index: --sequential-shard and other stuff
@ 2020-08-07  1:13  7% Eric Wong
  2020-08-07  1:14  6% ` [PATCH 6/7] index: support --xapian-only switch Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-08-07  1:13 UTC (permalink / raw)
  To: meta

1/7 is a minor usability fix (more on the way)
5/7 is a major improvement for HDDs
6/7 is useful to developers, and may be useful to users
    a few months down the line

And the rest are minor fixes related to indexing...

Eric Wong (7):
  xapcmd: quietly no-op on indexlevel=basic
  xapcmd: remove redundant searchidx require
  xapcmd: drop outdated comment
  v2writable: fix rethread cleanup
  index: v2: indexSequentialShard / --sequential-shard option
  index: support --xapian-only switch
  index+xcpdb: rename `--no-sync' to `--no-fsync'

 Documentation/public-inbox-config.pod    |  6 ++
 Documentation/public-inbox-index.pod     | 55 ++++++++++++++-
 Documentation/public-inbox-v2-format.pod | 11 ++-
 Documentation/public-inbox-xcpdb.pod     |  2 +-
 lib/PublicInbox/Config.pm                |  9 +--
 lib/PublicInbox/OverIdx.pm               |  2 +-
 lib/PublicInbox/SearchIdx.pm             |  6 +-
 lib/PublicInbox/SearchIdxShard.pm        | 10 ++-
 lib/PublicInbox/V2Writable.pm            | 88 ++++++++++++++++++++++--
 lib/PublicInbox/WatchMaildir.pm          |  2 +-
 lib/PublicInbox/Xapcmd.pm                | 18 ++---
 script/public-inbox-index                | 33 +++++++--
 t/config.t                               |  6 +-
 t/indexlevels-mirror.t                   | 24 +++++--
 t/v2mirror.t                             | 14 ++++
 15 files changed, 235 insertions(+), 51 deletions(-)

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-08-07  1:13  7% [PATCH 0/7] index: --sequential-shard and other stuff Eric Wong
2020-08-07  1:14  6% ` [PATCH 6/7] index: support --xapian-only switch Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).