user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 0/7] index + extindex interaction improvements
@ 2020-12-25 10:21  7% Eric Wong
  2020-12-25 10:21  6% ` [PATCH 4/7] index: do not attach inbox to extindex unless updated Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-12-25 10:21 UTC (permalink / raw)
  To: meta

Some things which make -index less painful when auto-updating
external indices.

"public-inbox-extindex --all" itself is still painfully slow
with 50K inboxes, but I think that can only be used once for
initialization and -index can be relied on for all incremental
updates.


Eric Wong (7):
  index: disable --fast-noop on --reindex
  extsearchidx: delay SQLite availability checks
  extsearchidx: close DB handles after use if FD constrained
  index: do not attach inbox to extindex unless updated
  index: fix --no-fsync flag propagation to extindex
  v2writable: don't verify tip if reindexing
  index: filter out indexlevel=basic from extindex

 lib/PublicInbox/Admin.pm        |  1 +
 lib/PublicInbox/ExtSearchIdx.pm | 96 +++++++++++++++++++++------------
 lib/PublicInbox/SearchIdx.pm    |  2 +
 lib/PublicInbox/V2Writable.pm   | 36 +++++++++----
 script/public-inbox-index       | 27 ++++++----
 5 files changed, 109 insertions(+), 53 deletions(-)


^ permalink raw reply	[relevance 7%]

* [PATCH 4/7] index: do not attach inbox to extindex unless updated
  2020-12-25 10:21  7% [PATCH 0/7] index + extindex interaction improvements Eric Wong
@ 2020-12-25 10:21  6% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-12-25 10:21 UTC (permalink / raw)
  To: meta

We'll count the number of log changes (regardless of index or
unindex) and only attach inboxes to ExtSearchIdx objects when
they get new work.  We'll also reduce lock bouncing and only
update external indices after all per-inbox indexing is done.

This also updates existing v2 indexing/unindexing callers
to be more consistent and ensures unindex log entries update
per-inbox last commit information.
---
 lib/PublicInbox/Admin.pm      |  1 +
 lib/PublicInbox/SearchIdx.pm  |  2 ++
 lib/PublicInbox/V2Writable.pm | 26 +++++++++++++++++++-------
 script/public-inbox-index     | 23 ++++++++++++++---------
 4 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 9a86d206..b468108e 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -271,6 +271,7 @@ EOM
 		$idx = PublicInbox::SearchIdx->new($ibx, 1);
 	}
 	$idx->index_sync($opt);
+	$idx->{nidx} // 0; # returns number processed
 }
 
 sub progress_prepare ($) {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c8e309fc..b3361e05 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -615,6 +615,7 @@ sub index_both { # git->cat_async callback
 	$smsg->{num} = index_mm($self, $eml, $oid, $sync) or
 		die "E: could not generate NNTP article number for $oid";
 	add_message($self, $eml, $smsg, $sync);
+	++$self->{nidx};
 	my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing';
 	${$sync->{latest_cmt}} = $cur_cmt;
 }
@@ -629,6 +630,7 @@ sub unindex_both { # git->cat_async callback
 	if (defined(my $cur_cmt = $sync->{cur_cmt})) {
 		${$sync->{latest_cmt}} = $cur_cmt;
 	}
+	++$self->{nidx};
 }
 
 sub with_umask {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 2b849ddf..ca52874b 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -891,12 +891,22 @@ sub reindex_checkpoint ($$) {
 	$mm_tmp->atfork_parent if $mm_tmp;
 }
 
+sub index_finalize ($$) {
+	my ($arg, $index) = @_;
+	++$arg->{self}->{nidx};
+	if (defined(my $cur = $arg->{cur_cmt})) {
+		${$arg->{latest_cmt}} = $cur;
+	} elsif ($index) {
+		die 'BUG: {cur_cmt} missing';
+	} # else { unindexing @leftovers doesn't set {cur_cmt}
+}
+
 sub index_oid { # cat_async callback
 	my ($bref, $oid, $type, $size, $arg) = @_;
-	return if is_bad_blob($oid, $type, $size, $arg->{oid});
+	is_bad_blob($oid, $type, $size, $arg->{oid}) and
+		return index_finalize($arg, 1); # size == 0 purged returns here
 	my $self = $arg->{self};
 	local $self->{current_info} = "$self->{current_info} $oid";
-	return if $size == 0; # purged
 	my ($num, $mid0);
 	my $eml = PublicInbox::Eml->new($$bref);
 	my $mids = mids($eml);
@@ -967,7 +977,7 @@ sub index_oid { # cat_async callback
 	if (do_idx($self, $bref, $eml, $smsg)) {
 		${$arg->{need_checkpoint}} = 1;
 	}
-	${$arg->{latest_cmt}} = $arg->{cur_cmt} // die 'BUG: {cur_cmt} missing';
+	index_finalize($arg, 1);
 }
 
 # only update last_commit for $i on reindex iff newer than current
@@ -1157,11 +1167,12 @@ sub unindex_oid_aux ($$$) {
 }
 
 sub unindex_oid ($$;$) { # git->cat_async callback
-	my ($bref, $oid, $type, $size, $sync) = @_;
-	return if is_bad_blob($oid, $type, $size, $sync->{oid});
-	my $self = $sync->{self};
+	my ($bref, $oid, $type, $size, $arg) = @_;
+	is_bad_blob($oid, $type, $size, $arg->{oid}) and
+		return index_finalize($arg, 0);
+	my $self = $arg->{self};
 	local $self->{current_info} = "$self->{current_info} $oid";
-	my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef;
+	my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef;
 	my $mm = $self->{mm};
 	my $mids = mids(PublicInbox::Eml->new($bref));
 	undef $$bref;
@@ -1186,6 +1197,7 @@ sub unindex_oid ($$;$) { # git->cat_async callback
 		}
 		unindex_oid_aux($self, $oid, $mid);
 	}
+	index_finalize($arg, 0);
 }
 
 sub git { $_[0]->{ibx}->git }
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 87893ef1..a17bf615 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -63,7 +63,7 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
 PublicInbox::Admin::require_or_die('-index');
 unless (@ibxs) { print STDERR $help; exit 1 }
 
-my (@eidx_dir, %eidx_seen);
+my (@eidx, %eidx_seen);
 my $update_extindex = $opt->{'update-extindex'};
 if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) {
 	# extindex and normal inboxes may have different owners
@@ -84,7 +84,8 @@ for my $ei_name (@$update_extindex) {
 	} else {
 		die "extindex `$ei_name' not configured or found\n";
 	}
-	$eidx_seen{$topdir} //= push(@eidx_dir, $topdir);
+	$eidx_seen{$topdir} //=
+		push(@eidx, PublicInbox::ExtSearchIdx->new($topdir));
 }
 my $mods = {};
 my @eidx_unconfigured;
@@ -95,7 +96,7 @@ foreach my $ibx (@ibxs) {
 	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
 			'full' : $detected);
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
-	if (@eidx_dir && $ibx->{-unconfigured}) {
+	if (@eidx && $ibx->{-unconfigured}) {
 		push @eidx_unconfigured, "  $ibx->{inboxdir}\n";
 	}
 }
@@ -128,18 +129,22 @@ publicInbox.$ibx->{name}.indexSequentialShard not boolean
 EOL
 		$ibx_opt = { %$opt, sequential_shard => $v };
 	}
-	PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+	my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
 	last if $ibx_opt->{quit};
 	if (my $copt = $opt->{compact_opt}) {
 		local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard};
 		PublicInbox::Xapcmd::run($ibx, 'compact', $copt);
 	}
-	next if $ibx->{-unconfigured};
 	last if $ibx_opt->{quit};
-	for my $dir (@eidx_dir) {
-		my $eidx = PublicInbox::ExtSearchIdx->new($dir);
+	next if $ibx->{-unconfigured} || !$nidx;
+	for my $eidx (@eidx) {
 		$eidx->attach_inbox($ibx);
-		$eidx->eidx_sync($ibx_opt);
-		last if $ibx_opt->{quit};
 	}
 }
+$opt->{-no_fsync} = 1 if !$opt->{fsync};
+my $pr = $opt->{-progress};
+for my $eidx (@eidx) {
+	$pr->("indexing $eidx->{topdir} ...\n") if $pr;
+	$eidx->eidx_sync($opt);
+	last if $opt->{quit};
+}

^ permalink raw reply related	[relevance 6%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-12-25 10:21  7% [PATCH 0/7] index + extindex interaction improvements Eric Wong
2020-12-25 10:21  6% ` [PATCH 4/7] index: do not attach inbox to extindex unless updated Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).