user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 0/6] extindex: --reindex --fast gets faster
@ 2021-10-11  8:06  7% Eric Wong
  2021-10-11  8:06  4% ` [PATCH 6/6] extindex: avoid invalid blobs after unref Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

-extindex --reindex --fast --all performance is nearly doubled.
There's also a bunch of cleanups for more consistently handling
with various forms of message removal from an extindex.

Eric Wong (6):
  extindex: speed up --reindex --fast
  sqlite: PRAGMA optimize on close
  extindex: rename var: active => active_shards
  extindex: share unref logic in more places
  extindex: more consistent doc removal
  extindex: avoid invalid blobs after unref

 lib/PublicInbox/ExtSearchIdx.pm | 303 ++++++++++++++++----------------
 lib/PublicInbox/LeiMailSync.pm  |   3 +-
 lib/PublicInbox/Over.pm         |   4 +-
 lib/PublicInbox/OverIdx.pm      |  54 +-----
 lib/PublicInbox/V2Writable.pm   |  11 ++
 t/extsearch.t                   |   2 +-
 t/over.t                        |   5 +-
 7 files changed, 173 insertions(+), 209 deletions(-)

^ permalink raw reply	[relevance 7%]

* [PATCH 6/6] extindex: avoid invalid blobs after unref
  2021-10-11  8:06  7% [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
@ 2021-10-11  8:06  4% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

When unref-ing a blob from xref3, make sure the "preferred"
smsg->{blob} doesn't point to the blob we just unrefed.  This
is necessary because we periodically checkpoint our extindex
process to allow -watch and -mda processes to run.

This also gets rid of a lot of redundant code for ->remove_xref3,
since it's all handled in ExtSearchIdx, now.
---
 lib/PublicInbox/ExtSearchIdx.pm | 70 ++++++++++++++++++---------------
 lib/PublicInbox/OverIdx.pm      | 44 ---------------------
 t/extsearch.t                   |  2 +-
 t/over.t                        |  5 ++-
 4 files changed, 43 insertions(+), 78 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index ce9cea25da5e..c2ab0447e176 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -138,21 +138,42 @@ sub remove_doc ($$) {
 
 sub _unref_doc ($$$$$;$) {
 	my ($sync, $docid, $ibx, $xnum, $oidbin, $eml) = @_;
-	my $s = 'DELETE FROM xref3 WHERE ibx_id = ? AND oidbin = ?';
+	my $smsg;
+	if (ref($docid)) {
+		$smsg = $docid;
+		$docid = $smsg->{num};
+	}
+	my $s = 'DELETE FROM xref3 WHERE oidbin = ?';
+	$s .= ' AND ibx_id = ?' if defined($ibx);
 	$s .= ' AND xnum = ?' if defined($xnum);
 	my $del = $sync->{self}->{oidx}->dbh->prepare_cached($s);
-	$del->bind_param(1, $ibx->{-ibx_id});
-	$del->bind_param(2, $oidbin, SQL_BLOB);
-	$del->bind_param(3, $xnum) if defined($xnum);
+	my $col = 0;
+	$del->bind_param(++$col, $oidbin, SQL_BLOB);
+	$del->bind_param(++$col, $ibx->{-ibx_id}) if $ibx;
+	$del->bind_param(++$col, $xnum) if defined($xnum);
 	$del->execute;
-	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1);
+	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid);
 	if (scalar(@$xr3) == 0) { # all gone
 		remove_doc($sync->{self}, $docid);
 	} else { # enqueue for reindex of remaining messages
-		my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key;
-		my $idx = $sync->{self}->idx_shard($docid);
-		$idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml);
-		$sync->{self}->{oidx}->eidxq_add($docid); # yes, add
+		if ($ibx) {
+			my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key;
+			my $idx = $sync->{self}->idx_shard($docid);
+			$idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml);
+		} # else: we can't remove_eidx_info in reindex-only path
+
+		# replace invalidated blob ASAP with something which should be
+		# readable since we may commit the transaction on checkpoint.
+		# eidxq processing will re-apply boost
+		$smsg //= $sync->{self}->{oidx}->get_art($docid);
+		my $hex = unpack('H*', $oidbin);
+		if ($smsg && $smsg->{blob} eq $hex) {
+			$xr3->[0] =~ /:([a-f0-9]{40,}+)\z/ or
+				die "BUG: xref $xr3->[0] has no OID";
+			$sync->{self}->{oidx}->update_blob($smsg, $1);
+		}
+		# yes, add, we'll need to re-apply boost
+		$sync->{self}->{oidx}->eidxq_add($docid);
 	}
 	@$xr3
 }
@@ -235,24 +256,12 @@ sub do_step ($) { # main iterator for adding messages to the index
 	do_finalize($req);
 }
 
-sub _blob_missing ($$) { # called when $smsg->{blob} is bad
+sub _blob_missing ($$) { # called when a known $smsg->{blob} is gone
 	my ($req, $smsg) = @_;
-	my $self = $req->{self};
-	my $xref3 = $self->{oidx}->get_xref3($smsg->{num});
-	my @keep = grep(!/:$smsg->{blob}\z/, @$xref3);
-	if (@keep) {
-		warn "E: $smsg->{blob} gone, removing #$smsg->{num}\n";
-		$keep[0] =~ /:([a-f0-9]{40,}+)\z/ or
-			die "BUG: xref $keep[0] has no OID";
-		my $oidhex = $1;
-		$self->{oidx}->remove_xref3($smsg->{num}, $smsg->{blob});
-		$self->{oidx}->update_blob($smsg, $oidhex) or warn <<EOM;
-E: #$smsg->{num} gone ($smsg->{blob} => $oidhex)
-EOM
-	} else {
-		warn "E: $smsg->{blob} gone, removing #$smsg->{num}\n";
-		remove_doc($self, $smsg->{num});
-	}
+	# xnum and ibx are unknown, we only call this when an entry from
+	# /ei*/over.sqlite3 is bad, not on entries from xap*/over.sqlite3
+	my $oidbin = pack('H*', $smsg->{blob});
+	_unref_doc($req, $smsg, undef, undef, $oidbin);
 }
 
 sub ck_existing { # git->cat_async callback
@@ -548,7 +557,8 @@ sub _reindex_finalize ($$$) {
 	for my $ary (values %$by_chash) {
 		for my $x (reverse @$ary) {
 			warn "removing #$docid xref3 $x->{blob}\n";
-			my $n = $self->{oidx}->remove_xref3($docid, $x->{blob});
+			my $bin = pack('H*', $x->{blob});
+			my $n = _unref_doc($sync, $docid, undef, undef, $bin);
 			die "BUG: $x->{blob} invalidated #$docid" if $n == 0;
 		}
 		my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty";
@@ -579,16 +589,14 @@ sub _reindex_oid { # git->cat_async callback
 	my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2];
 	my $docid = $orig_smsg->{num};
 	if (is_bad_blob($oid, $type, $size, $expect_oid)) {
-		my $remain = $self->{oidx}->remove_xref3($docid, $expect_oid);
+		my $oidbin = pack('H*', $expect_oid);
+		my $remain = _unref_doc($sync, $docid, undef, undef, $oidbin);
 		if ($remain == 0) {
 			warn "W: #$docid gone or corrupted\n";
-			remove_doc($self, $docid);
 		} elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) {
-			# n.b. we can't remove_eidx_info here
 			$self->git->cat_async($next_oid, \&_reindex_oid, $req);
 		} else {
 			warn "BUG: #$docid gone (UNEXPECTED)\n";
-			remove_doc($self, $docid);
 		}
 		return;
 	}
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 46f7a066275a..d6d706f7fed0 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -606,50 +606,6 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?)
 	$sth->execute;
 }
 
-# returns remaining reference count to $docid
-sub remove_xref3 {
-	my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_;
-	begin_lazy($self);
-	my $oidbin = pack('H*', $oidhex);
-	my ($sth, $ibx_id);
-	if (defined $eidx_key) {
-		$ibx_id = ibx_id($self, $eidx_key);
-		$sth = $self->{dbh}->prepare_cached(<<'');
-DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ?
-
-		$sth->bind_param(1, $docid);
-		$sth->bind_param(2, $ibx_id);
-		$sth->bind_param(3, $oidbin, SQL_BLOB);
-	} else {
-		$sth = $self->{dbh}->prepare_cached(<<'');
-DELETE FROM xref3 WHERE docid = ? AND oidbin = ?
-
-		$sth->bind_param(1, $docid);
-		$sth->bind_param(2, $oidbin, SQL_BLOB);
-	}
-	$sth->execute;
-	$sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
-SELECT COUNT(*) FROM xref3 WHERE docid = ?
-
-	$sth->execute($docid);
-	my $nr = $sth->fetchrow_array;
-	if ($nr == 0) {
-		delete_by_num($self, $docid);
-	} elsif (defined($ibx_id) && $rm_eidx_info) {
-		# if deduplication rules in ContentHash change, it's
-		# possible a docid can have multiple rows with the
-		# same ibx_id.  This governs whether or not we call
-		# ->remove_eidx_info in ExtSearchIdx.
-		$sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
-SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ?
-
-		$sth->execute($docid, $ibx_id);
-		my $count = $sth->fetchrow_array;
-		$$rm_eidx_info = ($count == 0);
-	}
-	$nr;
-}
-
 # for when an xref3 goes missing, this does NOT update {ts}
 sub update_blob {
 	my ($self, $smsg, $oidhex) = @_;
diff --git a/t/extsearch.t b/t/extsearch.t
index 896e270414bd..1b6235ba0eca 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -375,7 +375,7 @@ if ('reindex catches content bifurcation') {
 	is($oidx->max, $oldmax, 'oidx->max unchanged');
 	$oidx->dbh_close;
 	ok(run_script([qw(-extindex --reindex --all), "$home/extindex"],
-		undef, $opt), 'extindex --reindex');
+		undef, $opt), 'extindex --reindex') or diag explain($opt);
 	$oidx->dbh;
 	ok($oidx->max > $oldmax, 'oidx->max bumped');
 	like($err, qr/split into 2 due to deduplication change/,
diff --git a/t/over.t b/t/over.t
index a92d2f77b039..1f2df7cf065a 100644
--- a/t/over.t
+++ b/t/over.t
@@ -90,8 +90,9 @@ $over->eidx_prep;
 			'example.kee:2018:deadbeefcafe' ],
 			'xref3 works forw two');
 
-	@arg = qw(1349 adeadba7cafe example.key);
-	is($over->remove_xref3(@arg), 1, 'remove first');
+	is($over->dbh->do(<<''), 1, 'remove first');
+DELETE FROM xref3 WHERE xnum = 2019 AND docid = 1349
+
 	$xref3 = $over->get_xref3(1349);
 	is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ],
 		'confirm removal successful');

^ permalink raw reply related	[relevance 4%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2021-10-11  8:06  7% [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
2021-10-11  8:06  4% ` [PATCH 6/6] extindex: avoid invalid blobs after unref Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).