user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/6] extindex: --reindex --fast gets faster
@ 2021-10-11  8:06 Eric Wong
  2021-10-11  8:06 ` [PATCH 1/6] extindex: speed up --reindex --fast Eric Wong
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

-extindex --reindex --fast --all performance is nearly doubled.
There's also a bunch of cleanups for more consistently handling
with various forms of message removal from an extindex.

Eric Wong (6):
  extindex: speed up --reindex --fast
  sqlite: PRAGMA optimize on close
  extindex: rename var: active => active_shards
  extindex: share unref logic in more places
  extindex: more consistent doc removal
  extindex: avoid invalid blobs after unref

 lib/PublicInbox/ExtSearchIdx.pm | 303 ++++++++++++++++----------------
 lib/PublicInbox/LeiMailSync.pm  |   3 +-
 lib/PublicInbox/Over.pm         |   4 +-
 lib/PublicInbox/OverIdx.pm      |  54 +-----
 lib/PublicInbox/V2Writable.pm   |  11 ++
 t/extsearch.t                   |   2 +-
 t/over.t                        |   5 +-
 7 files changed, 173 insertions(+), 209 deletions(-)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/6] extindex: speed up --reindex --fast
  2021-10-11  8:06 [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
@ 2021-10-11  8:06 ` Eric Wong
  2021-10-11  8:06 ` [PATCH 2/6] sqlite: PRAGMA optimize on close Eric Wong
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

This required some tweaking of xref3 indices in over.sqlite3,
but the end result is it brings no-op "--reindex --fast --all"
checks down to roughly 20 minutes (from 30-40 minutes) on
lore/all.

This is faster because a bunch of small SQLite queries are still
slower en-mass than a bunch of perlops.  Despite the lack of IPC
overhead, crossing .so boundaries and repeating lookups over
btrees is still slower than doing the same with Perl hash tables.
---
 lib/PublicInbox/ExtSearchIdx.pm | 171 ++++++++++++++++----------------
 lib/PublicInbox/Over.pm         |   4 +-
 lib/PublicInbox/OverIdx.pm      |  10 +-
 3 files changed, 94 insertions(+), 91 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index d589d2c00f1a..8da98ba44a9a 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -807,10 +807,53 @@ sub reindex_unseen ($$$$) {
 	$self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req);
 }
 
-sub _reindex_check_unseen ($$$) {
+sub _unref_stale ($$$$$) {
+	my ($sync, $docid, $ibx, $xnum, $oidbin) = @_;
+	my $del = $sync->{self}->{oidx}->dbh->prepare_cached(<<'');
+DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ?
+
+	$del->bind_param(1, $ibx->{-ibx_id});
+	$del->bind_param(2, $xnum);
+	$del->bind_param(3, $oidbin, SQL_BLOB);
+	$del->execute;
+	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1);
+	my $idx = $sync->{self}->idx_shard($docid);
+	if (scalar(@$xr3) == 0) { # all gone
+		$sync->{self}->{oidx}->delete_by_num($docid);
+		$sync->{self}->{oidx}->eidxq_del($docid);
+		$idx->ipc_do('xdb_remove', $docid);
+	} else { # enqueue for reindex of remaining messages
+		$idx->ipc_do('remove_eidx_info', $docid, $ibx->eidx_key);
+		$sync->{self}->{oidx}->eidxq_add($docid); # yes, add
+	}
+}
+
+sub _unref_stale_range ($$$) {
+	my ($sync, $ibx, $lt_or_gt) = @_;
+	my $r;
+	my $lim = 10000;
+	do {
+		$r = $sync->{self}->{oidx}->dbh->selectall_arrayref(
+			<<EOS, undef, $ibx->{-ibx_id});
+SELECT docid,xnum,oidbin FROM xref3
+WHERE ibx_id = ? AND xnum $lt_or_gt LIMIT $lim
+EOS
+		return if $sync->{quit};
+		for (@$r) { # hopefully rare, not worth optimizing:
+			my ($docid, $xnum, $oidbin) = @$_;
+			my $hex = unpack('H*', $oidbin);
+			warn("# $xnum:$hex (#$docid): stale\n");
+			_unref_stale($sync, $docid, $ibx, $xnum, $oidbin);
+		}
+	} while (scalar(@$r) == $lim);
+	1;
+}
+
+sub _reindex_check_ibx ($$$) {
 	my ($self, $sync, $ibx) = @_;
 	my $ibx_id = $ibx->{-ibx_id};
-	my $slice = 1000;
+	my $slice = 10000;
+	my $opt = { limit => $slice };
 	my ($beg, $end) = (1, $slice);
 	my $err = sync_inbox($self, $sync, $ibx) and return;
 	my $max = $ibx->over->max;
@@ -820,11 +863,12 @@ sub _reindex_check_unseen ($$$) {
 	my $msgs;
 	my $pr = $sync->{-opt}->{-progress};
 	my $ekey = $ibx->eidx_key;
-	local $sync->{-regen_fmt} =
-			"$ekey checking unseen %u/".$ibx->over->max."\n";
+	local $sync->{-regen_fmt} = "$ekey checking %u/$max\n";
 	${$sync->{nr}} = 0;
 	my $fast = $sync->{-opt}->{fast};
-	while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) {
+	my $dsu; # _unref_stale_range (< $lo) called
+	my ($lo, $hi);
+	while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end, $opt)})) {
 		${$sync->{nr}} = $beg;
 		$beg = $msgs->[-1]->{num} + 1;
 		$end = $beg + $slice;
@@ -832,92 +876,48 @@ sub _reindex_check_unseen ($$$) {
 		if (checkpoint_due($sync)) {
 			reindex_checkpoint($self, $sync); # release lock
 		}
-
-		my $inx3 = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1);
-SELECT DISTINCT(docid) FROM xref3 WHERE
-ibx_id = ? AND xnum = ? AND oidbin = ?
-
+		($lo, $hi) = ($msgs->[0]->{num}, $msgs->[-1]->{num});
+		$dsu //= _unref_stale_range($sync, $ibx, "< $lo");
+		my $x3a = $self->{oidx}->dbh->selectall_arrayref(
+			<<"", undef, $ibx_id, $lo, $hi);
+SELECT xnum,oidbin,docid FROM xref3 WHERE
+ibx_id = ? AND xnum >= ? AND xnum <= ?
+
+		my %x3m;
+		for (@$x3a) {
+			my $k = pack('J', $_->[0]) . $_->[1];
+			push @{$x3m{$k}}, $_->[2];
+		}
+		undef $x3a;
 		for my $xsmsg (@$msgs) {
-			my $oidbin = pack('H*', $xsmsg->{blob});
-			$inx3->bind_param(1, $ibx_id);
-			$inx3->bind_param(2, $xsmsg->{num});
-			$inx3->bind_param(3, $oidbin, SQL_BLOB);
-			$inx3->execute;
-			my $docids = $inx3->fetchall_arrayref;
-			# index messages which were totally missed
-			# the first time around ASAP:
-			if (scalar(@$docids) == 0) {
+			my $k = pack('JH*', $xsmsg->{num}, $xsmsg->{blob});
+			my $docids = delete($x3m{$k});
+			if (!defined($docids)) {
 				reindex_unseen($self, $sync, $ibx, $xsmsg);
-			} elsif (!$fast) { # already seen, reindex later
-				for my $r (@$docids) {
-					$self->{oidx}->eidxq_add($r->[0]);
+			} elsif (!$fast) {
+				for my $num (@$docids) {
+					$self->{oidx}->eidxq_add($num);
 				}
+				return if $sync->{quit};
 			}
-			last if $sync->{quit};
-		}
-		last if $sync->{quit};
-	}
-}
-
-sub _reindex_check_stale ($$$) {
-	my ($self, $sync, $ibx) = @_;
-	my $min = 0;
-	my $pr = $sync->{-opt}->{-progress};
-	my $fetching;
-	my $ekey = $ibx->eidx_key;
-	local $sync->{-regen_fmt} =
-			"$ekey checking stale/missing %u/".$ibx->over->max."\n";
-	${$sync->{nr}} = 0;
-	do {
-		if (checkpoint_due($sync)) {
-			reindex_checkpoint($self, $sync); # release lock
 		}
-		# now, check if there's stale xrefs
-		my $iter = $self->{oidx}->dbh->prepare_cached(<<'', undef, 1);
-SELECT docid,xnum,oidbin FROM xref3 WHERE ibx_id = ? AND docid > ?
-ORDER BY docid,xnum ASC LIMIT 10000
-
-		$iter->execute($ibx->{-ibx_id}, $min);
-		$fetching = undef;
-
-		while (my ($docid, $xnum, $oidbin) = $iter->fetchrow_array) {
-			return if $sync->{quit};
-			${$sync->{nr}} = $xnum;
-
-			$fetching = $min = $docid;
-			my $smsg = $ibx->over->get_art($xnum);
-			my $err;
-			if (!$smsg) {
-				$err = 'stale';
-			} elsif (pack('H*', $smsg->{blob}) ne $oidbin) {
-				$err = "mismatch (!= $smsg->{blob})";
-			} else {
-				next; # likely, all good
-			}
-			# current_info already has eidx_key
-			my $oidhex = unpack('H*', $oidbin);
-			warn "$xnum:$oidhex (#$docid): $err\n";
-			my $del = $self->{oidx}->dbh->prepare_cached(<<'');
-DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ?
-
-			$del->bind_param(1, $ibx->{-ibx_id});
-			$del->bind_param(2, $xnum);
-			$del->bind_param(3, $oidbin, SQL_BLOB);
-			$del->execute;
-
-			# get_xref3 over-fetches, but this is a rare path:
-			my $xr3 = $self->{oidx}->get_xref3($docid, 1);
-			my $idx = $self->idx_shard($docid);
-			if (scalar(@$xr3) == 0) { # all gone
-				$self->{oidx}->delete_by_num($docid);
-				$self->{oidx}->eidxq_del($docid);
-				$idx->ipc_do('xdb_remove', $docid);
-			} else { # enqueue for reindex of remaining messages
-				$idx->ipc_do('remove_eidx_info', $docid, $ekey);
-				$self->{oidx}->eidxq_add($docid); # yes, add
+		return if $sync->{quit};
+		next unless scalar keys %x3m;
+
+		# eliminate stale/mismatched entries
+		my %mismatch = map { $_->{num} => $_->{blob} } @$msgs;
+		while (my ($k, $docids) = each %x3m) {
+			my ($xnum, $hex) = unpack('JH*', $k);
+			my $bin = pack('H*', $hex);
+			my $exp = $mismatch{$xnum};
+			my $m = defined($exp) ? "mismatch (!= $exp)" : 'stale';
+			warn("# $xnum:$hex (#@$docids): $m\n");
+			for my $i (@$docids) {
+				_unref_stale($sync, $i, $ibx, $xnum, $bin);
 			}
 		}
-	} while (defined $fetching);
+	}
+	_unref_stale_range($sync, $ibx, "> $hi") if defined($hi);
 }
 
 sub _reindex_inbox ($$$) {
@@ -927,8 +927,7 @@ sub _reindex_inbox ($$$) {
 	if (defined(my $err = _ibx_index_reject($ibx))) {
 		warn "W: cannot reindex $ekey ($err)\n";
 	} else {
-		_reindex_check_unseen($self, $sync, $ibx);
-		_reindex_check_stale($self, $sync, $ibx) unless $sync->{quit};
+		_reindex_check_ibx($self, $sync, $ibx);
 	}
 	delete @$ibx{qw(over mm search git)}; # won't need these for a bit
 }
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 98de82c048c8..30ad949dd027 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -108,8 +108,8 @@ sub do_get {
 }
 
 sub query_xover {
-	my ($self, $beg, $end) = @_;
-	do_get($self, <<'', {}, $beg, $end);
+	my ($self, $beg, $end, $opt) = @_;
+	do_get($self, <<'', $opt, $beg, $end);
 SELECT num,ts,ds,ddd FROM over WHERE num >= ? AND num <= ?
 ORDER BY num ASC
 
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 985abbf4e693..46f7a066275a 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -543,9 +543,13 @@ CREATE TABLE IF NOT EXISTS xref3 (
 	$dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)');
 
 	# performance critical, this is not UNIQUE since we may need to
-	# tolerate some old bugs from indexing mirrors
-	$dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '.
-		'xref3 (oidbin,xnum,ibx_id)');
+	# tolerate some old bugs from indexing mirrors.  n.b. we used
+	# to index oidbin here, but leaving it out speeds up reindexing
+	# and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin
+	$dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '.
+		'xref3 (xnum,ibx_id)');
+
+	$dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)');
 
 		$dbh->do(<<'');
 CREATE TABLE IF NOT EXISTS eidx_meta (

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/6] sqlite: PRAGMA optimize on close
  2021-10-11  8:06 [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
  2021-10-11  8:06 ` [PATCH 1/6] extindex: speed up --reindex --fast Eric Wong
@ 2021-10-11  8:06 ` Eric Wong
  2021-10-11  8:06 ` [PATCH 3/6] extindex: rename var: active => active_shards Eric Wong
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

As recommended by SQLite documentation[1]:

  To achieve the best long-term query performance without the need
  to do a detailed engineering analysis of the application schema
  and SQL, it is recommended that applications run "PRAGMA optimize"
  (with no arguments) just before closing each database connection.

Hopefully that works for our use cases and can make things
faster for us.

[1] https://www.sqlite.org/pragma.html#pragma_optimize
---
 lib/PublicInbox/LeiMailSync.pm |  3 ++-
 lib/PublicInbox/V2Writable.pm  | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index 91cd1c934a1f..c6cd1bc58d0a 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -47,7 +47,8 @@ sub lms_write_prepare { ($_[0]->{dbh} //= dbh_new($_[0], 1)); $_[0] }
 sub lms_pause {
 	my ($self) = @_;
 	$self->{fmap} = {};
-	delete $self->{dbh};
+	my $dbh = delete $self->{dbh};
+	$dbh->do('PRAGMA optimize') if $dbh;
 }
 
 sub create_tables {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index fcd7ffe2317b..d04cdda6e3dc 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -622,7 +622,18 @@ sub done {
 		my $m = $err ? 'rollback' : 'commit';
 		eval { $mm->{dbh}->$m };
 		$err .= "msgmap $m: $@\n" if $@;
+		eval { $mm->{dbh}->do('PRAGMA optimize') };
+		$err .= "msgmap optimize: $@\n" if $@;
 	}
+	if ($self->{oidx} && $self->{oidx}->{dbh}) {
+		if ($err) {
+			eval { $self->{oidx}->rollback_lazy };
+			$err .= "overview rollback: $@\n" if $@;
+		}
+		eval { $self->{oidx}->{dbh}->do('PRAGMA optimize') };
+		$err .= "overview optimize: $@\n" if $@;
+	}
+
 	my $shards = delete $self->{idx_shards};
 	if ($shards) {
 		for (@$shards) {

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/6] extindex: rename var: active => active_shards
  2021-10-11  8:06 [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
  2021-10-11  8:06 ` [PATCH 1/6] extindex: speed up --reindex --fast Eric Wong
  2021-10-11  8:06 ` [PATCH 2/6] sqlite: PRAGMA optimize on close Eric Wong
@ 2021-10-11  8:06 ` Eric Wong
  2021-10-11  8:06 ` [PATCH 4/6] extindex: share unref logic in more places Eric Wong
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

We also have the idea of active inboxes, too, so "active shards"
ought to make the purpose of the data structure more obvious.
---
 lib/PublicInbox/ExtSearchIdx.pm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 8da98ba44a9a..64cd8641585d 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -434,7 +434,7 @@ DELETE FROM over WHERE num > 0 AND num NOT IN (SELECT docid FROM xref3)
 SELECT MIN(num) FROM over WHERE num > 0
 EOM
 	$cur // return; # empty
-	my ($r, $n, %active);
+	my ($r, $n, %active_shards);
 	$nr = 0;
 	while (1) {
 		$r = $self->{oidx}->dbh->selectcol_arrayref(<<"", undef, $cur);
@@ -445,15 +445,15 @@ SELECT num FROM over WHERE num >= ? ORDER BY num ASC LIMIT 10000
 			for my $i ($cur..($n - 1)) {
 				my $idx = idx_shard($self, $i);
 				$idx->ipc_do('xdb_remove_quiet', $i);
-				$active{$idx} = $idx;
+				$active_shards{$idx} = $idx;
 			}
 			$cur = $n + 1;
 		}
 		if (checkpoint_due($sync)) {
-			for my $idx (values %active) {
+			for my $idx (values %active_shards) {
 				$nr += $idx->ipc_do('nr_quiet_rm')
 			}
-			%active = ();
+			%active_shards = ();
 			reindex_checkpoint($self, $sync);
 		}
 	}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 4/6] extindex: share unref logic in more places
  2021-10-11  8:06 [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
                   ` (2 preceding siblings ...)
  2021-10-11  8:06 ` [PATCH 3/6] extindex: rename var: active => active_shards Eric Wong
@ 2021-10-11  8:06 ` Eric Wong
  2021-10-11  8:06 ` [PATCH 5/6] extindex: more consistent doc removal Eric Wong
  2021-10-11  8:06 ` [PATCH 6/6] extindex: avoid invalid blobs after unref Eric Wong
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

We can use the same logic for --gc and --reindex and
'd' log entries

They're similar enough and the actual need to unref should
be fairly rare.  We could go a lot faster if we didn't show
progress for --gc and --reindex, actually.
---
 lib/PublicInbox/ExtSearchIdx.pm | 102 ++++++++++++--------------------
 1 file changed, 38 insertions(+), 64 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 64cd8641585d..c0fd282358f9 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -129,32 +129,46 @@ sub apply_boost ($$) {
 	$req->{self}->{oidx}->add_overview($req->{eml}, $new_smsg);
 }
 
+sub _unref_doc ($$$$$;$) {
+	my ($sync, $docid, $ibx, $xnum, $oidbin, $eml) = @_;
+	my $s = 'DELETE FROM xref3 WHERE ibx_id = ? AND oidbin = ?';
+	$s .= ' AND xnum = ?' if defined($xnum);
+	my $del = $sync->{self}->{oidx}->dbh->prepare_cached($s);
+	$del->bind_param(1, $ibx->{-ibx_id});
+	$del->bind_param(2, $oidbin, SQL_BLOB);
+	$del->bind_param(3, $xnum) if defined($xnum);
+	$del->execute;
+	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1);
+	my $idx = $sync->{self}->idx_shard($docid);
+	if (scalar(@$xr3) == 0) { # all gone
+		$sync->{self}->{oidx}->delete_by_num($docid);
+		$sync->{self}->{oidx}->eidxq_del($docid);
+		$idx->ipc_do('xdb_remove', $docid);
+	} else { # enqueue for reindex of remaining messages
+		my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key;
+		$idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml);
+		$sync->{self}->{oidx}->eidxq_add($docid); # yes, add
+	}
+	@$xr3
+}
+
 sub do_xpost ($$) {
 	my ($req, $smsg) = @_;
 	my $self = $req->{self};
 	my $docid = $smsg->{num};
-	my $idx = $self->idx_shard($docid);
 	my $oid = $req->{oid};
 	my $xibx = $req->{ibx};
 	my $eml = $req->{eml};
-	my $eidx_key = $xibx->eidx_key;
 	if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message
+		my $eidx_key = $xibx->eidx_key;
 		my $xnum = $req->{xnum};
 		$self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key);
+		my $idx = $self->idx_shard($docid);
 		$idx->ipc_do('add_eidx_info', $docid, $eidx_key, $eml);
 		apply_boost($req, $smsg) if $req->{boost_in_use};
-	} else { # 'd'
-		my $rm_eidx_info;
-		my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key,
-							\$rm_eidx_info);
-		if ($nr == 0) {
-			$self->{oidx}->eidxq_del($docid);
-			$idx->ipc_do('xdb_remove', $docid);
-		} elsif ($rm_eidx_info) {
-			$idx->ipc_do('remove_eidx_info',
-					$docid, $eidx_key, $eml);
-			$self->{oidx}->eidxq_add($docid); # yes, add
-		}
+	} else { # 'd' no {xnum}
+		$oid = pack('H*', $oid);
+		_unref_doc($req, $docid, $xibx, undef, $oid, $eml);
 	}
 }
 
@@ -345,36 +359,12 @@ sub _sync_inbox ($$$) {
 	undef;
 }
 
-sub gc_unref_doc ($$$$) {
-	my ($self, $ibx_id, $eidx_key, $docid) = @_;
-	my $remain = 0;
-	# for debug/info purposes, oids may no longer be accessible
-	my $dbh = $self->{oidx}->dbh;
-	my $sth = $dbh->prepare_cached(<<'', undef, 1);
-SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ?
-
-	$sth->execute($docid, $ibx_id);
-	my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref};
-	for my $oid (@oid) {
-		$remain += $self->{oidx}->remove_xref3($docid, $oid, $eidx_key);
-	}
-	if ($remain) {
-		$self->{oidx}->eidxq_add($docid); # enqueue for reindex
-		for my $oid (@oid) {
-			warn "I: unref #$docid $eidx_key $oid\n";
-		}
-	} else {
-		warn "I: remove #$docid $eidx_key @oid\n";
-		$self->idx_shard($docid)->ipc_do('xdb_remove', $docid);
-	}
-}
-
 sub eidx_gc_scan_inboxes ($$) {
 	my ($self, $sync) = @_;
 	my ($x3_doc, $ibx_ck);
 restart:
 	$x3_doc = $self->{oidx}->dbh->prepare(<<EOM);
-SELECT docid FROM xref3 WHERE ibx_id = ?
+SELECT docid,xnum,oidbin FROM xref3 WHERE ibx_id = ?
 EOM
 	$ibx_ck = $self->{oidx}->dbh->prepare(<<EOM);
 SELECT ibx_id,eidx_key FROM inboxes
@@ -385,8 +375,12 @@ EOM
 		$self->{midx}->remove_eidx_key($eidx_key);
 		warn "I: deleting messages for $eidx_key...\n";
 		$x3_doc->execute($ibx_id);
-		while (defined(my $docid = $x3_doc->fetchrow_array)) {
-			gc_unref_doc($self, $ibx_id, $eidx_key, $docid);
+		my $ibx = { -ibx_id => $ibx_id, -gc_eidx_key => $eidx_key };
+		while (my ($docid, $xnum, $oid) = $x3_doc->fetchrow_array) {
+			my $r = _unref_doc($sync, $docid, $ibx, $xnum, $oid);
+			$oid = unpack('H*', $oid);
+			$r = $r ? 'unref' : 'remove';
+			warn "I: $r #$docid $eidx_key $oid\n";
 			if (checkpoint_due($sync)) {
 				$x3_doc = $ibx_ck = undef;
 				reindex_checkpoint($self, $sync);
@@ -470,6 +464,7 @@ sub eidx_gc {
 		next_check => now() + 10,
 		checkpoint_unlocks => 1,
 		-opt => $opt,
+		self => $self,
 	};
 	$self->idx_init($opt); # acquire lock via V2Writable::_idx_init
 	eidx_gc_scan_inboxes($self, $sync);
@@ -807,27 +802,6 @@ sub reindex_unseen ($$$$) {
 	$self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req);
 }
 
-sub _unref_stale ($$$$$) {
-	my ($sync, $docid, $ibx, $xnum, $oidbin) = @_;
-	my $del = $sync->{self}->{oidx}->dbh->prepare_cached(<<'');
-DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ?
-
-	$del->bind_param(1, $ibx->{-ibx_id});
-	$del->bind_param(2, $xnum);
-	$del->bind_param(3, $oidbin, SQL_BLOB);
-	$del->execute;
-	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1);
-	my $idx = $sync->{self}->idx_shard($docid);
-	if (scalar(@$xr3) == 0) { # all gone
-		$sync->{self}->{oidx}->delete_by_num($docid);
-		$sync->{self}->{oidx}->eidxq_del($docid);
-		$idx->ipc_do('xdb_remove', $docid);
-	} else { # enqueue for reindex of remaining messages
-		$idx->ipc_do('remove_eidx_info', $docid, $ibx->eidx_key);
-		$sync->{self}->{oidx}->eidxq_add($docid); # yes, add
-	}
-}
-
 sub _unref_stale_range ($$$) {
 	my ($sync, $ibx, $lt_or_gt) = @_;
 	my $r;
@@ -843,7 +817,7 @@ EOS
 			my ($docid, $xnum, $oidbin) = @$_;
 			my $hex = unpack('H*', $oidbin);
 			warn("# $xnum:$hex (#$docid): stale\n");
-			_unref_stale($sync, $docid, $ibx, $xnum, $oidbin);
+			_unref_doc($sync, $docid, $ibx, $xnum, $oidbin);
 		}
 	} while (scalar(@$r) == $lim);
 	1;
@@ -913,7 +887,7 @@ ibx_id = ? AND xnum >= ? AND xnum <= ?
 			my $m = defined($exp) ? "mismatch (!= $exp)" : 'stale';
 			warn("# $xnum:$hex (#@$docids): $m\n");
 			for my $i (@$docids) {
-				_unref_stale($sync, $i, $ibx, $xnum, $bin);
+				_unref_doc($sync, $i, $ibx, $xnum, $bin);
 			}
 		}
 	}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 5/6] extindex: more consistent doc removal
  2021-10-11  8:06 [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
                   ` (3 preceding siblings ...)
  2021-10-11  8:06 ` [PATCH 4/6] extindex: share unref logic in more places Eric Wong
@ 2021-10-11  8:06 ` Eric Wong
  2021-10-11  8:06 ` [PATCH 6/6] extindex: avoid invalid blobs after unref Eric Wong
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

We need to ensure a message is consistently removed from eidxq,
over and Xapian in all cases.  Removing from eidxq saves users
from some noisy error messages.
---
 lib/PublicInbox/ExtSearchIdx.pm | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index c0fd282358f9..ce9cea25da5e 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -129,6 +129,13 @@ sub apply_boost ($$) {
 	$req->{self}->{oidx}->add_overview($req->{eml}, $new_smsg);
 }
 
+sub remove_doc ($$) {
+	my ($self, $docid) = @_;
+	$self->{oidx}->delete_by_num($docid);
+	$self->{oidx}->eidxq_del($docid);
+	$self->idx_shard($docid)->ipc_do('xdb_remove', $docid);
+}
+
 sub _unref_doc ($$$$$;$) {
 	my ($sync, $docid, $ibx, $xnum, $oidbin, $eml) = @_;
 	my $s = 'DELETE FROM xref3 WHERE ibx_id = ? AND oidbin = ?';
@@ -139,13 +146,11 @@ sub _unref_doc ($$$$$;$) {
 	$del->bind_param(3, $xnum) if defined($xnum);
 	$del->execute;
 	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1);
-	my $idx = $sync->{self}->idx_shard($docid);
 	if (scalar(@$xr3) == 0) { # all gone
-		$sync->{self}->{oidx}->delete_by_num($docid);
-		$sync->{self}->{oidx}->eidxq_del($docid);
-		$idx->ipc_do('xdb_remove', $docid);
+		remove_doc($sync->{self}, $docid);
 	} else { # enqueue for reindex of remaining messages
 		my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key;
+		my $idx = $sync->{self}->idx_shard($docid);
 		$idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml);
 		$sync->{self}->{oidx}->eidxq_add($docid); # yes, add
 	}
@@ -246,7 +251,7 @@ E: #$smsg->{num} gone ($smsg->{blob} => $oidhex)
 EOM
 	} else {
 		warn "E: $smsg->{blob} gone, removing #$smsg->{num}\n";
-		$self->{oidx}->delete_by_num($smsg->{num});
+		remove_doc($self, $smsg->{num});
 	}
 }
 
@@ -424,6 +429,12 @@ DELETE FROM over WHERE num > 0 AND num NOT IN (SELECT docid FROM xref3)
 	warn "I: eliminated $nr stale over entries\n" if $nr != 0;
 	reindex_checkpoint($self, $sync) if checkpoint_due($sync);
 
+	$nr = $self->{oidx}->dbh->do(<<'');
+DELETE FROM eidxq WHERE docid NOT IN (SELECT num FROM over)
+
+	warn "I: eliminated $nr stale reindex queue entries\n" if $nr != 0;
+	reindex_checkpoint($self, $sync) if checkpoint_due($sync);
+
 	my ($cur) = $self->{oidx}->dbh->selectrow_array(<<EOM);
 SELECT MIN(num) FROM over WHERE num > 0
 EOM
@@ -571,12 +582,13 @@ sub _reindex_oid { # git->cat_async callback
 		my $remain = $self->{oidx}->remove_xref3($docid, $expect_oid);
 		if ($remain == 0) {
 			warn "W: #$docid gone or corrupted\n";
-			$self->idx_shard($docid)->ipc_do('xdb_remove', $docid);
+			remove_doc($self, $docid);
 		} elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) {
+			# n.b. we can't remove_eidx_info here
 			$self->git->cat_async($next_oid, \&_reindex_oid, $req);
 		} else {
 			warn "BUG: #$docid gone (UNEXPECTED)\n";
-			$self->idx_shard($docid)->ipc_do('xdb_remove', $docid);
+			remove_doc($self, $docid);
 		}
 		return;
 	}
@@ -609,8 +621,7 @@ sub _reindex_smsg ($$$) {
 		warn <<"";
 BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex
 
-		$self->{oidx}->delete_by_num($docid);
-		$self->idx_shard($docid)->ipc_do('xdb_remove', $docid);
+		remove_doc($self, $docid);
 		return;
 	}
 
@@ -957,8 +968,7 @@ sub dd_smsg { # git->cat_async callback
 		for my $smsg (@$ary) {
 			my $gone = $smsg->{num};
 			$oidx->merge_xref3($keep->{num}, $gone, $smsg->{blob});
-			$self->idx_shard($gone)->ipc_do('xdb_remove', $gone);
-			$oidx->delete_by_num($gone);
+			remove_doc($self, $gone);
 		}
 	}
 }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 6/6] extindex: avoid invalid blobs after unref
  2021-10-11  8:06 [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
                   ` (4 preceding siblings ...)
  2021-10-11  8:06 ` [PATCH 5/6] extindex: more consistent doc removal Eric Wong
@ 2021-10-11  8:06 ` Eric Wong
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2021-10-11  8:06 UTC (permalink / raw)
  To: meta

When unref-ing a blob from xref3, make sure the "preferred"
smsg->{blob} doesn't point to the blob we just unrefed.  This
is necessary because we periodically checkpoint our extindex
process to allow -watch and -mda processes to run.

This also gets rid of a lot of redundant code for ->remove_xref3,
since it's all handled in ExtSearchIdx, now.
---
 lib/PublicInbox/ExtSearchIdx.pm | 70 ++++++++++++++++++---------------
 lib/PublicInbox/OverIdx.pm      | 44 ---------------------
 t/extsearch.t                   |  2 +-
 t/over.t                        |  5 ++-
 4 files changed, 43 insertions(+), 78 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index ce9cea25da5e..c2ab0447e176 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -138,21 +138,42 @@ sub remove_doc ($$) {
 
 sub _unref_doc ($$$$$;$) {
 	my ($sync, $docid, $ibx, $xnum, $oidbin, $eml) = @_;
-	my $s = 'DELETE FROM xref3 WHERE ibx_id = ? AND oidbin = ?';
+	my $smsg;
+	if (ref($docid)) {
+		$smsg = $docid;
+		$docid = $smsg->{num};
+	}
+	my $s = 'DELETE FROM xref3 WHERE oidbin = ?';
+	$s .= ' AND ibx_id = ?' if defined($ibx);
 	$s .= ' AND xnum = ?' if defined($xnum);
 	my $del = $sync->{self}->{oidx}->dbh->prepare_cached($s);
-	$del->bind_param(1, $ibx->{-ibx_id});
-	$del->bind_param(2, $oidbin, SQL_BLOB);
-	$del->bind_param(3, $xnum) if defined($xnum);
+	my $col = 0;
+	$del->bind_param(++$col, $oidbin, SQL_BLOB);
+	$del->bind_param(++$col, $ibx->{-ibx_id}) if $ibx;
+	$del->bind_param(++$col, $xnum) if defined($xnum);
 	$del->execute;
-	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1);
+	my $xr3 = $sync->{self}->{oidx}->get_xref3($docid);
 	if (scalar(@$xr3) == 0) { # all gone
 		remove_doc($sync->{self}, $docid);
 	} else { # enqueue for reindex of remaining messages
-		my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key;
-		my $idx = $sync->{self}->idx_shard($docid);
-		$idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml);
-		$sync->{self}->{oidx}->eidxq_add($docid); # yes, add
+		if ($ibx) {
+			my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key;
+			my $idx = $sync->{self}->idx_shard($docid);
+			$idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml);
+		} # else: we can't remove_eidx_info in reindex-only path
+
+		# replace invalidated blob ASAP with something which should be
+		# readable since we may commit the transaction on checkpoint.
+		# eidxq processing will re-apply boost
+		$smsg //= $sync->{self}->{oidx}->get_art($docid);
+		my $hex = unpack('H*', $oidbin);
+		if ($smsg && $smsg->{blob} eq $hex) {
+			$xr3->[0] =~ /:([a-f0-9]{40,}+)\z/ or
+				die "BUG: xref $xr3->[0] has no OID";
+			$sync->{self}->{oidx}->update_blob($smsg, $1);
+		}
+		# yes, add, we'll need to re-apply boost
+		$sync->{self}->{oidx}->eidxq_add($docid);
 	}
 	@$xr3
 }
@@ -235,24 +256,12 @@ sub do_step ($) { # main iterator for adding messages to the index
 	do_finalize($req);
 }
 
-sub _blob_missing ($$) { # called when $smsg->{blob} is bad
+sub _blob_missing ($$) { # called when a known $smsg->{blob} is gone
 	my ($req, $smsg) = @_;
-	my $self = $req->{self};
-	my $xref3 = $self->{oidx}->get_xref3($smsg->{num});
-	my @keep = grep(!/:$smsg->{blob}\z/, @$xref3);
-	if (@keep) {
-		warn "E: $smsg->{blob} gone, removing #$smsg->{num}\n";
-		$keep[0] =~ /:([a-f0-9]{40,}+)\z/ or
-			die "BUG: xref $keep[0] has no OID";
-		my $oidhex = $1;
-		$self->{oidx}->remove_xref3($smsg->{num}, $smsg->{blob});
-		$self->{oidx}->update_blob($smsg, $oidhex) or warn <<EOM;
-E: #$smsg->{num} gone ($smsg->{blob} => $oidhex)
-EOM
-	} else {
-		warn "E: $smsg->{blob} gone, removing #$smsg->{num}\n";
-		remove_doc($self, $smsg->{num});
-	}
+	# xnum and ibx are unknown, we only call this when an entry from
+	# /ei*/over.sqlite3 is bad, not on entries from xap*/over.sqlite3
+	my $oidbin = pack('H*', $smsg->{blob});
+	_unref_doc($req, $smsg, undef, undef, $oidbin);
 }
 
 sub ck_existing { # git->cat_async callback
@@ -548,7 +557,8 @@ sub _reindex_finalize ($$$) {
 	for my $ary (values %$by_chash) {
 		for my $x (reverse @$ary) {
 			warn "removing #$docid xref3 $x->{blob}\n";
-			my $n = $self->{oidx}->remove_xref3($docid, $x->{blob});
+			my $bin = pack('H*', $x->{blob});
+			my $n = _unref_doc($sync, $docid, undef, undef, $bin);
 			die "BUG: $x->{blob} invalidated #$docid" if $n == 0;
 		}
 		my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty";
@@ -579,16 +589,14 @@ sub _reindex_oid { # git->cat_async callback
 	my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2];
 	my $docid = $orig_smsg->{num};
 	if (is_bad_blob($oid, $type, $size, $expect_oid)) {
-		my $remain = $self->{oidx}->remove_xref3($docid, $expect_oid);
+		my $oidbin = pack('H*', $expect_oid);
+		my $remain = _unref_doc($sync, $docid, undef, undef, $oidbin);
 		if ($remain == 0) {
 			warn "W: #$docid gone or corrupted\n";
-			remove_doc($self, $docid);
 		} elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) {
-			# n.b. we can't remove_eidx_info here
 			$self->git->cat_async($next_oid, \&_reindex_oid, $req);
 		} else {
 			warn "BUG: #$docid gone (UNEXPECTED)\n";
-			remove_doc($self, $docid);
 		}
 		return;
 	}
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 46f7a066275a..d6d706f7fed0 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -606,50 +606,6 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?)
 	$sth->execute;
 }
 
-# returns remaining reference count to $docid
-sub remove_xref3 {
-	my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_;
-	begin_lazy($self);
-	my $oidbin = pack('H*', $oidhex);
-	my ($sth, $ibx_id);
-	if (defined $eidx_key) {
-		$ibx_id = ibx_id($self, $eidx_key);
-		$sth = $self->{dbh}->prepare_cached(<<'');
-DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ?
-
-		$sth->bind_param(1, $docid);
-		$sth->bind_param(2, $ibx_id);
-		$sth->bind_param(3, $oidbin, SQL_BLOB);
-	} else {
-		$sth = $self->{dbh}->prepare_cached(<<'');
-DELETE FROM xref3 WHERE docid = ? AND oidbin = ?
-
-		$sth->bind_param(1, $docid);
-		$sth->bind_param(2, $oidbin, SQL_BLOB);
-	}
-	$sth->execute;
-	$sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
-SELECT COUNT(*) FROM xref3 WHERE docid = ?
-
-	$sth->execute($docid);
-	my $nr = $sth->fetchrow_array;
-	if ($nr == 0) {
-		delete_by_num($self, $docid);
-	} elsif (defined($ibx_id) && $rm_eidx_info) {
-		# if deduplication rules in ContentHash change, it's
-		# possible a docid can have multiple rows with the
-		# same ibx_id.  This governs whether or not we call
-		# ->remove_eidx_info in ExtSearchIdx.
-		$sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
-SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ?
-
-		$sth->execute($docid, $ibx_id);
-		my $count = $sth->fetchrow_array;
-		$$rm_eidx_info = ($count == 0);
-	}
-	$nr;
-}
-
 # for when an xref3 goes missing, this does NOT update {ts}
 sub update_blob {
 	my ($self, $smsg, $oidhex) = @_;
diff --git a/t/extsearch.t b/t/extsearch.t
index 896e270414bd..1b6235ba0eca 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -375,7 +375,7 @@ if ('reindex catches content bifurcation') {
 	is($oidx->max, $oldmax, 'oidx->max unchanged');
 	$oidx->dbh_close;
 	ok(run_script([qw(-extindex --reindex --all), "$home/extindex"],
-		undef, $opt), 'extindex --reindex');
+		undef, $opt), 'extindex --reindex') or diag explain($opt);
 	$oidx->dbh;
 	ok($oidx->max > $oldmax, 'oidx->max bumped');
 	like($err, qr/split into 2 due to deduplication change/,
diff --git a/t/over.t b/t/over.t
index a92d2f77b039..1f2df7cf065a 100644
--- a/t/over.t
+++ b/t/over.t
@@ -90,8 +90,9 @@ $over->eidx_prep;
 			'example.kee:2018:deadbeefcafe' ],
 			'xref3 works forw two');
 
-	@arg = qw(1349 adeadba7cafe example.key);
-	is($over->remove_xref3(@arg), 1, 'remove first');
+	is($over->dbh->do(<<''), 1, 'remove first');
+DELETE FROM xref3 WHERE xnum = 2019 AND docid = 1349
+
 	$xref3 = $over->get_xref3(1349);
 	is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ],
 		'confirm removal successful');

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-10-11  8:06 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-11  8:06 [PATCH 0/6] extindex: --reindex --fast gets faster Eric Wong
2021-10-11  8:06 ` [PATCH 1/6] extindex: speed up --reindex --fast Eric Wong
2021-10-11  8:06 ` [PATCH 2/6] sqlite: PRAGMA optimize on close Eric Wong
2021-10-11  8:06 ` [PATCH 3/6] extindex: rename var: active => active_shards Eric Wong
2021-10-11  8:06 ` [PATCH 4/6] extindex: share unref logic in more places Eric Wong
2021-10-11  8:06 ` [PATCH 5/6] extindex: more consistent doc removal Eric Wong
2021-10-11  8:06 ` [PATCH 6/6] extindex: avoid invalid blobs after unref Eric Wong

Code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).