user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
 Warning: Initial query:
 %22extindex: fix delete (%60d%27) handling%22
 returned no results, used:
 "extindex: fix delete (`d') handling"
 instead

Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH] extindex: fix delete (`d') handling
@ 2020-11-28  8:45  7% Eric Wong
  0 siblings, 0 replies; 1+ results
From: Eric Wong @ 2020-11-28  8:45 UTC (permalink / raw)
  To: meta

We need to completely remove a message from over.sqlite3 and
Xapian when no references remain, otherwise users will still see
the removed messages in NNTP overviews and WWW search
results/summaries.

References to messages are now solely handled by the `xref3'
table of over.sqlite3.  We can also trust `xref3' when deciding
whether to remove only the "O$eidx_key" and "G$lid" terms from a
document in Xapian or to remove the entire Xapian document.
---
  Welcome to episode #967 of "Deletes Are Hard"...

 lib/PublicInbox/ExtSearchIdx.pm | 13 ++++++++++---
 lib/PublicInbox/OverIdx.pm      | 27 ++++++++++++++++++++++++---
 t/extsearch.t                   | 20 ++++++++++++++++++++
 t/over.t                        |  2 +-
 4 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index cf90c562..d780776f 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -128,14 +128,21 @@ sub do_xpost ($$) {
 	my $oid = $req->{oid};
 	my $xibx = $req->{ibx};
 	my $eml = $req->{eml};
+	my $eidx_key = $xibx->eidx_key;
 	if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message
 		my $xnum = $req->{xnum};
-		$self->{oidx}->add_xref3($docid, $xnum, $oid, $xibx->eidx_key);
+		$self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key);
 		$idx->shard_add_eidx_info($docid, $oid, $xibx, $eml);
 		check_batch_limit($req);
 	} else { # 'd'
-		$self->{oidx}->remove_xref3($docid, $oid, $xibx->eidx_key);
-		$idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml);
+		my $rm_eidx_info;
+		my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key,
+							\$rm_eidx_info);
+		if ($nr == 0) {
+			$idx->shard_remove($oid, $docid);
+		} elsif ($rm_eidx_info) {
+			$idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml);
+		}
 	}
 }
 
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 8bec08da..07cca4e5 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -595,13 +595,14 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?)
 	$sth->execute;
 }
 
+# returns remaining reference count to $docid
 sub remove_xref3 {
-	my ($self, $docid, $oidhex, $eidx_key) = @_;
+	my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_;
 	begin_lazy($self);
 	my $oidbin = pack('H*', $oidhex);
-	my $sth;
+	my ($sth, $ibx_id);
 	if (defined $eidx_key) {
-		my $ibx_id = id_for($self, 'inboxes', 'ibx_id',
+		$ibx_id = id_for($self, 'inboxes', 'ibx_id',
 					eidx_key => $eidx_key);
 		$sth = $self->{dbh}->prepare_cached(<<'');
 DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ?
@@ -617,6 +618,26 @@ DELETE FROM xref3 WHERE docid = ? AND oidbin = ?
 		$sth->bind_param(2, $oidbin, SQL_BLOB);
 	}
 	$sth->execute;
+	$sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
+SELECT COUNT(*) FROM xref3 WHERE docid = ?
+
+	$sth->execute($docid);
+	my $nr = $sth->fetchrow_array;
+	if ($nr == 0) {
+		delete_by_num($self, $docid);
+	} elsif (defined($ibx_id) && $rm_eidx_info) {
+		# if deduplication rules in ContentHash change, it's
+		# possible a docid can have multiple rows with the
+		# same ibx_id.  This governs whether or not we call
+		# ->shard_remove_eidx_info in ExtSearchIdx.
+		$sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
+SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+		$sth->execute($docid, $ibx_id);
+		my $count = $sth->fetchrow_array;
+		$$rm_eidx_info = ($count == 0);
+	}
+	$nr;
 }
 
 # for when an xref3 goes missing, this does NOT update {ts}
diff --git a/t/extsearch.t b/t/extsearch.t
index f9f74e5c..f5855558 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -118,6 +118,26 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex");
 	is(scalar(@$x1), 1, 'original only has one xref3');
 	is(scalar(@$x2), 1, 'new message has one xref3');
 	isnt($x1->[0], $x2->[0], 'xref3 differs');
+
+	my $mset = $es->mset('b:"BEST MSG"');
+	is($mset->size, 1, 'new message found');
+	$mset = $es->mset('b:"test message"');
+	is($mset->size, 1, 'old message found');
+
+	delete @$es{qw(git over xdb)}; # fork preparation
+
+	open my $rmfh, '+>', undef or BAIL_OUT $!;
+	$rmfh->autoflush(1);
+	print $rmfh $eml2->as_string or BAIL_OUT $!;
+	seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!;
+	$opt->{0} = $rmfh;
+	ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm');
+
+	ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef),
+		'extindex after rm');
+	is($es->over->get_art(2), undef, 'doc #2 gone');
+	$mset = $es->mset('b:"BEST MSG"');
+	is($mset->size, 0, 'new message gone');
 }
 
 my $misc = $es->misc;
diff --git a/t/over.t b/t/over.t
index 56c20d01..22061249 100644
--- a/t/over.t
+++ b/t/over.t
@@ -91,7 +91,7 @@ $over->eidx_prep;
 			'xref3 works forw two');
 
 	@arg = qw(1349 adeadba7cafe example.key);
-	ok($over->remove_xref3(@arg), 'remove first');
+	is($over->remove_xref3(@arg), 1, 'remove first');
 	$xref3 = $over->get_xref3(1349);
 	is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ],
 		'confirm removal successful');

^ permalink raw reply related	[relevance 7%]

Results 1-1 of 1 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-11-28  8:45  7% [PATCH] extindex: fix delete (`d') handling Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).