* [PATCH] extindex: fix delete (`d') handling
@ 2020-11-28 8:45 7% Eric Wong
0 siblings, 0 replies; 1+ results
From: Eric Wong @ 2020-11-28 8:45 UTC (permalink / raw)
To: meta
We need to completely remove a message from over.sqlite3 and
Xapian when no references remain, otherwise users will still see
the removed messages in NNTP overviews and WWW search
results/summaries.
References to messages are now solely handled by the `xref3'
table of over.sqlite3. We can also trust `xref3' when deciding
whether to remove only the "O$eidx_key" and "G$lid" terms from a
document in Xapian or to remove the entire Xapian document.
---
Welcome to episode #967 of "Deletes Are Hard"...
lib/PublicInbox/ExtSearchIdx.pm | 13 ++++++++++---
lib/PublicInbox/OverIdx.pm | 27 ++++++++++++++++++++++++---
t/extsearch.t | 20 ++++++++++++++++++++
t/over.t | 2 +-
4 files changed, 55 insertions(+), 7 deletions(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index cf90c562..d780776f 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -128,14 +128,21 @@ sub do_xpost ($$) {
my $oid = $req->{oid};
my $xibx = $req->{ibx};
my $eml = $req->{eml};
+ my $eidx_key = $xibx->eidx_key;
if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message
my $xnum = $req->{xnum};
- $self->{oidx}->add_xref3($docid, $xnum, $oid, $xibx->eidx_key);
+ $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key);
$idx->shard_add_eidx_info($docid, $oid, $xibx, $eml);
check_batch_limit($req);
} else { # 'd'
- $self->{oidx}->remove_xref3($docid, $oid, $xibx->eidx_key);
- $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml);
+ my $rm_eidx_info;
+ my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key,
+ \$rm_eidx_info);
+ if ($nr == 0) {
+ $idx->shard_remove($oid, $docid);
+ } elsif ($rm_eidx_info) {
+ $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml);
+ }
}
}
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 8bec08da..07cca4e5 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -595,13 +595,14 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?)
$sth->execute;
}
+# returns remaining reference count to $docid
sub remove_xref3 {
- my ($self, $docid, $oidhex, $eidx_key) = @_;
+ my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_;
begin_lazy($self);
my $oidbin = pack('H*', $oidhex);
- my $sth;
+ my ($sth, $ibx_id);
if (defined $eidx_key) {
- my $ibx_id = id_for($self, 'inboxes', 'ibx_id',
+ $ibx_id = id_for($self, 'inboxes', 'ibx_id',
eidx_key => $eidx_key);
$sth = $self->{dbh}->prepare_cached(<<'');
DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ?
@@ -617,6 +618,26 @@ DELETE FROM xref3 WHERE docid = ? AND oidbin = ?
$sth->bind_param(2, $oidbin, SQL_BLOB);
}
$sth->execute;
+ $sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
+SELECT COUNT(*) FROM xref3 WHERE docid = ?
+
+ $sth->execute($docid);
+ my $nr = $sth->fetchrow_array;
+ if ($nr == 0) {
+ delete_by_num($self, $docid);
+ } elsif (defined($ibx_id) && $rm_eidx_info) {
+ # if deduplication rules in ContentHash change, it's
+ # possible a docid can have multiple rows with the
+ # same ibx_id. This governs whether or not we call
+ # ->shard_remove_eidx_info in ExtSearchIdx.
+ $sth = $self->{dbh}->prepare_cached(<<'', undef, 1);
+SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+ $sth->execute($docid, $ibx_id);
+ my $count = $sth->fetchrow_array;
+ $$rm_eidx_info = ($count == 0);
+ }
+ $nr;
}
# for when an xref3 goes missing, this does NOT update {ts}
diff --git a/t/extsearch.t b/t/extsearch.t
index f9f74e5c..f5855558 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -118,6 +118,26 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex");
is(scalar(@$x1), 1, 'original only has one xref3');
is(scalar(@$x2), 1, 'new message has one xref3');
isnt($x1->[0], $x2->[0], 'xref3 differs');
+
+ my $mset = $es->mset('b:"BEST MSG"');
+ is($mset->size, 1, 'new message found');
+ $mset = $es->mset('b:"test message"');
+ is($mset->size, 1, 'old message found');
+
+ delete @$es{qw(git over xdb)}; # fork preparation
+
+ open my $rmfh, '+>', undef or BAIL_OUT $!;
+ $rmfh->autoflush(1);
+ print $rmfh $eml2->as_string or BAIL_OUT $!;
+ seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!;
+ $opt->{0} = $rmfh;
+ ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm');
+
+ ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef),
+ 'extindex after rm');
+ is($es->over->get_art(2), undef, 'doc #2 gone');
+ $mset = $es->mset('b:"BEST MSG"');
+ is($mset->size, 0, 'new message gone');
}
my $misc = $es->misc;
diff --git a/t/over.t b/t/over.t
index 56c20d01..22061249 100644
--- a/t/over.t
+++ b/t/over.t
@@ -91,7 +91,7 @@ $over->eidx_prep;
'xref3 works forw two');
@arg = qw(1349 adeadba7cafe example.key);
- ok($over->remove_xref3(@arg), 'remove first');
+ is($over->remove_xref3(@arg), 1, 'remove first');
$xref3 = $over->get_xref3(1349);
is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ],
'confirm removal successful');
^ permalink raw reply related [relevance 7%]
Results 1-1 of 1 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-11-28 8:45 7% [PATCH] extindex: fix delete (`d') handling Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).