From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id C81371F86C for ; Sat, 28 Nov 2020 08:45:21 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] extindex: fix delete (`d') handling Date: Sat, 28 Nov 2020 08:45:21 +0000 Message-Id: <20201128084521.12571-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We need to completely remove a message from over.sqlite3 and Xapian when no references remain, otherwise users will still see the removed messages in NNTP overviews and WWW search results/summaries. References to messages are now solely handled by the `xref3' table of over.sqlite3. We can also trust `xref3' when deciding whether to remove only the "O$eidx_key" and "G$lid" terms from a document in Xapian or to remove the entire Xapian document. --- Welcome to episode #967 of "Deletes Are Hard"... lib/PublicInbox/ExtSearchIdx.pm | 13 ++++++++++--- lib/PublicInbox/OverIdx.pm | 27 ++++++++++++++++++++++++--- t/extsearch.t | 20 ++++++++++++++++++++ t/over.t | 2 +- 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index cf90c562..d780776f 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -128,14 +128,21 @@ sub do_xpost ($$) { my $oid = $req->{oid}; my $xibx = $req->{ibx}; my $eml = $req->{eml}; + my $eidx_key = $xibx->eidx_key; if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message my $xnum = $req->{xnum}; - $self->{oidx}->add_xref3($docid, $xnum, $oid, $xibx->eidx_key); + $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key); $idx->shard_add_eidx_info($docid, $oid, $xibx, $eml); check_batch_limit($req); } else { # 'd' - $self->{oidx}->remove_xref3($docid, $oid, $xibx->eidx_key); - $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml); + my $rm_eidx_info; + my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key, + \$rm_eidx_info); + if ($nr == 0) { + $idx->shard_remove($oid, $docid); + } elsif ($rm_eidx_info) { + $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml); + } } } diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 8bec08da..07cca4e5 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -595,13 +595,14 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) $sth->execute; } +# returns remaining reference count to $docid sub remove_xref3 { - my ($self, $docid, $oidhex, $eidx_key) = @_; + my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; begin_lazy($self); my $oidbin = pack('H*', $oidhex); - my $sth; + my ($sth, $ibx_id); if (defined $eidx_key) { - my $ibx_id = id_for($self, 'inboxes', 'ibx_id', + $ibx_id = id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); $sth = $self->{dbh}->prepare_cached(<<''); DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? @@ -617,6 +618,26 @@ DELETE FROM xref3 WHERE docid = ? AND oidbin = ? $sth->bind_param(2, $oidbin, SQL_BLOB); } $sth->execute; + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? + + $sth->execute($docid); + my $nr = $sth->fetchrow_array; + if ($nr == 0) { + delete_by_num($self, $docid); + } elsif (defined($ibx_id) && $rm_eidx_info) { + # if deduplication rules in ContentHash change, it's + # possible a docid can have multiple rows with the + # same ibx_id. This governs whether or not we call + # ->shard_remove_eidx_info in ExtSearchIdx. + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my $count = $sth->fetchrow_array; + $$rm_eidx_info = ($count == 0); + } + $nr; } # for when an xref3 goes missing, this does NOT update {ts} diff --git a/t/extsearch.t b/t/extsearch.t index f9f74e5c..f5855558 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -118,6 +118,26 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex"); is(scalar(@$x1), 1, 'original only has one xref3'); is(scalar(@$x2), 1, 'new message has one xref3'); isnt($x1->[0], $x2->[0], 'xref3 differs'); + + my $mset = $es->mset('b:"BEST MSG"'); + is($mset->size, 1, 'new message found'); + $mset = $es->mset('b:"test message"'); + is($mset->size, 1, 'old message found'); + + delete @$es{qw(git over xdb)}; # fork preparation + + open my $rmfh, '+>', undef or BAIL_OUT $!; + $rmfh->autoflush(1); + print $rmfh $eml2->as_string or BAIL_OUT $!; + seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!; + $opt->{0} = $rmfh; + ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm'); + + ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef), + 'extindex after rm'); + is($es->over->get_art(2), undef, 'doc #2 gone'); + $mset = $es->mset('b:"BEST MSG"'); + is($mset->size, 0, 'new message gone'); } my $misc = $es->misc; diff --git a/t/over.t b/t/over.t index 56c20d01..22061249 100644 --- a/t/over.t +++ b/t/over.t @@ -91,7 +91,7 @@ $over->eidx_prep; 'xref3 works forw two'); @arg = qw(1349 adeadba7cafe example.key); - ok($over->remove_xref3(@arg), 'remove first'); + is($over->remove_xref3(@arg), 1, 'remove first'); $xref3 = $over->get_xref3(1349); is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ], 'confirm removal successful');