From 50ac81092ba1034f3055ddabb3d7cc7853edfa41 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 28 Nov 2020 08:45:21 +0000 Subject: extindex: fix delete (`d') handling We need to completely remove a message from over.sqlite3 and Xapian when no references remain, otherwise users will still see the removed messages in NNTP overviews and WWW search results/summaries. References to messages are now solely handled by the `xref3' table of over.sqlite3. We can also trust `xref3' when deciding whether to remove only the "O$eidx_key" and "G$lid" terms from a document in Xapian or to remove the entire Xapian document. --- lib/PublicInbox/ExtSearchIdx.pm | 13 ++++++++++--- lib/PublicInbox/OverIdx.pm | 27 ++++++++++++++++++++++++--- t/extsearch.t | 20 ++++++++++++++++++++ t/over.t | 2 +- 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index cf90c562..d780776f 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -128,14 +128,21 @@ sub do_xpost ($$) { my $oid = $req->{oid}; my $xibx = $req->{ibx}; my $eml = $req->{eml}; + my $eidx_key = $xibx->eidx_key; if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message my $xnum = $req->{xnum}; - $self->{oidx}->add_xref3($docid, $xnum, $oid, $xibx->eidx_key); + $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key); $idx->shard_add_eidx_info($docid, $oid, $xibx, $eml); check_batch_limit($req); } else { # 'd' - $self->{oidx}->remove_xref3($docid, $oid, $xibx->eidx_key); - $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml); + my $rm_eidx_info; + my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key, + \$rm_eidx_info); + if ($nr == 0) { + $idx->shard_remove($oid, $docid); + } elsif ($rm_eidx_info) { + $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml); + } } } diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 8bec08da..07cca4e5 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -595,13 +595,14 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) $sth->execute; } +# returns remaining reference count to $docid sub remove_xref3 { - my ($self, $docid, $oidhex, $eidx_key) = @_; + my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; begin_lazy($self); my $oidbin = pack('H*', $oidhex); - my $sth; + my ($sth, $ibx_id); if (defined $eidx_key) { - my $ibx_id = id_for($self, 'inboxes', 'ibx_id', + $ibx_id = id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); $sth = $self->{dbh}->prepare_cached(<<''); DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? @@ -617,6 +618,26 @@ DELETE FROM xref3 WHERE docid = ? AND oidbin = ? $sth->bind_param(2, $oidbin, SQL_BLOB); } $sth->execute; + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? + + $sth->execute($docid); + my $nr = $sth->fetchrow_array; + if ($nr == 0) { + delete_by_num($self, $docid); + } elsif (defined($ibx_id) && $rm_eidx_info) { + # if deduplication rules in ContentHash change, it's + # possible a docid can have multiple rows with the + # same ibx_id. This governs whether or not we call + # ->shard_remove_eidx_info in ExtSearchIdx. + $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my $count = $sth->fetchrow_array; + $$rm_eidx_info = ($count == 0); + } + $nr; } # for when an xref3 goes missing, this does NOT update {ts} diff --git a/t/extsearch.t b/t/extsearch.t index f9f74e5c..f5855558 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -118,6 +118,26 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex"); is(scalar(@$x1), 1, 'original only has one xref3'); is(scalar(@$x2), 1, 'new message has one xref3'); isnt($x1->[0], $x2->[0], 'xref3 differs'); + + my $mset = $es->mset('b:"BEST MSG"'); + is($mset->size, 1, 'new message found'); + $mset = $es->mset('b:"test message"'); + is($mset->size, 1, 'old message found'); + + delete @$es{qw(git over xdb)}; # fork preparation + + open my $rmfh, '+>', undef or BAIL_OUT $!; + $rmfh->autoflush(1); + print $rmfh $eml2->as_string or BAIL_OUT $!; + seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!; + $opt->{0} = $rmfh; + ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm'); + + ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef), + 'extindex after rm'); + is($es->over->get_art(2), undef, 'doc #2 gone'); + $mset = $es->mset('b:"BEST MSG"'); + is($mset->size, 0, 'new message gone'); } my $misc = $es->misc; diff --git a/t/over.t b/t/over.t index 56c20d01..22061249 100644 --- a/t/over.t +++ b/t/over.t @@ -91,7 +91,7 @@ $over->eidx_prep; 'xref3 works forw two'); @arg = qw(1349 adeadba7cafe example.key); - ok($over->remove_xref3(@arg), 'remove first'); + is($over->remove_xref3(@arg), 1, 'remove first'); $xref3 = $over->get_xref3(1349); is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ], 'confirm removal successful'); -- cgit v1.2.3-24-ge0c7