From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id D12391F8ED for ; Fri, 17 Jul 2020 06:31:56 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 8/8] search: simplify unindexing Date: Fri, 17 Jul 2020 06:31:55 +0000 Message-Id: <20200717063155.3734-9-e@yhbt.net> In-Reply-To: <20200717063155.3734-1-e@yhbt.net> References: <20200717063155.3734-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Since over.sqlite3 seems here to stay, we no longer need to do Message-ID lookups against Xapian and can simply rely on the docid <=> NNTP article number equivalancy SCHEMA_VERSION=15 gave us. This rids us of the closure-using batch_do sub in the v1 code path and vastly simplifies both v1 and v2 unindexing. --- lib/PublicInbox/OverIdx.pm | 13 +-- lib/PublicInbox/SearchIdx.pm | 134 +++++++++++------------------- lib/PublicInbox/SearchIdxShard.pm | 6 +- lib/PublicInbox/V2Writable.pm | 7 +- t/search.t | 6 +- 5 files changed, 66 insertions(+), 100 deletions(-) diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 29c6e0b9..5601e602 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -326,22 +326,23 @@ INSERT INTO id2num (id, num) VALUES (?,?) } sub _remove_oid { - my ($self, $smsg, $oid, $nr) = @_; + my ($self, $smsg, $oid, $removed) = @_; if (!defined($oid) || $smsg->{blob} eq $oid) { delete_by_num($self, $smsg->{num}); - $$nr++; + push @$removed, $smsg->{num}; } 1; } -# returns number of removed messages +# returns number of removed messages in scalar context, +# array of removed article numbers in array context. # $oid may be undef to match only on $mid sub remove_oid { my ($self, $oid, $mid) = @_; - my $nr = 0; + my $removed = []; begin_lazy($self); - each_by_mid($self, $mid, ['ddd'], \&_remove_oid, $oid, \$nr); - $nr; + each_by_mid($self, $mid, ['ddd'], \&_remove_oid, $oid, $removed); + @$removed; } sub _num_mid0_for_oid { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 9550847b..83162509 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -12,7 +12,7 @@ use v5.10.1; use parent qw(PublicInbox::Search PublicInbox::Lock); use PublicInbox::Eml; use PublicInbox::InboxWritable; -use PublicInbox::MID qw/mid_clean mid_mime mids_for_index/; +use PublicInbox::MID qw(mid_mime mids_for_index mids); use PublicInbox::MsgIter; use Carp qw(croak); use POSIX qw(strftime); @@ -413,88 +413,32 @@ sub add_message { $smsg->{num}; } -# returns begin and end PostingIterator -sub find_doc_ids { - my ($self, $termval) = @_; - my $db = $self->{xdb}; - - ($db->postlist_begin($termval), $db->postlist_end($termval)); -} - -# v1 only -sub batch_do { - my ($self, $termval, $cb) = @_; - my $batch_size = 1000; # don't let @ids grow too large to avoid OOM - while (1) { - my ($head, $tail) = $self->find_doc_ids($termval); - return if $head == $tail; - my @ids; - for (; $head != $tail && @ids < $batch_size; $head++) { - push @ids, $head->get_docid; +sub xdb_remove { + my ($self, $oid, @removed) = @_; + my $xdb = $self->{xdb} or return; + for my $num (@removed) { + my $doc = eval { $xdb->get_document($num) }; + unless ($doc) { + warn "E: $@\n" if $@; + warn "E: #$num $oid missing in Xapian\n"; + next; } - $cb->(\@ids); - } -} - -# v1 only, where $mid is unique -sub remove_message { - my ($self, $mid) = @_; - $mid = mid_clean($mid); - - if (my $over = $self->{over}) { - my $nr = eval { $over->remove_oid(undef, $mid) }; - if ($@) { - warn "failed to remove <$mid> from overview: $@\n"; - } elsif ($nr == 0) { - warn "<$mid> missing for removal from overview\n"; + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->load_expand($doc); + my $blob = $smsg->{blob} // '(unset)'; + if ($blob eq $oid) { + $xdb->delete_document($num); + } else { + warn "E: #$num $oid != $blob in Xapian\n"; } } - return unless need_xapian($self); - my $db = $self->{xdb}; - my $nr = 0; - eval { - batch_do($self, 'Q' . $mid, sub { - my ($ids) = @_; - $db->delete_document($_) for @$ids; - $nr += scalar @$ids; - }); - }; - if ($@) { - warn "failed to remove <$mid> from Xapian: $@\n"; - } elsif ($nr == 0) { - warn "<$mid> missing for removal from Xapian\n"; - } } -# MID is a hint in V2 sub remove_by_oid { - my ($self, $oid, $mid) = @_; - - $self->{over}->remove_oid($oid, $mid) if $self->{over}; - - return unless need_xapian($self); - my $db = $self->{xdb}; - - # XXX careful, we cannot use batch_do here since we conditionally - # delete documents based on other factors, so we cannot call - # find_doc_ids twice. - my ($head, $tail) = $self->find_doc_ids('Q' . $mid); - return if $head == $tail; - - # there is only ONE element in @delete unless we - # have bugs in our v2writable deduplication check - my @delete; - for (; $head != $tail; $head++) { - my $docid = $head->get_docid; - my $doc = $db->get_document($docid); - my $smsg = bless { mid => $mid }, 'PublicInbox::Smsg'; - $smsg->load_expand($doc); - if ($smsg->{blob} eq $oid) { - push(@delete, $docid); - } - } - $db->delete_document($_) foreach @delete; - scalar(@delete); + my ($self, $oid, $num) = @_; + die "BUG: remove_by_oid is v2-only\n" if $self->{over}; + $self->begin_txn_lazy; + xdb_remove($self, $oid, $num) if need_xapian($self); } sub index_git_blob_id { @@ -507,10 +451,29 @@ sub index_git_blob_id { } } -sub unindex_blob { - my ($self, $mime) = @_; - my $mid = eval { mid_mime($mime) }; - $self->remove_message($mid) if defined $mid; +# v1 only +sub unindex_eml { + my ($self, $oid, $eml) = @_; + my $mids = mids($eml); + my $nr = 0; + my %tmp; + for my $mid (@$mids) { + my @removed = eval { $self->{over}->remove_oid($oid, $mid) }; + if ($@) { + warn "E: failed to remove <$mid> from overview: $@\n"; + } else { + $nr += scalar @removed; + $tmp{$_}++ for @removed; + } + } + if (!$nr) { + $mids = join('> <', @$mids); + warn "W: <$mids> missing for removal from overview\n"; + } + while (my ($num, $nr) = each %tmp) { + warn "BUG: $num appears >1 times ($nr) for $oid\n" if $nr != 1; + } + xdb_remove($self, $oid, keys %tmp) if need_xapian($self); } sub index_mm { @@ -577,7 +540,7 @@ sub index_both { # git->cat_async callback sub unindex_both { # git->cat_async callback my ($bref, $oid, $type, $size, $self) = @_; my $eml = PublicInbox::Eml->new($bref); - unindex_blob($self, $eml); + unindex_eml($self, $oid, $eml); unindex_mm($self, $eml); } @@ -844,13 +807,12 @@ sub remote_close { } sub remote_remove { - my ($self, $oid, $mid) = @_; + my ($self, $oid, $num) = @_; if (my $w = $self->{w}) { # triggers remove_by_oid in a shard - print $w "D $oid $mid\n" or die "failed to write remove $!"; + print $w "D $oid $num\n" or die "failed to write remove $!"; } else { - $self->begin_txn_lazy; - $self->remove_by_oid($oid, $mid); + $self->remove_by_oid($oid, $num); } } diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index b51d148b..54426881 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -62,10 +62,8 @@ sub shard_worker_loop ($$$$$) { # no need to lock < 512 bytes is atomic under POSIX print $bnote "barrier $shard\n" or die "write failed for barrier $!\n"; - } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) { - my ($oid, $mid) = ($1, $2); - $self->begin_txn_lazy; - $self->remove_by_oid($oid, $mid); + } elsif ($line =~ /\AD ([a-f0-9]{40,}) ([0-9]+)\n\z/s) { + $self->remove_by_oid($1, $2 + 0); } else { chomp $line; # n.b. $mid may contain spaces(!) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b51c8525..dffe90d8 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1185,8 +1185,11 @@ sub sync_prepare ($$$) { sub unindex_oid_remote ($$$) { my ($self, $oid, $mid) = @_; - $_->remote_remove($oid, $mid) foreach @{$self->{idx_shards}}; - $self->{over}->remove_oid($oid, $mid); + my @removed = $self->{over}->remove_oid($oid, $mid); + for my $num (@removed) { + my $idx = idx_shard($self, $num % $self->{shards}); + $idx->remote_remove($oid, $num); + } } sub unindex_oid ($$$;$) { diff --git a/t/search.t b/t/search.t index 82caf9e4..aa6f94bf 100644 --- a/t/search.t +++ b/t/search.t @@ -397,7 +397,9 @@ $ibx->with_umask(sub { $ibx->with_umask(sub { my $amsg = eml_load 't/search-amsg.eml'; - ok($rw->add_message($amsg), 'added attachment'); + my $oid = ('0'x40); + my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + ok($rw->add_message($amsg, $smsg), 'added attachment'); $rw_commit->(); $ro->reopen; my $n = $ro->query('n:attached_fart.txt'); @@ -418,7 +420,7 @@ $ibx->with_umask(sub { $art = $ro->{over_ro}->next_by_mid($mid, \$id, \$prev); ok($art, 'article exists in OVER DB'); } - $rw->unindex_blob($amsg); + $rw->unindex_eml($oid, $amsg); $rw->commit_txn_lazy; SKIP: { skip('$art not defined', 1) unless defined $art;