From eb48e7d6675babdda9a36be1a490c29a2ccddbdc Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Mon, 19 Mar 2018 08:14:39 +0000 Subject: v2writable: implement remove correctly We need to hide removals from anybody hitting the search engine. --- lib/PublicInbox/Msgmap.pm | 8 ++++++ lib/PublicInbox/SearchIdx.pm | 32 ++++++++++++++++++++++ lib/PublicInbox/SearchIdxPart.pm | 8 ++++++ lib/PublicInbox/SearchIdxSkeleton.pm | 18 +++++++++++++ lib/PublicInbox/SearchMsg.pm | 4 ++- lib/PublicInbox/V2Writable.pm | 51 ++++++++++++++++++++++++++++-------- 6 files changed, 109 insertions(+), 12 deletions(-) (limited to 'lib/PublicInbox') diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index a147b9f3..8e81fba0 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -140,6 +140,14 @@ sub mid_delete { $sth->execute; } +sub num_delete { + my ($self, $num) = @_; + my $dbh = $self->{dbh}; + my $sth = $dbh->prepare('DELETE FROM msgmap WHERE num = ?'); + $sth->bind_param(1, $num); + $sth->execute; +} + sub create_tables { my ($dbh) = @_; my $e; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ccec0181..ae2544da 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -440,6 +440,31 @@ sub remove_message { } } +# MID is a hint in V2 +sub remove_by_oid { + my ($self, $oid, $mid) = @_; + my $db = $self->{xdb}; + + # XXX careful, we cannot use batch_do here since we conditionally + # delete documents based on other factors, so we cannot call + # find_doc_ids twice. + my ($head, $tail) = $self->find_doc_ids('Q' . $mid); + return if $head == $tail; + + # there is only ONE element in @delete unless we + # have bugs in our v2writable deduplication check + my @delete; + for (; $head != $tail; $head->inc) { + my $docid = $head->get_docid; + my $doc = $db->get_document($docid); + my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); + $smsg->load_expand; + push(@delete, $docid) if $smsg->{blob} eq $oid; + } + $db->delete_document($_) foreach @delete; + scalar(@delete); +} + sub term_generator { # write-only my ($self) = @_; @@ -896,4 +921,11 @@ sub remote_close { $? == 0 or die ref($self)." pid:$pid exited with: $?"; } +# triggers remove_by_oid in partition or skeleton +sub remote_remove { + my ($self, $oid, $mid) = @_; + print { $self->{w} } "D $oid $mid\n" or + die "failed to write remove $!"; +} + 1; diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm index dd7ace67..c1660783 100644 --- a/lib/PublicInbox/SearchIdxPart.pm +++ b/lib/PublicInbox/SearchIdxPart.pm @@ -54,6 +54,14 @@ sub partition_worker_loop ($$$) { $txn = undef; print { $self->{skeleton}->{w} } "barrier $part\n" or die "write failed to skeleton: $!\n"; + } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) { + my ($oid, $mid) = ($1, $2); + $xdb ||= $self->_xdb_acquire; + if (!$txn) { + $xdb->begin_transaction; + $txn = 1; + } + $self->remove_by_oid($oid, $mid); } else { chomp $line; my ($len, $artnum, $oid, $mid0) = split(/ /, $line); diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm index 4cb10f59..beb17b9f 100644 --- a/lib/PublicInbox/SearchIdxSkeleton.pm +++ b/lib/PublicInbox/SearchIdxSkeleton.pm @@ -73,6 +73,14 @@ sub skeleton_worker_loop { print $barrier_note "barrier_done\n" or die "print failed to barrier note: $!"; } + } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.*)\n\z/s) { + my ($oid, $mid) = ($1, $2); + $xdb ||= $self->_xdb_acquire; + if (!$txn) { + $xdb->begin_transaction; + $txn = 1; + } + $self->remove_by_oid($oid, $mid); } else { my $len = int($line); my $n = read($r, my $msg, $len) or die "read: $!\n"; @@ -110,6 +118,16 @@ sub index_skeleton { die "print failed: $err\n" if $err; } +sub remote_remove { + my ($self, $oid, $mid) = @_; + my $err; + $self->_lock_acquire; + eval { $self->SUPER::remote_remove($oid, $mid) }; + $err = $@; + $self->_lock_release; + die $err if $err; +} + # values: [ TS, NUM, BYTES, LINES, MID, XPATH, doc_data ] sub index_skeleton_real ($$) { my ($self, $values) = @_; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 23478a2a..a1cd0c28 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -64,7 +64,9 @@ sub load_doc { # :bytes and :lines metadata in RFC 3977 sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) } sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) } -sub num ($) { get_val($_[0]->{doc}, &PublicInbox::Search::NUM) } +sub num ($) { + $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM) +} sub __hdr ($$) { my ($self, $field) = @_; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index e673c252..656f0693 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -199,18 +199,47 @@ sub idx_init { } sub remove { - my ($self, $mime, $msg) = @_; - my $existing = $self->lookup_content($mime) or return; - - # don't touch ghosts or already junked messages - return unless $existing->type eq 'mail'; - - # always write removals to the current (latest) git repo since - # we process chronologically + my ($self, $mime, $cmt_msg) = @_; + $self->barrier; + $self->idx_init; my $im = $self->importer; - my ($cmt, undef) = $im->remove($mime, $msg); - $cmt = $im->get_mark($cmt); - $self->unindex_msg($existing, $cmt); + my $ibx = $self->{-inbox}; + my $srch = $ibx->search; + my $cid = content_id($mime); + my $skel = $self->{skel}; + my $parts = $self->{idx_parts}; + my $mm = $skel->{mm}; + my $removed; + my $mids = mids($mime->header_obj); + foreach my $mid (@$mids) { + $srch->reopen->each_smsg_by_mid($mid, sub { + my ($smsg) = @_; + $smsg->load_expand; + my $msg = $ibx->msg_by_smsg($smsg); + if (!defined($msg)) { + warn "broken smsg for $mid\n"; + return 1; # continue + } + my $cur = PublicInbox::MIME->new($msg); + if (content_id($cur) eq $cid) { + $mm->num_delete($smsg->num); + # $removed should only be set once assuming + # no bugs in our deduplication code: + $removed = $smsg; + $removed->{mime} = $cur; + $im->remove($cur, $cmt_msg); + $removed->num; # memoize this for callers + + my $oid = $smsg->{blob}; + foreach my $idx (@$parts, $skel) { + $idx->remote_remove($oid, $mid); + } + } + 1; # continue + }); + $self->barrier; + } + $removed; } sub done { -- cgit v1.2.3-24-ge0c7