From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A66991FAE8 for ; Tue, 6 Mar 2018 08:42:42 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 06/34] v2writable: deduplicate detection on add Date: Tue, 6 Mar 2018 08:42:14 +0000 Message-Id: <20180306084242.19988-7-e@80x24.org> In-Reply-To: <20180306084242.19988-1-e@80x24.org> References: <20180306084242.19988-1-e@80x24.org> List-Id: This is a bit expensive in a multi-process situation because we need to make our indices and packs visible to the read-only pieces. --- lib/PublicInbox/Search.pm | 16 ++++++++++ lib/PublicInbox/SearchIdx.pm | 9 ++++-- lib/PublicInbox/V2Writable.pm | 68 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 81 insertions(+), 12 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 21c72b6..c074410 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -371,6 +371,22 @@ sub lookup_mail { # no ghosts! }); } +sub each_smsg_by_mid { + my ($self, $mid, $cb) = @_; + $mid = mid_clean($mid); + my $xdb = $self->{xdb}; + # XXX retry_reopen isn't necessary for V2Writable, but the PSGI + # interface will need it... + my ($head, $tail) = $self->find_doc_ids('XMID' . $mid); + for (; $head->nequal($tail); $head->inc) { + my $doc_id = $head->get_docid; + my $doc = $xdb->get_document($doc_id); + my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); + $smsg->{doc_id} = $doc_id; + $cb->($smsg) or return; + } +} + sub find_unique_doc_id { my ($self, $termval) = @_; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ec3a6f3..ed52e38 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -284,7 +284,11 @@ sub add_message { my $db = $self->{xdb}; my ($doc_id, $old_tid); - my $mid = mid_clean(mid_mime($mime)); + my @mids = mid_mime($mime); + if (@mids > 1) { + warn "Multi-MID: ( ",join(' | ', @mids)," )\n"; + } + my $mid = mid_clean($mids[0]); my $skel = $self->{skeleton}; eval { @@ -512,13 +516,12 @@ sub unindex_blob { } sub index_mm { - my ($self, $mime, $warn_existing) = @_; + my ($self, $mime) = @_; my $mid = mid_clean(mid_mime($mime)); my $mm = $self->{mm}; my $num = $mm->mid_insert($mid); return $num if defined $num; - warn "<$mid> reused\n" if $warn_existing; # fallback to num_for since filters like RubyLang set the number $mm->num_for($mid); } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 0470fb0..57cb7d3 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -11,6 +11,9 @@ use PublicInbox::SearchIdxSkeleton; use PublicInbox::MIME; use PublicInbox::Git; use PublicInbox::Import; +use PublicInbox::MID qw(mid_clean mid_mime); +use PublicInbox::ContentId qw(content_id); +use PublicInbox::Inbox; # an estimate of the post-packed size to the raw uncompressed size my $PACKING_FACTOR = 0.4; @@ -46,22 +49,40 @@ sub new { # mimics Import::add and wraps it for v2 sub add { my ($self, $mime, $check_cb) = @_; - my $existing = $self->lookup_content($mime); - if ($existing) { - return undef if $existing->type eq 'mail'; # duplicate + # spam check: + if ($check_cb) { + $mime = $check_cb->($mime) or return; } - my $im = $self->importer; + # All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set, + # as does SQLite 3.4.1+ (released in 2007-07-20), and + # Xapian 1.3.2+ (released 2015-03-15). + # For the most part, we can spawn git-fast-import without + # leaking FDs to it... + $self->idx_init; + + my $mid = mid_clean(mid_mime($mime)); + my $num = $self->{skel}->{mm}->mid_insert($mid); + if (!defined($num)) { # mid is already known + $self->done; # ensure all subprocesses are done writing + + my $existing = $self->lookup_content($mime); + warn "<$mid> resent\n" if $existing; + return if $existing; # easy, don't store duplicates + + # reuse NNTP article number? + warn "<$mid> reused for mismatched content\n"; + $self->idx_init; + $num = $self->{skel}->{mm}->num_for($mid); + } - # im->add returns undef if check_cb fails - my $cmt = $im->add($mime, $check_cb) or return; + my $im = $self->importer; + my $cmt = $im->add($mime); $cmt = $im->get_mark($cmt); my $oid = $im->{last_object_id}; my ($len, $msgref) = @{$im->{last_object}}; - $self->idx_init; - my $num = $self->{skel}->index_mm($mime, 1); my $nparts = $self->{partitions}; my $part = $num % $nparts; my $idx = $self->idx_part($part); @@ -83,6 +104,12 @@ sub idx_part { sub idx_init { my ($self) = @_; return if $self->{idx_parts}; + my $ibx = $self->{-inbox}; + + # do not leak read-only FDs to child processes, we only have these + # FDs for duplicate detection so they should not be + # frequently activated. + delete $ibx->{$_} foreach (qw(git mm search)); # first time initialization, first we create the skeleton pipe: my $skel = $self->{skel} = PublicInbox::SearchIdxSkeleton->new($self); @@ -241,7 +268,30 @@ sub import_init { } sub lookup_content { - undef # TODO + my ($self, $mime) = @_; + my $ibx = $self->{-inbox}; + + my $srch = $ibx->search; + my $cid = content_id($mime); + my $found; + my $mid = mid_mime($mime); + $srch->each_smsg_by_mid($mid, sub { + my ($smsg) = @_; + $smsg->load_expand; + my $msg = $ibx->msg_by_smsg($smsg); + if (!defined($msg)) { + warn "broken smsg for $mid\n"; + return 1; # continue + } + my $cur = PublicInbox::MIME->new($msg); + if (content_id($cur) eq $cid) { + $smsg->{mime} = $cur; + $found = $smsg; + return 0; # break out of loop + } + 1; # continue + }); + $found; } sub atfork_child { -- EW