about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-03-02 03:39:09 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-03-02 09:07:54 +0000
commitfaf36e5b451bd5dba0b1ae867f606ba0cb397af3 (patch)
tree67bea99b1ba855b13330a2156f894979052f99c0 /lib
parentc062644c64bf44b48bd6469e1d4e77d8b9cfdc87 (diff)
downloadpublic-inbox-faf36e5b451bd5dba0b1ae867f606ba0cb397af3.tar.gz
This is a bit expensive in a multi-process situation because
we need to make our indices and packs visible to the read-only
pieces.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/Search.pm16
-rw-r--r--lib/PublicInbox/SearchIdx.pm9
-rw-r--r--lib/PublicInbox/V2Writable.pm68
3 files changed, 81 insertions, 12 deletions
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 21c72b6f..c074410c 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -371,6 +371,22 @@ sub lookup_mail { # no ghosts!
         });
 }
 
+sub each_smsg_by_mid {
+        my ($self, $mid, $cb) = @_;
+        $mid = mid_clean($mid);
+        my $xdb = $self->{xdb};
+        # XXX retry_reopen isn't necessary for V2Writable, but the PSGI
+        # interface will need it...
+        my ($head, $tail) = $self->find_doc_ids('XMID' . $mid);
+        for (; $head->nequal($tail); $head->inc) {
+                my $doc_id = $head->get_docid;
+                my $doc = $xdb->get_document($doc_id);
+                my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
+                $smsg->{doc_id} = $doc_id;
+                $cb->($smsg) or return;
+        }
+}
+
 sub find_unique_doc_id {
         my ($self, $termval) = @_;
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index ec3a6f3e..ed52e386 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -284,7 +284,11 @@ sub add_message {
         my $db = $self->{xdb};
 
         my ($doc_id, $old_tid);
-        my $mid = mid_clean(mid_mime($mime));
+        my @mids = mid_mime($mime);
+        if (@mids > 1) {
+                warn "Multi-MID: ( ",join(' | ', @mids)," )\n";
+        }
+        my $mid = mid_clean($mids[0]);
         my $skel = $self->{skeleton};
 
         eval {
@@ -512,13 +516,12 @@ sub unindex_blob {
 }
 
 sub index_mm {
-        my ($self, $mime, $warn_existing) = @_;
+        my ($self, $mime) = @_;
         my $mid = mid_clean(mid_mime($mime));
         my $mm = $self->{mm};
         my $num = $mm->mid_insert($mid);
         return $num if defined $num;
 
-        warn "<$mid> reused\n" if $warn_existing;
         # fallback to num_for since filters like RubyLang set the number
         $mm->num_for($mid);
 }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0470fb0e..57cb7d38 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -11,6 +11,9 @@ use PublicInbox::SearchIdxSkeleton;
 use PublicInbox::MIME;
 use PublicInbox::Git;
 use PublicInbox::Import;
+use PublicInbox::MID qw(mid_clean mid_mime);
+use PublicInbox::ContentId qw(content_id);
+use PublicInbox::Inbox;
 
 # an estimate of the post-packed size to the raw uncompressed size
 my $PACKING_FACTOR = 0.4;
@@ -46,22 +49,40 @@ sub new {
 # mimics Import::add and wraps it for v2
 sub add {
         my ($self, $mime, $check_cb) = @_;
-        my $existing = $self->lookup_content($mime);
 
-        if ($existing) {
-                return undef if $existing->type eq 'mail'; # duplicate
+        # spam check:
+        if ($check_cb) {
+                $mime = $check_cb->($mime) or return;
         }
 
-        my $im = $self->importer;
+        # All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
+        # as does SQLite 3.4.1+ (released in 2007-07-20), and
+        # Xapian 1.3.2+ (released 2015-03-15).
+        # For the most part, we can spawn git-fast-import without
+        # leaking FDs to it...
+        $self->idx_init;
+
+        my $mid = mid_clean(mid_mime($mime));
+        my $num = $self->{skel}->{mm}->mid_insert($mid);
+        if (!defined($num)) { # mid is already known
+                $self->done; # ensure all subprocesses are done writing
+
+                my $existing = $self->lookup_content($mime);
+                warn "<$mid> resent\n" if $existing;
+                return if $existing; # easy, don't store duplicates
+
+                # reuse NNTP article number?
+                warn "<$mid> reused for mismatched content\n";
+                $self->idx_init;
+                $num = $self->{skel}->{mm}->num_for($mid);
+        }
 
-        # im->add returns undef if check_cb fails
-        my $cmt = $im->add($mime, $check_cb) or return;
+        my $im = $self->importer;
+        my $cmt = $im->add($mime);
         $cmt = $im->get_mark($cmt);
         my $oid = $im->{last_object_id};
         my ($len, $msgref) = @{$im->{last_object}};
 
-        $self->idx_init;
-        my $num = $self->{skel}->index_mm($mime, 1);
         my $nparts = $self->{partitions};
         my $part = $num % $nparts;
         my $idx = $self->idx_part($part);
@@ -83,6 +104,12 @@ sub idx_part {
 sub idx_init {
         my ($self) = @_;
         return if $self->{idx_parts};
+        my $ibx = $self->{-inbox};
+
+        # do not leak read-only FDs to child processes, we only have these
+        # FDs for duplicate detection so they should not be
+        # frequently activated.
+        delete $ibx->{$_} foreach (qw(git mm search));
 
         # first time initialization, first we create the skeleton pipe:
         my $skel = $self->{skel} = PublicInbox::SearchIdxSkeleton->new($self);
@@ -241,7 +268,30 @@ sub import_init {
 }
 
 sub lookup_content {
-        undef # TODO
+        my ($self, $mime) = @_;
+        my $ibx = $self->{-inbox};
+
+        my $srch = $ibx->search;
+        my $cid = content_id($mime);
+        my $found;
+        my $mid = mid_mime($mime);
+        $srch->each_smsg_by_mid($mid, sub {
+                my ($smsg) = @_;
+                $smsg->load_expand;
+                my $msg = $ibx->msg_by_smsg($smsg);
+                if (!defined($msg)) {
+                        warn "broken smsg for $mid\n";
+                        return 1; # continue
+                }
+                my $cur = PublicInbox::MIME->new($msg);
+                if (content_id($cur) eq $cid) {
+                        $smsg->{mime} = $cur;
+                        $found = $smsg;
+                        return 0; # break out of loop
+                }
+                1; # continue
+        });
+        $found;
 }
 
 sub atfork_child {