about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-03-21 15:50:45 +0600
committerEric Wong <e@80x24.org>2021-03-21 10:35:11 +0000
commit47d21e78be57a8f4615be5872e08ec4539423d1f (patch)
treeaac2f06ea9c24b4a616702e849bcbaf024597dc5
parent076543017b7646a7324156ed5d5245d467d53c1c (diff)
downloadpublic-inbox-47d21e78be57a8f4615be5872e08ec4539423d1f.tar.gz
Keyword storage for external-only messages was preventing
messages from being explicitly imported.  Teach lei_store
to vivify keyword-only entries into fully-indexed messages
on import.
-rw-r--r--lib/PublicInbox/Import.pm14
-rw-r--r--lib/PublicInbox/LeiImport.pm22
-rw-r--r--lib/PublicInbox/LeiSearch.pm5
-rw-r--r--lib/PublicInbox/LeiStore.pm46
-rw-r--r--lib/PublicInbox/Over.pm2
-rw-r--r--lib/PublicInbox/SearchIdx.pm12
-rw-r--r--t/lei-q-kw.t44
7 files changed, 127 insertions, 18 deletions
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index b8fa5c21..34738279 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -413,7 +413,19 @@ sub add {
                 $smsg->{blob} = $self->get_mark(":$blob");
                 $smsg->set_bytes($raw_email, $n);
                 if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore
-                        return if $oidx->blob_exists($smsg->{blob});
+                        my @docids = $oidx->blob_exists($smsg->{blob});
+                        my @vivify_xvmd;
+                        for my $id (@docids) {
+                                if (my $cur = $oidx->get_art($id)) {
+                                        # already imported if bytes > 0
+                                        return if $cur->{bytes} > 0;
+                                        push @vivify_xvmd, $id;
+                                } else {
+                                        warn "W: $smsg->{blob} ",
+                                                "#$id gone (bug?)\n";
+                                }
+                        }
+                        $smsg->{-vivify_xvmd} = \@vivify_xvmd;
                 }
         }
         my $ref = $self->{ref};
diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm
index 137c22fc..ae24a1fa 100644
--- a/lib/PublicInbox/LeiImport.pm
+++ b/lib/PublicInbox/LeiImport.pm
@@ -10,9 +10,14 @@ use PublicInbox::Eml;
 use PublicInbox::PktOp qw(pkt_do);
 
 sub _import_eml { # MboxReader callback
-        my ($eml, $sto, $set_kw) = @_;
-        $sto->ipc_do('set_eml', $eml, $set_kw ?
-                { kw => PublicInbox::MboxReader::mbox_keywords($eml) } : ());
+        my ($eml, $lei, $mbox_keywords) = @_;
+        my $vmd;
+        if ($mbox_keywords) {
+                my $kw = $mbox_keywords->($eml);
+                $vmd = { kw => $kw } if scalar(@$kw);
+        }
+        my $xoids = $lei->{ale}->xoids_for($eml);
+        $lei->{sto}->ipc_do('set_eml', $eml, $vmd, $xoids);
 }
 
 sub import_done_wait { # dwaitpid callback
@@ -41,6 +46,7 @@ sub net_merge_complete { # callback used by LeiAuth
 sub import_start {
         my ($lei) = @_;
         my $self = $lei->{imp};
+        $lei->ale;
         my $j = $lei->{opt}->{jobs} // scalar(@{$self->{inputs}}) || 1;
         if (my $net = $lei->{net}) {
                 # $j = $net->net_concurrency($j); TODO
@@ -130,7 +136,8 @@ sub ipc_atfork_child {
 
 sub _import_fh {
         my ($lei, $fh, $input, $ifmt) = @_;
-        my $set_kw = $lei->{opt}->{kw};
+        my $kw = $lei->{opt}->{kw} ?
+                PublicInbox::MboxReader->can('mbox_keywords') : undef;
         eval {
                 if ($ifmt eq 'eml') {
                         my $buf = do { local $/; <$fh> } //
@@ -138,11 +145,11 @@ sub _import_fh {
 error reading $input: $!
 
                         my $eml = PublicInbox::Eml->new(\$buf);
-                        _import_eml($eml, $lei->{sto}, $set_kw);
+                        _import_eml($eml, $lei, $kw);
                 } else { # some mbox (->can already checked in call);
                         my $cb = PublicInbox::MboxReader->can($ifmt) //
                                 die "BUG: bad fmt=$ifmt";
-                        $cb->(undef, $fh, \&_import_eml, $lei->{sto}, $set_kw);
+                        $cb->(undef, $fh, \&_import_eml, $lei, $kw);
                 }
         };
         $lei->child_error(1 << 8, "$input: $@") if $@;
@@ -193,7 +200,8 @@ EOM
 sub import_stdin {
         my ($self) = @_;
         my $lei = $self->{lei};
-        _import_fh($lei, delete $self->{0}, '<stdin>', $lei->{opt}->{'in-format'});
+        my $in = delete $self->{0};
+        _import_fh($lei, $in, '<stdin>', $lei->{opt}->{'in-format'});
 }
 
 no warnings 'once'; # the following works even when LeiAuth is lazy-loaded
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index 360a37e5..bbb00661 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -63,7 +63,10 @@ sub _cmp_1st { # git->cat_async callback
         }
 }
 
-sub xoids_for { # returns { OID => docid } mapping for $eml matches
+# returns { OID => num } mapping for $eml matches
+# The `num' hash value only makes sense from LeiSearch itself
+# and is nonsense from the PublicInbox::LeiALE subclass
+sub xoids_for {
         my ($self, $eml, $min) = @_;
         my ($chash, $mids) = content_key($eml);
         my @overs = ($self->over // $self->overs_all);
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index c66d3dc2..b390b318 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -161,7 +161,7 @@ sub remove_eml_vmd {
 }
 
 sub add_eml {
-        my ($self, $eml, $vmd) = @_;
+        my ($self, $eml, $vmd, $xoids) = @_;
         my $im = $self->importer; # may create new epoch
         my $eidx = eidx_init($self); # writes ALL.git/objects/info/alternates
         my $oidx = $eidx->{oidx}; # PublicInbox::Import::add checks this
@@ -169,7 +169,40 @@ sub add_eml {
         $im->add($eml, undef, $smsg) or return; # duplicate returns undef
 
         local $self->{current_info} = $smsg->{blob};
-        if (my @docids = _docids_for($self, $eml)) {
+        my $vivify_xvmd = delete($smsg->{-vivify_xvmd}) // []; # exact matches
+        if ($xoids) { # fuzzy matches from externals in ale->xoids_for
+                delete $xoids->{$smsg->{blob}}; # added later
+                if (scalar keys %$xoids) {
+                        my %docids = map { $_ => 1 } @$vivify_xvmd;
+                        for my $oid (keys %$xoids) {
+                                my @id = $oidx->blob_exists($oid);
+                                @docids{@id} = @id;
+                        }
+                        @$vivify_xvmd = sort { $a <=> $b } keys(%docids);
+                }
+        }
+        if (@$vivify_xvmd) {
+                $xoids //= {};
+                $xoids->{$smsg->{blob}} = 1;
+                for my $docid (@$vivify_xvmd) {
+                        my $cur = $oidx->get_art($docid);
+                        my $idx = $eidx->idx_shard($docid);
+                        if (!$cur || $cur->{bytes} == 0) { # really vivifying
+                                $smsg->{num} = $docid;
+                                $oidx->add_overview($eml, $smsg);
+                                $smsg->{-merge_vmd} = 1;
+                                $idx->index_eml($eml, $smsg);
+                        } else { # lse fuzzy hit off ale
+                                $idx->ipc_do('add_eidx_info', $docid, '.', $eml);
+                        }
+                        for my $oid (keys %$xoids) {
+                                $oidx->add_xref3($docid, -1, $oid, '.');
+                        }
+                        $idx->ipc_do('add_vmd', $docid, $vmd) if $vmd;
+                }
+                $vivify_xvmd;
+        } elsif (my @docids = _docids_for($self, $eml)) {
+                # fuzzy match from within lei/store
                 for my $docid (@docids) {
                         my $idx = $eidx->idx_shard($docid);
                         $oidx->add_xref3($docid, -1, $smsg->{blob}, '.');
@@ -178,20 +211,21 @@ sub add_eml {
                         $idx->ipc_do('add_vmd', $docid, $vmd) if $vmd;
                 }
                 \@docids;
-        } else {
+        } else { # totally new message
                 $smsg->{num} = $oidx->adj_counter('eidx_docid', '+');
                 $oidx->add_overview($eml, $smsg);
                 $oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.');
                 my $idx = $eidx->idx_shard($smsg->{num});
                 $idx->index_eml($eml, $smsg);
-                $idx->ipc_do('add_vmd', $smsg->{num}, $vmd ) if $vmd;
+                $idx->ipc_do('add_vmd', $smsg->{num}, $vmd) if $vmd;
                 $smsg;
         }
 }
 
 sub set_eml {
-        my ($self, $eml, $vmd) = @_;
-        add_eml($self, $eml, $vmd) // set_eml_vmd($self, $eml, $vmd);
+        my ($self, $eml, $vmd, $xoids) = @_;
+        add_eml($self, $eml, $vmd, $xoids) //
+                set_eml_vmd($self, $eml, $vmd);
 }
 
 # set or update keywords for external message, called via ipc_do
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 587e0516..0e191c47 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -353,7 +353,7 @@ sub blob_exists {
         my ($self, $oidhex) = @_;
         if (wantarray) {
                 my $sth = $self->dbh->prepare_cached(<<'', undef, 1);
-SELECT docid FROM xref3 WHERE oidbin = ?
+SELECT docid FROM xref3 WHERE oidbin = ? ORDER BY docid ASC
 
                 $sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB);
                 $sth->execute;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3237aadc..3f933121 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -11,6 +11,7 @@ use strict;
 use v5.10.1;
 use parent qw(PublicInbox::Search PublicInbox::Lock Exporter);
 use PublicInbox::Eml;
+use PublicInbox::Search qw(xap_terms);
 use PublicInbox::InboxWritable;
 use PublicInbox::MID qw(mids_for_index mids);
 use PublicInbox::MsgIter;
@@ -34,6 +35,7 @@ use constant DEBUG => !!$ENV{DEBUG};
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
+my @VMD_MAP = (kw => 'K', label => 'L');
 our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
 
 sub new {
@@ -428,7 +430,15 @@ sub eml2doc ($$$;$) {
 sub add_xapian ($$$$) {
         my ($self, $eml, $smsg, $mids) = @_;
         begin_txn_lazy($self);
+        my $merge_vmd = delete $smsg->{-merge_vmd};
         my $doc = eml2doc($self, $eml, $smsg, $mids);
+        if (my $old = $merge_vmd ? _get_doc($self, $smsg->{num}) : undef) {
+                my @x = @VMD_MAP;
+                while (my ($field, $pfx) = splice(@x, 0, 2)) {
+                        my $vals = xap_terms($pfx, $old);
+                        $doc->add_boolean_term($pfx.$_) for keys %$vals;
+                }
+        }
         $self->{xdb}->replace_document($smsg->{num}, $doc);
 }
 
@@ -531,8 +541,6 @@ sub remove_eidx_info {
         $self->{xdb}->replace_document($docid, $doc);
 }
 
-my @VMD_MAP = (kw => 'K', label => 'L');
-
 sub set_vmd {
         my ($self, $docid, $vmd) = @_;
         begin_txn_lazy($self);
diff --git a/t/lei-q-kw.t b/t/lei-q-kw.t
index b5e22e9b..4db27363 100644
--- a/t/lei-q-kw.t
+++ b/t/lei-q-kw.t
@@ -161,5 +161,49 @@ like($s, qr/^Status: O\nX-Status: AF\n/ms,
 lei_ok(qw(q --pretty), "m:$m", @inc);
 like($lei_out, qr/^  "kw": \["answered", "flagged"\],\n/sm,
         '--pretty JSON output shows kw: on one line');
+
+# ensure import on previously external-only message works
+lei_ok('q', "m:$m");
+is_deeply(json_utf8->decode($lei_out), [ undef ],
+        'to-be-imported message non-existent');
+lei_ok(qw(import -F eml t/x-unknown-alpine.eml));
+is($lei_err, '', 'no errors importing previous external-only message');
+lei_ok('q', "m:$m");
+$res = json_utf8->decode($lei_out);
+is($res->[1], undef, 'got one result');
+is_deeply($res->[0]->{kw}, [ qw(answered flagged) ], 'kw preserved on exact');
+
+# ensure fuzzy match import works, too
+$m = 'multipart@example.com';
+$o = "$ENV{HOME}/fuzz";
+lei_ok('q', '-o', $o, "m:$m", @inc);
+@fn = glob("$o/cur/*");
+scalar(@fn) == 1 or BAIL_OUT "wrote multiple or zero files: ".explain(\@fn);
+rename($fn[0], "$fn[0]S") or BAIL_OUT "rename $!";
+lei_ok('q', '-o', $o, "m:$m");
+is_deeply([glob("$o/cur/*")], [], 'clobbered output results');
+my $eml = eml_load('t/plack-2-txt-bodies.eml');
+$eml->header_set('List-Id', '<list.example.com>');
+my $in = $eml->as_string;
+lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt });
+is($lei_err, '', 'no errors from import');
+lei_ok(qw(q -f mboxrd), "m:$m");
+open $fh, '<', \$lei_out or BAIL_OUT $!;
+my @res;
+PublicInbox::MboxReader->mboxrd($fh, sub { push @res, shift });
+is($res[0]->header('Status'), 'RO', 'seen kw set');
+$res[0]->header_set('Status');
+is_deeply(\@res, [ $eml ], 'imported message matches w/ List-Id');
+
+$eml->header_set('List-Id', '<another.example.com>');
+$in = $eml->as_string;
+lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt });
+is($lei_err, '', 'no errors from 2nd import');
+lei_ok(qw(q -f mboxrd), "m:$m", 'l:another.example.com');
+my @another;
+open $fh, '<', \$lei_out or BAIL_OUT $!;
+PublicInbox::MboxReader->mboxrd($fh, sub { push @another, shift });
+is($another[0]->header('Status'), 'RO', 'seen kw set');
+
 }); # test_lei
 done_testing;