From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 1/3] lei import: vivify external-only messages
Date: Sun, 21 Mar 2021 15:50:45 +0600 [thread overview]
Message-ID: <20210321095047.13855-2-e@80x24.org> (raw)
In-Reply-To: <20210321095047.13855-1-e@80x24.org>
Keyword storage for external-only messages was preventing
messages from being explicitly imported. Teach lei_store
to vivify keyword-only entries into fully-indexed messages
on import.
---
lib/PublicInbox/Import.pm | 14 ++++++++++-
lib/PublicInbox/LeiImport.pm | 22 +++++++++++------
lib/PublicInbox/LeiSearch.pm | 5 +++-
lib/PublicInbox/LeiStore.pm | 46 +++++++++++++++++++++++++++++++-----
lib/PublicInbox/Over.pm | 2 +-
lib/PublicInbox/SearchIdx.pm | 12 ++++++++--
t/lei-q-kw.t | 44 ++++++++++++++++++++++++++++++++++
7 files changed, 127 insertions(+), 18 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index b8fa5c21..34738279 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -413,7 +413,19 @@ sub add {
$smsg->{blob} = $self->get_mark(":$blob");
$smsg->set_bytes($raw_email, $n);
if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore
- return if $oidx->blob_exists($smsg->{blob});
+ my @docids = $oidx->blob_exists($smsg->{blob});
+ my @vivify_xvmd;
+ for my $id (@docids) {
+ if (my $cur = $oidx->get_art($id)) {
+ # already imported if bytes > 0
+ return if $cur->{bytes} > 0;
+ push @vivify_xvmd, $id;
+ } else {
+ warn "W: $smsg->{blob} ",
+ "#$id gone (bug?)\n";
+ }
+ }
+ $smsg->{-vivify_xvmd} = \@vivify_xvmd;
}
}
my $ref = $self->{ref};
diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm
index 137c22fc..ae24a1fa 100644
--- a/lib/PublicInbox/LeiImport.pm
+++ b/lib/PublicInbox/LeiImport.pm
@@ -10,9 +10,14 @@ use PublicInbox::Eml;
use PublicInbox::PktOp qw(pkt_do);
sub _import_eml { # MboxReader callback
- my ($eml, $sto, $set_kw) = @_;
- $sto->ipc_do('set_eml', $eml, $set_kw ?
- { kw => PublicInbox::MboxReader::mbox_keywords($eml) } : ());
+ my ($eml, $lei, $mbox_keywords) = @_;
+ my $vmd;
+ if ($mbox_keywords) {
+ my $kw = $mbox_keywords->($eml);
+ $vmd = { kw => $kw } if scalar(@$kw);
+ }
+ my $xoids = $lei->{ale}->xoids_for($eml);
+ $lei->{sto}->ipc_do('set_eml', $eml, $vmd, $xoids);
}
sub import_done_wait { # dwaitpid callback
@@ -41,6 +46,7 @@ sub net_merge_complete { # callback used by LeiAuth
sub import_start {
my ($lei) = @_;
my $self = $lei->{imp};
+ $lei->ale;
my $j = $lei->{opt}->{jobs} // scalar(@{$self->{inputs}}) || 1;
if (my $net = $lei->{net}) {
# $j = $net->net_concurrency($j); TODO
@@ -130,7 +136,8 @@ sub ipc_atfork_child {
sub _import_fh {
my ($lei, $fh, $input, $ifmt) = @_;
- my $set_kw = $lei->{opt}->{kw};
+ my $kw = $lei->{opt}->{kw} ?
+ PublicInbox::MboxReader->can('mbox_keywords') : undef;
eval {
if ($ifmt eq 'eml') {
my $buf = do { local $/; <$fh> } //
@@ -138,11 +145,11 @@ sub _import_fh {
error reading $input: $!
my $eml = PublicInbox::Eml->new(\$buf);
- _import_eml($eml, $lei->{sto}, $set_kw);
+ _import_eml($eml, $lei, $kw);
} else { # some mbox (->can already checked in call);
my $cb = PublicInbox::MboxReader->can($ifmt) //
die "BUG: bad fmt=$ifmt";
- $cb->(undef, $fh, \&_import_eml, $lei->{sto}, $set_kw);
+ $cb->(undef, $fh, \&_import_eml, $lei, $kw);
}
};
$lei->child_error(1 << 8, "$input: $@") if $@;
@@ -193,7 +200,8 @@ EOM
sub import_stdin {
my ($self) = @_;
my $lei = $self->{lei};
- _import_fh($lei, delete $self->{0}, '<stdin>', $lei->{opt}->{'in-format'});
+ my $in = delete $self->{0};
+ _import_fh($lei, $in, '<stdin>', $lei->{opt}->{'in-format'});
}
no warnings 'once'; # the following works even when LeiAuth is lazy-loaded
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index 360a37e5..bbb00661 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -63,7 +63,10 @@ sub _cmp_1st { # git->cat_async callback
}
}
-sub xoids_for { # returns { OID => docid } mapping for $eml matches
+# returns { OID => num } mapping for $eml matches
+# The `num' hash value only makes sense from LeiSearch itself
+# and is nonsense from the PublicInbox::LeiALE subclass
+sub xoids_for {
my ($self, $eml, $min) = @_;
my ($chash, $mids) = content_key($eml);
my @overs = ($self->over // $self->overs_all);
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index c66d3dc2..b390b318 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -161,7 +161,7 @@ sub remove_eml_vmd {
}
sub add_eml {
- my ($self, $eml, $vmd) = @_;
+ my ($self, $eml, $vmd, $xoids) = @_;
my $im = $self->importer; # may create new epoch
my $eidx = eidx_init($self); # writes ALL.git/objects/info/alternates
my $oidx = $eidx->{oidx}; # PublicInbox::Import::add checks this
@@ -169,7 +169,40 @@ sub add_eml {
$im->add($eml, undef, $smsg) or return; # duplicate returns undef
local $self->{current_info} = $smsg->{blob};
- if (my @docids = _docids_for($self, $eml)) {
+ my $vivify_xvmd = delete($smsg->{-vivify_xvmd}) // []; # exact matches
+ if ($xoids) { # fuzzy matches from externals in ale->xoids_for
+ delete $xoids->{$smsg->{blob}}; # added later
+ if (scalar keys %$xoids) {
+ my %docids = map { $_ => 1 } @$vivify_xvmd;
+ for my $oid (keys %$xoids) {
+ my @id = $oidx->blob_exists($oid);
+ @docids{@id} = @id;
+ }
+ @$vivify_xvmd = sort { $a <=> $b } keys(%docids);
+ }
+ }
+ if (@$vivify_xvmd) {
+ $xoids //= {};
+ $xoids->{$smsg->{blob}} = 1;
+ for my $docid (@$vivify_xvmd) {
+ my $cur = $oidx->get_art($docid);
+ my $idx = $eidx->idx_shard($docid);
+ if (!$cur || $cur->{bytes} == 0) { # really vivifying
+ $smsg->{num} = $docid;
+ $oidx->add_overview($eml, $smsg);
+ $smsg->{-merge_vmd} = 1;
+ $idx->index_eml($eml, $smsg);
+ } else { # lse fuzzy hit off ale
+ $idx->ipc_do('add_eidx_info', $docid, '.', $eml);
+ }
+ for my $oid (keys %$xoids) {
+ $oidx->add_xref3($docid, -1, $oid, '.');
+ }
+ $idx->ipc_do('add_vmd', $docid, $vmd) if $vmd;
+ }
+ $vivify_xvmd;
+ } elsif (my @docids = _docids_for($self, $eml)) {
+ # fuzzy match from within lei/store
for my $docid (@docids) {
my $idx = $eidx->idx_shard($docid);
$oidx->add_xref3($docid, -1, $smsg->{blob}, '.');
@@ -178,20 +211,21 @@ sub add_eml {
$idx->ipc_do('add_vmd', $docid, $vmd) if $vmd;
}
\@docids;
- } else {
+ } else { # totally new message
$smsg->{num} = $oidx->adj_counter('eidx_docid', '+');
$oidx->add_overview($eml, $smsg);
$oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.');
my $idx = $eidx->idx_shard($smsg->{num});
$idx->index_eml($eml, $smsg);
- $idx->ipc_do('add_vmd', $smsg->{num}, $vmd ) if $vmd;
+ $idx->ipc_do('add_vmd', $smsg->{num}, $vmd) if $vmd;
$smsg;
}
}
sub set_eml {
- my ($self, $eml, $vmd) = @_;
- add_eml($self, $eml, $vmd) // set_eml_vmd($self, $eml, $vmd);
+ my ($self, $eml, $vmd, $xoids) = @_;
+ add_eml($self, $eml, $vmd, $xoids) //
+ set_eml_vmd($self, $eml, $vmd);
}
# set or update keywords for external message, called via ipc_do
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 587e0516..0e191c47 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -353,7 +353,7 @@ sub blob_exists {
my ($self, $oidhex) = @_;
if (wantarray) {
my $sth = $self->dbh->prepare_cached(<<'', undef, 1);
-SELECT docid FROM xref3 WHERE oidbin = ?
+SELECT docid FROM xref3 WHERE oidbin = ? ORDER BY docid ASC
$sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB);
$sth->execute;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3237aadc..3f933121 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -11,6 +11,7 @@ use strict;
use v5.10.1;
use parent qw(PublicInbox::Search PublicInbox::Lock Exporter);
use PublicInbox::Eml;
+use PublicInbox::Search qw(xap_terms);
use PublicInbox::InboxWritable;
use PublicInbox::MID qw(mids_for_index mids);
use PublicInbox::MsgIter;
@@ -34,6 +35,7 @@ use constant DEBUG => !!$ENV{DEBUG};
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
my $OID = $hex .'{40,}';
+my @VMD_MAP = (kw => 'K', label => 'L');
our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
sub new {
@@ -428,7 +430,15 @@ sub eml2doc ($$$;$) {
sub add_xapian ($$$$) {
my ($self, $eml, $smsg, $mids) = @_;
begin_txn_lazy($self);
+ my $merge_vmd = delete $smsg->{-merge_vmd};
my $doc = eml2doc($self, $eml, $smsg, $mids);
+ if (my $old = $merge_vmd ? _get_doc($self, $smsg->{num}) : undef) {
+ my @x = @VMD_MAP;
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ my $vals = xap_terms($pfx, $old);
+ $doc->add_boolean_term($pfx.$_) for keys %$vals;
+ }
+ }
$self->{xdb}->replace_document($smsg->{num}, $doc);
}
@@ -531,8 +541,6 @@ sub remove_eidx_info {
$self->{xdb}->replace_document($docid, $doc);
}
-my @VMD_MAP = (kw => 'K', label => 'L');
-
sub set_vmd {
my ($self, $docid, $vmd) = @_;
begin_txn_lazy($self);
diff --git a/t/lei-q-kw.t b/t/lei-q-kw.t
index b5e22e9b..4db27363 100644
--- a/t/lei-q-kw.t
+++ b/t/lei-q-kw.t
@@ -161,5 +161,49 @@ like($s, qr/^Status: O\nX-Status: AF\n/ms,
lei_ok(qw(q --pretty), "m:$m", @inc);
like($lei_out, qr/^ "kw": \["answered", "flagged"\],\n/sm,
'--pretty JSON output shows kw: on one line');
+
+# ensure import on previously external-only message works
+lei_ok('q', "m:$m");
+is_deeply(json_utf8->decode($lei_out), [ undef ],
+ 'to-be-imported message non-existent');
+lei_ok(qw(import -F eml t/x-unknown-alpine.eml));
+is($lei_err, '', 'no errors importing previous external-only message');
+lei_ok('q', "m:$m");
+$res = json_utf8->decode($lei_out);
+is($res->[1], undef, 'got one result');
+is_deeply($res->[0]->{kw}, [ qw(answered flagged) ], 'kw preserved on exact');
+
+# ensure fuzzy match import works, too
+$m = 'multipart@example.com';
+$o = "$ENV{HOME}/fuzz";
+lei_ok('q', '-o', $o, "m:$m", @inc);
+@fn = glob("$o/cur/*");
+scalar(@fn) == 1 or BAIL_OUT "wrote multiple or zero files: ".explain(\@fn);
+rename($fn[0], "$fn[0]S") or BAIL_OUT "rename $!";
+lei_ok('q', '-o', $o, "m:$m");
+is_deeply([glob("$o/cur/*")], [], 'clobbered output results');
+my $eml = eml_load('t/plack-2-txt-bodies.eml');
+$eml->header_set('List-Id', '<list.example.com>');
+my $in = $eml->as_string;
+lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt });
+is($lei_err, '', 'no errors from import');
+lei_ok(qw(q -f mboxrd), "m:$m");
+open $fh, '<', \$lei_out or BAIL_OUT $!;
+my @res;
+PublicInbox::MboxReader->mboxrd($fh, sub { push @res, shift });
+is($res[0]->header('Status'), 'RO', 'seen kw set');
+$res[0]->header_set('Status');
+is_deeply(\@res, [ $eml ], 'imported message matches w/ List-Id');
+
+$eml->header_set('List-Id', '<another.example.com>');
+$in = $eml->as_string;
+lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt });
+is($lei_err, '', 'no errors from 2nd import');
+lei_ok(qw(q -f mboxrd), "m:$m", 'l:another.example.com');
+my @another;
+open $fh, '<', \$lei_out or BAIL_OUT $!;
+PublicInbox::MboxReader->mboxrd($fh, sub { push @another, shift });
+is($another[0]->header('Status'), 'RO', 'seen kw set');
+
}); # test_lei
done_testing;
next prev parent reply other threads:[~2021-03-21 9:50 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-03-21 9:50 [PATCH 0/3] lei import fix, other fixes Eric Wong
2021-03-21 9:50 ` Eric Wong [this message]
2021-03-21 9:50 ` [PATCH 2/3] lei q: fix warning on remote imports Eric Wong
2021-03-21 9:50 ` [PATCH 3/3] lei: fix some warnings in tests Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210321095047.13855-2-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).