user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 1/3] lei import: vivify external-only messages
Date: Sun, 21 Mar 2021 15:50:45 +0600	[thread overview]
Message-ID: <20210321095047.13855-2-e@80x24.org> (raw)
In-Reply-To: <20210321095047.13855-1-e@80x24.org>

Keyword storage for external-only messages was preventing
messages from being explicitly imported.  Teach lei_store
to vivify keyword-only entries into fully-indexed messages
on import.
---
 lib/PublicInbox/Import.pm    | 14 ++++++++++-
 lib/PublicInbox/LeiImport.pm | 22 +++++++++++------
 lib/PublicInbox/LeiSearch.pm |  5 +++-
 lib/PublicInbox/LeiStore.pm  | 46 +++++++++++++++++++++++++++++++-----
 lib/PublicInbox/Over.pm      |  2 +-
 lib/PublicInbox/SearchIdx.pm | 12 ++++++++--
 t/lei-q-kw.t                 | 44 ++++++++++++++++++++++++++++++++++
 7 files changed, 127 insertions(+), 18 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index b8fa5c21..34738279 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -413,7 +413,19 @@ sub add {
 		$smsg->{blob} = $self->get_mark(":$blob");
 		$smsg->set_bytes($raw_email, $n);
 		if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore
-			return if $oidx->blob_exists($smsg->{blob});
+			my @docids = $oidx->blob_exists($smsg->{blob});
+			my @vivify_xvmd;
+			for my $id (@docids) {
+				if (my $cur = $oidx->get_art($id)) {
+					# already imported if bytes > 0
+					return if $cur->{bytes} > 0;
+					push @vivify_xvmd, $id;
+				} else {
+					warn "W: $smsg->{blob} ",
+						"#$id gone (bug?)\n";
+				}
+			}
+			$smsg->{-vivify_xvmd} = \@vivify_xvmd;
 		}
 	}
 	my $ref = $self->{ref};
diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm
index 137c22fc..ae24a1fa 100644
--- a/lib/PublicInbox/LeiImport.pm
+++ b/lib/PublicInbox/LeiImport.pm
@@ -10,9 +10,14 @@ use PublicInbox::Eml;
 use PublicInbox::PktOp qw(pkt_do);
 
 sub _import_eml { # MboxReader callback
-	my ($eml, $sto, $set_kw) = @_;
-	$sto->ipc_do('set_eml', $eml, $set_kw ?
-		{ kw => PublicInbox::MboxReader::mbox_keywords($eml) } : ());
+	my ($eml, $lei, $mbox_keywords) = @_;
+	my $vmd;
+	if ($mbox_keywords) {
+		my $kw = $mbox_keywords->($eml);
+		$vmd = { kw => $kw } if scalar(@$kw);
+	}
+	my $xoids = $lei->{ale}->xoids_for($eml);
+	$lei->{sto}->ipc_do('set_eml', $eml, $vmd, $xoids);
 }
 
 sub import_done_wait { # dwaitpid callback
@@ -41,6 +46,7 @@ sub net_merge_complete { # callback used by LeiAuth
 sub import_start {
 	my ($lei) = @_;
 	my $self = $lei->{imp};
+	$lei->ale;
 	my $j = $lei->{opt}->{jobs} // scalar(@{$self->{inputs}}) || 1;
 	if (my $net = $lei->{net}) {
 		# $j = $net->net_concurrency($j); TODO
@@ -130,7 +136,8 @@ sub ipc_atfork_child {
 
 sub _import_fh {
 	my ($lei, $fh, $input, $ifmt) = @_;
-	my $set_kw = $lei->{opt}->{kw};
+	my $kw = $lei->{opt}->{kw} ?
+		PublicInbox::MboxReader->can('mbox_keywords') : undef;
 	eval {
 		if ($ifmt eq 'eml') {
 			my $buf = do { local $/; <$fh> } //
@@ -138,11 +145,11 @@ sub _import_fh {
 error reading $input: $!
 
 			my $eml = PublicInbox::Eml->new(\$buf);
-			_import_eml($eml, $lei->{sto}, $set_kw);
+			_import_eml($eml, $lei, $kw);
 		} else { # some mbox (->can already checked in call);
 			my $cb = PublicInbox::MboxReader->can($ifmt) //
 				die "BUG: bad fmt=$ifmt";
-			$cb->(undef, $fh, \&_import_eml, $lei->{sto}, $set_kw);
+			$cb->(undef, $fh, \&_import_eml, $lei, $kw);
 		}
 	};
 	$lei->child_error(1 << 8, "$input: $@") if $@;
@@ -193,7 +200,8 @@ EOM
 sub import_stdin {
 	my ($self) = @_;
 	my $lei = $self->{lei};
-	_import_fh($lei, delete $self->{0}, '<stdin>', $lei->{opt}->{'in-format'});
+	my $in = delete $self->{0};
+	_import_fh($lei, $in, '<stdin>', $lei->{opt}->{'in-format'});
 }
 
 no warnings 'once'; # the following works even when LeiAuth is lazy-loaded
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index 360a37e5..bbb00661 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -63,7 +63,10 @@ sub _cmp_1st { # git->cat_async callback
 	}
 }
 
-sub xoids_for { # returns { OID => docid } mapping for $eml matches
+# returns { OID => num } mapping for $eml matches
+# The `num' hash value only makes sense from LeiSearch itself
+# and is nonsense from the PublicInbox::LeiALE subclass
+sub xoids_for {
 	my ($self, $eml, $min) = @_;
 	my ($chash, $mids) = content_key($eml);
 	my @overs = ($self->over // $self->overs_all);
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index c66d3dc2..b390b318 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -161,7 +161,7 @@ sub remove_eml_vmd {
 }
 
 sub add_eml {
-	my ($self, $eml, $vmd) = @_;
+	my ($self, $eml, $vmd, $xoids) = @_;
 	my $im = $self->importer; # may create new epoch
 	my $eidx = eidx_init($self); # writes ALL.git/objects/info/alternates
 	my $oidx = $eidx->{oidx}; # PublicInbox::Import::add checks this
@@ -169,7 +169,40 @@ sub add_eml {
 	$im->add($eml, undef, $smsg) or return; # duplicate returns undef
 
 	local $self->{current_info} = $smsg->{blob};
-	if (my @docids = _docids_for($self, $eml)) {
+	my $vivify_xvmd = delete($smsg->{-vivify_xvmd}) // []; # exact matches
+	if ($xoids) { # fuzzy matches from externals in ale->xoids_for
+		delete $xoids->{$smsg->{blob}}; # added later
+		if (scalar keys %$xoids) {
+			my %docids = map { $_ => 1 } @$vivify_xvmd;
+			for my $oid (keys %$xoids) {
+				my @id = $oidx->blob_exists($oid);
+				@docids{@id} = @id;
+			}
+			@$vivify_xvmd = sort { $a <=> $b } keys(%docids);
+		}
+	}
+	if (@$vivify_xvmd) {
+		$xoids //= {};
+		$xoids->{$smsg->{blob}} = 1;
+		for my $docid (@$vivify_xvmd) {
+			my $cur = $oidx->get_art($docid);
+			my $idx = $eidx->idx_shard($docid);
+			if (!$cur || $cur->{bytes} == 0) { # really vivifying
+				$smsg->{num} = $docid;
+				$oidx->add_overview($eml, $smsg);
+				$smsg->{-merge_vmd} = 1;
+				$idx->index_eml($eml, $smsg);
+			} else { # lse fuzzy hit off ale
+				$idx->ipc_do('add_eidx_info', $docid, '.', $eml);
+			}
+			for my $oid (keys %$xoids) {
+				$oidx->add_xref3($docid, -1, $oid, '.');
+			}
+			$idx->ipc_do('add_vmd', $docid, $vmd) if $vmd;
+		}
+		$vivify_xvmd;
+	} elsif (my @docids = _docids_for($self, $eml)) {
+		# fuzzy match from within lei/store
 		for my $docid (@docids) {
 			my $idx = $eidx->idx_shard($docid);
 			$oidx->add_xref3($docid, -1, $smsg->{blob}, '.');
@@ -178,20 +211,21 @@ sub add_eml {
 			$idx->ipc_do('add_vmd', $docid, $vmd) if $vmd;
 		}
 		\@docids;
-	} else {
+	} else { # totally new message
 		$smsg->{num} = $oidx->adj_counter('eidx_docid', '+');
 		$oidx->add_overview($eml, $smsg);
 		$oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.');
 		my $idx = $eidx->idx_shard($smsg->{num});
 		$idx->index_eml($eml, $smsg);
-		$idx->ipc_do('add_vmd', $smsg->{num}, $vmd ) if $vmd;
+		$idx->ipc_do('add_vmd', $smsg->{num}, $vmd) if $vmd;
 		$smsg;
 	}
 }
 
 sub set_eml {
-	my ($self, $eml, $vmd) = @_;
-	add_eml($self, $eml, $vmd) // set_eml_vmd($self, $eml, $vmd);
+	my ($self, $eml, $vmd, $xoids) = @_;
+	add_eml($self, $eml, $vmd, $xoids) //
+		set_eml_vmd($self, $eml, $vmd);
 }
 
 # set or update keywords for external message, called via ipc_do
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 587e0516..0e191c47 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -353,7 +353,7 @@ sub blob_exists {
 	my ($self, $oidhex) = @_;
 	if (wantarray) {
 		my $sth = $self->dbh->prepare_cached(<<'', undef, 1);
-SELECT docid FROM xref3 WHERE oidbin = ?
+SELECT docid FROM xref3 WHERE oidbin = ? ORDER BY docid ASC
 
 		$sth->bind_param(1, pack('H*', $oidhex), SQL_BLOB);
 		$sth->execute;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3237aadc..3f933121 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -11,6 +11,7 @@ use strict;
 use v5.10.1;
 use parent qw(PublicInbox::Search PublicInbox::Lock Exporter);
 use PublicInbox::Eml;
+use PublicInbox::Search qw(xap_terms);
 use PublicInbox::InboxWritable;
 use PublicInbox::MID qw(mids_for_index mids);
 use PublicInbox::MsgIter;
@@ -34,6 +35,7 @@ use constant DEBUG => !!$ENV{DEBUG};
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
+my @VMD_MAP = (kw => 'K', label => 'L');
 our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
 
 sub new {
@@ -428,7 +430,15 @@ sub eml2doc ($$$;$) {
 sub add_xapian ($$$$) {
 	my ($self, $eml, $smsg, $mids) = @_;
 	begin_txn_lazy($self);
+	my $merge_vmd = delete $smsg->{-merge_vmd};
 	my $doc = eml2doc($self, $eml, $smsg, $mids);
+	if (my $old = $merge_vmd ? _get_doc($self, $smsg->{num}) : undef) {
+		my @x = @VMD_MAP;
+		while (my ($field, $pfx) = splice(@x, 0, 2)) {
+			my $vals = xap_terms($pfx, $old);
+			$doc->add_boolean_term($pfx.$_) for keys %$vals;
+		}
+	}
 	$self->{xdb}->replace_document($smsg->{num}, $doc);
 }
 
@@ -531,8 +541,6 @@ sub remove_eidx_info {
 	$self->{xdb}->replace_document($docid, $doc);
 }
 
-my @VMD_MAP = (kw => 'K', label => 'L');
-
 sub set_vmd {
 	my ($self, $docid, $vmd) = @_;
 	begin_txn_lazy($self);
diff --git a/t/lei-q-kw.t b/t/lei-q-kw.t
index b5e22e9b..4db27363 100644
--- a/t/lei-q-kw.t
+++ b/t/lei-q-kw.t
@@ -161,5 +161,49 @@ like($s, qr/^Status: O\nX-Status: AF\n/ms,
 lei_ok(qw(q --pretty), "m:$m", @inc);
 like($lei_out, qr/^  "kw": \["answered", "flagged"\],\n/sm,
 	'--pretty JSON output shows kw: on one line');
+
+# ensure import on previously external-only message works
+lei_ok('q', "m:$m");
+is_deeply(json_utf8->decode($lei_out), [ undef ],
+	'to-be-imported message non-existent');
+lei_ok(qw(import -F eml t/x-unknown-alpine.eml));
+is($lei_err, '', 'no errors importing previous external-only message');
+lei_ok('q', "m:$m");
+$res = json_utf8->decode($lei_out);
+is($res->[1], undef, 'got one result');
+is_deeply($res->[0]->{kw}, [ qw(answered flagged) ], 'kw preserved on exact');
+
+# ensure fuzzy match import works, too
+$m = 'multipart@example.com';
+$o = "$ENV{HOME}/fuzz";
+lei_ok('q', '-o', $o, "m:$m", @inc);
+@fn = glob("$o/cur/*");
+scalar(@fn) == 1 or BAIL_OUT "wrote multiple or zero files: ".explain(\@fn);
+rename($fn[0], "$fn[0]S") or BAIL_OUT "rename $!";
+lei_ok('q', '-o', $o, "m:$m");
+is_deeply([glob("$o/cur/*")], [], 'clobbered output results');
+my $eml = eml_load('t/plack-2-txt-bodies.eml');
+$eml->header_set('List-Id', '<list.example.com>');
+my $in = $eml->as_string;
+lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt });
+is($lei_err, '', 'no errors from import');
+lei_ok(qw(q -f mboxrd), "m:$m");
+open $fh, '<', \$lei_out or BAIL_OUT $!;
+my @res;
+PublicInbox::MboxReader->mboxrd($fh, sub { push @res, shift });
+is($res[0]->header('Status'), 'RO', 'seen kw set');
+$res[0]->header_set('Status');
+is_deeply(\@res, [ $eml ], 'imported message matches w/ List-Id');
+
+$eml->header_set('List-Id', '<another.example.com>');
+$in = $eml->as_string;
+lei_ok([qw(import -F eml --stdin)], undef, { 0 => \$in, %$lei_opt });
+is($lei_err, '', 'no errors from 2nd import');
+lei_ok(qw(q -f mboxrd), "m:$m", 'l:another.example.com');
+my @another;
+open $fh, '<', \$lei_out or BAIL_OUT $!;
+PublicInbox::MboxReader->mboxrd($fh, sub { push @another, shift });
+is($another[0]->header('Status'), 'RO', 'seen kw set');
+
 }); # test_lei
 done_testing;

  reply	other threads:[~2021-03-21  9:50 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-21  9:50 [PATCH 0/3] lei import fix, other fixes Eric Wong
2021-03-21  9:50 ` Eric Wong [this message]
2021-03-21  9:50 ` [PATCH 2/3] lei q: fix warning on remote imports Eric Wong
2021-03-21  9:50 ` [PATCH 3/3] lei: fix some warnings in tests Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210321095047.13855-2-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).