user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH] lei q: do not import unnecessarily from externals
Date: Sun, 14 Mar 2021 13:12:00 +0200	[thread overview]
Message-ID: <20210314111200.22943-1-e@80x24.org> (raw)

We only want to auto import messages that are exclusively in
remote externals.  Messages in local externals are not
auto-imported to save space and reduce wear on storage device.
---
 lib/PublicInbox/LeiSearch.pm  | 37 ++++++++++++++++---------
 lib/PublicInbox/LeiStore.pm   | 52 +++++++++++++++++++++++++++++++----
 lib/PublicInbox/LeiToMail.pm  |  2 +-
 lib/PublicInbox/LeiXSearch.pm | 10 ++++++-
 t/lei-q-remote-import.t       | 45 +++++++++++++++++++++++++++++-
 5 files changed, 124 insertions(+), 22 deletions(-)

diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index ceb3624b..2e3f10fd 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -44,29 +44,40 @@ sub content_key ($) {
 
 sub _cmp_1st { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $cmp) = @_; # cmp: [chash, found, smsg]
-	return if defined($cmp->[1]->[0]); # $found->[0]
 	if (content_hash(PublicInbox::Eml->new($bref)) eq $cmp->[0]) {
-		push @{$cmp->[1]}, $cmp->[2]->{num};
+		$cmp->[1]->{$oid} = $cmp->[2]->{num};
 	}
 }
 
-# returns true if $eml is indexed by lei/store and keywords don't match
-sub kw_changed {
-	my ($self, $eml, $new_kw_sorted) = @_;
+sub xids_for { # returns { OID => docid } mapping for $eml matches
+	my ($self, $eml, $min) = @_;
 	my ($chash, $mids) = content_key($eml);
-	my $over = $self->over;
+	my @overs = ($self->over // $self->overs_all);
 	my $git = $self->git;
-	my $found = [];
+	my $found = {};
 	for my $mid (@$mids) {
-		my ($id, $prev);
-		while (my $cur = $over->next_by_mid($mid, \$id, \$prev)) {
-			$git->cat_async($cur->{blob}, \&_cmp_1st,
-					[ $chash, $found, $cur ]);
-			last if scalar(@$found);
+		for my $o (@overs) {
+			my ($id, $prev);
+			while (my $cur = $o->next_by_mid($mid, \$id, \$prev)) {
+				next if $found->{$cur->{blob}};
+				$git->cat_async($cur->{blob}, \&_cmp_1st,
+						[ $chash, $found, $cur ]);
+				if ($min && scalar(keys %$found) >= $min) {
+					$git->cat_async_wait;
+					return $found;
+				}
+			}
 		}
 	}
 	$git->cat_async_wait;
-	my $num = $found->[0] // return;
+	scalar(keys %$found) ? $found : undef;
+}
+
+# returns true if $eml is indexed by lei/store and keywords don't match
+sub kw_changed {
+	my ($self, $eml, $new_kw_sorted) = @_;
+	my $found = xids_for($self, $eml, 1) // return;
+	my ($num) = values %$found;
 	my @cur_kw = msg_keywords($self, $num);
 	join("\0", @$new_kw_sorted) eq join("\0", @cur_kw) ? 0 : 1;
 }
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index 6ace2ad1..aaee5874 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -213,6 +213,24 @@ sub set_eml {
 	add_eml($self, $eml, @kw) // set_eml_keywords($self, $eml, @kw);
 }
 
+sub add_eml_maybe {
+	my ($self, $eml) = @_;
+	my $lxs = $self->{lxs_all_local} // die 'BUG: no {lxs_all_local}';
+	return if $lxs->xids_for($eml, 1);
+	add_eml($self, $eml);
+}
+
+# set or update keywords for external message, called via ipc_do
+sub set_xkw {
+	my ($self, $eml, $kw) = @_;
+	my $lxs = $self->{lxs_all_local} // die 'BUG: no {lxs_all_local}';
+	if ($lxs->xids_for($eml, 1)) { # is it in a local external?
+		# TODO: index keywords only
+	} else {
+		set_eml($self, $eml, @$kw);
+	}
+}
+
 sub checkpoint {
 	my ($self, $wait) = @_;
 	if (my $im = $self->{im}) {
@@ -237,18 +255,40 @@ sub done {
 
 sub ipc_atfork_child {
 	my ($self) = @_;
-	my $lei = delete $self->{lei};
+	my $lei = $self->{lei};
 	$lei->lei_atfork_child(1) if $lei;
 	$self->SUPER::ipc_atfork_child;
 }
 
+sub refresh_local_externals {
+	my ($self) = @_;
+	my $cfg = $self->{lei}->_lei_cfg or return;
+	my $cur_cfg = $self->{cur_cfg} // -1;
+	my $lxs = $self->{lxs_all_local};
+	if ($cfg != $cur_cfg || !$lxs) {
+		$lxs = PublicInbox::LeiXSearch->new;
+		my @loc = $self->{lei}->externals_each;
+		for my $loc (@loc) { # locals only
+			$lxs->prepare_external($loc) if -d $loc;
+		}
+		$self->{lxs_all_local} = $lxs;
+		$self->{cur_cfg} = $cfg;
+	}
+	($lxs->{git_tmp} //= $lxs->git_tmp)->{git_dir};
+}
+
 sub write_prepare {
 	my ($self, $lei) = @_;
-	$self->ipc_lock_init;
-	# Mail we import into lei are private, so headers filtered out
-	# by -mda for public mail are not appropriate
-	local @PublicInbox::MDA::BAD_HEADERS = ();
-	$self->ipc_worker_spawn('lei_store', $lei->oldset, { lei => $lei });
+	unless ($self->{-ipc_req}) {
+		require PublicInbox::LeiXSearch;
+		$self->ipc_lock_init;
+		# Mail we import into lei are private, so headers filtered out
+		# by -mda for public mail are not appropriate
+		local @PublicInbox::MDA::BAD_HEADERS = ();
+		$self->ipc_worker_spawn('lei_store', $lei->oldset,
+					{ lei => $lei });
+	}
+	$lei->{all_ext_git_dir} = $self->ipc_do('refresh_local_externals');
 	$lei->{sto} = $self;
 }
 
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 13764d79..587804bb 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -279,7 +279,7 @@ sub update_kw_maybe ($$$$) {
 	if ($x) {
 		$lei->{sto}->ipc_do('set_eml', $eml, @$kw);
 	} elsif (!defined($x)) {
-		# TODO: xkw
+		$lei->{sto}->ipc_do('set_xkw', $eml, $kw);
 	}
 }
 
diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
index f2c8c02e..22c8026c 100644
--- a/lib/PublicInbox/LeiXSearch.pm
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -97,6 +97,11 @@ sub recent {
 
 sub over {}
 
+sub overs_all { # for xids_for
+	my ($self) = @_;
+	grep(defined, map { $_->over } locals($self))
+}
+
 sub _mset_more ($$) {
 	my ($mset, $mo) = @_;
 	my $size = $mset->size;
@@ -204,7 +209,9 @@ sub query_mset { # non-parallel for non-"--threads" users
 
 sub each_remote_eml { # callback for MboxReader->mboxrd
 	my ($eml, $self, $lei, $each_smsg) = @_;
-	$lei->{sto}->ipc_do('add_eml', $eml) if $lei->{opt}->{'import-remote'};
+	if (my $sto = $self->{import_sto}) {
+		$sto->ipc_do('add_eml_maybe', $eml);
+	}
 	my $smsg = bless {}, 'PublicInbox::Smsg';
 	$smsg->populate($eml);
 	$smsg->parse_references($eml, mids($eml));
@@ -249,6 +256,7 @@ sub query_remote_mboxrd {
 	my $curl = PublicInbox::LeiCurl->new($lei, $self->{curl}) or return;
 	push @$curl, '-s', '-d', '';
 	my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei);
+	$self->{import_sto} = $lei->{sto} if $lei->{opt}->{'import-remote'};
 	for my $uri (@$uris) {
 		$lei->{-current_url} = $uri->as_string;
 		$lei->{-nr_remote_eml} = 0;
diff --git a/t/lei-q-remote-import.t b/t/lei-q-remote-import.t
index 4088b6ad..8b82579c 100644
--- a/t/lei-q-remote-import.t
+++ b/t/lei-q-remote-import.t
@@ -5,6 +5,7 @@ use strict; use v5.10.1; use PublicInbox::TestCommon;
 require_git 2.6;
 require_mods(qw(json DBD::SQLite Search::Xapian));
 use PublicInbox::MboxReader;
+use PublicInbox::InboxWritable;
 my ($ro_home, $cfg_path) = setup_public_inboxes;
 my $sock = tcp_server;
 my ($tmpdir, $for_destroy) = tmpdir;
@@ -36,7 +37,8 @@ test_lei({ tmpdir => $tmpdir }, sub {
 	is_deeply($slurp_emls->($o), $exp1, 'got results after remote search');
 	unlink $o or BAIL_OUT $!;
 	lei_ok(@cmd);
-	ok(-f $o && -s _, 'output exists after import but is not empty');
+	ok(-f $o && -s _, 'output exists after import but is not empty') or
+		diag $lei_err;
 	is_deeply($slurp_emls->($o), $exp1, 'got results w/o remote search');
 	unlink $o or BAIL_OUT $!;
 
@@ -58,5 +60,46 @@ test_lei({ tmpdir => $tmpdir }, sub {
 	unlink "$o.lock" or BAIL_OUT $!;
 	lei_ok(@cmd, '--lock=dotlock,timeout=0.000001',
 		\'succeeds after lock removal');
+
+	# XXX memoize this external creation
+	my $inboxdir = "$ENV{HOME}/tmp_git";
+	my $ibx = PublicInbox::InboxWritable->new({
+		name => 'tmp',
+		-primary_address => 'lei@example.com',
+		inboxdir => $inboxdir,
+		indexlevel => 'medium',
+	}, { nproc => 1 });
+	my $im = $ibx->importer(0);
+	$im->add(eml_load('t/utf8.eml')) or BAIL_OUT '->add';
+	$im->done;
+
+	run_script(['-index', $inboxdir], undef) or BAIL_OUT '-init';
+	lei_ok(qw(add-external -q), $inboxdir);
+	lei_ok(qw(q -o), "mboxrd:$o", '--only', $url,
+		'm:testmessage@example.com');
+	ok(-s $o, 'got result from remote external');
+	my $exp = eml_load('t/utf8.eml');
+	is_deeply($slurp_emls->($o), [$exp], 'got expected result');
+	lei_ok(qw(q --no-external -o), "mboxrd:/dev/stdout",
+			'm:testmessage@example.com');
+	is($lei_out, '', 'message not imported when in local external');
+
+	open $fh, '>', $o or BAIL_OUT;
+	print $fh <<'EOF' or BAIL_OUT;
+From a@z Mon Sep 17 00:00:00 2001
+From: nobody@localhost
+Date: Sat, 13 Mar 2021 18:23:01 +0600
+Message-ID: <never-before-seen@example.com>
+Status: RO
+
+whatever
+EOF
+	close $fh or BAIL_OUT;
+	lei_ok(qw(q -o), "mboxrd:$o", 'm:testmessage@example.com');
+	is_deeply($slurp_emls->($o), [$exp],
+		'got expected result after clobber') or diag $lei_err;
+	lei_ok(qw(q -o mboxrd:/dev/stdout m:never-before-seen@example.com));
+	like($lei_out, qr/seen\@example\.com>\nStatus: OR\n\nwhatever/sm,
+		'--import-before imported totally unseen message');
 });
 done_testing;

                 reply	other threads:[~2021-03-14 11:12 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210314111200.22943-1-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).