From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 7C3B11F9FD for ; Sun, 14 Mar 2021 11:12:00 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] lei q: do not import unnecessarily from externals Date: Sun, 14 Mar 2021 13:12:00 +0200 Message-Id: <20210314111200.22943-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We only want to auto import messages that are exclusively in remote externals. Messages in local externals are not auto-imported to save space and reduce wear on storage device. --- lib/PublicInbox/LeiSearch.pm | 37 ++++++++++++++++--------- lib/PublicInbox/LeiStore.pm | 52 +++++++++++++++++++++++++++++++---- lib/PublicInbox/LeiToMail.pm | 2 +- lib/PublicInbox/LeiXSearch.pm | 10 ++++++- t/lei-q-remote-import.t | 45 +++++++++++++++++++++++++++++- 5 files changed, 124 insertions(+), 22 deletions(-) diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm index ceb3624b..2e3f10fd 100644 --- a/lib/PublicInbox/LeiSearch.pm +++ b/lib/PublicInbox/LeiSearch.pm @@ -44,29 +44,40 @@ sub content_key ($) { sub _cmp_1st { # git->cat_async callback my ($bref, $oid, $type, $size, $cmp) = @_; # cmp: [chash, found, smsg] - return if defined($cmp->[1]->[0]); # $found->[0] if (content_hash(PublicInbox::Eml->new($bref)) eq $cmp->[0]) { - push @{$cmp->[1]}, $cmp->[2]->{num}; + $cmp->[1]->{$oid} = $cmp->[2]->{num}; } } -# returns true if $eml is indexed by lei/store and keywords don't match -sub kw_changed { - my ($self, $eml, $new_kw_sorted) = @_; +sub xids_for { # returns { OID => docid } mapping for $eml matches + my ($self, $eml, $min) = @_; my ($chash, $mids) = content_key($eml); - my $over = $self->over; + my @overs = ($self->over // $self->overs_all); my $git = $self->git; - my $found = []; + my $found = {}; for my $mid (@$mids) { - my ($id, $prev); - while (my $cur = $over->next_by_mid($mid, \$id, \$prev)) { - $git->cat_async($cur->{blob}, \&_cmp_1st, - [ $chash, $found, $cur ]); - last if scalar(@$found); + for my $o (@overs) { + my ($id, $prev); + while (my $cur = $o->next_by_mid($mid, \$id, \$prev)) { + next if $found->{$cur->{blob}}; + $git->cat_async($cur->{blob}, \&_cmp_1st, + [ $chash, $found, $cur ]); + if ($min && scalar(keys %$found) >= $min) { + $git->cat_async_wait; + return $found; + } + } } } $git->cat_async_wait; - my $num = $found->[0] // return; + scalar(keys %$found) ? $found : undef; +} + +# returns true if $eml is indexed by lei/store and keywords don't match +sub kw_changed { + my ($self, $eml, $new_kw_sorted) = @_; + my $found = xids_for($self, $eml, 1) // return; + my ($num) = values %$found; my @cur_kw = msg_keywords($self, $num); join("\0", @$new_kw_sorted) eq join("\0", @cur_kw) ? 0 : 1; } diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 6ace2ad1..aaee5874 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -213,6 +213,24 @@ sub set_eml { add_eml($self, $eml, @kw) // set_eml_keywords($self, $eml, @kw); } +sub add_eml_maybe { + my ($self, $eml) = @_; + my $lxs = $self->{lxs_all_local} // die 'BUG: no {lxs_all_local}'; + return if $lxs->xids_for($eml, 1); + add_eml($self, $eml); +} + +# set or update keywords for external message, called via ipc_do +sub set_xkw { + my ($self, $eml, $kw) = @_; + my $lxs = $self->{lxs_all_local} // die 'BUG: no {lxs_all_local}'; + if ($lxs->xids_for($eml, 1)) { # is it in a local external? + # TODO: index keywords only + } else { + set_eml($self, $eml, @$kw); + } +} + sub checkpoint { my ($self, $wait) = @_; if (my $im = $self->{im}) { @@ -237,18 +255,40 @@ sub done { sub ipc_atfork_child { my ($self) = @_; - my $lei = delete $self->{lei}; + my $lei = $self->{lei}; $lei->lei_atfork_child(1) if $lei; $self->SUPER::ipc_atfork_child; } +sub refresh_local_externals { + my ($self) = @_; + my $cfg = $self->{lei}->_lei_cfg or return; + my $cur_cfg = $self->{cur_cfg} // -1; + my $lxs = $self->{lxs_all_local}; + if ($cfg != $cur_cfg || !$lxs) { + $lxs = PublicInbox::LeiXSearch->new; + my @loc = $self->{lei}->externals_each; + for my $loc (@loc) { # locals only + $lxs->prepare_external($loc) if -d $loc; + } + $self->{lxs_all_local} = $lxs; + $self->{cur_cfg} = $cfg; + } + ($lxs->{git_tmp} //= $lxs->git_tmp)->{git_dir}; +} + sub write_prepare { my ($self, $lei) = @_; - $self->ipc_lock_init; - # Mail we import into lei are private, so headers filtered out - # by -mda for public mail are not appropriate - local @PublicInbox::MDA::BAD_HEADERS = (); - $self->ipc_worker_spawn('lei_store', $lei->oldset, { lei => $lei }); + unless ($self->{-ipc_req}) { + require PublicInbox::LeiXSearch; + $self->ipc_lock_init; + # Mail we import into lei are private, so headers filtered out + # by -mda for public mail are not appropriate + local @PublicInbox::MDA::BAD_HEADERS = (); + $self->ipc_worker_spawn('lei_store', $lei->oldset, + { lei => $lei }); + } + $lei->{all_ext_git_dir} = $self->ipc_do('refresh_local_externals'); $lei->{sto} = $self; } diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 13764d79..587804bb 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -279,7 +279,7 @@ sub update_kw_maybe ($$$$) { if ($x) { $lei->{sto}->ipc_do('set_eml', $eml, @$kw); } elsif (!defined($x)) { - # TODO: xkw + $lei->{sto}->ipc_do('set_xkw', $eml, $kw); } } diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm index f2c8c02e..22c8026c 100644 --- a/lib/PublicInbox/LeiXSearch.pm +++ b/lib/PublicInbox/LeiXSearch.pm @@ -97,6 +97,11 @@ sub recent { sub over {} +sub overs_all { # for xids_for + my ($self) = @_; + grep(defined, map { $_->over } locals($self)) +} + sub _mset_more ($$) { my ($mset, $mo) = @_; my $size = $mset->size; @@ -204,7 +209,9 @@ sub query_mset { # non-parallel for non-"--threads" users sub each_remote_eml { # callback for MboxReader->mboxrd my ($eml, $self, $lei, $each_smsg) = @_; - $lei->{sto}->ipc_do('add_eml', $eml) if $lei->{opt}->{'import-remote'}; + if (my $sto = $self->{import_sto}) { + $sto->ipc_do('add_eml_maybe', $eml); + } my $smsg = bless {}, 'PublicInbox::Smsg'; $smsg->populate($eml); $smsg->parse_references($eml, mids($eml)); @@ -249,6 +256,7 @@ sub query_remote_mboxrd { my $curl = PublicInbox::LeiCurl->new($lei, $self->{curl}) or return; push @$curl, '-s', '-d', ''; my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); + $self->{import_sto} = $lei->{sto} if $lei->{opt}->{'import-remote'}; for my $uri (@$uris) { $lei->{-current_url} = $uri->as_string; $lei->{-nr_remote_eml} = 0; diff --git a/t/lei-q-remote-import.t b/t/lei-q-remote-import.t index 4088b6ad..8b82579c 100644 --- a/t/lei-q-remote-import.t +++ b/t/lei-q-remote-import.t @@ -5,6 +5,7 @@ use strict; use v5.10.1; use PublicInbox::TestCommon; require_git 2.6; require_mods(qw(json DBD::SQLite Search::Xapian)); use PublicInbox::MboxReader; +use PublicInbox::InboxWritable; my ($ro_home, $cfg_path) = setup_public_inboxes; my $sock = tcp_server; my ($tmpdir, $for_destroy) = tmpdir; @@ -36,7 +37,8 @@ test_lei({ tmpdir => $tmpdir }, sub { is_deeply($slurp_emls->($o), $exp1, 'got results after remote search'); unlink $o or BAIL_OUT $!; lei_ok(@cmd); - ok(-f $o && -s _, 'output exists after import but is not empty'); + ok(-f $o && -s _, 'output exists after import but is not empty') or + diag $lei_err; is_deeply($slurp_emls->($o), $exp1, 'got results w/o remote search'); unlink $o or BAIL_OUT $!; @@ -58,5 +60,46 @@ test_lei({ tmpdir => $tmpdir }, sub { unlink "$o.lock" or BAIL_OUT $!; lei_ok(@cmd, '--lock=dotlock,timeout=0.000001', \'succeeds after lock removal'); + + # XXX memoize this external creation + my $inboxdir = "$ENV{HOME}/tmp_git"; + my $ibx = PublicInbox::InboxWritable->new({ + name => 'tmp', + -primary_address => 'lei@example.com', + inboxdir => $inboxdir, + indexlevel => 'medium', + }, { nproc => 1 }); + my $im = $ibx->importer(0); + $im->add(eml_load('t/utf8.eml')) or BAIL_OUT '->add'; + $im->done; + + run_script(['-index', $inboxdir], undef) or BAIL_OUT '-init'; + lei_ok(qw(add-external -q), $inboxdir); + lei_ok(qw(q -o), "mboxrd:$o", '--only', $url, + 'm:testmessage@example.com'); + ok(-s $o, 'got result from remote external'); + my $exp = eml_load('t/utf8.eml'); + is_deeply($slurp_emls->($o), [$exp], 'got expected result'); + lei_ok(qw(q --no-external -o), "mboxrd:/dev/stdout", + 'm:testmessage@example.com'); + is($lei_out, '', 'message not imported when in local external'); + + open $fh, '>', $o or BAIL_OUT; + print $fh <<'EOF' or BAIL_OUT; +From a@z Mon Sep 17 00:00:00 2001 +From: nobody@localhost +Date: Sat, 13 Mar 2021 18:23:01 +0600 +Message-ID: +Status: RO + +whatever +EOF + close $fh or BAIL_OUT; + lei_ok(qw(q -o), "mboxrd:$o", 'm:testmessage@example.com'); + is_deeply($slurp_emls->($o), [$exp], + 'got expected result after clobber') or diag $lei_err; + lei_ok(qw(q -o mboxrd:/dev/stdout m:never-before-seen@example.com)); + like($lei_out, qr/seen\@example\.com>\nStatus: OR\n\nwhatever/sm, + '--import-before imported totally unseen message'); }); done_testing;