about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-03-14 13:12:00 +0200
committerEric Wong <e@80x24.org>2021-03-15 08:04:44 +0000
commit42fc590f8cabd23455949d002e2ddf28bbec6d1e (patch)
treecf521b1325e9d74d60ad5f75d0df1d12cf277cb0 /lib
parent64b557420689476493d752968d99ab8ae62bad9a (diff)
downloadpublic-inbox-42fc590f8cabd23455949d002e2ddf28bbec6d1e.tar.gz
We only want to auto import messages that are exclusively in
remote externals.  Messages in local externals are not
auto-imported to save space and reduce wear on storage device.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/LeiSearch.pm37
-rw-r--r--lib/PublicInbox/LeiStore.pm52
-rw-r--r--lib/PublicInbox/LeiToMail.pm2
-rw-r--r--lib/PublicInbox/LeiXSearch.pm10
4 files changed, 80 insertions, 21 deletions
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index ceb3624b..2e3f10fd 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -44,29 +44,40 @@ sub content_key ($) {
 
 sub _cmp_1st { # git->cat_async callback
         my ($bref, $oid, $type, $size, $cmp) = @_; # cmp: [chash, found, smsg]
-        return if defined($cmp->[1]->[0]); # $found->[0]
         if (content_hash(PublicInbox::Eml->new($bref)) eq $cmp->[0]) {
-                push @{$cmp->[1]}, $cmp->[2]->{num};
+                $cmp->[1]->{$oid} = $cmp->[2]->{num};
         }
 }
 
-# returns true if $eml is indexed by lei/store and keywords don't match
-sub kw_changed {
-        my ($self, $eml, $new_kw_sorted) = @_;
+sub xids_for { # returns { OID => docid } mapping for $eml matches
+        my ($self, $eml, $min) = @_;
         my ($chash, $mids) = content_key($eml);
-        my $over = $self->over;
+        my @overs = ($self->over // $self->overs_all);
         my $git = $self->git;
-        my $found = [];
+        my $found = {};
         for my $mid (@$mids) {
-                my ($id, $prev);
-                while (my $cur = $over->next_by_mid($mid, \$id, \$prev)) {
-                        $git->cat_async($cur->{blob}, \&_cmp_1st,
-                                        [ $chash, $found, $cur ]);
-                        last if scalar(@$found);
+                for my $o (@overs) {
+                        my ($id, $prev);
+                        while (my $cur = $o->next_by_mid($mid, \$id, \$prev)) {
+                                next if $found->{$cur->{blob}};
+                                $git->cat_async($cur->{blob}, \&_cmp_1st,
+                                                [ $chash, $found, $cur ]);
+                                if ($min && scalar(keys %$found) >= $min) {
+                                        $git->cat_async_wait;
+                                        return $found;
+                                }
+                        }
                 }
         }
         $git->cat_async_wait;
-        my $num = $found->[0] // return;
+        scalar(keys %$found) ? $found : undef;
+}
+
+# returns true if $eml is indexed by lei/store and keywords don't match
+sub kw_changed {
+        my ($self, $eml, $new_kw_sorted) = @_;
+        my $found = xids_for($self, $eml, 1) // return;
+        my ($num) = values %$found;
         my @cur_kw = msg_keywords($self, $num);
         join("\0", @$new_kw_sorted) eq join("\0", @cur_kw) ? 0 : 1;
 }
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index 6ace2ad1..aaee5874 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -213,6 +213,24 @@ sub set_eml {
         add_eml($self, $eml, @kw) // set_eml_keywords($self, $eml, @kw);
 }
 
+sub add_eml_maybe {
+        my ($self, $eml) = @_;
+        my $lxs = $self->{lxs_all_local} // die 'BUG: no {lxs_all_local}';
+        return if $lxs->xids_for($eml, 1);
+        add_eml($self, $eml);
+}
+
+# set or update keywords for external message, called via ipc_do
+sub set_xkw {
+        my ($self, $eml, $kw) = @_;
+        my $lxs = $self->{lxs_all_local} // die 'BUG: no {lxs_all_local}';
+        if ($lxs->xids_for($eml, 1)) { # is it in a local external?
+                # TODO: index keywords only
+        } else {
+                set_eml($self, $eml, @$kw);
+        }
+}
+
 sub checkpoint {
         my ($self, $wait) = @_;
         if (my $im = $self->{im}) {
@@ -237,18 +255,40 @@ sub done {
 
 sub ipc_atfork_child {
         my ($self) = @_;
-        my $lei = delete $self->{lei};
+        my $lei = $self->{lei};
         $lei->lei_atfork_child(1) if $lei;
         $self->SUPER::ipc_atfork_child;
 }
 
+sub refresh_local_externals {
+        my ($self) = @_;
+        my $cfg = $self->{lei}->_lei_cfg or return;
+        my $cur_cfg = $self->{cur_cfg} // -1;
+        my $lxs = $self->{lxs_all_local};
+        if ($cfg != $cur_cfg || !$lxs) {
+                $lxs = PublicInbox::LeiXSearch->new;
+                my @loc = $self->{lei}->externals_each;
+                for my $loc (@loc) { # locals only
+                        $lxs->prepare_external($loc) if -d $loc;
+                }
+                $self->{lxs_all_local} = $lxs;
+                $self->{cur_cfg} = $cfg;
+        }
+        ($lxs->{git_tmp} //= $lxs->git_tmp)->{git_dir};
+}
+
 sub write_prepare {
         my ($self, $lei) = @_;
-        $self->ipc_lock_init;
-        # Mail we import into lei are private, so headers filtered out
-        # by -mda for public mail are not appropriate
-        local @PublicInbox::MDA::BAD_HEADERS = ();
-        $self->ipc_worker_spawn('lei_store', $lei->oldset, { lei => $lei });
+        unless ($self->{-ipc_req}) {
+                require PublicInbox::LeiXSearch;
+                $self->ipc_lock_init;
+                # Mail we import into lei are private, so headers filtered out
+                # by -mda for public mail are not appropriate
+                local @PublicInbox::MDA::BAD_HEADERS = ();
+                $self->ipc_worker_spawn('lei_store', $lei->oldset,
+                                        { lei => $lei });
+        }
+        $lei->{all_ext_git_dir} = $self->ipc_do('refresh_local_externals');
         $lei->{sto} = $self;
 }
 
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 13764d79..587804bb 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -279,7 +279,7 @@ sub update_kw_maybe ($$$$) {
         if ($x) {
                 $lei->{sto}->ipc_do('set_eml', $eml, @$kw);
         } elsif (!defined($x)) {
-                # TODO: xkw
+                $lei->{sto}->ipc_do('set_xkw', $eml, $kw);
         }
 }
 
diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
index f2c8c02e..22c8026c 100644
--- a/lib/PublicInbox/LeiXSearch.pm
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -97,6 +97,11 @@ sub recent {
 
 sub over {}
 
+sub overs_all { # for xids_for
+        my ($self) = @_;
+        grep(defined, map { $_->over } locals($self))
+}
+
 sub _mset_more ($$) {
         my ($mset, $mo) = @_;
         my $size = $mset->size;
@@ -204,7 +209,9 @@ sub query_mset { # non-parallel for non-"--threads" users
 
 sub each_remote_eml { # callback for MboxReader->mboxrd
         my ($eml, $self, $lei, $each_smsg) = @_;
-        $lei->{sto}->ipc_do('add_eml', $eml) if $lei->{opt}->{'import-remote'};
+        if (my $sto = $self->{import_sto}) {
+                $sto->ipc_do('add_eml_maybe', $eml);
+        }
         my $smsg = bless {}, 'PublicInbox::Smsg';
         $smsg->populate($eml);
         $smsg->parse_references($eml, mids($eml));
@@ -249,6 +256,7 @@ sub query_remote_mboxrd {
         my $curl = PublicInbox::LeiCurl->new($lei, $self->{curl}) or return;
         push @$curl, '-s', '-d', '';
         my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei);
+        $self->{import_sto} = $lei->{sto} if $lei->{opt}->{'import-remote'};
         for my $uri (@$uris) {
                 $lei->{-current_url} = $uri->as_string;
                 $lei->{-nr_remote_eml} = 0;