about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/LEI.pm12
-rw-r--r--lib/PublicInbox/LeiExternal.pm33
-rw-r--r--lib/PublicInbox/LeiQuery.pm176
-rw-r--r--lib/PublicInbox/LeiStore.pm2
-rw-r--r--lib/PublicInbox/LeiToMail.pm2
-rw-r--r--lib/PublicInbox/LeiXSearch.pm22
-rw-r--r--lib/PublicInbox/Search.pm10
-rw-r--r--lib/PublicInbox/SearchView.pm10
-rw-r--r--t/lei.t11
-rw-r--r--t/lei_xsearch.t5
11 files changed, 250 insertions, 34 deletions
diff --git a/MANIFEST b/MANIFEST
index 6dc08f01..609160dd 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -165,6 +165,7 @@ lib/PublicInbox/KQNotify.pm
 lib/PublicInbox/LEI.pm
 lib/PublicInbox/LeiDedupe.pm
 lib/PublicInbox/LeiExternal.pm
+lib/PublicInbox/LeiQuery.pm
 lib/PublicInbox/LeiSearch.pm
 lib/PublicInbox/LeiStore.pm
 lib/PublicInbox/LeiToMail.pm
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index 9c3308ad..a5658e6d 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -8,7 +8,8 @@
 package PublicInbox::LEI;
 use strict;
 use v5.10.1;
-use parent qw(PublicInbox::DS PublicInbox::LeiExternal);
+use parent qw(PublicInbox::DS PublicInbox::LeiExternal
+        PublicInbox::LeiQuery);
 use Getopt::Long ();
 use Socket qw(AF_UNIX SOCK_STREAM pack_sockaddr_un);
 use Errno qw(EAGAIN ECONNREFUSED ENOENT);
@@ -80,7 +81,7 @@ sub _config_path ($) {
 our %CMD = ( # sorted in order of importance/use:
 'q' => [ 'SEARCH_TERMS...', 'search for messages matching terms', qw(
         save-as=s output|mfolder|o=s format|f=s dedupe|d=s thread|t augment|a
-        sort|s=s@ reverse|r offset=i remote local! external!
+        sort|s=s reverse|r offset=i remote local! external! pretty
         since|after=s until|before=s), opt_dash('limit|n=i', '[0-9]+') ],
 
 'show' => [ 'MID|OID', 'show a given object (Message-ID or object ID)',
@@ -202,8 +203,9 @@ my %OPTDESC = (
 'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ],
 'offset=i' => ['OFF', 'search result offset (default: 0)'],
 
-'sort|s=s@' => [ 'VAL|internaldate,date,relevance,docid',
+'sort|s=s' => [ 'VAL|received,relevance,docid',
                 "order of results `--output'-dependent"],
+'reverse|r' => [ 'reverse search results' ], # like sort(1)
 
 'boost=i' => 'increase/decrease priority of results (default: 0)',
 
@@ -469,10 +471,6 @@ sub lei_show {
         my ($self, @argv) = @_;
 }
 
-sub lei_query {
-        my ($self, @argv) = @_;
-}
-
 sub lei_mark {
         my ($self, @argv) = @_;
 }
diff --git a/lib/PublicInbox/LeiExternal.pm b/lib/PublicInbox/LeiExternal.pm
index 4facd451..64faf5a0 100644
--- a/lib/PublicInbox/LeiExternal.pm
+++ b/lib/PublicInbox/LeiExternal.pm
@@ -8,24 +8,35 @@ use v5.10.1;
 use parent qw(Exporter);
 our @EXPORT = qw(lei_ls_external lei_add_external lei_forget_external);
 
-sub lei_ls_external {
-        my ($self, @argv) = @_;
-        my $stor = $self->_lei_store(0);
+sub _externals_each {
+        my ($self, $cb, @arg) = @_;
         my $cfg = $self->_lei_cfg(0);
-        my $out = $self->{1};
-        my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n");
-        my (%boost, @loc);
+        my %boost;
         for my $sec (grep(/\Aexternal\./, @{$cfg->{-section_order}})) {
                 my $loc = substr($sec, length('external.'));
                 $boost{$loc} = $cfg->{"$sec.boost"};
-                push @loc, $loc;
         }
-        use sort 'stable';
+        return \%boost if !wantarray && !$cb;
+
         # highest boost first, but stable for alphabetic tie break
-        for (sort { $boost{$b} <=> $boost{$a} } sort keys %boost) {
-                # TODO: use miscidx and show docid so forget/set is easier
-                print $out $_, $OFS, 'boost=', $boost{$_}, $ORS;
+        use sort 'stable';
+        my @order = sort { $boost{$b} <=> $boost{$a} } sort keys %boost;
+        return @order if !$cb;
+        for my $loc (@order) {
+                $cb->(@arg, $loc, $boost{$loc});
         }
+        @order; # scalar or array
+}
+
+sub lei_ls_external {
+        my ($self, @argv) = @_;
+        my $stor = $self->_lei_store(0);
+        my $out = $self->{1};
+        my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n");
+        $self->_externals_each(sub {
+                my ($loc, $boost_val) = @_;
+                print $out $loc, $OFS, 'boost=', $boost_val, $ORS;
+        });
 }
 
 sub lei_add_external {
diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm
new file mode 100644
index 00000000..d14da1bc
--- /dev/null
+++ b/lib/PublicInbox/LeiQuery.pm
@@ -0,0 +1,176 @@
+# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# handles lei <q|ls-query|rm-query|mv-query> commands
+package PublicInbox::LeiQuery;
+use strict;
+use v5.10.1;
+use PublicInbox::MID qw($MID_EXTRACT);
+use POSIX qw(strftime);
+use PublicInbox::Address qw(pairs);
+use PublicInbox::Search qw(get_pct);
+
+sub _iso8601 ($) { strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($_[0])) }
+
+# prepares an smsg for JSON
+sub _smsg_unbless ($) {
+        my ($smsg) = @_;
+
+        delete @$smsg{qw(lines bytes)};
+        $smsg->{rcvd} = _iso8601(delete $smsg->{ts}); # JMAP receivedAt
+        $smsg->{dt} = _iso8601(delete $smsg->{ds}); # JMAP UTCDate
+
+        if (my $r = delete $smsg->{references}) {
+                $smsg->{references} = [
+                                map { "<$_>" } ($r =~ m/$MID_EXTRACT/go) ];
+        }
+        if (my $m = delete($smsg->{mid})) {
+                $smsg->{'m'} = "<$m>";
+        }
+        # XXX breaking to/cc, into structured arrays or tables which
+        # distinguish "$phrase <$address>" causes pretty printing JSON
+        # to take up too much vertical space.  I can't get either
+        # Cpanel::JSON::XS or JSON::XS or jq(1) only indent when
+        # wrapping is necessary, rather than blindly indenting and
+        # adding vertical space everywhere.
+        for my $f (qw(from to cc)) {
+                my $v = delete $smsg->{$f} or next;
+                $smsg->{substr($f, 0, 1)} = $v;
+        }
+        $smsg->{'s'} = delete $smsg->{subject};
+        # can we be bothered to parse From/To/Cc into arrays?
+        scalar { %$smsg }; # unbless
+}
+
+sub _vivify_external { # _externals_each callback
+        my ($src, $dir) = @_;
+        if (-f "$dir/ei.lock") {
+                require PublicInbox::ExtSearch;
+                push @$src, PublicInbox::ExtSearch->new($dir);
+        } elsif (-f "$dir/inbox.lock" || -d "$dir/public-inbox") { # v2, v1
+                require PublicInbox::Inbox;
+                push @$src, bless { inboxdir => $dir }, 'PublicInbox::Inbox';
+        } else {
+                warn "W: ignoring $dir, unable to determine type\n";
+        }
+}
+
+# the main "lei q SEARCH_TERMS" method
+sub lei_q {
+        my ($self, @argv) = @_;
+        my $sto = $self->_lei_store(1);
+        my $cfg = $self->_lei_cfg(1);
+        my $opt = $self->{opt};
+        my $qstr = join(' ', map {;
+                # Consider spaces in argv to be for phrase search in Xapian.
+                # In other words, the users should need only care about
+                # normal shell quotes and not have to learn Xapian quoting.
+                /\s/ ? (s/\A(\w+:)// ? qq{$1"$_"} : qq{"$_"}) : $_
+        } @argv);
+        $opt->{limit} //= 10000;
+        my $lxs;
+
+        # --local is enabled by default
+        my @src = $opt->{'local'} ? ($sto->search) : ();
+
+        # --external is enabled by default, but allow --no-external
+        if ($opt->{external} // 1) {
+                $self->_externals_each(\&_vivify_external, \@src);
+                # {tid} is not unique between indices, so we have to search
+                # each src individually
+                if (!$opt->{thread}) {
+                        require PublicInbox::LeiXSearch;
+                        my $lxs = PublicInbox::LeiXSearch->new;
+                        # local is always first
+                        $lxs->attach_external($_) for @src;
+                        @src = ($lxs);
+                }
+        }
+        my $out = $self->{output} // '-';
+        $out = 'json:/dev/stdout' if $out eq '-';
+        my $isatty = -t $self->{1};
+        $self->start_pager if $isatty;
+        my $json = substr($out, 0, 5) eq 'json:' ?
+                ref(PublicInbox::Config->json)->new : undef;
+        if ($json) {
+                if ($opt->{pretty} //= $isatty) {
+                        $json->pretty(1)->space_before(0);
+                        $json->indent_length($opt->{indent} // 2);
+                }
+                $json->utf8; # avoid Wide character in print warnings
+                $json->ascii(1) if $opt->{ascii}; # for "\uXXXX"
+                $json->canonical;
+        }
+
+        # src: LeiXSearch || LeiSearch || Inbox
+        my %mset_opt = map { $_ => $opt->{$_} } qw(thread limit offset);
+        delete $mset_opt{limit} if $opt->{limit} < 0;
+        $mset_opt{asc} = $opt->{'reverse'} ? 1 : 0;
+        if (defined(my $sort = $opt->{'sort'})) {
+                if ($sort eq 'relevance') {
+                        $mset_opt{relevance} = 1;
+                } elsif ($sort eq 'docid') {
+                        $mset_opt{relevance} = $mset_opt{asc} ? -1 : -2;
+                } elsif ($sort =~ /\Areceived(?:-?[aA]t)?\z/) {
+                        # the default
+                } else {
+                        die "unrecognized --sort=$sort\n";
+                }
+        }
+        # $self->out($json->encode(\%mset_opt));
+        # descending docid order
+        $mset_opt{relevance} //= -2 if $opt->{thread};
+        # my $wcb = PublicInbox::LeiToMail->write_cb($out, $self);
+
+        # even w/o pretty, do the equivalent of a --pretty=oneline
+        # output so "lei q SEARCH_TERMS | wc -l" can be useful:
+        my $ORS = $json ? ($opt->{pretty} ? ', ' : ",\n") : "\n";
+        my $buf;
+
+        # we can generate too many records to hold in RAM, so we stream
+        # and fake a JSON array starting here:
+        $self->out('[') if $json;
+        my $emit_cb = sub {
+                my ($smsg) = @_;
+                delete @$smsg{qw(tid num)}; # only makes sense if single src
+                chomp($buf = $json->encode(_smsg_unbless($smsg)));
+        };
+        for my $src (@src) {
+                my $srch = $src->search;
+                my $over = $src->over;
+                my $smsg_for = $src->can('smsg_for'); # LeiXSearch
+                my $mo = { %mset_opt };
+                my $mset = $srch->mset($qstr, $mo);
+                my $ctx = {};
+                if ($smsg_for) {
+                        for my $it ($mset->items) {
+                                my $smsg = $smsg_for->($srch, $it) or next;
+                                $self->out($buf .= $ORS) if defined $buf;
+                                $smsg->{relevance} = get_pct($it);
+                                $emit_cb->($smsg);
+                        }
+                } else { # --thread
+                        my $ids = $srch->mset_to_artnums($mset, $mo);
+                        $ctx->{ids} = $ids;
+                        my $i = 0;
+                        my %n2p = map {
+                                ($ids->[$i++], get_pct($_));
+                        } $mset->items;
+                        undef $mset;
+                        while ($over && $over->expand_thread($ctx)) {
+                                for my $n (@{$ctx->{xids}}) {
+                                        my $t = $over->get_art($n) or next;
+                                        if (my $p = delete $n2p{$t->{num}}) {
+                                                $t->{relevance} = $p;
+                                        }
+                                        $self->out($buf .= $ORS);
+                                        $emit_cb->($t);
+                                }
+                                @{$ctx->{xids}} = ();
+                        }
+                }
+        }
+        $self->out($buf .= "]\n"); # done
+}
+
+1;
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index 7cda7e44..a7d7d953 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -23,7 +23,7 @@ sub new {
         my (undef, $dir, $opt) = @_;
         my $eidx = PublicInbox::ExtSearchIdx->new($dir, $opt);
         my $self = bless { priv_eidx => $eidx }, __PACKAGE__;
-        eidx_init($self) if $opt->{creat};
+        eidx_init($self)->done if $opt->{creat};
         $self;
 }
 
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 851c015b..4c65dce2 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -354,6 +354,8 @@ sub write_cb { # returns a callback for git_to_mail
                 _mbox_write_cb($cls, $1, $dst, $lei);
         } elsif ($dst =~ s!\A[Mm]aildir:!!) { # typically capitalized
                 _maildir_write_cb($dst, $lei);
+        } else {
+                undef;
         }
         # TODO: Maildir, MH, IMAP, JMAP ...
 }
diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
index 33e9c413..b670bc2f 100644
--- a/lib/PublicInbox/LeiXSearch.pm
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -20,9 +20,16 @@ sub new {
 
 sub attach_external {
         my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox
-        if (!$ibxish->can('over')) {
-                push @{$self->{remotes}}, $ibxish
+
+        if (!$ibxish->can('over') || !$ibxish->over) {
+                return push(@{$self->{remotes}}, $ibxish)
         }
+        my $desc = $ibxish->{inboxdir} // $ibxish->{topdir};
+        my $srch = $ibxish->search or
+                return warn("$desc not indexed for Xapian\n");
+        my @shards = $srch->xdb_shards_flat or
+                return warn("$desc has no Xapian shardsXapian\n");
+
         if (delete $self->{xdb}) { # XXX: do we need this?
                 # clobber existing {xdb} if amending
                 my $expect = delete $self->{nshard};
@@ -41,13 +48,18 @@ sub attach_external {
                 $nr == $expect or die
                         "BUG: reloaded $nr shards, expected $expect"
         }
-        my @shards = $ibxish->search->xdb_shards_flat;
         push @{$self->{shards_flat}}, @shards;
         push(@{$self->{shard2ibx}}, $ibxish) for (@shards);
 }
 
+# returns a list of local inboxes (or count in scalar context)
+sub locals {
+        my %uniq = map {; "$_" => $_ } @{$_[0]->{shard2ibx} // []};
+        values %uniq;
+}
+
 # called by PublicInbox::Search::xdb
-sub xdb_shards_flat { @{$_[0]->{shards_flat}} }
+sub xdb_shards_flat { @{$_[0]->{shards_flat} // []} }
 
 # like over->get_art
 sub smsg_for {
@@ -69,4 +81,6 @@ sub recent {
         $self->mset($qstr //= 'bytes:1..', $opt);
 }
 
+sub over {}
+
 1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 0bdf6fc6..7f68ee01 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -6,7 +6,7 @@
 package PublicInbox::Search;
 use strict;
 use parent qw(Exporter);
-our @EXPORT_OK = qw(retry_reopen int_val);
+our @EXPORT_OK = qw(retry_reopen int_val get_pct);
 use List::Util qw(max);
 
 # values for searching, changing the numeric value breaks
@@ -424,4 +424,12 @@ sub int_val ($$) {
         sortable_unserialise($val) + 0; # PV => IV conversion
 }
 
+sub get_pct ($) { # mset item
+        # Capped at "99%" since "100%" takes an extra column in the
+        # thread skeleton view.  <xapian/mset.h> says the value isn't
+        # very meaningful, anyways.
+        my $n = $_[0]->get_percent;
+        $n > 99 ? 99 : $n;
+}
+
 1;
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 6b36f795..d50d3cf6 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -14,7 +14,7 @@ use PublicInbox::WwwAtomStream;
 use PublicInbox::WwwStream qw(html_oneshot);
 use PublicInbox::SearchThread;
 use PublicInbox::SearchQuery;
-use PublicInbox::Search;
+use PublicInbox::Search qw(get_pct);
 my %rmap_inc;
 
 sub mbox_results {
@@ -276,14 +276,6 @@ sub sort_relevance {
         } @{$_[0]} ]
 }
 
-sub get_pct ($) {
-        # Capped at "99%" since "100%" takes an extra column in the
-        # thread skeleton view.  <xapian/mset.h> says the value isn't
-        # very meaningful, anyways.
-        my $n = $_[0]->get_percent;
-        $n > 99 ? 99 : $n;
-}
-
 sub mset_thread {
         my ($ctx, $mset, $q) = @_;
         my $ibx = $ctx->{ibx};
diff --git a/t/lei.t b/t/lei.t
index 6d47e307..72c50308 100644
--- a/t/lei.t
+++ b/t/lei.t
@@ -122,7 +122,7 @@ my $setup_publicinboxes = sub {
         return if $done eq $home;
         use PublicInbox::InboxWritable;
         for my $V (1, 2) {
-                run_script([qw(-init -Lmedium), "-V$V", "t$V",
+                run_script([qw(-init), "-V$V", "t$V",
                                 '--newsgroup', "t.$V",
                                 "$home/t$V", "http://example.com/t$V",
                                 "t$V\@example.com" ]) or BAIL_OUT "init v$V";
@@ -175,6 +175,15 @@ my $test_external = sub {
         });
         $lei->('ls-external');
         like($out, qr/boost=0\n/s, 'ls-external has output');
+
+        # note, on a Bourne shell users should be able to use either:
+        #        s:"use boolean prefix"
+        #        "s:use boolean prefix"
+        # or use single quotes, it should not matter.  Users only need
+        # to know shell quoting rules, not Xapian quoting rules.
+        # No double-quoting should be imposed on users on the CLI
+        $lei->('q', 's:use boolean prefix');
+        like($out, qr/search: use boolean prefix/, 'phrase search got result');
 };
 
 my $test_lei_common = sub {
diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t
index 3774b4c1..8b03c1f2 100644
--- a/t/lei_xsearch.t
+++ b/t/lei_xsearch.t
@@ -70,4 +70,9 @@ my $max = max(map { $_->{docid} } @msgs);
 is($lxs->smsg_for(($mset->items)[0])->{docid}, $max,
         'got highest docid');
 
+my @ibxish = $lxs->locals;
+is(scalar(@ibxish), scalar(@ibx) + 1, 'got locals back');
+is($lxs->search, $lxs, '->search works');
+is($lxs->over, undef, '->over fails');
+
 done_testing;