From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 85BE01F934 for ; Sun, 10 Jan 2021 12:15:19 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 01/22] lei query + pagination sorta working Date: Sun, 10 Jan 2021 12:14:58 +0000 Message-Id: <20210110121519.17044-2-e@80x24.org> In-Reply-To: <20210110121519.17044-1-e@80x24.org> References: <20210110121519.17044-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Parallelism and interactivity with pager + SIGPIPE needs work; but results are shown and phrase search works without shell users having to apply Xapian quoting rules on top of standard shell quoting. --- MANIFEST | 1 + lib/PublicInbox/LEI.pm | 12 +-- lib/PublicInbox/LeiExternal.pm | 33 ++++--- lib/PublicInbox/LeiQuery.pm | 176 +++++++++++++++++++++++++++++++++ lib/PublicInbox/LeiStore.pm | 2 +- lib/PublicInbox/LeiToMail.pm | 2 + lib/PublicInbox/LeiXSearch.pm | 22 ++++- lib/PublicInbox/Search.pm | 10 +- lib/PublicInbox/SearchView.pm | 10 +- t/lei.t | 11 ++- t/lei_xsearch.t | 5 + 11 files changed, 250 insertions(+), 34 deletions(-) create mode 100644 lib/PublicInbox/LeiQuery.pm diff --git a/MANIFEST b/MANIFEST index 6dc08f01..609160dd 100644 --- a/MANIFEST +++ b/MANIFEST @@ -165,6 +165,7 @@ lib/PublicInbox/KQNotify.pm lib/PublicInbox/LEI.pm lib/PublicInbox/LeiDedupe.pm lib/PublicInbox/LeiExternal.pm +lib/PublicInbox/LeiQuery.pm lib/PublicInbox/LeiSearch.pm lib/PublicInbox/LeiStore.pm lib/PublicInbox/LeiToMail.pm diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm index 9c3308ad..a5658e6d 100644 --- a/lib/PublicInbox/LEI.pm +++ b/lib/PublicInbox/LEI.pm @@ -8,7 +8,8 @@ package PublicInbox::LEI; use strict; use v5.10.1; -use parent qw(PublicInbox::DS PublicInbox::LeiExternal); +use parent qw(PublicInbox::DS PublicInbox::LeiExternal + PublicInbox::LeiQuery); use Getopt::Long (); use Socket qw(AF_UNIX SOCK_STREAM pack_sockaddr_un); use Errno qw(EAGAIN ECONNREFUSED ENOENT); @@ -80,7 +81,7 @@ sub _config_path ($) { our %CMD = ( # sorted in order of importance/use: 'q' => [ 'SEARCH_TERMS...', 'search for messages matching terms', qw( save-as=s output|mfolder|o=s format|f=s dedupe|d=s thread|t augment|a - sort|s=s@ reverse|r offset=i remote local! external! + sort|s=s reverse|r offset=i remote local! external! pretty since|after=s until|before=s), opt_dash('limit|n=i', '[0-9]+') ], 'show' => [ 'MID|OID', 'show a given object (Message-ID or object ID)', @@ -202,8 +203,9 @@ my %OPTDESC = ( 'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ], 'offset=i' => ['OFF', 'search result offset (default: 0)'], -'sort|s=s@' => [ 'VAL|internaldate,date,relevance,docid', +'sort|s=s' => [ 'VAL|received,relevance,docid', "order of results `--output'-dependent"], +'reverse|r' => [ 'reverse search results' ], # like sort(1) 'boost=i' => 'increase/decrease priority of results (default: 0)', @@ -469,10 +471,6 @@ sub lei_show { my ($self, @argv) = @_; } -sub lei_query { - my ($self, @argv) = @_; -} - sub lei_mark { my ($self, @argv) = @_; } diff --git a/lib/PublicInbox/LeiExternal.pm b/lib/PublicInbox/LeiExternal.pm index 4facd451..64faf5a0 100644 --- a/lib/PublicInbox/LeiExternal.pm +++ b/lib/PublicInbox/LeiExternal.pm @@ -8,24 +8,35 @@ use v5.10.1; use parent qw(Exporter); our @EXPORT = qw(lei_ls_external lei_add_external lei_forget_external); -sub lei_ls_external { - my ($self, @argv) = @_; - my $stor = $self->_lei_store(0); +sub _externals_each { + my ($self, $cb, @arg) = @_; my $cfg = $self->_lei_cfg(0); - my $out = $self->{1}; - my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n"); - my (%boost, @loc); + my %boost; for my $sec (grep(/\Aexternal\./, @{$cfg->{-section_order}})) { my $loc = substr($sec, length('external.')); $boost{$loc} = $cfg->{"$sec.boost"}; - push @loc, $loc; } - use sort 'stable'; + return \%boost if !wantarray && !$cb; + # highest boost first, but stable for alphabetic tie break - for (sort { $boost{$b} <=> $boost{$a} } sort keys %boost) { - # TODO: use miscidx and show docid so forget/set is easier - print $out $_, $OFS, 'boost=', $boost{$_}, $ORS; + use sort 'stable'; + my @order = sort { $boost{$b} <=> $boost{$a} } sort keys %boost; + return @order if !$cb; + for my $loc (@order) { + $cb->(@arg, $loc, $boost{$loc}); } + @order; # scalar or array +} + +sub lei_ls_external { + my ($self, @argv) = @_; + my $stor = $self->_lei_store(0); + my $out = $self->{1}; + my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n"); + $self->_externals_each(sub { + my ($loc, $boost_val) = @_; + print $out $loc, $OFS, 'boost=', $boost_val, $ORS; + }); } sub lei_add_external { diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm new file mode 100644 index 00000000..d14da1bc --- /dev/null +++ b/lib/PublicInbox/LeiQuery.pm @@ -0,0 +1,176 @@ +# Copyright (C) 2021 all contributors +# License: AGPL-3.0+ + +# handles lei commands +package PublicInbox::LeiQuery; +use strict; +use v5.10.1; +use PublicInbox::MID qw($MID_EXTRACT); +use POSIX qw(strftime); +use PublicInbox::Address qw(pairs); +use PublicInbox::Search qw(get_pct); + +sub _iso8601 ($) { strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($_[0])) } + +# prepares an smsg for JSON +sub _smsg_unbless ($) { + my ($smsg) = @_; + + delete @$smsg{qw(lines bytes)}; + $smsg->{rcvd} = _iso8601(delete $smsg->{ts}); # JMAP receivedAt + $smsg->{dt} = _iso8601(delete $smsg->{ds}); # JMAP UTCDate + + if (my $r = delete $smsg->{references}) { + $smsg->{references} = [ + map { "<$_>" } ($r =~ m/$MID_EXTRACT/go) ]; + } + if (my $m = delete($smsg->{mid})) { + $smsg->{'m'} = "<$m>"; + } + # XXX breaking to/cc, into structured arrays or tables which + # distinguish "$phrase <$address>" causes pretty printing JSON + # to take up too much vertical space. I can't get either + # Cpanel::JSON::XS or JSON::XS or jq(1) only indent when + # wrapping is necessary, rather than blindly indenting and + # adding vertical space everywhere. + for my $f (qw(from to cc)) { + my $v = delete $smsg->{$f} or next; + $smsg->{substr($f, 0, 1)} = $v; + } + $smsg->{'s'} = delete $smsg->{subject}; + # can we be bothered to parse From/To/Cc into arrays? + scalar { %$smsg }; # unbless +} + +sub _vivify_external { # _externals_each callback + my ($src, $dir) = @_; + if (-f "$dir/ei.lock") { + require PublicInbox::ExtSearch; + push @$src, PublicInbox::ExtSearch->new($dir); + } elsif (-f "$dir/inbox.lock" || -d "$dir/public-inbox") { # v2, v1 + require PublicInbox::Inbox; + push @$src, bless { inboxdir => $dir }, 'PublicInbox::Inbox'; + } else { + warn "W: ignoring $dir, unable to determine type\n"; + } +} + +# the main "lei q SEARCH_TERMS" method +sub lei_q { + my ($self, @argv) = @_; + my $sto = $self->_lei_store(1); + my $cfg = $self->_lei_cfg(1); + my $opt = $self->{opt}; + my $qstr = join(' ', map {; + # Consider spaces in argv to be for phrase search in Xapian. + # In other words, the users should need only care about + # normal shell quotes and not have to learn Xapian quoting. + /\s/ ? (s/\A(\w+:)// ? qq{$1"$_"} : qq{"$_"}) : $_ + } @argv); + $opt->{limit} //= 10000; + my $lxs; + + # --local is enabled by default + my @src = $opt->{'local'} ? ($sto->search) : (); + + # --external is enabled by default, but allow --no-external + if ($opt->{external} // 1) { + $self->_externals_each(\&_vivify_external, \@src); + # {tid} is not unique between indices, so we have to search + # each src individually + if (!$opt->{thread}) { + require PublicInbox::LeiXSearch; + my $lxs = PublicInbox::LeiXSearch->new; + # local is always first + $lxs->attach_external($_) for @src; + @src = ($lxs); + } + } + my $out = $self->{output} // '-'; + $out = 'json:/dev/stdout' if $out eq '-'; + my $isatty = -t $self->{1}; + $self->start_pager if $isatty; + my $json = substr($out, 0, 5) eq 'json:' ? + ref(PublicInbox::Config->json)->new : undef; + if ($json) { + if ($opt->{pretty} //= $isatty) { + $json->pretty(1)->space_before(0); + $json->indent_length($opt->{indent} // 2); + } + $json->utf8; # avoid Wide character in print warnings + $json->ascii(1) if $opt->{ascii}; # for "\uXXXX" + $json->canonical; + } + + # src: LeiXSearch || LeiSearch || Inbox + my %mset_opt = map { $_ => $opt->{$_} } qw(thread limit offset); + delete $mset_opt{limit} if $opt->{limit} < 0; + $mset_opt{asc} = $opt->{'reverse'} ? 1 : 0; + if (defined(my $sort = $opt->{'sort'})) { + if ($sort eq 'relevance') { + $mset_opt{relevance} = 1; + } elsif ($sort eq 'docid') { + $mset_opt{relevance} = $mset_opt{asc} ? -1 : -2; + } elsif ($sort =~ /\Areceived(?:-?[aA]t)?\z/) { + # the default + } else { + die "unrecognized --sort=$sort\n"; + } + } + # $self->out($json->encode(\%mset_opt)); + # descending docid order + $mset_opt{relevance} //= -2 if $opt->{thread}; + # my $wcb = PublicInbox::LeiToMail->write_cb($out, $self); + + # even w/o pretty, do the equivalent of a --pretty=oneline + # output so "lei q SEARCH_TERMS | wc -l" can be useful: + my $ORS = $json ? ($opt->{pretty} ? ', ' : ",\n") : "\n"; + my $buf; + + # we can generate too many records to hold in RAM, so we stream + # and fake a JSON array starting here: + $self->out('[') if $json; + my $emit_cb = sub { + my ($smsg) = @_; + delete @$smsg{qw(tid num)}; # only makes sense if single src + chomp($buf = $json->encode(_smsg_unbless($smsg))); + }; + for my $src (@src) { + my $srch = $src->search; + my $over = $src->over; + my $smsg_for = $src->can('smsg_for'); # LeiXSearch + my $mo = { %mset_opt }; + my $mset = $srch->mset($qstr, $mo); + my $ctx = {}; + if ($smsg_for) { + for my $it ($mset->items) { + my $smsg = $smsg_for->($srch, $it) or next; + $self->out($buf .= $ORS) if defined $buf; + $smsg->{relevance} = get_pct($it); + $emit_cb->($smsg); + } + } else { # --thread + my $ids = $srch->mset_to_artnums($mset, $mo); + $ctx->{ids} = $ids; + my $i = 0; + my %n2p = map { + ($ids->[$i++], get_pct($_)); + } $mset->items; + undef $mset; + while ($over && $over->expand_thread($ctx)) { + for my $n (@{$ctx->{xids}}) { + my $t = $over->get_art($n) or next; + if (my $p = delete $n2p{$t->{num}}) { + $t->{relevance} = $p; + } + $self->out($buf .= $ORS); + $emit_cb->($t); + } + @{$ctx->{xids}} = (); + } + } + } + $self->out($buf .= "]\n"); # done +} + +1; diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 7cda7e44..a7d7d953 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -23,7 +23,7 @@ sub new { my (undef, $dir, $opt) = @_; my $eidx = PublicInbox::ExtSearchIdx->new($dir, $opt); my $self = bless { priv_eidx => $eidx }, __PACKAGE__; - eidx_init($self) if $opt->{creat}; + eidx_init($self)->done if $opt->{creat}; $self; } diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 851c015b..4c65dce2 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -354,6 +354,8 @@ sub write_cb { # returns a callback for git_to_mail _mbox_write_cb($cls, $1, $dst, $lei); } elsif ($dst =~ s!\A[Mm]aildir:!!) { # typically capitalized _maildir_write_cb($dst, $lei); + } else { + undef; } # TODO: Maildir, MH, IMAP, JMAP ... } diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm index 33e9c413..b670bc2f 100644 --- a/lib/PublicInbox/LeiXSearch.pm +++ b/lib/PublicInbox/LeiXSearch.pm @@ -20,9 +20,16 @@ sub new { sub attach_external { my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox - if (!$ibxish->can('over')) { - push @{$self->{remotes}}, $ibxish + + if (!$ibxish->can('over') || !$ibxish->over) { + return push(@{$self->{remotes}}, $ibxish) } + my $desc = $ibxish->{inboxdir} // $ibxish->{topdir}; + my $srch = $ibxish->search or + return warn("$desc not indexed for Xapian\n"); + my @shards = $srch->xdb_shards_flat or + return warn("$desc has no Xapian shardsXapian\n"); + if (delete $self->{xdb}) { # XXX: do we need this? # clobber existing {xdb} if amending my $expect = delete $self->{nshard}; @@ -41,13 +48,18 @@ sub attach_external { $nr == $expect or die "BUG: reloaded $nr shards, expected $expect" } - my @shards = $ibxish->search->xdb_shards_flat; push @{$self->{shards_flat}}, @shards; push(@{$self->{shard2ibx}}, $ibxish) for (@shards); } +# returns a list of local inboxes (or count in scalar context) +sub locals { + my %uniq = map {; "$_" => $_ } @{$_[0]->{shard2ibx} // []}; + values %uniq; +} + # called by PublicInbox::Search::xdb -sub xdb_shards_flat { @{$_[0]->{shards_flat}} } +sub xdb_shards_flat { @{$_[0]->{shards_flat} // []} } # like over->get_art sub smsg_for { @@ -69,4 +81,6 @@ sub recent { $self->mset($qstr //= 'bytes:1..', $opt); } +sub over {} + 1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 0bdf6fc6..7f68ee01 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(retry_reopen int_val); +our @EXPORT_OK = qw(retry_reopen int_val get_pct); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -424,4 +424,12 @@ sub int_val ($$) { sortable_unserialise($val) + 0; # PV => IV conversion } +sub get_pct ($) { # mset item + # Capped at "99%" since "100%" takes an extra column in the + # thread skeleton view. says the value isn't + # very meaningful, anyways. + my $n = $_[0]->get_percent; + $n > 99 ? 99 : $n; +} + 1; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 6b36f795..d50d3cf6 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -14,7 +14,7 @@ use PublicInbox::WwwAtomStream; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::SearchThread; use PublicInbox::SearchQuery; -use PublicInbox::Search; +use PublicInbox::Search qw(get_pct); my %rmap_inc; sub mbox_results { @@ -276,14 +276,6 @@ sub sort_relevance { } @{$_[0]} ] } -sub get_pct ($) { - # Capped at "99%" since "100%" takes an extra column in the - # thread skeleton view. says the value isn't - # very meaningful, anyways. - my $n = $_[0]->get_percent; - $n > 99 ? 99 : $n; -} - sub mset_thread { my ($ctx, $mset, $q) = @_; my $ibx = $ctx->{ibx}; diff --git a/t/lei.t b/t/lei.t index 6d47e307..72c50308 100644 --- a/t/lei.t +++ b/t/lei.t @@ -122,7 +122,7 @@ my $setup_publicinboxes = sub { return if $done eq $home; use PublicInbox::InboxWritable; for my $V (1, 2) { - run_script([qw(-init -Lmedium), "-V$V", "t$V", + run_script([qw(-init), "-V$V", "t$V", '--newsgroup', "t.$V", "$home/t$V", "http://example.com/t$V", "t$V\@example.com" ]) or BAIL_OUT "init v$V"; @@ -175,6 +175,15 @@ my $test_external = sub { }); $lei->('ls-external'); like($out, qr/boost=0\n/s, 'ls-external has output'); + + # note, on a Bourne shell users should be able to use either: + # s:"use boolean prefix" + # "s:use boolean prefix" + # or use single quotes, it should not matter. Users only need + # to know shell quoting rules, not Xapian quoting rules. + # No double-quoting should be imposed on users on the CLI + $lei->('q', 's:use boolean prefix'); + like($out, qr/search: use boolean prefix/, 'phrase search got result'); }; my $test_lei_common = sub { diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t index 3774b4c1..8b03c1f2 100644 --- a/t/lei_xsearch.t +++ b/t/lei_xsearch.t @@ -70,4 +70,9 @@ my $max = max(map { $_->{docid} } @msgs); is($lxs->smsg_for(($mset->items)[0])->{docid}, $max, 'got highest docid'); +my @ibxish = $lxs->locals; +is(scalar(@ibxish), scalar(@ibx) + 1, 'got locals back'); +is($lxs->search, $lxs, '->search works'); +is($lxs->over, undef, '->over fails'); + done_testing;