From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E0A6A1F9FC for ; Wed, 10 Feb 2021 19:57:59 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 1/2] search: use git approxidate in WWW and "lei q --stdin" Date: Wed, 10 Feb 2021 18:57:58 -0100 Message-Id: <20210210195759.11108-2-e@80x24.org> In-Reply-To: <20210210195759.11108-1-e@80x24.org> References: <20210210195759.11108-1-e@80x24.org> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit List-Id: This greatly improves the usability of d:, dt:, and rt: search prefixes for users already familiar git's "approxidate" feature. That is, users familiar with the --(since|after|until|before)= options in git-log(1) and similar commands will be able to use those dates in the WWW UI. --- lib/PublicInbox/Isearch.pm | 1 + lib/PublicInbox/LeiQuery.pm | 8 +++++++- lib/PublicInbox/Mbox.pm | 1 + lib/PublicInbox/Search.pm | 35 +++++++++++++++++++++++++-------- lib/PublicInbox/SearchView.pm | 3 ++- t/lei-externals.t | 2 +- t/psgi_search.t | 37 ++++++++++++++++++++--------------- t/search.t | 25 +++++++++++++++++++++++ 8 files changed, 85 insertions(+), 27 deletions(-) diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index 342d7913..9ed2d9e5 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -25,6 +25,7 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n"; } +sub query_approxidate { $_[0]->{es}->query_approxidate($_[1], $_[2]) } sub mset { my ($self, $str, $opt) = @_; diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm index d637b1ae..f71beae6 100644 --- a/lib/PublicInbox/LeiQuery.pm +++ b/lib/PublicInbox/LeiQuery.pm @@ -14,7 +14,12 @@ sub prep_ext { # externals_each callback sub qstr_add { # for --stdin my ($self) = @_; # $_[1] = $rbuf if (defined($_[1])) { - return eval { $self->{lxs}->do_query($self) } if $_[1] eq ''; + $_[1] eq '' and return eval { + my $lse = delete $self->{lse}; + $lse->query_approxidate($lse->git, + $self->{mset_opt}->{qstr}); + $self->{lxs}->do_query($self); + }; $self->{mset_opt}->{qstr} .= $_[1]; } else { $self->fail("error reading stdin: $!"); @@ -105,6 +110,7 @@ sub lei_q { no query allowed on command-line with --stdin require PublicInbox::InputPipe; + $self->{lse} = $lse; # for query_approxidate PublicInbox::InputPipe::consume($self->{0}, \&qstr_add, $self); return; } diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 94f733bc..844099aa 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -237,6 +237,7 @@ sub mbox_all { my $qopts = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC $qopts->{threads} = 1 if $q->{t}; + $srch->query_approxidate($ctx->{ibx}->git, $q_string); my $mset = $srch->mset($q_string, $qopts); $qopts->{offset} = $mset->size or return [404, [qw(Content-Type text/plain)], diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index b3fd532d..8e4cce33 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -321,6 +321,16 @@ sub date_parse_prepare { "$pfx:".join('..', @r).$end; } +sub date_parse_finalize { + my ($git, $to_parse) = @_; + # git-rev-parse can handle any number of args up to system + # limits (around (4096*32) bytes on Linux). + my @r = $git->date_parse(@$to_parse); + my $i; + $_[2] =~ s/\0(%[%YmdHMSs]+)([0-9\+]+)\0/strftime($1, + gmtime($2 eq '+' ? ($r[$i]+86400) : $r[$i=$2+0]))/sge; +} + # n.b. argv never has NUL, though we'll need to filter it out # if this $argv isn't from a command execution sub query_argv_to_string { @@ -336,17 +346,26 @@ sub query_argv_to_string { $_ } } @$argv); - # git-rev-parse can handle any number of args up to system - # limits (around (4096*32) bytes on Linux). - if ($to_parse) { - my @r = $git->date_parse(@$to_parse); - my $i; - $tmp =~ s/\0(%[%YmdHMSs]+)([0-9\+]+)\0/strftime($1, - gmtime($2 eq '+' ? ($r[$i]+86400) : $r[$i=$2+0]))/sge; - } + date_parse_finalize($git, $to_parse, $tmp) if $to_parse; $tmp } +# this is for the WWW "q=" query parameter and "lei q --stdin" +# it can't do d:"5 days ago", but it will do d:5.days.ago +sub query_approxidate { + my (undef, $git) = @_; # $_[2] = $query_string (modified in-place) + my $DQ = qq<"\x{201c}\x{201d}>; # Xapian can use curly quotes + $_[2] =~ tr/\x00/ /; # Xapian doesn't do NUL, we use it as a placeholder + my ($terms, $phrase, $to_parse); + $_[2] =~ s{([^$DQ]*)([${DQ}][^\"]*[$DQ])?}{ + ($terms, $phrase) = ($1, $2); + $terms =~ s!\b(d|rt|dt):(\S+)! + date_parse_prepare($to_parse //= [], $1, $2)!sge; + $terms.($phrase // ''); + }sge; + date_parse_finalize($git, $to_parse, $_[2]) if $to_parse; +} + # read-only sub mset { my ($self, $query_string, $opts) = @_; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 08c77f35..2d0b8e13 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -34,7 +34,6 @@ sub sres_top_html { return PublicInbox::WWW::need($ctx, 'Search'); my $q = PublicInbox::SearchQuery->new($ctx->{qp}); my $x = $q->{x}; - my $query = $q->{'q'}; my $o = $q->{o}; my $asc; if ($o < 0) { @@ -54,6 +53,8 @@ sub sres_top_html { my ($mset, $total, $err, $html); retry: eval { + my $query = $q->{'q'}; + $srch->query_approxidate($ctx->{ibx}->git, $query); $mset = $srch->mset($query, $opts); $total = $mset->get_matches_estimated; }; diff --git a/t/lei-externals.t b/t/lei-externals.t index 9fc8bae9..f61b7e52 100644 --- a/t/lei-externals.t +++ b/t/lei-externals.t @@ -151,7 +151,7 @@ SKIP: { { open my $fh, '+>', undef or BAIL_OUT $!; $fh->autoflush(1); - print $fh 's:use' or BAIL_OUT $!; + print $fh 's:use d:..5.days.from.now' or BAIL_OUT $!; seek($fh, 0, SEEK_SET) or BAIL_OUT $!; ok($lei->([qw(q -q --stdin)], undef, { %$lei_opt, 0 => $fh }), '--stdin on regular file works'); diff --git a/t/psgi_search.t b/t/psgi_search.t index 8ba431bc..514df005 100644 --- a/t/psgi_search.t +++ b/t/psgi_search.t @@ -74,20 +74,25 @@ EOF my $www = PublicInbox::WWW->new($cfg); test_psgi(sub { $www->call(@_) }, sub { my ($cb) = @_; - my $res; - $res = $cb->(GET('/test/?q=%C3%86var')); - my $html = $res->content; - like($html, qr/Ævar - /, 'HTML escaped in title'); - my @res = ($html =~ m/\?q=(.+var)\b/g); - ok(scalar(@res), 'saw query strings'); - my %uniq = map { $_ => 1 } @res; - is(1, scalar keys %uniq, 'all query values identical in HTML'); - is('%C3%86var', (keys %uniq)[0], 'matches original query'); - ok(index($html, 'by Ævar Arnfjörð Bjarmason') >= 0, - "displayed Ævar's name properly in HTML"); - - like($html, qr/download mbox\.gz: .*?"full threads"/s, - '"full threads" download option shown'); + my ($html, $res); + my $approxidate = '1.hour.from.now'; + for my $req ('/test/?q=%C3%86var', '/test/?q=%25C3%2586var') { + $res = $cb->(GET($req."+d:..$approxidate")); + $html = $res->content; + like($html, qr/<title>Ævar d:\.\.\Q$approxidate\E/, + 'HTML escaped in title, "d:..$APPROXIDATE" preserved'); + my @res = ($html =~ m/\?q=(.+var)\+d:\.\.\Q$approxidate\E/g); + ok(scalar(@res), 'saw query strings'); + my %uniq = map { $_ => 1 } @res; + is(1, scalar keys %uniq, 'all query values identical in HTML'); + is('%C3%86var', (keys %uniq)[0], 'matches original query'); + ok(index($html, 'by Ævar Arnfjörð Bjarmason') + >= 0, "displayed Ævar's name properly in HTML"); + like($html, qr/download mbox\.gz: .*?"full threads"/s, + '"full threads" download option shown'); + } + like($html, qr/Initial query\b.*?returned no.results, used:.*instead/s, + 'noted retry on double-escaped query {-uxs_retried}'); my $warn = []; local $SIG{__WARN__} = sub { push @$warn, @_ }; @@ -130,7 +135,7 @@ test_psgi(sub { $www->call(@_) }, sub { qr/filename=no-subject\.mbox\.gz/); # "full threads" mbox.gz download - $res = $cb->(POST('/test/?q=s:test&x=m&t')); + $res = $cb->(POST('/test/?q=s:test+d:..1.hour.from.now&x=m&t')); is($res->code, 200, 'successful mbox download with threads'); gunzip(\($res->content) => \(my $before)); is_deeply([ "Message-ID: <$mid>\n", "Message-ID: <reply\@asdf>\n" ], @@ -151,7 +156,7 @@ test_psgi(sub { $www->call(@_) }, sub { '"full threads" download option not shown w/o has_threadid'); # in case somebody uses curl to bypass <form> - $res = $cb->(POST('/test/?q=s:test&x=m&t')); + $res = $cb->(POST("/test/?q=s:test+d:..$approxidate&x=m&t")); is($res->code, 200, 'successful mbox download w/ threads'); gunzip(\($res->content) => \(my $after)); isnt($before, $after); diff --git a/t/search.t b/t/search.t index bcfe91f5..77081231 100644 --- a/t/search.t +++ b/t/search.t @@ -583,6 +583,31 @@ SKIP: { $q = $s->query_argv_to_string($g, [qw{OR (rt:1993-10-02)}]); like($q, qr/\AOR \(rt:749\d{6}\.\.749\d{6}\)\z/, 'trailing parentheses preserved'); + + my $qs = qq[f:bob rt:1993-10-02..2010-10-02]; + $s->query_approxidate($g, $qs); + like($qs, qr/\Af:bob rt:749\d{6}\.\.1286\d{6}\z/, + 'no phrases, no problem'); + + my $orig = $qs = qq[f:bob "d:1993-10-02..2010-10-02"]; + $s->query_approxidate($g, $qs); + is($qs, $orig, 'phrase preserved'); + + $orig = $qs = qq[f:bob "d:1993-10-02..2010-10-02 "] . + qq["dt:1993-10-02..2010-10-02 " \x{201c}]; + $s->query_approxidate($g, $qs); + is($qs, $orig, 'phrase preserved even with escaped ""'); + + $orig = $qs = qq[f:bob "hello world" d:1993-10-02..2010-10-02]; + $s->query_approxidate($g, $qs); + is($qs, qq[f:bob "hello world" d:19931002..20101002], + 'post-phrase date corrected'); + + my $x_days_ago = strftime('%Y%m%d', gmtime(time - (5 * 86400))); + $orig = $qs = qq[broken d:5.days.ago..]; + $s->query_approxidate($g, $qs); + is($qs, qq[broken d:$x_days_ago..], 'date.phrase.with.dots'); + $ENV{TEST_EXPENSIVE} or skip 'TEST_EXPENSIVE not set for argv overflow check', 1; my @w;