From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 30FAF20711 for ; Fri, 9 Sep 2016 00:01:34 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 03/10] search: more granular message body searching Date: Fri, 9 Sep 2016 00:01:24 +0000 Message-Id: <20160909000131.18584-4-e@80x24.org> In-Reply-To: <20160909000131.18584-1-e@80x24.org> References: <20160909000131.18584-1-e@80x24.org> List-Id: "bs:" and "b:" are adapted from mairix(1) We will also support searching explicitly for quoted vs non-quoted text via "q:" and "nq:" prefixes since sometimes readers will not care for quoted text. In the future, we will support parsing diffs (perhaps when repobrowse integration is complete). Note: this roughly doubles the size of the Xapian database due to the additional information; so this change may not be worth it. --- lib/PublicInbox/Search.pm | 18 ++++++++++++------ lib/PublicInbox/SearchIdx.pm | 17 ++++++++++++++--- t/search.t | 25 +++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 3b25b66..f74129d 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -58,16 +58,22 @@ my %bool_pfx_external = ( ); my %prob_prefix = ( - s => 'S', # for mairix compatibility + # for mairix compatibility + s => 'S', m => 'Q', # 'mid' is exact, 'm' can do partial - f => 'A', # for mairix compatibility - t => 'XTO', # for mairix compatibility - tc => 'XTC', # for mairix compatibility - c => 'XCC', # for mairix compatibility - tcf => 'XTCF', # for mairix compatibility + f => 'A', + t => 'XTO', + tc => 'XTC', + c => 'XCC', + tcf => 'XTCF', + b => 'XBODY', + bs => 'XBS', + # n.b.: leaving out "a:" alias for "tcf:" even though # mairix supports it. It is only mentioned in passing in mairix(1) # and the extra two letters are not significantly longer. + q => 'XQUOT', + nq => 'XNQ', ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 37fefbe..cd27a29 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -173,7 +173,10 @@ sub add_message { my $tg = $self->term_generator; $tg->set_document($doc); - $tg->index_text($subj, 1, 'S') if $subj; + if ($subj) { + $tg->index_text($subj, 1, 'S'); + $tg->index_text($subj, 1, 'XBS'); + } $tg->increase_termpos; $tg->index_text($subj) if $subj; $tg->increase_termpos; @@ -199,13 +202,21 @@ sub add_message { } } if (@quot) { - $tg->index_text(join("\n", @quot), 0); + my $s = join("\n", @quot); @quot = (); + $tg->index_text($s, 1, 'XQUOT'); + $tg->index_text($s, 0, 'XBS'); + $tg->index_text($s, 0, 'XBODY'); + $tg->index_text($s, 0); $tg->increase_termpos; } if (@orig) { - $tg->index_text(join("\n", @orig)); + my $s = join("\n", @orig); @orig = (); + $tg->index_text($s, 1, 'XNQ'); + $tg->index_text($s, 1, 'XBS'); + $tg->index_text($s, 1, 'XBODY'); + $tg->index_text($s); $tg->increase_termpos; } }); diff --git a/t/search.t b/t/search.t index 7abaf83..bddb545 100644 --- a/t/search.t +++ b/t/search.t @@ -361,6 +361,31 @@ sub filter_mids { } } +{ + $rw_commit->(); + $ro->reopen; + my $res = $ro->query('b:hello'); + is(scalar @{$res->{msgs}}, 0, 'no match on body search only'); + $res = $ro->query('bs:smith'); + is(scalar @{$res->{msgs}}, 0, + 'no match on body+subject search for From'); + + $res = $ro->query('q:theatre'); + is(scalar @{$res->{msgs}}, 1, 'only one quoted body'); + like($res->{msgs}->[0]->from, qr/\AQuoter/, 'got quoted body'); + + $res = $ro->query('nq:theatre'); + is(scalar @{$res->{msgs}}, 1, 'only one non-quoted body'); + like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, 'got non-quoted body'); + + foreach my $pfx (qw(b: bs:)) { + $res = $ro->query($pfx . 'theatre'); + is(scalar @{$res->{msgs}}, 2, "searched both bodies for $pfx"); + like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, + "non-quoter first for $pfx"); + } +} + done_testing(); 1; -- EW