From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 792DE20705 for ; Fri, 9 Sep 2016 00:01:33 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 01/10] search: allow searching user fields (To/Cc/From) Date: Fri, 9 Sep 2016 00:01:22 +0000 Message-Id: <20160909000131.18584-2-e@80x24.org> In-Reply-To: <20160909000131.18584-1-e@80x24.org> References: <20160909000131.18584-1-e@80x24.org> List-Id: Sometimes it can be useful to search based on who the message was sent to, sent by, or Cc:-ed. Of course, headers can be faked, but they usually are not... Anyways this mostly matches the behavior of mairix(1). --- lib/PublicInbox/Search.pm | 10 +++++++- lib/PublicInbox/SearchIdx.pm | 59 +++++++++++++++++++++++++++++++------------- t/search.t | 37 +++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 18 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 445c2d8..aec459b 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -51,8 +51,8 @@ my %bool_pfx_internal = ( thread => 'G', # newsGroup (or similar entity - e.g. a web forum name) ); -# do we still need these? probably not.. my %bool_pfx_external = ( + # do we still need these? probably not.. path => 'XPATH', mid => 'Q', # uniQue id (Message-ID) ); @@ -61,6 +61,14 @@ my %prob_prefix = ( subject => 'S', s => 'S', # for mairix compatibility m => 'Q', # 'mid' is exact, 'm' can do partial + f => 'A', # for mairix compatibility + t => 'XTO', # for mairix compatibility + tc => 'XTC', # for mairix compatibility + c => 'XCC', # for mairix compatibility + tcf => 'XTCF', # for mairix compatibility + # n.b.: leaving out "a:" alias for "tcf:" even though + # mairix supports it. It is only mentioned in passing in mairix(1) + # and the extra two letters are not significantly longer. ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index f54f5f2..37fefbe 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -96,12 +96,51 @@ sub _lock_release { close $lockfh or die "close failed: $!\n"; } -sub add_val { +sub add_val ($$$) { my ($doc, $col, $num) = @_; $num = Search::Xapian::sortable_serialise($num); $doc->add_value($col, $num); } +sub add_values ($$$) { + my ($smsg, $bytes, $num) = @_; + + my $ts = $smsg->ts; + my $doc = $smsg->{doc}; + add_val($doc, &PublicInbox::Search::TS, $ts); + + defined($num) and add_val($doc, &PublicInbox::Search::NUM, $num); + + defined($bytes) and add_val($doc, &PublicInbox::Search::BYTES, $bytes); + + add_val($doc, &PublicInbox::Search::LINES, + $smsg->{mime}->body_raw =~ tr!\n!\n!); + + my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); + $doc->add_value(&PublicInbox::Search::YYYYMMDD, $yyyymmdd); +} + +sub index_users ($$) { + my ($tg, $smsg) = @_; + + my $from = $smsg->from; + my $to = $smsg->to; + my $cc = $smsg->cc; + + $tg->index_text($from, 1, 'A'); # A - author + $tg->increase_termpos; + + $tg->index_text($to, 1, 'XTO') if $to ne ''; + $tg->index_text($cc, 1, 'XCC') if $cc ne ''; + my $tc = join("\t", $to, $cc); + $tg->index_text($tc, 1, 'XTC') if $tc ne ''; + my $tcf = join("\t", $tc, $from); + $tg->index_text($tcf, 1, 'XTCF') if $tcf ne ''; + + $tg->index_text($from); + $tg->increase_termpos; +} + sub add_message { my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; @@ -129,20 +168,7 @@ sub add_message { $doc->add_term(xpfx('path') . id_compress($path)); } - my $ts = $smsg->ts; - add_val($doc, &PublicInbox::Search::TS, $ts); - - defined($num) and - add_val($doc, &PublicInbox::Search::NUM, $num); - - defined($bytes) and - add_val($doc, &PublicInbox::Search::BYTES, $bytes); - - add_val($doc, &PublicInbox::Search::LINES, - $mime->body_raw =~ tr!\n!\n!); - - my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); - $doc->add_value(&PublicInbox::Search::YYYYMMDD, $yyyymmdd); + add_values($smsg, $bytes, $num); my $tg = $self->term_generator; @@ -152,8 +178,7 @@ sub add_message { $tg->index_text($subj) if $subj; $tg->increase_termpos; - $tg->index_text($smsg->from); - $tg->increase_termpos; + index_users($tg, $smsg); msg_iter($mime, sub { my ($part, $depth, @idx) = @{$_[0]}; diff --git a/t/search.t b/t/search.t index db94c0a..bb0861a 100644 --- a/t/search.t +++ b/t/search.t @@ -86,6 +86,7 @@ my $rw_commit = sub { 'Message-ID' => '', From => 'John Smith ', To => 'list@example.com', + Cc => 'foo@example.com', ], body => "goodbye forever :<\n"); @@ -324,6 +325,42 @@ sub filter_mids { is(scalar @{$res->{msgs}}, 0, 'nothing before 19931001'); } +# names and addresses +{ + my $res = $ro->query('t:list@example.com'); + is(scalar @{$res->{msgs}}, 6, 'searched To: successfully'); + foreach my $smsg (@{$res->{msgs}}) { + like($smsg->to, qr/\blist\@example\.com\b/, 'to appears'); + } + + $res = $ro->query('tc:list@example.com'); + is(scalar @{$res->{msgs}}, 6, 'searched To+Cc: successfully'); + foreach my $smsg (@{$res->{msgs}}) { + my $tocc = join("\n", $smsg->to, $smsg->cc); + like($tocc, qr/\blist\@example\.com\b/, 'tocc appears'); + } + + foreach my $pfx ('tcf:', 'c:') { + $res = $ro->query($pfx . 'foo@example.com'); + is(scalar @{$res->{msgs}}, 1, + "searched $pfx successfully for Cc:"); + foreach my $smsg (@{$res->{msgs}}) { + like($smsg->cc, qr/\bfoo\@example\.com\b/, + 'cc appears'); + } + } + + foreach my $pfx ('', 'tcf:', 'f:') { + $res = $ro->query($pfx . 'Laggy'); + is(scalar @{$res->{msgs}}, 1, + "searched $pfx successfully for From:"); + foreach my $smsg (@{$res->{msgs}}) { + like($smsg->from, qr/Laggy Sender/, + "From appears with $pfx"); + } + } +} + done_testing(); 1; -- EW