about summary refs log tree commit homepage
diff options
authorEric Wong <e@yhbt.net>2020-05-07 03:00:09 +0000
committerEric Wong <e@yhbt.net>2020-05-09 00:54:34 +0000
commitb714ab45d30d6f0298d73ef4281c1d0263a02493 (patch)
parentc2bc9ebcb770a27823d8e989707f434826333b0e (diff)
We'll support both probabilistic matches via `l:' and boolean
matches via `lid:' for exact matches, similar to how both `m:'
and `mid:' are supported.  Only text inside angle braces (`<'
and `>') are supported, since I'm not sure if there's value in
searching on the optional phrases (which would require decoding
with ->header_str instead of ->header_raw).
3 files changed, 46 insertions, 0 deletions
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 86a6ad67..b7db2b9f 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -77,11 +77,17 @@ use constant {
         # 15 - see public-inbox-v2-format(5)
         #      further bumps likely unnecessary, we'll suggest in-place
         #      "--reindex" use for further fixes and tweaks
+        #
+        #      public-inbox v1.5.0 adds (still SCHEMA_VERSION=15):
+        #      * "lid:" and "l:" for List-Id searches
         SCHEMA_VERSION => 15,
+# note: the non-X term prefix allocations are shared with
+# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
 my %bool_pfx_external = (
         mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
+        lid => 'G', # newsGroup (or similar entity), just inside <>
         dfpre => 'XDFPRE',
         dfpost => 'XDFPOST',
         dfblob => 'XDFPRE XDFPOST',
@@ -92,6 +98,7 @@ my %prob_prefix = (
         # for mairix compatibility
         s => 'S',
         m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+        l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
         f => 'A',
         t => 'XTO',
         tc => 'XTO XCC',
@@ -134,6 +141,8 @@ EOF
         'f:' => 'match within the From header',
         'a:' => 'match within the To, Cc, and From headers',
         'tc:' => 'match within the To and Cc headers',
+        'lid:' => 'exact contents of the List-Id',
+        'l:' => 'partial match contents of the List-Id header',
         'bs:' => 'match within the Subject and body',
         'dfn:' => 'match filename from diff',
         'dfa:' => 'match diff removed (-) lines',
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 25118f43..998341a7 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -352,6 +352,12 @@ sub add_xapian ($$$$) {
         $doc->add_boolean_term('Q' . $_) foreach @$mids;
+        for my $l ($hdr->header_raw('List-Id')) {
+                $l =~ /<([^>]+)>/ or next;
+                my $lid = $1;
+                $doc->add_boolean_term('G' . $lid);
+                index_text($self, $lid, 1, 'XL'); # probabilistic
+        }
         $self->{xdb}->replace_document($smsg->{num}, $doc);
diff --git a/t/search.t b/t/search.t
index 83986837..92f3305d 100644
--- a/t/search.t
+++ b/t/search.t
@@ -66,6 +66,7 @@ Subject: Hello world
 Message-ID: <root@s>
 From: John Smith <js@example.com>
 To: list@example.com
+List-Id: I'm not mad <i.m.just.bored>
@@ -77,6 +78,7 @@ Message-ID: <last@s>
 From: John Smith <js@example.com>
 To: list@example.com
 Cc: foo@example.com
+List-Id: there's nothing <left.for.me.to.do>
 goodbye forever :<
@@ -448,6 +450,35 @@ EOF
         is($ro->query("m:Pine m:LNX m:10010260936330", {mset=>1})->size, 1);
+{ # List-Id searching
+        my $found = $ro->query('lid:i.m.just.bored');
+        is_deeply([ filter_mids($found) ], [ 'root@s' ],
+                'got expected mid on exact lid: search');
+        $found = $ro->query('lid:just.bored');
+        is_deeply($found, [], 'got nothing on lid: search');
+        $found = $ro->query('lid:*.just.bored');
+        is_deeply($found, [], 'got nothing on lid: search');
+        $found = $ro->query('l:i.m.just.bored');
+        is_deeply([ filter_mids($found) ], [ 'root@s' ],
+                'probabilistic search works on full List-Id contents');
+        $found = $ro->query('l:just.bored');
+        is_deeply([ filter_mids($found) ], [ 'root@s' ],
+                'probabilistic search works on partial List-Id contents');
+        $found = $ro->query('lid:mad');
+        is_deeply($found, [], 'no match on phrase with lid:');
+        $found = $ro->query('lid:bored');
+        is_deeply($found, [], 'no match on partial List-Id with lid:');
+        $found = $ro->query('l:nothing');
+        is_deeply($found, [], 'matched on phrase with l:');