about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--lib/PublicInbox/SearchIdx.pm47
-rw-r--r--t/lei_xsearch.t31
2 files changed, 68 insertions, 10 deletions
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3372bea5..772f5a64 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -22,6 +22,7 @@ use PublicInbox::OverIdx;
 use PublicInbox::Spawn qw(spawn nodatacow_dir);
 use PublicInbox::Git qw(git_unquote);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::Address;
 our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
         index_text term_generator add_val is_bad_blob);
 my $X = \%PublicInbox::Search::X;
@@ -158,22 +159,44 @@ sub term_generator ($) { # write-only
         }
 }
 
+sub index_phrase ($$$$) {
+        my ($self, $text, $wdf_inc, $prefix) = @_;
+
+        my $tg = term_generator($self);
+        $tg->index_text($text, $wdf_inc, $prefix);
+        $tg->increase_termpos;
+}
+
 sub index_text ($$$$) {
         my ($self, $text, $wdf_inc, $prefix) = @_;
-        my $tg = term_generator($self); # man Search::Xapian::TermGenerator
 
         if ($self->{indexlevel} eq 'full') {
-                $tg->index_text($text, $wdf_inc, $prefix);
-                $tg->increase_termpos;
+                index_phrase($self, $text, $wdf_inc, $prefix);
         } else {
+                my $tg = term_generator($self);
                 $tg->index_text_without_positions($text, $wdf_inc, $prefix);
         }
 }
 
 sub index_headers ($$) {
         my ($self, $smsg) = @_;
-        my @x = (from => 'A', # Author
-                subject => 'S', to => 'XTO', cc => 'XCC');
+        my @x = (from => 'A', to => 'XTO', cc => 'XCC'); # A: Author
+        while (my ($field, $pfx) = splice(@x, 0, 2)) {
+                my $val = $smsg->{$field};
+                next if $val eq '';
+                # include "(comments)" after the address, too, so not using
+                # PublicInbox::Address::names or pairs
+                index_text($self, $val, 1, $pfx);
+
+                # we need positional info for email addresses since they
+                # can be considered phrases
+                if ($self->{indexlevel} eq 'medium') {
+                        for my $addr (PublicInbox::Address::emails($val)) {
+                                index_phrase($self, $addr, 1, $pfx);
+                        }
+                }
+        }
+        @x = (subject => 'S');
         while (my ($field, $pfx) = splice(@x, 0, 2)) {
                 my $val = $smsg->{$field};
                 index_text($self, $val, 1, $pfx) if $val ne '';
@@ -186,7 +209,11 @@ sub index_diff_inc ($$$$) {
                 index_text($self, join("\n", @$xnq), 1, 'XNQ');
                 @$xnq = ();
         }
-        index_text($self, $text, 1, $pfx);
+        if ($pfx eq 'XDFN') {
+                index_phrase($self, $text, 1, $pfx);
+        } else {
+                index_text($self, $text, 1, $pfx);
+        }
 }
 
 sub index_old_diff_fn {
@@ -292,7 +319,7 @@ sub index_xapian { # msg_iter callback
         my $ct = $part->content_type || 'text/plain';
         my $fn = $part->filename;
         if (defined $fn && $fn ne '') {
-                index_text($self, $fn, 1, 'XFN');
+                index_phrase($self, $fn, 1, 'XFN');
         }
         if ($part->{is_submsg}) {
                 my $mids = mids_for_index($part);
@@ -330,20 +357,20 @@ sub index_list_id ($$$) {
                 $l =~ /<([^>]+)>/ or next;
                 my $lid = lc $1;
                 $doc->add_boolean_term('G' . $lid);
-                index_text($self, $lid, 1, 'XL'); # probabilistic
+                index_phrase($self, $lid, 1, 'XL'); # probabilistic
         }
 }
 
 sub index_ids ($$$$) {
         my ($self, $doc, $hdr, $mids) = @_;
         for my $mid (@$mids) {
-                index_text($self, $mid, 1, 'XM');
+                index_phrase($self, $mid, 1, 'XM');
 
                 # because too many Message-IDs are prefixed with
                 # "Pine.LNX."...
                 if ($mid =~ /\w{12,}/) {
                         my @long = ($mid =~ /(\w{3,}+)/g);
-                        index_text($self, join(' ', @long), 1, 'XM');
+                        index_phrase($self, join(' ', @long), 1, 'XM');
                 }
         }
         $doc->add_boolean_term('Q' . $_) for @$mids;
diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t
index f865ff43..5bfbcfe6 100644
--- a/t/lei_xsearch.t
+++ b/t/lei_xsearch.t
@@ -78,4 +78,35 @@ is(scalar(@ibxish), scalar(@ibx) + 1, 'got locals back');
 is($lxs->search, $lxs, '->search works');
 is($lxs->over, undef, '->over fails');
 
+{
+        $lxs = PublicInbox::LeiXSearch->new;
+        my $v2ibx = PublicInbox::InboxWritable->new({
+                inboxdir => "$home/v2full",
+                name => 'v2full',
+                version => 2,
+                indexlevel => 'full',
+                -primary_address => 'v2full@example.com',
+        }, {});
+        my $im = $v2ibx->importer(0);
+        $im->add(eml_load('t/plack-qp.eml'));
+        $im->done;
+        my $v1ibx = PublicInbox::InboxWritable->new({
+                inboxdir => "$home/v1medium",
+                name => 'v1medium',
+                version => 1,
+                indexlevel => 'medium',
+                -primary_address => 'v1medium@example.com',
+        }, {});
+        $im = $v1ibx->importer(0);
+        $im->add(eml_load('t/utf8.eml'));
+        $im->done;
+        $lxs->prepare_external($v1ibx);
+        $lxs->prepare_external($v2ibx);
+        for my $loc ($lxs->locals) {
+                $lxs->attach_external($loc);
+        }
+        my $mset = $lxs->mset('m:testmessage@example.com');
+        is($mset->size, 1, 'got m: match on medium+full XSearch mix');
+}
+
 done_testing;