searchidx: fix -Lmedium for IDs and filenames

This fixes "m:", "l:", "f:", "t:", "c:", "dfn:", and "n:" search prefixes under indexlevel=medium when mixed with indexlevel=full inboxish. We need positional data for Message-IDs, List-Id, email addresses and filenames for exact matches, though we still want to support wildcards. Fortunately the storage cost is still small as these prefixes tend to be small compared to message bodies. These are NOT boolean terms since wildcard support and partial matching is desired.
author: Eric Wong <e@80x24.org> 2021-03-13 15:40:27 +0000
committer: Eric Wong <e@80x24.org> 2021-03-13 18:39:36 -0400
commit: 64b557420689476493d752968d99ab8ae62bad9a (patch)
tree: 0987e5bf1a1028adc065d5691ed9ee9d9d0142bd /lib
parent: 037cc7637baf074b9dc14a67a629940ae84375d9 (diff)
download: public-inbox-64b557420689476493d752968d99ab8ae62bad9a.tar.gz
1 files changed, 37 insertions, 10 deletions
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3372bea5..772f5a64 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -22,6 +22,7 @@ use PublicInbox::OverIdx;
  use PublicInbox::Spawn qw(spawn nodatacow_dir);
  use PublicInbox::Git qw(git_unquote);
  use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::Address;
  our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
          index_text term_generator add_val is_bad_blob);
  my $X = \%PublicInbox::Search::X;
@@ -158,22 +159,44 @@ sub term_generator ($) { # write-only
          }
  }
  
+sub index_phrase ($$$$) {
+        my ($self, $text, $wdf_inc, $prefix) = @_;
+
+        my $tg = term_generator($self);
+        $tg->index_text($text, $wdf_inc, $prefix);
+        $tg->increase_termpos;
+}
+
  sub index_text ($$$$) {
          my ($self, $text, $wdf_inc, $prefix) = @_;
-        my $tg = term_generator($self); # man Search::Xapian::TermGenerator
  
          if ($self->{indexlevel} eq 'full') {
-                $tg->index_text($text, $wdf_inc, $prefix);
-                $tg->increase_termpos;
+                index_phrase($self, $text, $wdf_inc, $prefix);
          } else {
+                my $tg = term_generator($self);
                  $tg->index_text_without_positions($text, $wdf_inc, $prefix);
          }
  }
  
  sub index_headers ($$) {
          my ($self, $smsg) = @_;
-        my @x = (from => 'A', # Author
-                subject => 'S', to => 'XTO', cc => 'XCC');
+        my @x = (from => 'A', to => 'XTO', cc => 'XCC'); # A: Author
+        while (my ($field, $pfx) = splice(@x, 0, 2)) {
+                my $val = $smsg->{$field};
+                next if $val eq '';
+                # include "(comments)" after the address, too, so not using
+                # PublicInbox::Address::names or pairs
+                index_text($self, $val, 1, $pfx);
+
+                # we need positional info for email addresses since they
+                # can be considered phrases
+                if ($self->{indexlevel} eq 'medium') {
+                        for my $addr (PublicInbox::Address::emails($val)) {
+                                index_phrase($self, $addr, 1, $pfx);
+                        }
+                }
+        }
+        @x = (subject => 'S');
          while (my ($field, $pfx) = splice(@x, 0, 2)) {
                  my $val = $smsg->{$field};
                  index_text($self, $val, 1, $pfx) if $val ne '';
@@ -186,7 +209,11 @@ sub index_diff_inc ($$$$) {
                  index_text($self, join("\n", @$xnq), 1, 'XNQ');
                  @$xnq = ();
          }
-        index_text($self, $text, 1, $pfx);
+        if ($pfx eq 'XDFN') {
+                index_phrase($self, $text, 1, $pfx);
+        } else {
+                index_text($self, $text, 1, $pfx);
+        }
  }
  
  sub index_old_diff_fn {
@@ -292,7 +319,7 @@ sub index_xapian { # msg_iter callback
          my $ct = $part->content_type || 'text/plain';
          my $fn = $part->filename;
          if (defined $fn && $fn ne '') {
-                index_text($self, $fn, 1, 'XFN');
+                index_phrase($self, $fn, 1, 'XFN');
          }
          if ($part->{is_submsg}) {
                  my $mids = mids_for_index($part);
@@ -330,20 +357,20 @@ sub index_list_id ($$$) {
                  $l =~ /<([^>]+)>/ or next;
                  my $lid = lc $1;
                  $doc->add_boolean_term('G' . $lid);
-                index_text($self, $lid, 1, 'XL'); # probabilistic
+                index_phrase($self, $lid, 1, 'XL'); # probabilistic
          }
  }
  
  sub index_ids ($$$$) {
          my ($self, $doc, $hdr, $mids) = @_;
          for my $mid (@$mids) {
-                index_text($self, $mid, 1, 'XM');
+                index_phrase($self, $mid, 1, 'XM');
  
                  # because too many Message-IDs are prefixed with
                  # "Pine.LNX."...
                  if ($mid =~ /\w{12,}/) {
                          my @long = ($mid =~ /(\w{3,}+)/g);
-                        index_text($self, join(' ', @long), 1, 'XM');
+                        index_phrase($self, join(' ', @long), 1, 'XM');
                  }
          }
          $doc->add_boolean_term('Q' . $_) for @$mids;
author	Eric Wong <e@80x24.org>	2021-03-13 15:40:27 +0000
committer	Eric Wong <e@80x24.org>	2021-03-13 18:39:36 -0400
commit	64b557420689476493d752968d99ab8ae62bad9a (patch)
tree	0987e5bf1a1028adc065d5691ed9ee9d9d0142bd /lib
parent	037cc7637baf074b9dc14a67a629940ae84375d9 (diff)
download	public-inbox-64b557420689476493d752968d99ab8ae62bad9a.tar.gz