about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-03-03 05:55:26 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-03-03 06:47:19 +0000
commit5230930a798ef261a68385d9026acb81137d640f (patch)
tree44f334ad64c8de60012b4952dad916a62926d970 /lib
parent4030525cb228eb3837f5260637bd7a5a861e81e2 (diff)
downloadpublic-inbox-5230930a798ef261a68385d9026acb81137d640f.tar.gz
When indexing diffs, we can avoid indexing the diff parts under
XNQ and instead combine the parts in the read-only search
interface.  This results in better indexing performance and
10-15% smaller Xapian indices.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/Search.pm9
-rw-r--r--lib/PublicInbox/SearchIdx.pm77
2 files changed, 55 insertions, 31 deletions
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index fb7a126a..a1c423c8 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -59,6 +59,7 @@ my %bool_pfx_external = (
         mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
 );
 
+my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST';
 my %prob_prefix = (
         # for mairix compatibility
         s => 'S',
@@ -69,12 +70,12 @@ my %prob_prefix = (
         c => 'XCC',
         tcf => 'XTO XCC A',
         a => 'XTO XCC A',
-        b => 'XNQ XQUOT',
-        bs => 'XNQ XQUOT S',
+        b => $non_quoted_body . ' XQUOT',
+        bs => $non_quoted_body . ' XQUOT S',
         n => 'XFN',
 
         q => 'XQUOT',
-        nq => 'XNQ',
+        nq => $non_quoted_body,
         dfn => 'XDFN',
         dfa => 'XDFA',
         dfb => 'XDFB',
@@ -85,7 +86,7 @@ my %prob_prefix = (
         dfblob => 'XDFPRE XDFPOST',
 
         # default:
-        '' => 'XM S A XNQ XQUOT XFN',
+        '' => 'XM S A XQUOT XFN ' . $non_quoted_body,
 );
 
 # not documenting m: and mid: for now, the using the URLs works w/o Xapian
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1c107282..1bca3a64 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -175,14 +175,19 @@ sub index_users ($$) {
         $tg->increase_termpos;
 }
 
-sub index_text_inc ($$$) {
-        my ($tg, $text, $pfx) = @_;
+sub index_diff_inc ($$$$) {
+        my ($tg, $text, $pfx, $xnq) = @_;
+        if (@$xnq) {
+                $tg->index_text(join("\n", @$xnq), 1, 'XNQ');
+                $tg->increase_termpos;
+                @$xnq = ();
+        }
         $tg->index_text($text, 1, $pfx);
         $tg->increase_termpos;
 }
 
 sub index_old_diff_fn {
-        my ($tg, $seen, $fa, $fb) = @_;
+        my ($tg, $seen, $fa, $fb, $xnq) = @_;
 
         # no renames or space support for traditional diffs,
         # find the number of leading common paths to strip:
@@ -192,7 +197,9 @@ sub index_old_diff_fn {
                 $fa = join('/', @fa);
                 $fb = join('/', @fb);
                 if ($fa eq $fb) {
-                        index_text_inc($tg, $fa,'XDFN') unless $seen->{$fa}++;
+                        unless ($seen->{$fa}++) {
+                                index_diff_inc($tg, $fa, 'XDFN', $xnq);
+                        }
                         return 1;
                 }
                 shift @fa;
@@ -205,40 +212,46 @@ sub index_diff ($$$) {
         my ($tg, $lines, $doc) = @_;
         my %seen;
         my $in_diff;
+        my @xnq;
+        my $xnq = \@xnq;
         foreach (@$lines) {
                 if ($in_diff && s/^ //) { # diff context
-                        index_text_inc($tg, $_, 'XDFCTX');
+                        index_diff_inc($tg, $_, 'XDFCTX', $xnq);
                 } elsif (/^-- $/) { # email signature begins
                         $in_diff = undef;
                 } elsif (m!^diff --git ("?a/.+) ("?b/.+)\z!) {
                         my ($fa, $fb) = ($1, $2);
                         my $fn = (split('/', git_unquote($fa), 2))[1];
-                        index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+                        $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
                         $fn = (split('/', git_unquote($fb), 2))[1];
-                        index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+                        $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
                         $in_diff = 1;
                 # traditional diff:
                 } elsif (m/^diff -(.+) (\S+) (\S+)$/) {
                         my ($opt, $fa, $fb) = ($1, $2, $3);
+                        push @xnq, $_;
                         # only support unified:
                         next unless $opt =~ /[uU]/;
-                        $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb);
+                        $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb,
+                                                        $xnq);
                 } elsif (m!^--- ("?a/.+)!) {
                         my $fn = (split('/', git_unquote($1), 2))[1];
-                        index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+                        $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
                         $in_diff = 1;
                 } elsif (m!^\+\+\+ ("?b/.+)!)  {
                         my $fn = (split('/', git_unquote($1), 2))[1];
-                        index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++;
+                        $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq);
                         $in_diff = 1;
                 } elsif (/^--- (\S+)/) {
                         $in_diff = $1;
+                        push @xnq, $_;
                 } elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
-                        $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1);
+                        $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1,
+                                                        $xnq);
                 } elsif ($in_diff && s/^\+//) { # diff added
-                        index_text_inc($tg, $_, 'XDFB');
+                        index_diff_inc($tg, $_, 'XDFB', $xnq);
                 } elsif ($in_diff && s/^-//) { # diff removed
-                        index_text_inc($tg, $_, 'XDFA');
+                        index_diff_inc($tg, $_, 'XDFA', $xnq);
                 } elsif (m!^index ([a-f0-9]+)\.\.([a-f0-9]+)!) {
                         my ($ba, $bb) = ($1, $2);
                         index_git_blob_id($doc, 'XDFPRE', $ba);
@@ -248,34 +261,44 @@ sub index_diff ($$$) {
                         # traditional diff w/o -p
                 } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)$/) {
                         # hunk header context
-                        index_text_inc($tg, $1, 'XDFHH');
+                        index_diff_inc($tg, $1, 'XDFHH', $xnq);
                 # ignore the following lines:
-                } elsif (/^(?:dis)similarity index/) {
-                } elsif (/^(?:old|new) mode/) {
-                } elsif (/^(?:deleted|new) file mode/) {
-                } elsif (/^(?:copy|rename) (?:from|to) /) {
-                } elsif (/^(?:dis)?similarity index /) {
-                } elsif (/^\\ No newline at end of file/) {
-                } elsif (/^Binary files .* differ/) {
+                } elsif (/^(?:dis)similarity index/ ||
+                                /^(?:old|new) mode/ ||
+                                /^(?:deleted|new) file mode/ ||
+                                /^(?:copy|rename) (?:from|to) / ||
+                                /^(?:dis)?similarity index / ||
+                                /^\\ No newline at end of file/ ||
+                                /^Binary files .* differ/) {
+                        push @xnq, $_;
                 } elsif ($_ eq '') {
                         $in_diff = undef;
                 } else {
+                        push @xnq, $_;
                         warn "non-diff line: $_\n" if DEBUG && $_ ne '';
                         $in_diff = undef;
                 }
         }
+
+        $tg->index_text(join("\n", @xnq), 1, 'XNQ');
+        $tg->increase_termpos;
 }
 
 sub index_body ($$$) {
         my ($tg, $lines, $doc) = @_;
         my $txt = join("\n", @$lines);
-        $tg->index_text($txt, !!$doc, $doc ? 'XNQ' : 'XQUOT');
-        $tg->increase_termpos;
-        # does it look like a diff?
-        if ($doc && $txt =~ /^(?:diff|---|\+\+\+) /ms) {
-                $txt = undef;
-                index_diff($tg, $lines, $doc);
+        if ($doc) {
+                # does it look like a diff?
+                if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
+                        $txt = undef;
+                        index_diff($tg, $lines, $doc);
+                } else {
+                        $tg->index_text($txt, 1, 'XNQ');
+                }
+        } else {
+                $tg->index_text($txt, 0, 'XQUOT');
         }
+        $tg->increase_termpos;
         @$lines = ();
 }