From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 579111FAF3 for ; Tue, 6 Mar 2018 08:42:43 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 16/34] searchidx: avoid excessive XNQ indexing with diffs Date: Tue, 6 Mar 2018 08:42:24 +0000 Message-Id: <20180306084242.19988-17-e@80x24.org> In-Reply-To: <20180306084242.19988-1-e@80x24.org> References: <20180306084242.19988-1-e@80x24.org> List-Id: When indexing diffs, we can avoid indexing the diff parts under XNQ and instead combine the parts in the read-only search interface. This results in better indexing performance and 10-15% smaller Xapian indices. --- lib/PublicInbox/Search.pm | 9 +++--- lib/PublicInbox/SearchIdx.pm | 77 ++++++++++++++++++++++++++++---------------- 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index fb7a126..a1c423c 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -59,6 +59,7 @@ my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue ); +my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST'; my %prob_prefix = ( # for mairix compatibility s => 'S', @@ -69,12 +70,12 @@ my %prob_prefix = ( c => 'XCC', tcf => 'XTO XCC A', a => 'XTO XCC A', - b => 'XNQ XQUOT', - bs => 'XNQ XQUOT S', + b => $non_quoted_body . ' XQUOT', + bs => $non_quoted_body . ' XQUOT S', n => 'XFN', q => 'XQUOT', - nq => 'XNQ', + nq => $non_quoted_body, dfn => 'XDFN', dfa => 'XDFA', dfb => 'XDFB', @@ -85,7 +86,7 @@ my %prob_prefix = ( dfblob => 'XDFPRE XDFPOST', # default: - '' => 'XM S A XNQ XQUOT XFN', + '' => 'XM S A XQUOT XFN ' . $non_quoted_body, ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1c10728..1bca3a6 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -175,14 +175,19 @@ sub index_users ($$) { $tg->increase_termpos; } -sub index_text_inc ($$$) { - my ($tg, $text, $pfx) = @_; +sub index_diff_inc ($$$$) { + my ($tg, $text, $pfx, $xnq) = @_; + if (@$xnq) { + $tg->index_text(join("\n", @$xnq), 1, 'XNQ'); + $tg->increase_termpos; + @$xnq = (); + } $tg->index_text($text, 1, $pfx); $tg->increase_termpos; } sub index_old_diff_fn { - my ($tg, $seen, $fa, $fb) = @_; + my ($tg, $seen, $fa, $fb, $xnq) = @_; # no renames or space support for traditional diffs, # find the number of leading common paths to strip: @@ -192,7 +197,9 @@ sub index_old_diff_fn { $fa = join('/', @fa); $fb = join('/', @fb); if ($fa eq $fb) { - index_text_inc($tg, $fa,'XDFN') unless $seen->{$fa}++; + unless ($seen->{$fa}++) { + index_diff_inc($tg, $fa, 'XDFN', $xnq); + } return 1; } shift @fa; @@ -205,40 +212,46 @@ sub index_diff ($$$) { my ($tg, $lines, $doc) = @_; my %seen; my $in_diff; + my @xnq; + my $xnq = \@xnq; foreach (@$lines) { if ($in_diff && s/^ //) { # diff context - index_text_inc($tg, $_, 'XDFCTX'); + index_diff_inc($tg, $_, 'XDFCTX', $xnq); } elsif (/^-- $/) { # email signature begins $in_diff = undef; } elsif (m!^diff --git ("?a/.+) ("?b/.+)\z!) { my ($fa, $fb) = ($1, $2); my $fn = (split('/', git_unquote($fa), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $fn = (split('/', git_unquote($fb), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $in_diff = 1; # traditional diff: } elsif (m/^diff -(.+) (\S+) (\S+)$/) { my ($opt, $fa, $fb) = ($1, $2, $3); + push @xnq, $_; # only support unified: next unless $opt =~ /[uU]/; - $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb); + $in_diff = index_old_diff_fn($tg, \%seen, $fa, $fb, + $xnq); } elsif (m!^--- ("?a/.+)!) { my $fn = (split('/', git_unquote($1), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (m!^\+\+\+ ("?b/.+)!) { my $fn = (split('/', git_unquote($1), 2))[1]; - index_text_inc($tg, $fn, 'XDFN') unless $seen{$fn}++; + $seen{$fn}++ or index_diff_inc($tg, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (/^--- (\S+)/) { $in_diff = $1; + push @xnq, $_; } elsif (defined $in_diff && /^\+\+\+ (\S+)/) { - $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1); + $in_diff = index_old_diff_fn($tg, \%seen, $in_diff, $1, + $xnq); } elsif ($in_diff && s/^\+//) { # diff added - index_text_inc($tg, $_, 'XDFB'); + index_diff_inc($tg, $_, 'XDFB', $xnq); } elsif ($in_diff && s/^-//) { # diff removed - index_text_inc($tg, $_, 'XDFA'); + index_diff_inc($tg, $_, 'XDFA', $xnq); } elsif (m!^index ([a-f0-9]+)\.\.([a-f0-9]+)!) { my ($ba, $bb) = ($1, $2); index_git_blob_id($doc, 'XDFPRE', $ba); @@ -248,34 +261,44 @@ sub index_diff ($$$) { # traditional diff w/o -p } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)$/) { # hunk header context - index_text_inc($tg, $1, 'XDFHH'); + index_diff_inc($tg, $1, 'XDFHH', $xnq); # ignore the following lines: - } elsif (/^(?:dis)similarity index/) { - } elsif (/^(?:old|new) mode/) { - } elsif (/^(?:deleted|new) file mode/) { - } elsif (/^(?:copy|rename) (?:from|to) /) { - } elsif (/^(?:dis)?similarity index /) { - } elsif (/^\\ No newline at end of file/) { - } elsif (/^Binary files .* differ/) { + } elsif (/^(?:dis)similarity index/ || + /^(?:old|new) mode/ || + /^(?:deleted|new) file mode/ || + /^(?:copy|rename) (?:from|to) / || + /^(?:dis)?similarity index / || + /^\\ No newline at end of file/ || + /^Binary files .* differ/) { + push @xnq, $_; } elsif ($_ eq '') { $in_diff = undef; } else { + push @xnq, $_; warn "non-diff line: $_\n" if DEBUG && $_ ne ''; $in_diff = undef; } } + + $tg->index_text(join("\n", @xnq), 1, 'XNQ'); + $tg->increase_termpos; } sub index_body ($$$) { my ($tg, $lines, $doc) = @_; my $txt = join("\n", @$lines); - $tg->index_text($txt, !!$doc, $doc ? 'XNQ' : 'XQUOT'); - $tg->increase_termpos; - # does it look like a diff? - if ($doc && $txt =~ /^(?:diff|---|\+\+\+) /ms) { - $txt = undef; - index_diff($tg, $lines, $doc); + if ($doc) { + # does it look like a diff? + if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { + $txt = undef; + index_diff($tg, $lines, $doc); + } else { + $tg->index_text($txt, 1, 'XNQ'); + } + } else { + $tg->index_text($txt, 0, 'XQUOT'); } + $tg->increase_termpos; @$lines = (); } -- EW