about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2022-06-20 19:27:30 +0000
committerEric Wong <e@80x24.org>2022-06-21 10:39:11 +0000
commit8fda04081acde7053458023fde3b1c784cbcfc81 (patch)
tree330f61bf4cb414a778d6732f4b171686361be185
parentd0079a334fe2b769e4f81d6bd3c7e4346a8fa2b2 (diff)
downloadpublic-inbox-8fda04081acde7053458023fde3b1c784cbcfc81.tar.gz
Base-85 binary patches generated by git lead to many false
positives, so skip over gibberish words which may occur in them.
To avoid regressions in search results, continue to allow
searching for exact size matches (via "literal $SIZE") and the
phrase "GIT binary patch" for the mere presence of a binary
patch.
-rw-r--r--MANIFEST1
-rw-r--r--TODO2
-rw-r--r--lib/PublicInbox/SearchIdx.pm52
-rw-r--r--t/data/binary.patch20
-rw-r--r--t/search.t15
5 files changed, 72 insertions, 18 deletions
diff --git a/MANIFEST b/MANIFEST
index ce2cf4a5..607a4c5b 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -397,6 +397,7 @@ t/content_hash.t
 t/convert-compact.t
 t/data-gen/.gitignore
 t/data/0001.patch
+t/data/binary.patch
 t/data/message_embed.eml
 t/dir_idle.t
 t/ds-kqxs.t
diff --git a/TODO b/TODO
index 43eee063..7a27fdd2 100644
--- a/TODO
+++ b/TODO
@@ -153,8 +153,6 @@ all need to be considered for everything we introduce)
 
 * support UUCP addresses for legacy archives
 
-* decode (skip indexing of) base-85 binary patches to avoid false-positives
-
 * support pipelining as an IMAP/NNTP client for -watch + lei
 
 * auto-detect and reload on TLS cert+key changes in daemons
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 53ec23a5..cbfe7816 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -36,9 +36,8 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
         # assume a typical 64-bit system has 8x more RAM than a
         # typical 32-bit system:
         (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
-
 use constant DEBUG => !!$ENV{DEBUG};
-
+my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
@@ -258,21 +257,42 @@ sub index_diff ($$$) {
         my ($self, $txt, $doc) = @_;
         my %seen;
         my $in_diff;
-        my @xnq;
-        my $xnq = \@xnq;
-        foreach (split(/\n/, $txt)) {
-                if ($in_diff && s/^ //) { # diff context
+        my $xnq = [];
+        my @l = split(/\n/, $$txt);
+        undef $$txt;
+        while (defined($_ = shift @l)) {
+                if ($in_diff && /^GIT binary patch/) {
+                        push @$xnq, $_;
+                        while (@l && $l[0] =~ /^literal /) {
+                                # TODO allow searching by size range?
+                                # allows searching by exact size via:
+                                # "literal $SIZE"
+                                push @$xnq, shift(@l);
+
+                                # skip base85 and empty lines
+                                while (@l && ($l[0] =~ /$BASE85/o ||
+                                                $l[0] !~ /\S/)) {
+                                        shift @l;
+                                }
+                                # loop hits trailing "literal 0\nHcmV?d00001\n"
+                        }
+                } elsif ($in_diff && s/^ //) { # diff context
                         index_diff_inc($self, $_, 'XDFCTX', $xnq);
                 } elsif (/^-- $/) { # email signature begins
                         $in_diff = undef;
-                } elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) {
-                        # wait until "---" and "+++" to capture filenames
+                } elsif (m!^diff --git ("?[^/]+/.+) ("?[^/]+/.+)\z!) {
+                        # capture filenames here for binary diffs:
+                        my ($fa, $fb) = ($1, $2);
+                        push @$xnq, $_;
                         $in_diff = 1;
-                        push @xnq, $_;
+                        $fa = (split(m'/', git_unquote($fa), 2))[1];
+                        $fb = (split(m'/', git_unquote($fb), 2))[1];
+                        $seen{$fa}++ or index_diff_inc($self, $fa, 'XDFN', $xnq);
+                        $seen{$fb}++ or index_diff_inc($self, $fb, 'XDFN', $xnq);
                 # traditional diff:
                 } elsif (m/^diff -(.+) (\S+) (\S+)$/) {
                         my ($opt, $fa, $fb) = ($1, $2, $3);
-                        push @xnq, $_;
+                        push @$xnq, $_;
                         # only support unified:
                         next unless $opt =~ /[uU]/;
                         $in_diff = index_old_diff_fn($self, \%seen, $fa, $fb,
@@ -288,8 +308,8 @@ sub index_diff ($$$) {
                         $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq);
                         $in_diff = 1;
                 } elsif (/^--- (\S+)/) {
-                        $in_diff = $1;
-                        push @xnq, $_;
+                        $in_diff = $1; # old diff filename
+                        push @$xnq, $_;
                 } elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
                         $in_diff = index_old_diff_fn($self, \%seen, $in_diff,
                                                         $1, $xnq);
@@ -315,19 +335,19 @@ sub index_diff ($$$) {
                                 /^(?:dis)?similarity index / ||
                                 /^\\ No newline at end of file/ ||
                                 /^Binary files .* differ/) {
-                        push @xnq, $_;
+                        push @$xnq, $_;
                 } elsif ($_ eq '') {
                         # possible to be in diff context, some mail may be
                         # stripped by MUA or even GNU diff(1).  "git apply"
                         # treats a bare "\n" as diff context, too
                 } else {
-                        push @xnq, $_;
+                        push @$xnq, $_;
                         warn "non-diff line: $_\n" if DEBUG && $_ ne '';
                         $in_diff = undef;
                 }
         }
 
-        index_text($self, join("\n", @xnq), 1, 'XNQ');
+        index_text($self, join("\n", @$xnq), 1, 'XNQ');
 }
 
 sub index_xapian { # msg_iter callback
@@ -373,7 +393,7 @@ sub index_xapian { # msg_iter callback
                 } else {
                         # does it look like a diff?
                         if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
-                                index_diff($self, $txt, $doc);
+                                index_diff($self, \$txt, $doc);
                         } else {
                                 index_text($self, $txt, 1, 'XNQ');
                         }
diff --git a/t/data/binary.patch b/t/data/binary.patch
new file mode 100644
index 00000000..58717abe
--- /dev/null
+++ b/t/data/binary.patch
@@ -0,0 +1,20 @@
+From 7a1921ba7bd99c63ad6dc6ec0791691ee80e279a Mon Sep 17 00:00:00 2001
+From: BOFH <bofh@example.com>
+Date: Fri, 13 May 2022 23:04:14 +0000
+Subject: [PATCH] binary patch test
+Message-ID: <binary-patch-test@example>
+
+---
+ zero | Bin 0 -> 1 bytes
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 zero
+
+diff --git a/zero b/zero
+new file mode 100644
+index 0000000000000000000000000000000000000000..f76dd238ade08917e6712764a16a22005a50573d
+GIT binary patch
+literal 1
+IcmZPo000310RR91
+
+literal 0
+HcmV?d00001
diff --git a/t/search.t b/t/search.t
index 47a67f7f..13210ff5 100644
--- a/t/search.t
+++ b/t/search.t
@@ -533,6 +533,21 @@ $ibx->with_umask(sub {
         is($query->('s:"mail header experiments"')->[0]->{mid},
                 '20200418222508.GA13918@dcvr',
                 'Subject search reaches inside message/rfc822');
+
+        $doc_id = $rw->add_message(eml_load('t/data/binary.patch'));
+        $rw->commit_txn_lazy;
+        $ibx->search->reopen;
+        my $res = $query->('HcmV');
+        is_deeply($res, [], 'no results against trailer');
+        $res = $query->('IcmZPo000310RR91');
+        is_deeply($res, [], 'no results against 1-byte binary patch');
+        $res = $query->('"GIT binary patch"');
+        is(scalar(@$res), 1, 'got binary result from "GIT binary patch"');
+        is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary');
+        my $s = $query->('"literal 1"');
+        is_deeply($s, $res, 'got binary result from exact literal size');
+        $s = $query->('"literal 2"');
+        is_deeply($s, [], 'no results for wrong size');
 });
 
 SKIP: {