about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--TODO2
-rw-r--r--lib/PublicInbox/SearchIdx.pm52
-rw-r--r--t/data/binary.patch20
-rw-r--r--t/search.t15
5 files changed, 72 insertions, 18 deletions
diff --git a/MANIFEST b/MANIFEST
index ce2cf4a5..607a4c5b 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -397,6 +397,7 @@ t/content_hash.t
 t/convert-compact.t
 t/data-gen/.gitignore
 t/data/0001.patch
+t/data/binary.patch
 t/data/message_embed.eml
 t/dir_idle.t
 t/ds-kqxs.t
diff --git a/TODO b/TODO
index 43eee063..7a27fdd2 100644
--- a/TODO
+++ b/TODO
@@ -153,8 +153,6 @@ all need to be considered for everything we introduce)
 
 * support UUCP addresses for legacy archives
 
-* decode (skip indexing of) base-85 binary patches to avoid false-positives
-
 * support pipelining as an IMAP/NNTP client for -watch + lei
 
 * auto-detect and reload on TLS cert+key changes in daemons
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 53ec23a5..cbfe7816 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -36,9 +36,8 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
         # assume a typical 64-bit system has 8x more RAM than a
         # typical 32-bit system:
         (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
-
 use constant DEBUG => !!$ENV{DEBUG};
-
+my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
@@ -258,21 +257,42 @@ sub index_diff ($$$) {
         my ($self, $txt, $doc) = @_;
         my %seen;
         my $in_diff;
-        my @xnq;
-        my $xnq = \@xnq;
-        foreach (split(/\n/, $txt)) {
-                if ($in_diff && s/^ //) { # diff context
+        my $xnq = [];
+        my @l = split(/\n/, $$txt);
+        undef $$txt;
+        while (defined($_ = shift @l)) {
+                if ($in_diff && /^GIT binary patch/) {
+                        push @$xnq, $_;
+                        while (@l && $l[0] =~ /^literal /) {
+                                # TODO allow searching by size range?
+                                # allows searching by exact size via:
+                                # "literal $SIZE"
+                                push @$xnq, shift(@l);
+
+                                # skip base85 and empty lines
+                                while (@l && ($l[0] =~ /$BASE85/o ||
+                                                $l[0] !~ /\S/)) {
+                                        shift @l;
+                                }
+                                # loop hits trailing "literal 0\nHcmV?d00001\n"
+                        }
+                } elsif ($in_diff && s/^ //) { # diff context
                         index_diff_inc($self, $_, 'XDFCTX', $xnq);
                 } elsif (/^-- $/) { # email signature begins
                         $in_diff = undef;
-                } elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) {
-                        # wait until "---" and "+++" to capture filenames
+                } elsif (m!^diff --git ("?[^/]+/.+) ("?[^/]+/.+)\z!) {
+                        # capture filenames here for binary diffs:
+                        my ($fa, $fb) = ($1, $2);
+                        push @$xnq, $_;
                         $in_diff = 1;
-                        push @xnq, $_;
+                        $fa = (split(m'/', git_unquote($fa), 2))[1];
+                        $fb = (split(m'/', git_unquote($fb), 2))[1];
+                        $seen{$fa}++ or index_diff_inc($self, $fa, 'XDFN', $xnq);
+                        $seen{$fb}++ or index_diff_inc($self, $fb, 'XDFN', $xnq);
                 # traditional diff:
                 } elsif (m/^diff -(.+) (\S+) (\S+)$/) {
                         my ($opt, $fa, $fb) = ($1, $2, $3);
-                        push @xnq, $_;
+                        push @$xnq, $_;
                         # only support unified:
                         next unless $opt =~ /[uU]/;
                         $in_diff = index_old_diff_fn($self, \%seen, $fa, $fb,
@@ -288,8 +308,8 @@ sub index_diff ($$$) {
                         $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq);
                         $in_diff = 1;
                 } elsif (/^--- (\S+)/) {
-                        $in_diff = $1;
-                        push @xnq, $_;
+                        $in_diff = $1; # old diff filename
+                        push @$xnq, $_;
                 } elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
                         $in_diff = index_old_diff_fn($self, \%seen, $in_diff,
                                                         $1, $xnq);
@@ -315,19 +335,19 @@ sub index_diff ($$$) {
                                 /^(?:dis)?similarity index / ||
                                 /^\\ No newline at end of file/ ||
                                 /^Binary files .* differ/) {
-                        push @xnq, $_;
+                        push @$xnq, $_;
                 } elsif ($_ eq '') {
                         # possible to be in diff context, some mail may be
                         # stripped by MUA or even GNU diff(1).  "git apply"
                         # treats a bare "\n" as diff context, too
                 } else {
-                        push @xnq, $_;
+                        push @$xnq, $_;
                         warn "non-diff line: $_\n" if DEBUG && $_ ne '';
                         $in_diff = undef;
                 }
         }
 
-        index_text($self, join("\n", @xnq), 1, 'XNQ');
+        index_text($self, join("\n", @$xnq), 1, 'XNQ');
 }
 
 sub index_xapian { # msg_iter callback
@@ -373,7 +393,7 @@ sub index_xapian { # msg_iter callback
                 } else {
                         # does it look like a diff?
                         if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
-                                index_diff($self, $txt, $doc);
+                                index_diff($self, \$txt, $doc);
                         } else {
                                 index_text($self, $txt, 1, 'XNQ');
                         }
diff --git a/t/data/binary.patch b/t/data/binary.patch
new file mode 100644
index 00000000..58717abe
--- /dev/null
+++ b/t/data/binary.patch
@@ -0,0 +1,20 @@
+From 7a1921ba7bd99c63ad6dc6ec0791691ee80e279a Mon Sep 17 00:00:00 2001
+From: BOFH <bofh@example.com>
+Date: Fri, 13 May 2022 23:04:14 +0000
+Subject: [PATCH] binary patch test
+Message-ID: <binary-patch-test@example>
+
+---
+ zero | Bin 0 -> 1 bytes
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 zero
+
+diff --git a/zero b/zero
+new file mode 100644
+index 0000000000000000000000000000000000000000..f76dd238ade08917e6712764a16a22005a50573d
+GIT binary patch
+literal 1
+IcmZPo000310RR91
+
+literal 0
+HcmV?d00001
diff --git a/t/search.t b/t/search.t
index 47a67f7f..13210ff5 100644
--- a/t/search.t
+++ b/t/search.t
@@ -533,6 +533,21 @@ $ibx->with_umask(sub {
         is($query->('s:"mail header experiments"')->[0]->{mid},
                 '20200418222508.GA13918@dcvr',
                 'Subject search reaches inside message/rfc822');
+
+        $doc_id = $rw->add_message(eml_load('t/data/binary.patch'));
+        $rw->commit_txn_lazy;
+        $ibx->search->reopen;
+        my $res = $query->('HcmV');
+        is_deeply($res, [], 'no results against trailer');
+        $res = $query->('IcmZPo000310RR91');
+        is_deeply($res, [], 'no results against 1-byte binary patch');
+        $res = $query->('"GIT binary patch"');
+        is(scalar(@$res), 1, 'got binary result from "GIT binary patch"');
+        is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary');
+        my $s = $query->('"literal 1"');
+        is_deeply($s, $res, 'got binary result from exact literal size');
+        $s = $query->('"literal 2"');
+        is_deeply($s, [], 'no results for wrong size');
 });
 
 SKIP: {