From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.1 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 7F3241F72A; Mon, 20 Jun 2022 19:27:30 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1655753250; bh=HL+VzS/3ocGPLxW18FpS8q9UdEoVxViCsFbcmOIOXBw=; h=From:To:Subject:Date:In-Reply-To:References:From; b=kWFivbPTjdK+Qc/DnOSeV2ysWSygl6Vougt2bpzTtatO2k9vw5iJ18+dKDcbuG0dX WHmBYTa8Z7Nqr/cimFT1B+mxHX+nwUx9H7aIWNNlM/tz9jmT3x/tH63gO29QhvReRJ 6EKUCZubse/Cn/MzUHEBJkjHooVILVFzyYn7slM4= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/3] search: do not index base-85 binary patches Date: Mon, 20 Jun 2022 19:27:30 +0000 Message-Id: <20220620192730.550803-4-e@80x24.org> In-Reply-To: <20220620192730.550803-1-e@80x24.org> References: <20220620192730.550803-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Base-85 binary patches generated by git lead to many false positives, so skip over gibberish words which may occur in them. To avoid regressions in search results, continue to allow searching for exact size matches (via "literal $SIZE") and the phrase "GIT binary patch" for the mere presence of a binary patch. --- MANIFEST | 1 + TODO | 2 -- lib/PublicInbox/SearchIdx.pm | 52 +++++++++++++++++++++++++----------- t/data/binary.patch | 20 ++++++++++++++ t/search.t | 15 +++++++++++ 5 files changed, 72 insertions(+), 18 deletions(-) create mode 100644 t/data/binary.patch diff --git a/MANIFEST b/MANIFEST index ce2cf4a5..607a4c5b 100644 --- a/MANIFEST +++ b/MANIFEST @@ -397,6 +397,7 @@ t/content_hash.t t/convert-compact.t t/data-gen/.gitignore t/data/0001.patch +t/data/binary.patch t/data/message_embed.eml t/dir_idle.t t/ds-kqxs.t diff --git a/TODO b/TODO index 43eee063..7a27fdd2 100644 --- a/TODO +++ b/TODO @@ -153,8 +153,6 @@ all need to be considered for everything we introduce) * support UUCP addresses for legacy archives -* decode (skip indexing of) base-85 binary patches to avoid false-positives - * support pipelining as an IMAP/NNTP client for -watch + lei * auto-detect and reload on TLS cert+key changes in daemons diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 53ec23a5..cbfe7816 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -36,9 +36,8 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : # assume a typical 64-bit system has 8x more RAM than a # typical 32-bit system: (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024); - use constant DEBUG => !!$ENV{DEBUG}; - +my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; @@ -258,21 +257,42 @@ sub index_diff ($$$) { my ($self, $txt, $doc) = @_; my %seen; my $in_diff; - my @xnq; - my $xnq = \@xnq; - foreach (split(/\n/, $txt)) { - if ($in_diff && s/^ //) { # diff context + my $xnq = []; + my @l = split(/\n/, $$txt); + undef $$txt; + while (defined($_ = shift @l)) { + if ($in_diff && /^GIT binary patch/) { + push @$xnq, $_; + while (@l && $l[0] =~ /^literal /) { + # TODO allow searching by size range? + # allows searching by exact size via: + # "literal $SIZE" + push @$xnq, shift(@l); + + # skip base85 and empty lines + while (@l && ($l[0] =~ /$BASE85/o || + $l[0] !~ /\S/)) { + shift @l; + } + # loop hits trailing "literal 0\nHcmV?d00001\n" + } + } elsif ($in_diff && s/^ //) { # diff context index_diff_inc($self, $_, 'XDFCTX', $xnq); } elsif (/^-- $/) { # email signature begins $in_diff = undef; - } elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) { - # wait until "---" and "+++" to capture filenames + } elsif (m!^diff --git ("?[^/]+/.+) ("?[^/]+/.+)\z!) { + # capture filenames here for binary diffs: + my ($fa, $fb) = ($1, $2); + push @$xnq, $_; $in_diff = 1; - push @xnq, $_; + $fa = (split(m'/', git_unquote($fa), 2))[1]; + $fb = (split(m'/', git_unquote($fb), 2))[1]; + $seen{$fa}++ or index_diff_inc($self, $fa, 'XDFN', $xnq); + $seen{$fb}++ or index_diff_inc($self, $fb, 'XDFN', $xnq); # traditional diff: } elsif (m/^diff -(.+) (\S+) (\S+)$/) { my ($opt, $fa, $fb) = ($1, $2, $3); - push @xnq, $_; + push @$xnq, $_; # only support unified: next unless $opt =~ /[uU]/; $in_diff = index_old_diff_fn($self, \%seen, $fa, $fb, @@ -288,8 +308,8 @@ sub index_diff ($$$) { $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (/^--- (\S+)/) { - $in_diff = $1; - push @xnq, $_; + $in_diff = $1; # old diff filename + push @$xnq, $_; } elsif (defined $in_diff && /^\+\+\+ (\S+)/) { $in_diff = index_old_diff_fn($self, \%seen, $in_diff, $1, $xnq); @@ -315,19 +335,19 @@ sub index_diff ($$$) { /^(?:dis)?similarity index / || /^\\ No newline at end of file/ || /^Binary files .* differ/) { - push @xnq, $_; + push @$xnq, $_; } elsif ($_ eq '') { # possible to be in diff context, some mail may be # stripped by MUA or even GNU diff(1). "git apply" # treats a bare "\n" as diff context, too } else { - push @xnq, $_; + push @$xnq, $_; warn "non-diff line: $_\n" if DEBUG && $_ ne ''; $in_diff = undef; } } - index_text($self, join("\n", @xnq), 1, 'XNQ'); + index_text($self, join("\n", @$xnq), 1, 'XNQ'); } sub index_xapian { # msg_iter callback @@ -373,7 +393,7 @@ sub index_xapian { # msg_iter callback } else { # does it look like a diff? if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { - index_diff($self, $txt, $doc); + index_diff($self, \$txt, $doc); } else { index_text($self, $txt, 1, 'XNQ'); } diff --git a/t/data/binary.patch b/t/data/binary.patch new file mode 100644 index 00000000..58717abe --- /dev/null +++ b/t/data/binary.patch @@ -0,0 +1,20 @@ +From 7a1921ba7bd99c63ad6dc6ec0791691ee80e279a Mon Sep 17 00:00:00 2001 +From: BOFH +Date: Fri, 13 May 2022 23:04:14 +0000 +Subject: [PATCH] binary patch test +Message-ID: + +--- + zero | Bin 0 -> 1 bytes + 1 file changed, 0 insertions(+), 0 deletions(-) + create mode 100644 zero + +diff --git a/zero b/zero +new file mode 100644 +index 0000000000000000000000000000000000000000..f76dd238ade08917e6712764a16a22005a50573d +GIT binary patch +literal 1 +IcmZPo000310RR91 + +literal 0 +HcmV?d00001 diff --git a/t/search.t b/t/search.t index 47a67f7f..13210ff5 100644 --- a/t/search.t +++ b/t/search.t @@ -533,6 +533,21 @@ $ibx->with_umask(sub { is($query->('s:"mail header experiments"')->[0]->{mid}, '20200418222508.GA13918@dcvr', 'Subject search reaches inside message/rfc822'); + + $doc_id = $rw->add_message(eml_load('t/data/binary.patch')); + $rw->commit_txn_lazy; + $ibx->search->reopen; + my $res = $query->('HcmV'); + is_deeply($res, [], 'no results against trailer'); + $res = $query->('IcmZPo000310RR91'); + is_deeply($res, [], 'no results against 1-byte binary patch'); + $res = $query->('"GIT binary patch"'); + is(scalar(@$res), 1, 'got binary result from "GIT binary patch"'); + is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary'); + my $s = $query->('"literal 1"'); + is_deeply($s, $res, 'got binary result from exact literal size'); + $s = $query->('"literal 2"'); + is_deeply($s, [], 'no results for wrong size'); }); SKIP: {