user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 3/3] search: do not index base-85 binary patches
  2022-06-20 19:27  6% [PATCH 0/3] search indexing improvements Eric Wong
@ 2022-06-20 19:27  5% ` Eric Wong
  0 siblings, 0 replies; 3+ results
From: Eric Wong @ 2022-06-20 19:27 UTC (permalink / raw)
  To: meta

Base-85 binary patches generated by git lead to many false
positives, so skip over gibberish words which may occur in them.
To avoid regressions in search results, continue to allow
searching for exact size matches (via "literal $SIZE") and the
phrase "GIT binary patch" for the mere presence of a binary
patch.
---
 MANIFEST                     |  1 +
 TODO                         |  2 --
 lib/PublicInbox/SearchIdx.pm | 52 +++++++++++++++++++++++++-----------
 t/data/binary.patch          | 20 ++++++++++++++
 t/search.t                   | 15 +++++++++++
 5 files changed, 72 insertions(+), 18 deletions(-)
 create mode 100644 t/data/binary.patch

diff --git a/MANIFEST b/MANIFEST
index ce2cf4a5..607a4c5b 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -397,6 +397,7 @@ t/content_hash.t
 t/convert-compact.t
 t/data-gen/.gitignore
 t/data/0001.patch
+t/data/binary.patch
 t/data/message_embed.eml
 t/dir_idle.t
 t/ds-kqxs.t
diff --git a/TODO b/TODO
index 43eee063..7a27fdd2 100644
--- a/TODO
+++ b/TODO
@@ -153,8 +153,6 @@ all need to be considered for everything we introduce)
 
 * support UUCP addresses for legacy archives
 
-* decode (skip indexing of) base-85 binary patches to avoid false-positives
-
 * support pipelining as an IMAP/NNTP client for -watch + lei
 
 * auto-detect and reload on TLS cert+key changes in daemons
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 53ec23a5..cbfe7816 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -36,9 +36,8 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
 	# assume a typical 64-bit system has 8x more RAM than a
 	# typical 32-bit system:
 	(($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
-
 use constant DEBUG => !!$ENV{DEBUG};
-
+my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
@@ -258,21 +257,42 @@ sub index_diff ($$$) {
 	my ($self, $txt, $doc) = @_;
 	my %seen;
 	my $in_diff;
-	my @xnq;
-	my $xnq = \@xnq;
-	foreach (split(/\n/, $txt)) {
-		if ($in_diff && s/^ //) { # diff context
+	my $xnq = [];
+	my @l = split(/\n/, $$txt);
+	undef $$txt;
+	while (defined($_ = shift @l)) {
+		if ($in_diff && /^GIT binary patch/) {
+			push @$xnq, $_;
+			while (@l && $l[0] =~ /^literal /) {
+				# TODO allow searching by size range?
+				# allows searching by exact size via:
+				# "literal $SIZE"
+				push @$xnq, shift(@l);
+
+				# skip base85 and empty lines
+				while (@l && ($l[0] =~ /$BASE85/o ||
+						$l[0] !~ /\S/)) {
+					shift @l;
+				}
+				# loop hits trailing "literal 0\nHcmV?d00001\n"
+			}
+		} elsif ($in_diff && s/^ //) { # diff context
 			index_diff_inc($self, $_, 'XDFCTX', $xnq);
 		} elsif (/^-- $/) { # email signature begins
 			$in_diff = undef;
-		} elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) {
-			# wait until "---" and "+++" to capture filenames
+		} elsif (m!^diff --git ("?[^/]+/.+) ("?[^/]+/.+)\z!) {
+			# capture filenames here for binary diffs:
+			my ($fa, $fb) = ($1, $2);
+			push @$xnq, $_;
 			$in_diff = 1;
-			push @xnq, $_;
+			$fa = (split(m'/', git_unquote($fa), 2))[1];
+			$fb = (split(m'/', git_unquote($fb), 2))[1];
+			$seen{$fa}++ or index_diff_inc($self, $fa, 'XDFN', $xnq);
+			$seen{$fb}++ or index_diff_inc($self, $fb, 'XDFN', $xnq);
 		# traditional diff:
 		} elsif (m/^diff -(.+) (\S+) (\S+)$/) {
 			my ($opt, $fa, $fb) = ($1, $2, $3);
-			push @xnq, $_;
+			push @$xnq, $_;
 			# only support unified:
 			next unless $opt =~ /[uU]/;
 			$in_diff = index_old_diff_fn($self, \%seen, $fa, $fb,
@@ -288,8 +308,8 @@ sub index_diff ($$$) {
 			$seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq);
 			$in_diff = 1;
 		} elsif (/^--- (\S+)/) {
-			$in_diff = $1;
-			push @xnq, $_;
+			$in_diff = $1; # old diff filename
+			push @$xnq, $_;
 		} elsif (defined $in_diff && /^\+\+\+ (\S+)/) {
 			$in_diff = index_old_diff_fn($self, \%seen, $in_diff,
 							$1, $xnq);
@@ -315,19 +335,19 @@ sub index_diff ($$$) {
 				/^(?:dis)?similarity index / ||
 				/^\\ No newline at end of file/ ||
 				/^Binary files .* differ/) {
-			push @xnq, $_;
+			push @$xnq, $_;
 		} elsif ($_ eq '') {
 			# possible to be in diff context, some mail may be
 			# stripped by MUA or even GNU diff(1).  "git apply"
 			# treats a bare "\n" as diff context, too
 		} else {
-			push @xnq, $_;
+			push @$xnq, $_;
 			warn "non-diff line: $_\n" if DEBUG && $_ ne '';
 			$in_diff = undef;
 		}
 	}
 
-	index_text($self, join("\n", @xnq), 1, 'XNQ');
+	index_text($self, join("\n", @$xnq), 1, 'XNQ');
 }
 
 sub index_xapian { # msg_iter callback
@@ -373,7 +393,7 @@ sub index_xapian { # msg_iter callback
 		} else {
 			# does it look like a diff?
 			if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
-				index_diff($self, $txt, $doc);
+				index_diff($self, \$txt, $doc);
 			} else {
 				index_text($self, $txt, 1, 'XNQ');
 			}
diff --git a/t/data/binary.patch b/t/data/binary.patch
new file mode 100644
index 00000000..58717abe
--- /dev/null
+++ b/t/data/binary.patch
@@ -0,0 +1,20 @@
+From 7a1921ba7bd99c63ad6dc6ec0791691ee80e279a Mon Sep 17 00:00:00 2001
+From: BOFH <bofh@example.com>
+Date: Fri, 13 May 2022 23:04:14 +0000
+Subject: [PATCH] binary patch test
+Message-ID: <binary-patch-test@example>
+
+---
+ zero | Bin 0 -> 1 bytes
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 zero
+
+diff --git a/zero b/zero
+new file mode 100644
+index 0000000000000000000000000000000000000000..f76dd238ade08917e6712764a16a22005a50573d
+GIT binary patch
+literal 1
+IcmZPo000310RR91
+
+literal 0
+HcmV?d00001
diff --git a/t/search.t b/t/search.t
index 47a67f7f..13210ff5 100644
--- a/t/search.t
+++ b/t/search.t
@@ -533,6 +533,21 @@ $ibx->with_umask(sub {
 	is($query->('s:"mail header experiments"')->[0]->{mid},
 		'20200418222508.GA13918@dcvr',
 		'Subject search reaches inside message/rfc822');
+
+	$doc_id = $rw->add_message(eml_load('t/data/binary.patch'));
+	$rw->commit_txn_lazy;
+	$ibx->search->reopen;
+	my $res = $query->('HcmV');
+	is_deeply($res, [], 'no results against trailer');
+	$res = $query->('IcmZPo000310RR91');
+	is_deeply($res, [], 'no results against 1-byte binary patch');
+	$res = $query->('"GIT binary patch"');
+	is(scalar(@$res), 1, 'got binary result from "GIT binary patch"');
+	is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary');
+	my $s = $query->('"literal 1"');
+	is_deeply($s, $res, 'got binary result from exact literal size');
+	$s = $query->('"literal 2"');
+	is_deeply($s, [], 'no results for wrong size');
 });
 
 SKIP: {

^ permalink raw reply related	[relevance 5%]

* [PATCH 0/3] search indexing improvements
@ 2022-06-20 19:27  6% Eric Wong
  2022-06-20 19:27  5% ` [PATCH 3/3] search: do not index base-85 binary patches Eric Wong
  0 siblings, 1 reply; 3+ results
From: Eric Wong @ 2022-06-20 19:27 UTC (permalink / raw)
  To: meta

Still stuck on POP3 account manglement, but here's some easy-ish
indexing changes for public-inbox-* tools.  These require a full
reindex with either public-inbox-index or public-inbox-extindex,
but old/new indexes should be fully compatible and should be
doable hot:

   public-inbox-index --no-fsync --reindex /path/to/v1-or-v2
   public-inbox-extindex --no-fsync --reindex --all /path/to/eidx

Will probably take 2 days or so on my own machine.

Note: lei doesn't support reindexing, yet, but will, soon...

Eric Wong (3):
  searchidx: use regexp as first arg for `split' op
  search: support "patchid:" prefix (git patch-id --stable)
  search: do not index base-85 binary patches

 MANIFEST                     |  1 +
 TODO                         |  5 ---
 lib/PublicInbox/Search.pm    |  5 ++-
 lib/PublicInbox/SearchIdx.pm | 75 ++++++++++++++++++++++++++----------
 t/data/binary.patch          | 20 ++++++++++
 t/extsearch.t                |  7 +++-
 t/search.t                   | 15 ++++++++
 t/v2mda.t                    | 10 ++++-
 8 files changed, 108 insertions(+), 30 deletions(-)
 create mode 100644 t/data/binary.patch

^ permalink raw reply	[relevance 6%]

* [PATCH] searchidx: do not index quoted Base-85 patches
@ 2023-02-20  9:21  7% Eric Wong
  0 siblings, 0 replies; 3+ results
From: Eric Wong @ 2023-02-20  9:21 UTC (permalink / raw)
  To: meta

Base-85 binary patches were a source of false-positives in results
and we've filtered out in non-quoted text since July 2022.
Unfortunately, people were quoting binary patch contents
in replies (*sigh*) and triggering false positives in search
results.  So we must filter out base-85-looking contents from
quoted text, too.

Followup-to: 8fda04081acde705 (search: do not index base-85 binary patches, 2022-06-20)
Followup-to: 840785917bc74c8e (searchidx: skip "delta $N" sections for base-85, 2022-07-19)
---
 lib/PublicInbox/SearchIdx.pm | 10 ++++++++--
 t/search.t                   | 13 +++++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 257b83a5..fc464383 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -37,7 +37,7 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
 	# typical 32-bit system:
 	(($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
 use constant DEBUG => !!$ENV{DEBUG};
-my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
+my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
@@ -270,7 +270,7 @@ sub index_diff ($$$) {
 				push @$xnq, shift(@l);
 
 				# skip base85 and empty lines
-				while (@l && ($l[0] =~ /$BASE85/o ||
+				while (@l && ($l[0] =~ /\A$BASE85\h*\z/o ||
 						$l[0] !~ /\S/)) {
 					shift @l;
 				}
@@ -389,6 +389,12 @@ sub index_xapian { # msg_iter callback
 	undef $s; # free memory
 	for my $txt (@sections) {
 		if ($txt =~ /\A>/) {
+			if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) {
+				# get rid of Base-85 noise
+				$txt =~ s/^([>\h]+(?:literal|delta)
+						\x20[0-9]+\r?\n)
+					(?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx;
+			}
 			index_text($self, $txt, 0, 'XQUOT');
 		} else {
 			# does it look like a diff?
diff --git a/t/search.t b/t/search.t
index dded6c40..cf639a6d 100644
--- a/t/search.t
+++ b/t/search.t
@@ -534,7 +534,15 @@ $ibx->with_umask(sub {
 		'20200418222508.GA13918@dcvr',
 		'Subject search reaches inside message/rfc822');
 
-	$doc_id = $rw->add_message(eml_load('t/data/binary.patch'));
+	my $eml = eml_load('t/data/binary.patch');
+	my $body = $eml->body;
+	$rw->add_message($eml);
+
+	$body =~ s/^/> /gsm;
+	$eml = PublicInbox::Eml->new($eml->header_obj->as_string."\n".$body);
+	$eml->header_set('Message-ID', '<binary-patch-reply@example>');
+	$rw->add_message($eml);
+
 	$rw->commit_txn_lazy;
 	$ibx->search->reopen;
 	my $res = $query->('HcmV');
@@ -542,8 +550,9 @@ $ibx->with_umask(sub {
 	$res = $query->('IcmZPo000310RR91');
 	is_deeply($res, [], 'no results against 1-byte binary patch');
 	$res = $query->('"GIT binary patch"');
-	is(scalar(@$res), 1, 'got binary result from "GIT binary patch"');
+	is(scalar(@$res), 2, 'got binary results from "GIT binary patch"');
 	is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary');
+	is($res->[1]->{mid}, 'binary-patch-reply@example', 'msgid for reply');
 	my $s = $query->('"literal 1"');
 	is_deeply($s, $res, 'got binary result from exact literal size');
 	$s = $query->('"literal 2"');

^ permalink raw reply related	[relevance 7%]

Results 1-3 of 3 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-02-20  9:21  7% [PATCH] searchidx: do not index quoted Base-85 patches Eric Wong
2022-06-20 19:27  6% [PATCH 0/3] search indexing improvements Eric Wong
2022-06-20 19:27  5% ` [PATCH 3/3] search: do not index base-85 binary patches Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).