From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.1 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 8E7EA208EB for ; Sun, 29 Jul 2018 09:34:41 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/3] mda: v2: ensure message bodies are indexed Date: Sun, 29 Jul 2018 09:34:41 +0000 Message-Id: <20180729093441.5250-4-e@80x24.org> In-Reply-To: <20180729093441.5250-1-e@80x24.org> References: <20180729093441.5250-1-e@80x24.org> List-Id: We must not clobber the original message string, as Email::MIME(*) still needs it for iterating through parts in SearchIdx (but not when handing it as a raw string to git-fast-import). I've noticed message bodies (especially dfpre/dpost) were not getting indexed when going through -mda (no problems with -watch). This also did not affect v1 repos, since indexing is a separate process for v1 and requires re-reading the data from git. (*) tested Email::MIME 1.937 on Debian stretch --- MANIFEST | 1 + script/public-inbox-mda | 1 - t/data/0001.patch | 46 +++++++++++++++++++++++++++++++++++++++++ t/v2mda.t | 10 +++++++++ t/watch_maildir_v2.t | 15 ++++++++++++++ 5 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 t/data/0001.patch diff --git a/MANIFEST b/MANIFEST index fd74a43..003c3c5 100644 --- a/MANIFEST +++ b/MANIFEST @@ -146,6 +146,7 @@ t/config.t t/config_limiter.t t/content_id.t t/convert-compact.t +t/data/0001.patch t/emergency.t t/fail-bin/spamc t/feed.t diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 2a31537..2b7f298 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -51,7 +51,6 @@ $emm = PublicInbox::Emergency->new($emergency); $emm->prepare(\$str); $ems = $ems->abort; my $mime = PublicInbox::MIME->new(\$str); -$str = ''; do_exit(0) unless $spam_ok; my $fcfg = $dst->{filter} || ''; diff --git a/t/data/0001.patch b/t/data/0001.patch new file mode 100644 index 0000000..b7964a2 --- /dev/null +++ b/t/data/0001.patch @@ -0,0 +1,46 @@ +From: Eric Wong +Date: Fri, 20 Jul 2018 07:21:41 +0000 +To: test@example.com +Subject: [PATCH] search: use boolean prefix for filenames in diffs, too +Message-ID: <20180720072141.GA15957@example> + +Filenames within a project tend to be reasonably stable within a +project and I plan on having automated searches hit these. + +Also, using no term prefix at all (the default for searching) +still allows probabilistic searches on everything that's in a +"git diff", including the blob names which were just made +boolean. + +Note, attachment filenames ("n:" prefix) will stil use +probabilistic search, as they're hardly standardized. +--- + lib/PublicInbox/Search.pm | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm +index 090d998b6c2c..6e006fd73b1d 100644 +--- a/lib/PublicInbox/Search.pm ++++ b/lib/PublicInbox/Search.pm +@@ -53,6 +53,9 @@ my %bool_pfx_external = ( + dfpre => 'XDFPRE', + dfpost => 'XDFPOST', + dfblob => 'XDFPRE XDFPOST', ++ dfn => 'XDFN', ++ dfa => 'XDFA', ++ dfb => 'XDFB', + ); + + my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST'; +@@ -72,9 +75,6 @@ my %prob_prefix = ( + + q => 'XQUOT', + nq => $non_quoted_body, +- dfn => 'XDFN', +- dfa => 'XDFA', +- dfb => 'XDFB', + dfhh => 'XDFHH', + dfctx => 'XDFCTX', + +-- +^_^ diff --git a/t/v2mda.t b/t/v2mda.t index 7df3a43..6145720 100644 --- a/t/v2mda.t +++ b/t/v2mda.t @@ -65,4 +65,14 @@ my $msgs = $ibx->search->query(''); my $saved = $ibx->smsg_mime($msgs->[0]); is($saved->{mime}->as_string, $mime->as_string, 'injected message'); +my $patch = 't/data/0001.patch'; +open my $fh, '<', $patch or die "failed to open $patch: $!\n"; +$rdr = { 0 => fileno($fh) }; +ok(PublicInbox::Import::run_die(['public-inbox-mda'], undef, $rdr), + 'mda delivered a patch'); +my $post = $ibx->search->reopen->query('dfpost:6e006fd7'); +is(scalar(@$post), 1, 'got one result for dfpost'); +my $pre = $ibx->search->query('dfpre:090d998'); +is(scalar(@$pre), 1, 'got one result for dfpre'); +is($post->[0]->{blob}, $pre->[0]->{blob}, 'same message in both cases'); done_testing(); diff --git a/t/watch_maildir_v2.t b/t/watch_maildir_v2.t index a76e413..fc002dc 100644 --- a/t/watch_maildir_v2.t +++ b/t/watch_maildir_v2.t @@ -120,6 +120,21 @@ More majordomo info at http://vger.kernel.org/majordomo-info.html\n); is($nr, 1, 'inbox has one mail after spamc OK-ed a message'); my $mref = $ibx->msg_by_smsg($msgs->[0]); like($$mref, qr/something\n\z/s, 'message scrubbed on import'); + delete $config->{'publicinboxwatch.spamcheck'}; +} + +{ + my $patch = 't/data/0001.patch'; + open my $fh, '<', $patch or die "failed to open $patch: $!\n"; + $msg = eval { local $/; <$fh> }; + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + PublicInbox::WatchMaildir->new($config)->scan('full'); + ($nr, $msgs) = $srch->reopen->query('dfpost:6e006fd7'); + is($nr, 1, 'diff postimage found'); + my $post = $msgs->[0]; + ($nr, $msgs) = $srch->query('dfpre:090d998b6c2c'); + is($nr, 1, 'diff preimage found'); + is($post->{blob}, $msgs->[0]->{blob}, 'same message'); } done_testing; -- EW