From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5C7071F5AF for ; Mon, 29 Jun 2020 10:34:23 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 1/5] watch: check for duplicates in ->over before spamcheck Date: Mon, 29 Jun 2020 10:34:17 +0000 Message-Id: <20200629103421.31016-2-e@yhbt.net> In-Reply-To: <20200629103421.31016-1-e@yhbt.net> References: <20200629103421.31016-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: It's cheaper to check for duplicates than run `spamc' repeatedly when rechecking. We already do this for v1 with by using the "ls" command with fast-import, but v2 requires checking against over.sqlite3. --- lib/PublicInbox/Import.pm | 2 +- lib/PublicInbox/V2Writable.pm | 2 +- lib/PublicInbox/WatchMaildir.pm | 21 ++++++++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index ae508cd8013..fb813159ef7 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -387,7 +387,7 @@ sub add { # spam check: if ($check_cb) { - $mime = $check_cb->($mime) or return; + $mime = $check_cb->($mime, $self->{-inbox}) or return; } my $blob = $self->{mark}++; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 8b31b69a62f..528f5e9a565 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -171,7 +171,7 @@ sub _add { # spam check: if ($check_cb) { - $mime = $check_cb->($mime) or return; + $mime = $check_cb->($mime, $self->{-inbox}) or return; } # All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set, diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index efc9849a6ef..ec28a3034ff 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -12,6 +12,8 @@ use PublicInbox::Filter::Base qw(REJECT); use PublicInbox::Spamcheck; use PublicInbox::Sigfd; use PublicInbox::DS qw(now); +use PublicInbox::MID qw(mids); +use PublicInbox::ContentHash qw(content_hash); use POSIX qw(_exit); *mime_from_path = \&PublicInbox::InboxWritable::mime_from_path; @@ -988,10 +990,27 @@ sub _importer_for { $importers->{"$ibx"} = $im; } +# XXX consider sharing with V2Writable, this only requires read-only access +sub content_exists ($$) { + my ($ibx, $eml) = @_; + my $over = $ibx->over or return; + my $mids = mids($eml); + my $chash = content_hash($eml); + my ($id, $prev); + for my $mid (@$mids) { + while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { + my $cmp = $ibx->smsg_eml($smsg) or return; + return 1 if $chash eq content_hash($cmp); + } + } + undef; +} + sub _spamcheck_cb { my ($sc) = @_; sub { - my ($mime) = @_; + my ($mime, $ibx) = @_; + return if content_exists($ibx, $mime); my $tmp = ''; if ($sc->spamcheck($mime, \$tmp)) { return PublicInbox::Eml->new(\$tmp);