From 1b356e8d587a9c1bb92a11ffce255a3d3c25747c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 29 Jun 2020 10:34:17 +0000 Subject: watch: check for duplicates in ->over before spamcheck It's cheaper to check for duplicates than run `spamc' repeatedly when rechecking. We already do this for v1 with by using the "ls" command with fast-import, but v2 requires checking against over.sqlite3. --- lib/PublicInbox/WatchMaildir.pm | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'lib/PublicInbox/WatchMaildir.pm') diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index efc9849a..ec28a303 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -12,6 +12,8 @@ use PublicInbox::Filter::Base qw(REJECT); use PublicInbox::Spamcheck; use PublicInbox::Sigfd; use PublicInbox::DS qw(now); +use PublicInbox::MID qw(mids); +use PublicInbox::ContentHash qw(content_hash); use POSIX qw(_exit); *mime_from_path = \&PublicInbox::InboxWritable::mime_from_path; @@ -988,10 +990,27 @@ sub _importer_for { $importers->{"$ibx"} = $im; } +# XXX consider sharing with V2Writable, this only requires read-only access +sub content_exists ($$) { + my ($ibx, $eml) = @_; + my $over = $ibx->over or return; + my $mids = mids($eml); + my $chash = content_hash($eml); + my ($id, $prev); + for my $mid (@$mids) { + while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { + my $cmp = $ibx->smsg_eml($smsg) or return; + return 1 if $chash eq content_hash($cmp); + } + } + undef; +} + sub _spamcheck_cb { my ($sc) = @_; sub { - my ($mime) = @_; + my ($mime, $ibx) = @_; + return if content_exists($ibx, $mime); my $tmp = ''; if ($sc->spamcheck($mime, \$tmp)) { return PublicInbox::Eml->new(\$tmp); -- cgit v1.2.3-24-ge0c7