about summary refs log tree commit
diff options
context:
space:
mode:
authorEric Wong <e@yhbt.net>2020-06-29 10:34:17 +0000
committerEric Wong <e@yhbt.net>2020-06-30 03:05:21 +0000
commit1b356e8d587a9c1bb92a11ffce255a3d3c25747c (patch)
tree883f7f9a1efc0525d85542af26cd9726744f7155
parentfd7140782db39585e90e59f5fd0801bf42490570 (diff)
downloadpublic-inbox-1b356e8d587a9c1bb92a11ffce255a3d3c25747c.tar.gz
watch: check for duplicates in ->over before spamcheck
It's cheaper to check for duplicates than run `spamc'
repeatedly when rechecking.  We already do this for
v1 with by using the "ls" command with fast-import,
but v2 requires checking against over.sqlite3.
-rw-r--r--lib/PublicInbox/Import.pm2
-rw-r--r--lib/PublicInbox/V2Writable.pm2
-rw-r--r--lib/PublicInbox/WatchMaildir.pm21
3 files changed, 22 insertions, 3 deletions
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index ae508cd8..fb813159 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -387,7 +387,7 @@ sub add {
 
         # spam check:
         if ($check_cb) {
-                $mime = $check_cb->($mime) or return;
+                $mime = $check_cb->($mime, $self->{-inbox}) or return;
         }
 
         my $blob = $self->{mark}++;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 8b31b69a..528f5e9a 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -171,7 +171,7 @@ sub _add {
 
         # spam check:
         if ($check_cb) {
-                $mime = $check_cb->($mime) or return;
+                $mime = $check_cb->($mime, $self->{-inbox}) or return;
         }
 
         # All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index efc9849a..ec28a303 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -12,6 +12,8 @@ use PublicInbox::Filter::Base qw(REJECT);
 use PublicInbox::Spamcheck;
 use PublicInbox::Sigfd;
 use PublicInbox::DS qw(now);
+use PublicInbox::MID qw(mids);
+use PublicInbox::ContentHash qw(content_hash);
 use POSIX qw(_exit);
 *mime_from_path = \&PublicInbox::InboxWritable::mime_from_path;
 
@@ -988,10 +990,27 @@ sub _importer_for {
         $importers->{"$ibx"} = $im;
 }
 
+# XXX consider sharing with V2Writable, this only requires read-only access
+sub content_exists ($$) {
+        my ($ibx, $eml) = @_;
+        my $over = $ibx->over or return;
+        my $mids = mids($eml);
+        my $chash = content_hash($eml);
+        my ($id, $prev);
+        for my $mid (@$mids) {
+                while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
+                        my $cmp = $ibx->smsg_eml($smsg) or return;
+                        return 1 if $chash eq content_hash($cmp);
+                }
+        }
+        undef;
+}
+
 sub _spamcheck_cb {
         my ($sc) = @_;
         sub {
-                my ($mime) = @_;
+                my ($mime, $ibx) = @_;
+                return if content_exists($ibx, $mime);
                 my $tmp = '';
                 if ($sc->spamcheck($mime, \$tmp)) {
                         return PublicInbox::Eml->new(\$tmp);