[PATCH] watchmaildir: spam checking and learning improvements

user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed

* [PATCH] watchmaildir: spam checking and learning improvements
@ 2019-01-10  9:02 Eric Wong
  2019-01-10  9:19 ` [REJECT] " Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2019-01-10  9:02 UTC (permalink / raw)
  To: meta

It turns out spam training ("spamc -L spam") was never implemented
for -watch.  So implement that, and take advantage of an
existing feature to pass FDs directly to spamc(1) instead of
reconverting the MIME object into a string.

While we're at it, the spam-checking call is now only performed
once per message when a Maildir supports multiple inboxes.

This changes PublicInbox::Import::add (and V2Writable
equivalent) API to no longer perform spam-checking.

While ->add is one of the few documented API functions in POD,
the spamcheck callback was never documented, so it's gone.  I'm
not sure what I was thinking when I made the callback in ->add
instead of just checking spam before it was called...
---
 MANIFEST                        |  1 +
 lib/PublicInbox/Import.pm       |  7 +--
 lib/PublicInbox/V2Writable.pm   | 13 ++----
 lib/PublicInbox/WatchMaildir.pm | 76 ++++++++++++++++++++++-----------
 t/import.t                      |  3 +-
 t/log-bin/spamc                 |  6 +++
 t/watch_maildir.t               | 31 +++++++++++++-
 7 files changed, 92 insertions(+), 45 deletions(-)
 create mode 100755 t/log-bin/spamc

diff --git a/MANIFEST b/MANIFEST
index 73d1047..cabcf7b 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -173,6 +173,7 @@ t/import.t
 t/inbox.t
 t/init.t
 t/linkify.t
+t/log-bin/spamc
 t/main-bin/spamc
 t/mda.t
 t/mda_filter_rubylang.t
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index fd4255c..65a7e8c 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -356,7 +356,7 @@ ($$$)
 # returns undef on duplicate
 # returns the :MARK of the most recent commit
 sub add {
-	my ($self, $mime, $check_cb) = @_; # mime = Email::MIME
+	my ($self, $mime) = @_; # mime = Email::MIME
 
 	my ($name, $email) = extract_author_info($mime);
 	my $hdr = $mime->header_obj;
@@ -383,11 +383,6 @@ sub add {
 
 	drop_unwanted_headers($mime);
 
-	# spam check:
-	if ($check_cb) {
-		$mime = $check_cb->($mime) or return;
-	}
-
 	my $blob = $self->{mark}++;
 	my $str = $mime->as_string;
 	my $n = length($str);
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 222df5c..f4f0cfe 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -99,19 +99,12 @@ sub init_inbox {
 # returns undef on duplicate or spam
 # mimics Import::add and wraps it for v2
 sub add {
-	my ($self, $mime, $check_cb) = @_;
-	$self->{-inbox}->with_umask(sub {
-		_add($self, $mime, $check_cb)
-	});
+	my ($self, $mime) = @_;
+	$self->{-inbox}->with_umask(sub { _add($self, $mime) });
 }
 
 sub _add {
-	my ($self, $mime, $check_cb) = @_;
-
-	# spam check:
-	if ($check_cb) {
-		$mime = $check_cb->($mime) or return;
-	}
+	my ($self, $mime) = @_;
 
 	# All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set,
 	# as does SQLite 3.4.1+ (released in 2007-07-20), and
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index 2d4c6f4..17ace92 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -16,6 +16,7 @@ package PublicInbox::WatchMaildir;
 use PublicInbox::Filter::Base;
 use PublicInbox::Spamcheck;
 *REJECT = *PublicInbox::Filter::Base::REJECT;
+use Fcntl qw(SEEK_SET);
 
 sub new {
 	my ($class, $config) = @_;
@@ -52,7 +53,6 @@ sub new {
 	my $k = 'publicinboxwatch.spamcheck';
 	my $default = undef;
 	my $spamcheck = PublicInbox::Spamcheck::get($config, $k, $default);
-	$spamcheck = _spamcheck_cb($spamcheck) if $spamcheck;
 
 	$config->each_inbox(sub {
 		# need to make all inboxes writable for spam removal:
@@ -115,7 +115,8 @@ sub _remove_spam {
 	my ($self, $path) = @_;
 	# path must be marked as (S)een
 	$path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return;
-	my $mime = _path_to_mime($path) or return;
+	my $fh;
+	my $mime = _path_to_mime($path, \$fh) or return;
 	$self->{config}->each_inbox(sub {
 		my ($ibx) = @_;
 		eval {
@@ -132,7 +133,22 @@ sub _remove_spam {
 			warn "error removing spam at: ", $path,
 				" from ", $ibx->{name}, ': ', $@, "\n";
 		}
-	})
+	});
+
+	# This ignores the scrubbed version if it differs.
+	# Probably not worth training both...
+	my $spamcheck = $self->{spamcheck};
+	$mime = undef;
+	if ($spamcheck && $spamcheck->can('spamlearn')) {
+		sysseek($fh, 0, SEEK_SET) or die "sysseek failed: $!";
+		$spamcheck->spamlearn($fh);
+	}
+}
+
+sub spam_fail ($$) {
+	my ($mime, $path) = @_;
+	warn $mime->header('Message-ID')." failed spam check\n",
+		'path: ', $path, "\n";
 }
 
 sub _try_path {
@@ -150,22 +166,37 @@ sub _try_path {
 	if (!ref($inboxes) && $inboxes eq 'watchspam') {
 		return _remove_spam($self, $path);
 	}
+
+	my $spamcheck = $self->{spamcheck};
+	my $fh;
 	foreach my $ibx (@$inboxes) {
-		my $mime = _path_to_mime($path) or next;
-		my $im = _importer_for($self, $ibx);
+		my $mime = $fh ? _fh_to_mime($fh, 0) :
+				_path_to_mime($path, \$fh) or next;
 
+		my $im = _importer_for($self, $ibx);
 		my $wm = $ibx->{-watchheader};
 		if ($wm) {
 			my $v = $mime->header_obj->header_raw($wm->[0]);
 			next unless ($v && $v =~ $wm->[1]);
 		}
 
+		# we only check each message once:
+		if ($spamcheck) {
+			my $dst = '';
+			sysseek($fh, 0, SEEK_SET) or die "sysseek failed: $!";
+			$spamcheck->spamcheck($mime, \$dst) or
+				return spam_fail($mime, $path);
+
+			$mime = PublicInbox::MIME->new(\$dst);
+			$spamcheck = undef; # no need to check again this loop
+		}
+
 		if (my $scrub = $ibx->filter($im)) {
 			my $ret = $scrub->scrub($mime) or next;
 			$ret == REJECT() and next;
 			$mime = $ret;
 		}
-		$im->add($mime, $self->{spamcheck});
+		$im->add($mime);
 	}
 }
 
@@ -237,18 +268,24 @@ sub scan {
 	trigger_scan($self, 'cont') if keys %$opendirs;
 }
 
+sub _fh_to_mime {
+	my ($fh, $off) = @_;
+	if (defined $off) {
+		seek($fh, $off, SEEK_SET) or die "seek failed: $!";
+	}
+	defined(my $str = do { local $/; <$fh> }) or return;
+	PublicInbox::MIME->new(\$str);
+}
+
 sub _path_to_mime {
-	my ($path) = @_;
+	my ($path, $fhref) = @_;
 	if (open my $fh, '<', $path) {
-		local $/;
-		my $str = <$fh>;
-		$str or return;
-		return PublicInbox::MIME->new(\$str);
+		$$fhref = $fh;
+		_fh_to_mime($fh);
 	} elsif ($!{ENOENT}) {
-		return;
+		# race with another process, just ignore the missing message
 	} else {
 		warn "failed to open $path: $!\n";
-		return;
 	}
 }
 
@@ -264,19 +301,6 @@ sub _importer_for {
 	$importers->{"$ibx"} = $im;
 }
 
-sub _spamcheck_cb {
-	my ($sc) = @_;
-	sub {
-		my ($mime) = @_;
-		my $tmp = '';
-		if ($sc->spamcheck($mime, \$tmp)) {
-			return PublicInbox::MIME->new(\$tmp);
-		}
-		warn $mime->header('Message-ID')." failed spam check\n";
-		undef;
-	}
-}
-
 sub is_maildir {
 	$_[0] =~ s!\Amaildir:!! or return;
 	$_[0] =~ tr!/!/!s;
diff --git a/t/import.t b/t/import.t
index eee4744..1016ee8 100644
--- a/t/import.t
+++ b/t/import.t
@@ -56,7 +56,7 @@ is(scalar @revs, 1, 'one revision created');
 
 $mime->header_set('Message-ID', '<b@example.com>');
 $mime->header_set('Subject', 'msg2');
-like($im->add($mime, sub { $mime }), qr/\A:\d+\z/, 'added 2nd message');
+like($im->add($mime), qr/\A:\d+\z/, 'added 2nd message');
 $im->done;
 @revs = $git->qx(qw(rev-list HEAD));
 is(scalar @revs, 2, '2 revisions exist');
@@ -88,7 +88,6 @@ is($msg->header('Message-ID'), '<a@example.com>', 'Message-ID matches');
 isnt($msg->header('Subject'), $mime->header('Subject'), 'subject mismatch');
 
 $mime->header_set('Message-Id', '<failcheck@example.com>');
-is($im->add($mime, sub { undef }), undef, 'check callback fails');
 is($im->remove($mime), undef, 'message not added, so not removed');
 is(undef, $im->checkpoint, 'checkpoint works before ->done');
 $im->done;
diff --git a/t/log-bin/spamc b/t/log-bin/spamc
new file mode 100755
index 0000000..9a1551f
--- /dev/null
+++ b/t/log-bin/spamc
@@ -0,0 +1,6 @@
+#!/bin/sh
+# mock spamc which writes its args to LOG_PATH
+if test -n "$LOG_PATH"
+then
+	echo >>"$LOG_PATH" "ARGS: $@"
+fi
diff --git a/t/watch_maildir.t b/t/watch_maildir.t
index b85ddc5..ee84069 100644
--- a/t/watch_maildir.t
+++ b/t/watch_maildir.t
@@ -95,10 +95,14 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	local $ENV{PATH} = $fail_path;
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
 	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	my @warn;
 	{
-		local $SIG{__WARN__} = sub {}; # quiet spam check warning
+		local $SIG{__WARN__} = sub { push @warn, @_ };
 		PublicInbox::WatchMaildir->new($config)->scan('full');
 	}
+	my $warn = join("\n", @warn);
+	like($warn, qr/failed spam check/, 'warned about spam');
+	like($warn, qr/^path: /m, 'shows failing path');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 0, 'tree has no files spamc checked');
 	is(unlink(glob("$maildir/new/*")), 1);
@@ -124,6 +128,31 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	like($$mref, qr/something\n\z/s, 'message scrubbed on import');
 }
 
+my $write_spam = sub {
+	is(scalar glob("$spamdir/new/*"), undef, 'no spam existing');
+	PublicInbox::Emergency->new($spamdir)->prepare(\$msg);
+	my @new = glob("$spamdir/new/*");
+	is(scalar @new, 1);
+	my @p = split(m!/+!, $new[0]);
+	ok(link($new[0], "$spamdir/cur/".$p[-1].":2,S"));
+	is(unlink($new[0]), 1);
+};
+
+{
+	my $log_bin = getcwd()."/t/log-bin";
+	ok(-x "$log_bin/spamc", "mock spamc exists for logging");
+	my $log_path = "$log_bin:$ENV{PATH}"; # for spamc ham mock
+	local $ENV{PATH} = $log_path;
+	my $log_path = "$tmpdir/spamc.log";
+	local $ENV{LOG_PATH} = $log_path;
+	ok(unlink(glob("$maildir/*/*")));
+	ok(unlink(glob("$spamdir/*/*")));
+	$write_spam->(\$msg);
+	PublicInbox::WatchMaildir->new($config)->scan('full');
+	ok(open(my $fh, '<', $log_path), 'opened spamc.log');
+	like(<$fh>, qr/-L spam/, '"spamc -L spam" called');
+}
+
 sub is_maildir {
 	my ($dir) = @_;
 	PublicInbox::WatchMaildir::is_maildir($dir);
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [REJECT] watchmaildir: spam checking and learning improvements
  2019-01-10  9:02 [PATCH] watchmaildir: spam checking and learning improvements Eric Wong
@ 2019-01-10  9:19 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2019-01-10  9:19 UTC (permalink / raw)
  To: meta

Eric Wong <e@80x24.org> wrote:
> It turns out spam training ("spamc -L spam") was never implemented
> for -watch.  So implement that, and take advantage of an
> existing feature to pass FDs directly to spamc(1) instead of
> reconverting the MIME object into a string.

Will keep that...

> While we're at it, the spam-checking call is now only performed
> once per message when a Maildir supports multiple inboxes.

And that.

> This changes PublicInbox::Import::add (and V2Writable
> equivalent) API to no longer perform spam-checking.

But nope, we need to do spam-checking inside ->add;
because we don't want to waste cycles re-spam-checking
messages which are already in our archives on -watch
restarts.

And the V2Writable->add spam-checking is in the wrong place.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-01-10  9:19 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-10  9:02 [PATCH] watchmaildir: spam checking and learning improvements Eric Wong
2019-01-10  9:19 ` [REJECT] " Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).