From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 110101F803 for ; Thu, 10 Jan 2019 09:02:52 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] watchmaildir: spam checking and learning improvements Date: Thu, 10 Jan 2019 09:02:51 +0000 Message-Id: <20190110090251.18736-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: It turns out spam training ("spamc -L spam") was never implemented for -watch. So implement that, and take advantage of an existing feature to pass FDs directly to spamc(1) instead of reconverting the MIME object into a string. While we're at it, the spam-checking call is now only performed once per message when a Maildir supports multiple inboxes. This changes PublicInbox::Import::add (and V2Writable equivalent) API to no longer perform spam-checking. While ->add is one of the few documented API functions in POD, the spamcheck callback was never documented, so it's gone. I'm not sure what I was thinking when I made the callback in ->add instead of just checking spam before it was called... --- MANIFEST | 1 + lib/PublicInbox/Import.pm | 7 +-- lib/PublicInbox/V2Writable.pm | 13 ++---- lib/PublicInbox/WatchMaildir.pm | 76 ++++++++++++++++++++++----------- t/import.t | 3 +- t/log-bin/spamc | 6 +++ t/watch_maildir.t | 31 +++++++++++++- 7 files changed, 92 insertions(+), 45 deletions(-) create mode 100755 t/log-bin/spamc diff --git a/MANIFEST b/MANIFEST index 73d1047..cabcf7b 100644 --- a/MANIFEST +++ b/MANIFEST @@ -173,6 +173,7 @@ t/import.t t/inbox.t t/init.t t/linkify.t +t/log-bin/spamc t/main-bin/spamc t/mda.t t/mda_filter_rubylang.t diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index fd4255c..65a7e8c 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -356,7 +356,7 @@ ($$$) # returns undef on duplicate # returns the :MARK of the most recent commit sub add { - my ($self, $mime, $check_cb) = @_; # mime = Email::MIME + my ($self, $mime) = @_; # mime = Email::MIME my ($name, $email) = extract_author_info($mime); my $hdr = $mime->header_obj; @@ -383,11 +383,6 @@ sub add { drop_unwanted_headers($mime); - # spam check: - if ($check_cb) { - $mime = $check_cb->($mime) or return; - } - my $blob = $self->{mark}++; my $str = $mime->as_string; my $n = length($str); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 222df5c..f4f0cfe 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -99,19 +99,12 @@ sub init_inbox { # returns undef on duplicate or spam # mimics Import::add and wraps it for v2 sub add { - my ($self, $mime, $check_cb) = @_; - $self->{-inbox}->with_umask(sub { - _add($self, $mime, $check_cb) - }); + my ($self, $mime) = @_; + $self->{-inbox}->with_umask(sub { _add($self, $mime) }); } sub _add { - my ($self, $mime, $check_cb) = @_; - - # spam check: - if ($check_cb) { - $mime = $check_cb->($mime) or return; - } + my ($self, $mime) = @_; # All pipes (> $^F) known to Perl 5.6+ have FD_CLOEXEC set, # as does SQLite 3.4.1+ (released in 2007-07-20), and diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index 2d4c6f4..17ace92 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -16,6 +16,7 @@ package PublicInbox::WatchMaildir; use PublicInbox::Filter::Base; use PublicInbox::Spamcheck; *REJECT = *PublicInbox::Filter::Base::REJECT; +use Fcntl qw(SEEK_SET); sub new { my ($class, $config) = @_; @@ -52,7 +53,6 @@ sub new { my $k = 'publicinboxwatch.spamcheck'; my $default = undef; my $spamcheck = PublicInbox::Spamcheck::get($config, $k, $default); - $spamcheck = _spamcheck_cb($spamcheck) if $spamcheck; $config->each_inbox(sub { # need to make all inboxes writable for spam removal: @@ -115,7 +115,8 @@ sub _remove_spam { my ($self, $path) = @_; # path must be marked as (S)een $path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return; - my $mime = _path_to_mime($path) or return; + my $fh; + my $mime = _path_to_mime($path, \$fh) or return; $self->{config}->each_inbox(sub { my ($ibx) = @_; eval { @@ -132,7 +133,22 @@ sub _remove_spam { warn "error removing spam at: ", $path, " from ", $ibx->{name}, ': ', $@, "\n"; } - }) + }); + + # This ignores the scrubbed version if it differs. + # Probably not worth training both... + my $spamcheck = $self->{spamcheck}; + $mime = undef; + if ($spamcheck && $spamcheck->can('spamlearn')) { + sysseek($fh, 0, SEEK_SET) or die "sysseek failed: $!"; + $spamcheck->spamlearn($fh); + } +} + +sub spam_fail ($$) { + my ($mime, $path) = @_; + warn $mime->header('Message-ID')." failed spam check\n", + 'path: ', $path, "\n"; } sub _try_path { @@ -150,22 +166,37 @@ sub _try_path { if (!ref($inboxes) && $inboxes eq 'watchspam') { return _remove_spam($self, $path); } + + my $spamcheck = $self->{spamcheck}; + my $fh; foreach my $ibx (@$inboxes) { - my $mime = _path_to_mime($path) or next; - my $im = _importer_for($self, $ibx); + my $mime = $fh ? _fh_to_mime($fh, 0) : + _path_to_mime($path, \$fh) or next; + my $im = _importer_for($self, $ibx); my $wm = $ibx->{-watchheader}; if ($wm) { my $v = $mime->header_obj->header_raw($wm->[0]); next unless ($v && $v =~ $wm->[1]); } + # we only check each message once: + if ($spamcheck) { + my $dst = ''; + sysseek($fh, 0, SEEK_SET) or die "sysseek failed: $!"; + $spamcheck->spamcheck($mime, \$dst) or + return spam_fail($mime, $path); + + $mime = PublicInbox::MIME->new(\$dst); + $spamcheck = undef; # no need to check again this loop + } + if (my $scrub = $ibx->filter($im)) { my $ret = $scrub->scrub($mime) or next; $ret == REJECT() and next; $mime = $ret; } - $im->add($mime, $self->{spamcheck}); + $im->add($mime); } } @@ -237,18 +268,24 @@ sub scan { trigger_scan($self, 'cont') if keys %$opendirs; } +sub _fh_to_mime { + my ($fh, $off) = @_; + if (defined $off) { + seek($fh, $off, SEEK_SET) or die "seek failed: $!"; + } + defined(my $str = do { local $/; <$fh> }) or return; + PublicInbox::MIME->new(\$str); +} + sub _path_to_mime { - my ($path) = @_; + my ($path, $fhref) = @_; if (open my $fh, '<', $path) { - local $/; - my $str = <$fh>; - $str or return; - return PublicInbox::MIME->new(\$str); + $$fhref = $fh; + _fh_to_mime($fh); } elsif ($!{ENOENT}) { - return; + # race with another process, just ignore the missing message } else { warn "failed to open $path: $!\n"; - return; } } @@ -264,19 +301,6 @@ sub _importer_for { $importers->{"$ibx"} = $im; } -sub _spamcheck_cb { - my ($sc) = @_; - sub { - my ($mime) = @_; - my $tmp = ''; - if ($sc->spamcheck($mime, \$tmp)) { - return PublicInbox::MIME->new(\$tmp); - } - warn $mime->header('Message-ID')." failed spam check\n"; - undef; - } -} - sub is_maildir { $_[0] =~ s!\Amaildir:!! or return; $_[0] =~ tr!/!/!s; diff --git a/t/import.t b/t/import.t index eee4744..1016ee8 100644 --- a/t/import.t +++ b/t/import.t @@ -56,7 +56,7 @@ is(scalar @revs, 1, 'one revision created'); $mime->header_set('Message-ID', ''); $mime->header_set('Subject', 'msg2'); -like($im->add($mime, sub { $mime }), qr/\A:\d+\z/, 'added 2nd message'); +like($im->add($mime), qr/\A:\d+\z/, 'added 2nd message'); $im->done; @revs = $git->qx(qw(rev-list HEAD)); is(scalar @revs, 2, '2 revisions exist'); @@ -88,7 +88,6 @@ is($msg->header('Message-ID'), '', 'Message-ID matches'); isnt($msg->header('Subject'), $mime->header('Subject'), 'subject mismatch'); $mime->header_set('Message-Id', ''); -is($im->add($mime, sub { undef }), undef, 'check callback fails'); is($im->remove($mime), undef, 'message not added, so not removed'); is(undef, $im->checkpoint, 'checkpoint works before ->done'); $im->done; diff --git a/t/log-bin/spamc b/t/log-bin/spamc new file mode 100755 index 0000000..9a1551f --- /dev/null +++ b/t/log-bin/spamc @@ -0,0 +1,6 @@ +#!/bin/sh +# mock spamc which writes its args to LOG_PATH +if test -n "$LOG_PATH" +then + echo >>"$LOG_PATH" "ARGS: $@" +fi diff --git a/t/watch_maildir.t b/t/watch_maildir.t index b85ddc5..ee84069 100644 --- a/t/watch_maildir.t +++ b/t/watch_maildir.t @@ -95,10 +95,14 @@ More majordomo info at http://vger.kernel.org/majordomo-info.html\n); local $ENV{PATH} = $fail_path; PublicInbox::Emergency->new($maildir)->prepare(\$msg); $config->{'publicinboxwatch.spamcheck'} = 'spamc'; + my @warn; { - local $SIG{__WARN__} = sub {}; # quiet spam check warning + local $SIG{__WARN__} = sub { push @warn, @_ }; PublicInbox::WatchMaildir->new($config)->scan('full'); } + my $warn = join("\n", @warn); + like($warn, qr/failed spam check/, 'warned about spam'); + like($warn, qr/^path: /m, 'shows failing path'); @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); is(scalar @list, 0, 'tree has no files spamc checked'); is(unlink(glob("$maildir/new/*")), 1); @@ -124,6 +128,31 @@ More majordomo info at http://vger.kernel.org/majordomo-info.html\n); like($$mref, qr/something\n\z/s, 'message scrubbed on import'); } +my $write_spam = sub { + is(scalar glob("$spamdir/new/*"), undef, 'no spam existing'); + PublicInbox::Emergency->new($spamdir)->prepare(\$msg); + my @new = glob("$spamdir/new/*"); + is(scalar @new, 1); + my @p = split(m!/+!, $new[0]); + ok(link($new[0], "$spamdir/cur/".$p[-1].":2,S")); + is(unlink($new[0]), 1); +}; + +{ + my $log_bin = getcwd()."/t/log-bin"; + ok(-x "$log_bin/spamc", "mock spamc exists for logging"); + my $log_path = "$log_bin:$ENV{PATH}"; # for spamc ham mock + local $ENV{PATH} = $log_path; + my $log_path = "$tmpdir/spamc.log"; + local $ENV{LOG_PATH} = $log_path; + ok(unlink(glob("$maildir/*/*"))); + ok(unlink(glob("$spamdir/*/*"))); + $write_spam->(\$msg); + PublicInbox::WatchMaildir->new($config)->scan('full'); + ok(open(my $fh, '<', $log_path), 'opened spamc.log'); + like(<$fh>, qr/-L spam/, '"spamc -L spam" called'); +} + sub is_maildir { my ($dir) = @_; PublicInbox::WatchMaildir::is_maildir($dir); -- EW