user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 1/6] implement ListMirror SpamAssassin plugin
@ 2016-06-24 20:47 Eric Wong
  2016-06-24 20:47 ` [PATCH 2/6] split out spamcheck/spamc to its own module Eric Wong
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Eric Wong @ 2016-06-24 20:47 UTC (permalink / raw)
  To: meta

When running mailing list mirrors, one needs to be careful
spammers do not try to sidestep the list server we want to
mirror from and inject email into our mail directly by setting
the appropriate list headers (e.g. "X-Mailing-List" or
"List-Id").  We trust the top-most Received: header is
the one our own mail server got the mail from.

Bcc:-ing a public mailing list is a very likely indicator of
spam in my experience, so throw in an extra rule mark it.
While public-inbox-mda rejects Bcc: entirely, public-inbox-watch
needs to mirror lists which allow Bcc.

==> list_mirror.cf <==
loadplugin PublicInbox::SaPlugin::ListMirror

ifplugin PublicInbox::SaPlugin::ListMirror
  header LIST_MIRROR_RECEIVED eval:check_list_mirror_received()
  describe LIST_MIRROR_RECEIVED Received does not match trusted list server
  score LIST_MIRROR_RECEIVED 10

  header LIST_MIRROR_BCC eval:check_list_mirror_bcc()
  describe LIST_MIRROR_BCC Mailing list was Bcc-ed
  score LIST_MIRROR_BCC 1
endif

==> ~/.spamassassin/user_prefs <==
ifplugin PublicInbox::SaPlugin::ListMirror
  list_mirror X-Mailing-List git@vger.kernel.org *.kernel.org git@vger.kernel.org
endif
---
 MANIFEST                               |   1 +
 lib/PublicInbox/SaPlugin/ListMirror.pm | 107 +++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 lib/PublicInbox/SaPlugin/ListMirror.pm

diff --git a/MANIFEST b/MANIFEST
index 17a2a31..bc7d54c 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -64,6 +64,7 @@ lib/PublicInbox/NewsWWW.pm
 lib/PublicInbox/ParentPipe.pm
 lib/PublicInbox/ProcessPipe.pm
 lib/PublicInbox/Qspawn.pm
+lib/PublicInbox/SaPlugin/ListMirror.pm
 lib/PublicInbox/Search.pm
 lib/PublicInbox/SearchIdx.pm
 lib/PublicInbox/SearchMsg.pm
diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pm b/lib/PublicInbox/SaPlugin/ListMirror.pm
new file mode 100644
index 0000000..1010188
--- /dev/null
+++ b/lib/PublicInbox/SaPlugin/ListMirror.pm
@@ -0,0 +1,107 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Rules useful for running a mailing list mirror.  We want to:
+# * ensure Received: headers are really from the list mail server
+#   users expect.  This is to prevent malicious users from
+#   injecting spam into mirrors without going through the expected
+#   server
+# * flag messages where the mailing list is Bcc:-ed since it is
+#   common for spam to have wrong or non-existent To:/Cc: headers.
+
+package PublicInbox::SaPlugin::ListMirror;
+use strict;
+use warnings;
+use base qw(Mail::SpamAssassin::Plugin);
+
+# constructor: register the eval rules
+sub new {
+	my ($class, $mail) = @_;
+
+	# some boilerplate...
+	$class = ref($class) || $class;
+	my $self = $class->SUPER::new($mail);
+	bless $self, $class;
+	$mail->{conf}->{list_mirror_check} = [];
+	$self->register_eval_rule('check_list_mirror_received');
+	$self->register_eval_rule('check_list_mirror_bcc');
+	$self->set_config($mail->{conf});
+	$self;
+}
+
+sub check_list_mirror_received {
+	my ($self, $pms) = @_;
+	my $recvd = $pms->get('Received') || '';
+	$recvd =~ s/\n.*\z//s;
+
+	foreach my $cfg (@{$pms->{conf}->{list_mirror_check}}) {
+		my ($hdr, $hval, $host_re, $addr_re) = @$cfg;
+		my $v = $pms->get($hdr) or next;
+		chomp $v;
+		next if $v ne $hval;
+		return 1 if $recvd !~ $host_re;
+	}
+
+	0;
+}
+
+sub check_list_mirror_bcc {
+	my ($self, $pms) = @_;
+	my $tocc = $pms->get('ToCc');
+
+	foreach my $cfg (@{$pms->{conf}->{list_mirror_check}}) {
+		my ($hdr, $hval, $host_re, $addr_re) = @$cfg;
+		defined $addr_re or next;
+		my $v = $pms->get($hdr) or next;
+		chomp $v;
+		next if $v ne $hval;
+		return 1 if !$tocc || $tocc !~ $addr_re;
+	}
+
+	0;
+}
+
+# list_mirror HEADER HEADER_VALUE HOSTNAME_GLOB [LIST_ADDRESS]
+# list_mirror X-Mailing-List git@vger.kernel.org *.kernel.org
+# list_mirror List-Id <foo.example.org> *.example.org foo@example.org
+sub config_list_mirror {
+	my ($self, $key, $value, $line) = @_;
+
+	defined $value or
+		return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
+
+	my ($hdr, $hval, $host_glob, @extra) = split(/\s+/, $value);
+	my $addr = shift @extra;
+
+	if (defined $addr) {
+		$addr !~ /\@/ and
+			return $Mail::SpamAssassin::Conf::INVALID_VALUE;
+		$addr = join('|', map { quotemeta } split(/,/, $addr));
+		$addr = qr/\b$addr\b/i;
+	}
+
+	@extra and return $Mail::SpamAssassin::Conf::INVALID_VALUE;
+
+	defined $host_glob or
+		return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
+
+	my %patmap = ('*' => '\S+', '?' => '.', '[' => '[', ']' => ']');
+	$host_glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge;
+	my $host_re = qr/\A\s*from\s+$host_glob(?:\s|$)/si;
+
+	push @{$self->{list_mirror_check}}, [ $hdr, $hval, $host_re, $addr ];
+}
+
+sub set_config {
+	my ($self, $conf) = @_;
+	my @cmds;
+	push @cmds, {
+		setting => 'list_mirror',
+		default => '',
+		type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
+		code => *config_list_mirror,
+	};
+	$conf->{parser}->register_commands(\@cmds);
+}
+
+1;

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/6] split out spamcheck/spamc to its own module.
  2016-06-24 20:47 [PATCH 1/6] implement ListMirror SpamAssassin plugin Eric Wong
@ 2016-06-24 20:47 ` Eric Wong
  2016-06-24 20:47 ` [PATCH 3/6] document Filesys::Notify::Simple dependency Eric Wong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2016-06-24 20:47 UTC (permalink / raw)
  To: meta

This should hopefully make it easier to try other anti-spam
systems (or none at all) in the future.
---
 MANIFEST                           |  2 +
 lib/PublicInbox/Spamcheck/Spamc.pm | 94 ++++++++++++++++++++++++++++++++++++++
 script/public-inbox-learn          | 21 +++------
 script/public-inbox-mda            | 23 ++--------
 t/spamcheck_spamc.t                | 49 ++++++++++++++++++++
 5 files changed, 156 insertions(+), 33 deletions(-)
 create mode 100644 lib/PublicInbox/Spamcheck/Spamc.pm
 create mode 100644 t/spamcheck_spamc.t

diff --git a/MANIFEST b/MANIFEST
index bc7d54c..834cb5d 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -69,6 +69,7 @@ lib/PublicInbox/Search.pm
 lib/PublicInbox/SearchIdx.pm
 lib/PublicInbox/SearchMsg.pm
 lib/PublicInbox/SearchView.pm
+lib/PublicInbox/Spamcheck/Spamc.pm
 lib/PublicInbox/Spawn.pm
 lib/PublicInbox/SpawnPP.pm
 lib/PublicInbox/Thread.pm
@@ -133,6 +134,7 @@ t/psgi_attach.t
 t/psgi_mount.t
 t/qspawn.t
 t/search.t
+t/spamcheck_spamc.t
 t/spawn.t
 t/utf8.mbox
 t/view.t
diff --git a/lib/PublicInbox/Spamcheck/Spamc.pm b/lib/PublicInbox/Spamcheck/Spamc.pm
new file mode 100644
index 0000000..312e52d
--- /dev/null
+++ b/lib/PublicInbox/Spamcheck/Spamc.pm
@@ -0,0 +1,94 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+package PublicInbox::Spamcheck::Spamc;
+use strict;
+use warnings;
+use PublicInbox::Spawn qw(popen_rd spawn);
+use IO::File;
+use Fcntl qw(:DEFAULT SEEK_SET);
+
+sub new {
+	my ($class) = @_;
+	bless {
+		checkcmd => [qw(spamc -E --headers)],
+		hamcmd => [qw(spamc -L ham)],
+		spamcmd => [qw(spamc -L spam)],
+	}, $class;
+}
+
+sub spamcheck {
+	my ($self, $msg, $out) = @_;
+
+	my $tmp;
+	my $fd = _msg_to_fd($self, $msg, \$tmp);
+	my $rdr = { 0 => $fd };
+	my ($fh, $pid) = popen_rd($self->{checkcmd}, undef, $rdr);
+	defined $pid or die "failed to popen_rd spamc: $!\n";
+	my $r;
+	unless (ref $out) {
+		my $buf = '';
+		$out = \$buf;
+	}
+	do {
+		$r = sysread($fh, $$out, 65536, length($$out));
+	} while (defined($r) && $r != 0);
+	defined $r or die "read failed: $!";
+	close $fh or die "close failed: $!";
+	waitpid($pid, 0);
+	($? || $$out eq '') ? 0 : 1;
+}
+
+sub hamlearn {
+	my ($self, $msg, $rdr) = @_;
+	_learn($self, $msg, $rdr, 'hamcmd');
+}
+
+sub spamlearn {
+	my ($self, $msg, $rdr) = @_;
+	_learn($self, $msg, $rdr, 'spamcmd');
+}
+
+sub _learn {
+	my ($self, $msg, $rdr, $field) = @_;
+	$rdr ||= {};
+	$rdr->{1} ||= $self->_devnull;
+	$rdr->{2} ||= $self->_devnull;
+	my $tmp;
+	$rdr->{0} = _msg_to_fd($self, $msg, \$tmp);
+	my $pid = spawn($self->{$field}, undef, $rdr);
+	waitpid($pid, 0);
+	!$?;
+}
+
+sub _devnull {
+	my ($self) = @_;
+	my $fd = $self->{-devnullfd};
+	return $fd if defined $fd;
+	open my $fh, '+>', '/dev/null' or
+				die "failed to open /dev/null: $!";
+	$self->{-devnull} = $fh;
+	$self->{-devnullfd} = fileno($fh);
+}
+
+sub _msg_to_fd {
+	my ($self, $msg, $tmpref) = @_;
+	my $tmpfh;
+	my $fd;
+	if (my $ref = ref($msg)) {
+
+		return $msg->fileno if $ref ne 'SCALAR' && $msg->can('fileno');
+
+		$tmpfh = IO::File->new_tmpfile;
+		$tmpfh->autoflush(1);
+		$msg = \($msg->as_string) if $ref ne 'SCALAR';
+		print $tmpfh $$msg or die "failed to print: $!";
+		sysseek($tmpfh, 0, SEEK_SET) or
+			die "sysseek(fh) failed: $!";
+		$$tmpref = $tmpfh;
+
+		return fileno($tmpfh);
+	}
+	$msg;
+}
+
+1;
diff --git a/script/public-inbox-learn b/script/public-inbox-learn
index b05ef05..7ef2a31 100755
--- a/script/public-inbox-learn
+++ b/script/public-inbox-learn
@@ -14,12 +14,13 @@ use Email::MIME;
 use Email::MIME::ContentType;
 $Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect
 use PublicInbox::Address;
-use PublicInbox::Spawn qw(spawn);
+use PublicInbox::Spamcheck::Spamc;
 my $train = shift or die "usage: $usage\n";
 if ($train !~ /\A(?:ham|spam)\z/) {
 	die "`$train' not recognized.\nusage: $usage\n";
 }
 
+my $spamc = PublicInbox::Spamcheck::Spamc->new;
 my $pi_config = PublicInbox::Config->new;
 my $err;
 my $mime = Email::MIME->new(eval {
@@ -27,19 +28,11 @@ my $mime = Email::MIME->new(eval {
 	my $data = scalar <STDIN>;
 	$data =~ s/\AFrom [^\r\n]*\r?\n//s;
 	eval {
-		my @cmd = (qw(spamc -L), $train);
-		my ($r, $w);
-		pipe($r, $w) or die "pipe failed: $!";
-		open my $null, '>', '/dev/null' or
-					die "failed to open /dev/null: $!";
-		my $nullfd = fileno($null);
-		my %rdr = (0 => fileno($r), 1 => $nullfd, 2 => $nullfd);
-		my $pid = spawn(\@cmd, undef, \%rdr);
-		close $null;
-		close $r or die "close \$r failed: $!";
-		print $w $data or die "print \$w failed: $!";
-		close $w or die "close \$w failed: $!";
-		waitpid($pid, 0);
+		if ($train eq 'ham') {
+			$spamc->hamlearn(\$data);
+		} else {
+			$spamc->spamlearn(\$data);
+		}
 		die "spamc failed with: $?\n" if $?;
 	};
 	$err = $@;
diff --git a/script/public-inbox-mda b/script/public-inbox-mda
index 013642d..f739ad0 100755
--- a/script/public-inbox-mda
+++ b/script/public-inbox-mda
@@ -24,7 +24,7 @@ use PublicInbox::Import;
 use PublicInbox::Git;
 use PublicInbox::Emergency;
 use PublicInbox::Filter::Base;
-use PublicInbox::Spawn qw(popen_rd);
+use PublicInbox::Spamcheck::Spamc;
 
 # n.b: hopefully we can setup the emergency path without bailing due to
 # user error, we really want to setup the emergency destination ASAP
@@ -44,9 +44,9 @@ my $main_repo = $dst->{mainrepo} or do_exit(1);
 
 # pre-check, MDA has stricter rules than an importer might;
 do_exit(0) unless PublicInbox::MDA->precheck($simple, $dst->{address});
-
+my $spamc = PublicInbox::Spamcheck::Spamc->new;
 $str = '';
-my $spam_ok = do_spamc($ems->fh, \$str);
+my $spam_ok = $spamc->spamcheck($ems->fh, \$str);
 $simple = undef;
 $emm = PublicInbox::Emergency->new($emergency);
 $emm->prepare(\$str);
@@ -90,20 +90,5 @@ if (defined $im->add($mime)) {
 			$mime->header_obj->header_raw('Message-ID'),
 			" exists\n";
 }
-do_exit(0);
-
-# we depend on "report_safe 0" in /etc/spamassassin/*.cf with --headers
-sub do_spamc {
-	my ($in, $out) = @_;
-	my $rdr = { 0 => fileno($in) };
-	my ($fh, $pid) = popen_rd([qw/spamc -E --headers/], undef, $rdr);
-	defined $pid or die "failed to popen_rd spamc: $!\n";
-	my $r;
-	do {
-		$r = sysread($fh, $$out, 65536, length($$out));
-	} while (defined($r) && $r != 0);
-	close $fh or die "close failed: $!\n";
-	waitpid($pid, 0);
 
-	($? || $$out eq '') ? 0 : 1;
-}
+do_exit(0);
diff --git a/t/spamcheck_spamc.t b/t/spamcheck_spamc.t
new file mode 100644
index 0000000..65ac5c2
--- /dev/null
+++ b/t/spamcheck_spamc.t
@@ -0,0 +1,49 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use Cwd;
+use Email::Simple;
+use IO::File;
+use File::Temp qw/tempdir/;
+use Fcntl qw(:DEFAULT SEEK_SET);
+my $tmpdir = tempdir('spamcheck_spamc-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+
+use_ok 'PublicInbox::Spamcheck::Spamc';
+my $spamc = PublicInbox::Spamcheck::Spamc->new;
+$spamc->{checkcmd} = [qw(cat)];
+
+{
+	open my $fh, '+>', "$tmpdir/file" or die "open failed: $!";
+	ok(!$spamc->spamcheck($fh), 'empty '.ref($fh));
+}
+ok(!$spamc->spamcheck(IO::File->new_tmpfile), 'IO::File->new_tmpfile');
+
+my $dst = '';
+my $src = <<'EOF';
+Date: Thu, 01 Jan 1970 00:00:00 +0000
+To: <e@example.com>
+From: <e@example.com>
+Subject: test
+Message-ID: <testmessage@example.com>
+
+EOF
+ok($spamc->spamcheck(Email::Simple->new($src), \$dst), 'Email::Simple works');
+is($dst, $src, 'input == output');
+
+$dst = '';
+$spamc->{checkcmd} = ['sh', '-c', 'cat; false'];
+ok(!$spamc->spamcheck(Email::Simple->new($src), \$dst), 'Failed check works');
+is($dst, $src, 'input == output for spammy example');
+
+for my $l (qw(ham spam)) {
+	my $file = "$tmpdir/$l.out";
+	$spamc->{$l.'cmd'} = ['tee', $file ];
+	my $method = $l.'learn';
+	ok($spamc->$method(Email::Simple->new($src)), "$method OK");
+	open my $fh, '<', $file or die "failed to open $file: $!";
+	is(eval { local $/, <$fh> }, $src, "$l command ran alright");
+}
+
+done_testing();

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/6] document Filesys::Notify::Simple dependency
  2016-06-24 20:47 [PATCH 1/6] implement ListMirror SpamAssassin plugin Eric Wong
  2016-06-24 20:47 ` [PATCH 2/6] split out spamcheck/spamc to its own module Eric Wong
@ 2016-06-24 20:47 ` Eric Wong
  2016-06-24 20:47 ` [PATCH 4/6] watch_maildir: rename _check_spam => _remove_spam Eric Wong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2016-06-24 20:47 UTC (permalink / raw)
  To: meta

And improve documentation for existing dependencies, too.
---
 INSTALL                         | 24 ++++++++++++++----------
 lib/PublicInbox/WatchMaildir.pm |  2 ++
 t/watch_maildir.t               |  5 +++++
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/INSTALL b/INSTALL
index e7d4b75..7f12fbe 100644
--- a/INSTALL
+++ b/INSTALL
@@ -19,8 +19,8 @@ standard MakeMaker installation (Perl)
 	make test
 	make install # root permissions may be needed
 
-Requirements (server MDA)
--------------------------
+Requirements
+------------
 
 * git
 * SpamAssassin (spamc/spamd)
@@ -36,19 +36,23 @@ Optional modules:
   - Plack[1]                   libplack-perl
   - Mail::Thread (2.5+)[1]     libmail-thread-perl
   - URI::Escape[1]             liburi-perl
-  - Search::Xapian[3]          libsearch-xapian-perl
-  - IO::Compress::Gzip[3]      libio-compress-perl
+  - Search::Xapian[2][3]       libsearch-xapian-perl
+  - IO::Compress::Gzip[3]      perl-modules (or libio-compress-perl)
   - DBI[3]                     libdbi-perl
-  - DBD::SQLite[3]             libdbd-sqlite3-perl
+  - DBD::SQLite[2][3]          libdbd-sqlite3-perl
   - Danga::Socket[4]           libdanga-socket-perl
-  - Net::Server[4]             libnet-server-perl
+  - Net::Server[5]             libnet-server-perl
+  - Filesys::Notify::Simple[6] libfilesys-notify-simple-perl
 
-[1] - Only required for serving/generating Atom and HTML pages.
-[3] - Optional for HTML web interface and HTTP/NNTP servers
-[4] - Optional for HTTP and NNTP servers
+[1] - Optional, needed for serving/generating Atom and HTML pages
+[2] - Optional, only required for NNTP server
+[3] - Optional, needed for gzipped mbox support over HTTP
+[4] - Optional, needed for bundled HTTP and NNTP servers
+[5] - Optional, needed for standalone daemonization of HTTP+NNTP servers
+[6] - Optional, needed for public-inbox-watch Maildir watcher
 
 When installing Search::Xapian, make sure the underlying Xapian
-is patched against the index corruption bug documented in:
+installation is not affected by an index corruption bug:
 
 	https://bugs.debian.org/808610
 
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index 4468a44..abf1df7 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -144,6 +144,8 @@ sub watch {
 	my $cb = sub { _try_fsn_paths($self, \@_) };
 	my $mdir = $self->{mdir};
 
+	# lazy load here, we may support watching via IMAP IDLE
+	# in the future...
 	require Filesys::Notify::Simple;
 	my $watcher = Filesys::Notify::Simple->new($mdir);
 	$watcher->wait($cb) while (1);
diff --git a/t/watch_maildir.t b/t/watch_maildir.t
index e8c9740..be1a312 100644
--- a/t/watch_maildir.t
+++ b/t/watch_maildir.t
@@ -4,6 +4,11 @@ use Test::More;
 use File::Temp qw/tempdir/;
 use Email::MIME;
 use PublicInbox::Config;
+my @mods = qw(Filesys::Notify::Simple);
+foreach my $mod (@mods) {
+	eval "require $mod";
+	plan skip_all => "$mod missing for watch_maildir.t" if $@;
+}
 
 my $tmpdir = tempdir('watch_maildir-XXXXXX', TMPDIR => 1, CLEANUP => 1);
 my $git_dir = "$tmpdir/test.git";

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/6] watch_maildir: rename _check_spam => _remove_spam
  2016-06-24 20:47 [PATCH 1/6] implement ListMirror SpamAssassin plugin Eric Wong
  2016-06-24 20:47 ` [PATCH 2/6] split out spamcheck/spamc to its own module Eric Wong
  2016-06-24 20:47 ` [PATCH 3/6] document Filesys::Notify::Simple dependency Eric Wong
@ 2016-06-24 20:47 ` Eric Wong
  2016-06-24 20:47 ` [PATCH 5/6] watch_maildir: implement optional spam checking Eric Wong
  2016-06-24 20:47 ` [PATCH 6/6] watch_maildir: ignore Trash and Drafts, support Dovecot Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2016-06-24 20:47 UTC (permalink / raw)
  To: meta

We do not actually do spam checking, here; but will
do spam checking before adding a message in the future.
---
 lib/PublicInbox/WatchMaildir.pm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index abf1df7..c1fe81e 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -69,7 +69,7 @@ sub _try_fsn_paths {
 	_done_for_now($self);
 }
 
-sub _check_spam {
+sub _remove_spam {
 	my ($self, $path) = @_;
 	$path =~ /:2,[A-R]*S[T-Z]*\z/ or return;
 	my $mime = _path_to_mime($path) or return;
@@ -121,7 +121,7 @@ sub _try_path {
 		return;
 	}
 	if (!ref($inbox) && $inbox eq 'watchspam') {
-		return _check_spam($self, $path);
+		return _remove_spam($self, $path);
 	}
 	my $im = _importer_for($self, $inbox);
 	my $mime = _path_to_mime($path) or return;

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/6] watch_maildir: implement optional spam checking
  2016-06-24 20:47 [PATCH 1/6] implement ListMirror SpamAssassin plugin Eric Wong
                   ` (2 preceding siblings ...)
  2016-06-24 20:47 ` [PATCH 4/6] watch_maildir: rename _check_spam => _remove_spam Eric Wong
@ 2016-06-24 20:47 ` Eric Wong
  2016-06-24 20:47 ` [PATCH 6/6] watch_maildir: ignore Trash and Drafts, support Dovecot Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2016-06-24 20:47 UTC (permalink / raw)
  To: meta

Mailing lists I watch and mirror may not have the best spam
filtering, and an extra layer should not hurt.
---
 lib/PublicInbox/Import.pm       |  6 +++++-
 lib/PublicInbox/WatchMaildir.pm | 34 ++++++++++++++++++++++++++++++++--
 t/import.t                      |  6 +++++-
 t/watch_maildir.t               | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 5ffc26e..27f36a7 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -140,7 +140,7 @@ sub remove {
 
 # returns undef on duplicate
 sub add {
-	my ($self, $mime) = @_; # mime = Email::MIME
+	my ($self, $mime, $check_cb) = @_; # mime = Email::MIME
 
 	my $from = $mime->header('From');
 	my ($email) = ($from =~ /([^<\s]+\@[^>\s]+)/g);
@@ -170,6 +170,10 @@ sub add {
 
 	# kill potentially confusing/misleading headers
 	$mime->header_set($_) for qw(bytes lines content-length status);
+	if ($check_cb) {
+		$mime = $check_cb->($mime) or return;
+	}
+
 	$mime = $mime->as_string;
 	my $blob = $self->{mark}++;
 	print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail;
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index c1fe81e..72bd3d0 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -13,7 +13,9 @@ use PublicInbox::Spawn qw(spawn);
 
 sub new {
 	my ($class, $config) = @_;
-	my (%mdmap, @mdir);
+	my (%mdmap, @mdir, $spamc);
+
+	# XXX is "publicinboxlearn" really a good namespace for this?
 	my $k = 'publicinboxlearn.watchspam';
 	if (my $spamdir = $config->{$k}) {
 		if ($spamdir =~ s/\Amaildir://) {
@@ -26,6 +28,21 @@ sub new {
 			warn "unsupported $k=$spamdir\n";
 		}
 	}
+
+	$k = 'publicinboxwatch.spamcheck';
+	my $spamcheck = $config->{$k};
+	if ($spamcheck) {
+		if ($spamcheck eq 'spamc') {
+			$spamcheck = 'PublicInbox::Spamcheck::Spamc';
+		}
+		if ($spamcheck =~ /::/) {
+			eval "require $spamcheck";
+			$spamcheck = _spamcheck_cb($spamcheck->new);
+		} else {
+			warn "unsupported $k=$spamcheck\n";
+			$spamcheck = undef;
+		}
+	}
 	foreach $k (keys %$config) {
 		$k =~ /\Apublicinbox\.([^\.]+)\.watch\z/ or next;
 		my $name = $1;
@@ -52,6 +69,7 @@ sub new {
 	my $mdre = join('|', map { quotemeta($_) } @mdir);
 	$mdre = qr!\A($mdre)/!;
 	bless {
+		spamcheck => $spamcheck,
 		mdmap => \%mdmap,
 		mdir => \@mdir,
 		mdre => $mdre,
@@ -136,7 +154,7 @@ sub _try_path {
 	}
 
 	_force_mid($mime);
-	$im->add($mime);
+	$im->add($mime, $self->{spamcheck});
 }
 
 sub watch {
@@ -208,4 +226,16 @@ sub _scrubber_for {
 	undef;
 }
 
+sub _spamcheck_cb {
+	my ($sc) = @_;
+	sub {
+		my ($mime) = @_;
+		my $tmp = '';
+		if ($sc->spamcheck($mime, \$tmp)) {
+			return Email::MIME->new(\$tmp);
+		}
+		undef;
+	}
+}
+
 1;
diff --git a/t/import.t b/t/import.t
index 09c0036..73f92ad 100644
--- a/t/import.t
+++ b/t/import.t
@@ -30,7 +30,7 @@ is(scalar @revs, 1, 'one revision created');
 
 $mime->header_set('Message-ID', '<b@example.com>');
 $mime->header_set('Subject', 'msg2');
-like($im->add($mime), qr/\A:\d+\z/, 'added 2nd message');
+like($im->add($mime, sub { $mime }), qr/\A:\d+\z/, 'added 2nd message');
 $im->done;
 @revs = $git->qx(qw(rev-list HEAD));
 is(scalar @revs, 2, '2 revisions exist');
@@ -61,5 +61,9 @@ is($mark, 'MISMATCH', 'mark == MISMATCH on mismatch');
 is($msg->header('Message-ID'), '<a@example.com>', 'Message-ID matches');
 isnt($msg->header('Subject'), $mime->header('Subject'), 'subject mismatch');
 
+$mime->header_set('Message-Id', '<failcheck@example.com>');
+is($im->add($mime, sub { undef }), undef, 'check callback fails');
+is($im->remove($mime), undef, 'message not added, so not removed');
+
 $im->done;
 done_testing();
diff --git a/t/watch_maildir.t b/t/watch_maildir.t
index be1a312..2138963 100644
--- a/t/watch_maildir.t
+++ b/t/watch_maildir.t
@@ -3,6 +3,7 @@
 use Test::More;
 use File::Temp qw/tempdir/;
 use Email::MIME;
+use Cwd;
 use PublicInbox::Config;
 my @mods = qw(Filesys::Notify::Simple);
 foreach my $mod (@mods) {
@@ -86,4 +87,37 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	is(scalar @list, 4, 'four revisions in rev-list');
 }
 
+{
+	my $fail_bin = getcwd()."/t/fail-bin";
+	ok(-x "$fail_bin/spamc", "mock spamc exists");
+	my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock
+	local $ENV{PATH} = $fail_path;
+	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
+	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	PublicInbox::WatchMaildir->new($config)->scan;
+	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
+	is(scalar @list, 0, 'tree has no files spamc checked');
+	is(unlink(glob("$maildir/new/*")), 1);
+}
+
+{
+	my $main_bin = getcwd()."/t/main-bin";
+	ok(-x "$main_bin/spamc", "mock spamc exists");
+	my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock
+	local $ENV{PATH} = $main_path;
+	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
+	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
+	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
+	PublicInbox::WatchMaildir->new($config)->scan;
+	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
+	is(scalar @list, 1, 'tree has one file after spamc checked');
+
+	# XXX: workaround some weird caching/memoization in cat-file,
+	# shouldn't be an issue in real-world use, though...
+	$git = PublicInbox::Git->new($git_dir);
+
+	my $mref = $git->cat_file('refs/heads/master:'.$list[0]);
+	like($$mref, qr/something\n\z/s, 'message scrubbed on import');
+}
+
 done_testing;

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 6/6] watch_maildir: ignore Trash and Drafts, support Dovecot
  2016-06-24 20:47 [PATCH 1/6] implement ListMirror SpamAssassin plugin Eric Wong
                   ` (3 preceding siblings ...)
  2016-06-24 20:47 ` [PATCH 5/6] watch_maildir: implement optional spam checking Eric Wong
@ 2016-06-24 20:47 ` Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2016-06-24 20:47 UTC (permalink / raw)
  To: meta

Trashed messages and drafts are probably not intended for
importing, so do not import them.  Dovecot uses extra flags via
lowercase letters, so we must support those (as that's the
server I use).
---
 lib/PublicInbox/WatchMaildir.pm | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index 72bd3d0..b25704e 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -1,5 +1,8 @@
 # Copyright (C) 2016 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# ref: https://cr.yp.to/proto/maildir.html
+#	http://wiki2.dovecot.org/MailboxFormat/Maildir
 package PublicInbox::WatchMaildir;
 use strict;
 use warnings;
@@ -89,7 +92,7 @@ sub _try_fsn_paths {
 
 sub _remove_spam {
 	my ($self, $path) = @_;
-	$path =~ /:2,[A-R]*S[T-Z]*\z/ or return;
+	$path =~ /:2,[A-R]*S[T-Z]*\z/i or return;
 	my $mime = _path_to_mime($path) or return;
 	_force_mid($mime);
 	foreach my $inbox (values %{$self->{mdmap}}) {
@@ -127,7 +130,11 @@ sub _force_mid {
 sub _try_path {
 	my ($self, $path) = @_;
 	my @p = split(m!/+!, $path);
-	return unless $p[-1] =~ /\A[a-zA-Z0-9][\w:,=\.]+\z/;
+	return if $p[-1] !~ /\A[a-zA-Z0-9][\w:,=\.]+\z/;
+	if ($p[-1] =~ /:2,([A-Z]+)\z/i) {
+		my $flags = $1;
+		return if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail
+	}
 	return unless -f $path;
 	if ($path !~ $self->{mdre}) {
 		warn "unrecognized path: $path\n";

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2016-06-24 20:47 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-24 20:47 [PATCH 1/6] implement ListMirror SpamAssassin plugin Eric Wong
2016-06-24 20:47 ` [PATCH 2/6] split out spamcheck/spamc to its own module Eric Wong
2016-06-24 20:47 ` [PATCH 3/6] document Filesys::Notify::Simple dependency Eric Wong
2016-06-24 20:47 ` [PATCH 4/6] watch_maildir: rename _check_spam => _remove_spam Eric Wong
2016-06-24 20:47 ` [PATCH 5/6] watch_maildir: implement optional spam checking Eric Wong
2016-06-24 20:47 ` [PATCH 6/6] watch_maildir: ignore Trash and Drafts, support Dovecot Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).