about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--Makefile.PL3
-rwxr-xr-xpublic-inbox-learn78
-rw-r--r--t/mda.t68
3 files changed, 146 insertions, 3 deletions
diff --git a/Makefile.PL b/Makefile.PL
index 7f2d586d..e1bd1cf0 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -8,7 +8,8 @@ WriteMakefile(
         VERSION => '0.0.0',
         AUTHOR => 'Eric Wong <normalperson@yhbt.net>',
         ABSTRACT => 'public-inbox server infrastructure',
-        EXE_FILES => [qw/public-inbox-mda public-inbox-cgi/],
+        EXE_FILES => [qw/public-inbox-mda public-inbox-cgi
+                        public-inbox-learn/],
         PREREQ_PM => {
                 # note: we use ssoma(1) and spamc(1), NOT the Perl modules
                 # We also depend on git through ssoma.
diff --git a/public-inbox-learn b/public-inbox-learn
new file mode 100755
index 00000000..d770f0f7
--- /dev/null
+++ b/public-inbox-learn
@@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2014, Eric Wong <normalperson@yhbt.net> and all contributors
+# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
+my $usage = "$0 (spam|ham) < /path/to/message";
+use strict;
+use warnings;
+use PublicInbox::Config;
+use Email::Simple;
+use Email::Address;
+use IPC::Run qw/run/;
+my $train = shift or die "usage: $usage\n";
+if ($train !~ /\A(?:ham|spam)\z/) {
+        die "`$train' not recognized.\nusage: $usage\n";
+}
+
+my $pi_config = PublicInbox::Config->new;
+my $simple;
+{
+        local $/;
+        $simple = Email::Simple->new(<>);
+}
+
+# get all recipients
+my %dests;
+foreach my $h (qw(Cc To)) {
+        foreach my $recipient (Email::Address->parse($simple->header($h))) {
+                $dests{lc($recipient->address)} = 1;
+        }
+}
+
+my $in = $simple->as_string;
+$simple->body_set("");
+
+my $err = 0;
+my @output = qw(> /dev/null > /dev/null);
+
+# n.b. message may be cross-posted to multiple public-inboxes
+foreach my $recipient (keys %dests) {
+        my $dst = $pi_config->lookup($recipient) or next;
+        my $git_dir = $dst->{mainrepo} or next;
+        my ($out, $err) = ("", "");
+
+        # We do not touch GIT_COMMITTER_* env here so we can track
+        # who trained the message.
+        # We will not touch GIT_AUTHOR_* when learning spam messages, either
+        if ($train eq "spam") {
+                # This needs to be idempotent, as my inotify trainer
+                # may train for each cross-posted message, and this
+                # script already learns for every list in
+                # ~/.public-inbox/config
+                if (!run(["ssoma-rm", $git_dir], \$in, \$out, \$err)) {
+                        if ($err !~ /^git cat-file .+ failed: 32768$/) {
+                                $err = 1;
+                        }
+                }
+        } else { # $train eq "ham"
+                my $from = $simple->header("From");
+                my @from = Email::Address->parse($from);
+                my $name = $from[0]->name;
+                defined $name or $name = "";
+                my $email = $from[0]->address;
+                defined $email or $email = "";
+                local $ENV{GIT_AUTHOR_NAME} = $name;
+                local $ENV{GIT_AUTHOR_EMAIL} = $email;
+                local $ENV{GIT_AUTHOR_DATE} = $simple->header("Date");
+
+                # Ham messages are trained when they're marked into
+                # a SEEN state, so this is idempotent
+                run([qw(ssoma-mda -1), $git_dir], \$in, \$out, \$err);
+                if ($err !~ /CONFLICT/) {
+                        $err = 1;
+                }
+        }
+        if (!run([qw(spamc -L), $train], \$in, @output)) {
+                $err = 1;
+        }
+}
+exit $err;
diff --git a/t/mda.t b/t/mda.t
index d37c03cd..b403c6b3 100644
--- a/t/mda.t
+++ b/t/mda.t
@@ -9,6 +9,7 @@ use Cwd;
 use IPC::Run qw(run);
 
 my $mda = "blib/script/public-inbox-mda";
+my $learn = "blib/script/public-inbox-learn";
 my $tmpdir = tempdir(CLEANUP => 1);
 my $home = "$tmpdir/pi-home";
 my $pi_home = "$home/.public-inbox";
@@ -130,6 +131,71 @@ Date: deadbeef
 
 }
 
+# spam training
+{
+        local $ENV{PI_FAILBOX} = $failbox;
+        local $ENV{HOME} = $home;
+        local $ENV{RECIPIENT} = $addr;
+        local $ENV{PATH} = $main_path;
+        my $mid = 'spam-train@example.com';
+        my $simple = Email::Simple->new(<<EOF);
+From: Spammer <spammer\@example.com>
+To: You <you\@example.com>
+Cc: $addr
+Message-ID: <$mid>
+Subject: this message will be trained as spam
+Date: Thu, 01 Jan 1970 00:00:00 +0000
+
+EOF
+        my $in = $simple->as_string;
+
+        {
+                # deliver the spam message, first
+                run([$mda], \$in);
+                my $msg = `ssoma cat $mid $maindir`;
+                like($msg, qr/\Q$mid\E/, "message delivered");
+
+                # now train it
+                local $ENV{GIT_AUTHOR_EMAIL} = 'trainer@example.com';
+                local $ENV{GIT_COMMITTER_EMAIL} = 'trainer@example.com';
+                run([$learn, "spam"], \$msg);
+                is($?, 0, "no failure from learning spam");
+                run([$learn, "spam"], \$msg);
+                is($?, 0, "no failure from learning spam idempotently");
+        }
+}
+
+# train ham message
+{
+        local $ENV{PI_FAILBOX} = $failbox;
+        local $ENV{HOME} = $home;
+        local $ENV{RECIPIENT} = $addr;
+        local $ENV{PATH} = $main_path;
+        my $mid = 'ham-train@example.com';
+        my $simple = Email::Simple->new(<<EOF);
+From: False-positive <hammer\@example.com>
+To: You <you\@example.com>
+Cc: $addr
+Message-ID: <$mid>
+Subject: this message will be trained as spam
+Date: Thu, 01 Jan 1970 00:00:00 +0000
+
+EOF
+        my $in = $simple->as_string;
+
+        # now train it
+        local $ENV{GIT_AUTHOR_EMAIL} = 'trainer@example.com';
+        local $ENV{GIT_COMMITTER_EMAIL} = 'trainer@example.com';
+        run([$learn, "ham"], \$in);
+        is($?, 0, "learned ham without failure");
+        my $msg = `ssoma cat $mid $maindir`;
+        like($msg, qr/\Q$mid\E/, "ham message delivered");
+        run([$learn, "ham"], \$in);
+        is($?, 0, "learned ham idempotently ");
+}
+
+done_testing();
+
 sub fail_bad_header {
         my ($good_rev, $msg, $in) = @_;
         open my $fh, '>', $failbox or die "failed to open $failbox: $!\n";
@@ -144,5 +210,3 @@ sub fail_bad_header {
         ok(-s $failbox > 0, "PI_FAILBOX is written to ($msg)");
         [ $in, $out, $err ];
 }
-
-done_testing();