about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2014-04-19 23:23:10 +0000
committerEric Wong <e@80x24.org>2014-04-19 23:27:18 +0000
commita8d9e2f1853032016db6ff177979873b3bdadd85 (patch)
tree8f76f5c29c205412d843f3c58ebccc9c74cdc936
parent4024aae69fe08c0aa14a69a12d55ca2b7dd4a4ab (diff)
downloadpublic-inbox-a8d9e2f1853032016db6ff177979873b3bdadd85.tar.gz
We need -learn to do many of the same things as -mda
when we have a false-positive.  We also need -learn to
do HTML filtering in case the training user screws up.
-rw-r--r--lib/PublicInbox/MDA.pm30
-rwxr-xr-xpublic-inbox-learn21
-rwxr-xr-xpublic-inbox-mda30
3 files changed, 47 insertions, 34 deletions
diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm
index 22879236..bb14ae50 100644
--- a/lib/PublicInbox/MDA.pm
+++ b/lib/PublicInbox/MDA.pm
@@ -4,8 +4,10 @@ package PublicInbox::MDA;
 use strict;
 use warnings;
 use Email::Address;
+use Encode qw/decode encode/;
 use Date::Parse qw(strptime);
 use constant MAX_SIZE => 1024 * 500; # same as spamc default
+use constant cmd => qw/ssoma-mda -1/;
 
 # drop plus addressing for matching
 sub __drop_plus {
@@ -53,4 +55,32 @@ sub recipient_specified {
         return 0;
 }
 
+# RFC2919 and RFC2369
+sub set_list_headers {
+        my ($class, $simple, $dst) = @_;
+        my $pa = "<$dst->{-primary_address}>";
+        $simple->header_set("List-Id", $pa);
+        $simple->header_set("List-Post", $pa);
+
+        my $url = $dst->{url};
+        if (defined $url) {
+                $simple->header_set("List-Archive", "<$url>");
+                $simple->header_set("List-Help", "<${url}help>");
+        }
+}
+
+# returns a 3-element array: name, email, date
+sub author_info {
+        my ($class, $simple) = @_;
+
+        my $from = decode('MIME-Header', $simple->header('From'));
+        $from = encode('utf8', $from);
+        my @from = Email::Address->parse($from);
+        my $name = $from[0]->name;
+        defined $name or $name = '';
+        my $email = $from[0]->address;
+        defined $email or $email = '';
+        ($name, $email, $simple->header('Date'));
+}
+
 1;
diff --git a/public-inbox-learn b/public-inbox-learn
index d770f0f7..2c2bbfb5 100755
--- a/public-inbox-learn
+++ b/public-inbox-learn
@@ -54,19 +54,22 @@ foreach my $recipient (keys %dests) {
                         }
                 }
         } else { # $train eq "ham"
-                my $from = $simple->header("From");
-                my @from = Email::Address->parse($from);
-                my $name = $from[0]->name;
-                defined $name or $name = "";
-                my $email = $from[0]->address;
-                defined $email or $email = "";
+                require PublicInbox::MDA;
+                require PublicInbox::Filter;
+
+                # no checking for errors here, we assume the message has
+                # been reviewed by a human at this point:
+                PublicInbox::Filter->run($simple);
+
+                my ($name, $email, $date) =
+                                PublicInbox::MDA->author_info($simple);
                 local $ENV{GIT_AUTHOR_NAME} = $name;
                 local $ENV{GIT_AUTHOR_EMAIL} = $email;
-                local $ENV{GIT_AUTHOR_DATE} = $simple->header("Date");
+                local $ENV{GIT_AUTHOR_DATE} = $date;
 
                 # Ham messages are trained when they're marked into
-                # a SEEN state, so this is idempotent
-                run([qw(ssoma-mda -1), $git_dir], \$in, \$out, \$err);
+                # a SEEN state, so this is idempotent:
+                run([PublicInbox::MDA->cmd, $git_dir], \$in, \$out, \$err);
                 if ($err !~ /CONFLICT/) {
                         $err = 1;
                 }
diff --git a/public-inbox-mda b/public-inbox-mda
index dd3dac80..504fefd7 100755
--- a/public-inbox-mda
+++ b/public-inbox-mda
@@ -11,7 +11,6 @@ use Encode qw/decode encode/;
 use Encode::MIME::Header;
 use File::Path::Expand qw/expand_filename/;
 use IPC::Run qw(run);
-use constant MDA => 'ssoma-mda';
 use PublicInbox::MDA;
 use PublicInbox::Filter;
 use PublicInbox::Config;
@@ -44,23 +43,18 @@ if (PublicInbox::MDA->precheck($filter, $recipient) &&
                 # run spamc again on the HTML-free message
                 if (do_spamc($simple, \$filtered)) {
                         $simple = Email::Simple->new($filtered);
-                        set_list_headers($simple, $dst);
+                        PublicInbox::MDA->set_list_headers($simple, $dst);
                         $filter->simple($simple);
 
-                        my $from = decode('MIME-Header', $filter->from);
-                        $from = encode("utf8", $from);
-                        my @from = Email::Address->parse($from);
-                        my $name = $from[0]->name;
-                        defined $name or $name = "";
-                        my $email = $from[0]->address;
-                        defined $email or $email = "";
+                        my ($name, $email, $date) =
+                                        PublicInbox::MDA->author_info($simple);
                         local $ENV{GIT_AUTHOR_NAME} = $name;
                         local $ENV{GIT_AUTHOR_EMAIL} = $email;
-                        local $ENV{GIT_AUTHOR_DATE} = $simple->header("Date");
+                        local $ENV{GIT_AUTHOR_DATE} = $date;
                         local $ENV{GIT_COMMITTER_EMAIL} = $recipient;
                         local $ENV{GIT_COMMITTER_NAME} = $dst->{listname};
 
-                        $filter->pipe(MDA, '-1', $main_repo);
+                        $filter->pipe(PublicInbox::MDA->cmd, $main_repo);
                 }
         }
 }
@@ -78,17 +72,3 @@ sub do_spamc {
 
         return ($@ || $? || !defined($$out) || length($$out) == 0) ? 0 : 1;
 }
-
-# RFC2919 and RFC2369
-sub set_list_headers {
-        my ($simple, $dst) = @_;
-        my $pa = "<$dst->{-primary_address}>";
-        $simple->header_set("List-Id", $pa);
-        $simple->header_set("List-Post", $pa);
-
-        my $url = $dst->{url};
-        if (defined $url) {
-                $simple->header_set("List-Archive", "<$url>");
-                $simple->header_set("List-Help", "<${url}help>");
-        }
-}