From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-3.7 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, RP_MATCHES_RCVD shortcircuit=no autolearn=unavailable version=3.3.2 X-Original-To: meta@public-inbox.org Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 53B89633806; Thu, 13 Nov 2014 21:53:06 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Cc: Eric Wong Subject: [PATCH 2/2] -learn: nuke HTML portions when training as ham Date: Thu, 13 Nov 2014 21:53:01 +0000 Message-Id: <1415915581-2522-2-git-send-email-e@80x24.org> X-Mailer: git-send-email 2.2.0.rc0.1.g384e9bc In-Reply-To: <1415915581-2522-1-git-send-email-e@80x24.org> References: <1415915581-2522-1-git-send-email-e@80x24.org> List-Id: Sometimes people send HTML email and I forget to fixup in my MUA during moderation. Automatically strip out HTML portions instead. --- public-inbox-learn | 19 ++++++++++--------- t/mda.t | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/public-inbox-learn b/public-inbox-learn index 13b75b7..db0a1bb 100755 --- a/public-inbox-learn +++ b/public-inbox-learn @@ -24,9 +24,16 @@ foreach my $h (qw(Cc To)) { } } -my $in = $mime->as_string; -$mime->body_set(''); +my ($name, $email, $date); + +if ($train eq "ham") { + require PublicInbox::MDA; + require PublicInbox::Filter; + PublicInbox::Filter->run($mime); + ($name, $email, $date) = PublicInbox::MDA->author_info($mime); +} +my $in = $mime->as_string; my $err = 0; my @output = qw(> /dev/null > /dev/null); @@ -50,16 +57,10 @@ foreach my $recipient (keys %dests) { } } } else { # $train eq "ham" - require PublicInbox::MDA; - require PublicInbox::Filter; - - # no checking for errors here, we assume the message has + # no checking for spam here, we assume the message has # been reviewed by a human at this point: - PublicInbox::Filter->run($mime); PublicInbox::MDA->set_list_headers($mime, $dst); - my ($name, $email, $date) = - PublicInbox::MDA->author_info($mime); local $ENV{GIT_AUTHOR_NAME} = $name; local $ENV{GIT_AUTHOR_EMAIL} = $email; local $ENV{GIT_AUTHOR_DATE} = $date; diff --git a/t/mda.t b/t/mda.t index fad96e5..53712a5 100644 --- a/t/mda.t +++ b/t/mda.t @@ -205,14 +205,55 @@ EOF my $in = $simple->as_string; # now train it + # these should be overridden local $ENV{GIT_AUTHOR_EMAIL} = 'trainer@example.com'; local $ENV{GIT_COMMITTER_EMAIL} = 'trainer@example.com'; + run([$learn, "ham"], \$in); is($?, 0, "learned ham without failure"); my $msg = `ssoma cat $mid $maindir`; like($msg, qr/\Q$mid\E/, "ham message delivered"); run([$learn, "ham"], \$in); is($?, 0, "learned ham idempotently "); + + # ensure trained email is filtered, too + my $html_body = "hi"; + my $parts = [ + Email::MIME->create( + attributes => { + content_type => 'text/html; charset=UTF-8', + encoding => 'base64', + }, + body => $html_body, + ), + Email::MIME->create( + attributes => { + content_type => 'text/plain', + encoding => 'quoted-printable', + }, + body => 'hi = "bye"', + ) + ]; + $mid = 'multipart-html-sucks@11'; + my $mime = Email::MIME->create( + header_str => [ + From => 'a@example.com', + Subject => 'blah', + Cc => $addr, + 'Message-ID' => "<$mid>", + 'Content-Type' => 'multipart/alternative', + ], + parts => $parts, + ); + + { + $in = $mime->as_string; + run([$learn, "ham"], \$in); + is($?, 0, "learned ham without failure"); + $msg = `ssoma cat $mid $maindir`; + like($msg, qr/<\Q$mid\E>/, "ham message delivered"); + unlike($msg, qr//i, ' filtered'); + } } # faildir - emergency destination is maildir -- EW