user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 1/2] view: account for filter bugs which leak HTML into the repo
@ 2014-11-13 21:53 Eric Wong
  2014-11-13 21:53 ` [PATCH 2/2] -learn: nuke HTML portions when training as ham Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2014-11-13 21:53 UTC (permalink / raw)
  To: meta; +Cc: Eric Wong

Ugh, apparently there's a (yet-to-be-fixed) bug in the Filter
code which caused an HTML message portion of a multipart message
to be displayed on the web UI.  Account for that and nuke it.
---
 lib/PublicInbox/View.pm | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index e76d904..b09c3ba 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -104,7 +104,12 @@ sub index_entry {
 	$mime->walk_parts(sub {
 		my ($part) = @_;
 		return if $part->subparts; # walk_parts already recurses
-		my $enc = enc_for($part->content_type) || $enc_msg || $enc_utf8;
+		my $ct = $part->content_type;
+
+		# account for filter bugs...
+		return if defined $ct && $ct =~ m!\btext/[xh]+tml\b!i;
+
+		my $enc = enc_for($ct) || $enc_msg || $enc_utf8;
 
 		if ($part_nr > 0) {
 			my $fn = $part->filename;
@@ -178,7 +183,12 @@ sub multipart_text_as_html {
 	$mime->walk_parts(sub {
 		my ($part) = @_;
 		return if $part->subparts; # walk_parts already recurses
-		my $enc = enc_for($part->content_type) || $enc_msg || $enc_utf8;
+		my $ct = $part->content_type;
+
+		# account for filter bugs...
+		return if defined $ct && $ct =~ m!\btext/[xh]+tml\b!i;
+
+		my $enc = enc_for($ct) || $enc_msg || $enc_utf8;
 
 		if ($part_nr > 0) {
 			my $fn = $part->filename;
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH 2/2] -learn: nuke HTML portions when training as ham
  2014-11-13 21:53 [PATCH 1/2] view: account for filter bugs which leak HTML into the repo Eric Wong
@ 2014-11-13 21:53 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2014-11-13 21:53 UTC (permalink / raw)
  To: meta; +Cc: Eric Wong

Sometimes people send HTML email and I forget to fixup in my
MUA during moderation.  Automatically strip out HTML portions
instead.
---
 public-inbox-learn | 19 ++++++++++---------
 t/mda.t            | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/public-inbox-learn b/public-inbox-learn
index 13b75b7..db0a1bb 100755
--- a/public-inbox-learn
+++ b/public-inbox-learn
@@ -24,9 +24,16 @@ foreach my $h (qw(Cc To)) {
 	}
 }
 
-my $in = $mime->as_string;
-$mime->body_set('');
+my ($name, $email, $date);
+
+if ($train eq "ham") {
+	require PublicInbox::MDA;
+	require PublicInbox::Filter;
+	PublicInbox::Filter->run($mime);
+	($name, $email, $date) = PublicInbox::MDA->author_info($mime);
+}
 
+my $in = $mime->as_string;
 my $err = 0;
 my @output = qw(> /dev/null > /dev/null);
 
@@ -50,16 +57,10 @@ foreach my $recipient (keys %dests) {
 			}
 		}
 	} else { # $train eq "ham"
-		require PublicInbox::MDA;
-		require PublicInbox::Filter;
-
-		# no checking for errors here, we assume the message has
+		# no checking for spam here, we assume the message has
 		# been reviewed by a human at this point:
-		PublicInbox::Filter->run($mime);
 		PublicInbox::MDA->set_list_headers($mime, $dst);
 
-		my ($name, $email, $date) =
-				PublicInbox::MDA->author_info($mime);
 		local $ENV{GIT_AUTHOR_NAME} = $name;
 		local $ENV{GIT_AUTHOR_EMAIL} = $email;
 		local $ENV{GIT_AUTHOR_DATE} = $date;
diff --git a/t/mda.t b/t/mda.t
index fad96e5..53712a5 100644
--- a/t/mda.t
+++ b/t/mda.t
@@ -205,14 +205,55 @@ EOF
 	my $in = $simple->as_string;
 
 	# now train it
+	# these should be overridden
 	local $ENV{GIT_AUTHOR_EMAIL} = 'trainer@example.com';
 	local $ENV{GIT_COMMITTER_EMAIL} = 'trainer@example.com';
+
 	run([$learn, "ham"], \$in);
 	is($?, 0, "learned ham without failure");
 	my $msg = `ssoma cat $mid $maindir`;
 	like($msg, qr/\Q$mid\E/, "ham message delivered");
 	run([$learn, "ham"], \$in);
 	is($?, 0, "learned ham idempotently ");
+
+	# ensure trained email is filtered, too
+	my $html_body = "<html><body>hi</body></html>";
+	my $parts = [
+		Email::MIME->create(
+			attributes => {
+				content_type => 'text/html; charset=UTF-8',
+				encoding => 'base64',
+			},
+			body => $html_body,
+		),
+		Email::MIME->create(
+			attributes => {
+				content_type => 'text/plain',
+				encoding => 'quoted-printable',
+			},
+			body => 'hi = "bye"',
+		)
+	];
+	$mid = 'multipart-html-sucks@11';
+	my $mime = Email::MIME->create(
+		header_str => [
+		  From => 'a@example.com',
+		  Subject => 'blah',
+		  Cc => $addr,
+		  'Message-ID' => "<$mid>",
+		  'Content-Type' => 'multipart/alternative',
+		],
+		parts => $parts,
+	);
+
+	{
+		$in = $mime->as_string;
+		run([$learn, "ham"], \$in);
+		is($?, 0, "learned ham without failure");
+		$msg = `ssoma cat $mid $maindir`;
+		like($msg, qr/<\Q$mid\E>/, "ham message delivered");
+		unlike($msg, qr/<html>/i, '<html> filtered');
+	}
 }
 
 # faildir - emergency destination is maildir
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2014-11-13 21:53 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-13 21:53 [PATCH 1/2] view: account for filter bugs which leak HTML into the repo Eric Wong
2014-11-13 21:53 ` [PATCH 2/2] -learn: nuke HTML portions when training as ham Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).