user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH] msg_part_text: discover text in application/octet-stream
@ 2021-03-11  1:45 Eric Wong
  2021-03-12  0:31 ` [SQUASH] " Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2021-03-11  1:45 UTC (permalink / raw)
  To: meta

Some poorly-configured MUAs will send application/octet-stream
even for text-only attachments.  We can't make expect all MUAs
are configured with proper MIME types, and there is plenty of
historical mail that falls into this unfortunate criteria.
---
 MANIFEST                   |  1 +
 lib/PublicInbox/MsgIter.pm | 12 ++++++
 t/msg_iter.t               | 64 ++++++++++++++++++++++++++++---
 xt/eml_octet-stream.t      | 77 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 149 insertions(+), 5 deletions(-)
 create mode 100644 xt/eml_octet-stream.t

diff --git a/MANIFEST b/MANIFEST
index 8c9c86a0..4757b4fc 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -471,6 +471,7 @@ xt/cmp-msgstr.t
 xt/cmp-msgview.t
 xt/create-many-inboxes.t
 xt/eml_check_limits.t
+xt/eml_octet-stream.t
 xt/git-http-backend.t
 xt/git_async_cmp.t
 xt/httpd-async-stream.t
diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm
index c503eb98..e2819523 100644
--- a/lib/PublicInbox/MsgIter.pm
+++ b/lib/PublicInbox/MsgIter.pm
@@ -84,6 +84,18 @@ sub msg_part_text ($$) {
 		# If forcing charset=UTF-8 failed,
 		# caller will warn further down...
 		$s = $part->body if $@;
+	} elsif ($err && $ct =~ m!\bapplication/octet-stream\b!i) {
+		# Some unconfigured/poorly-configured MUAs will set
+		# application/octet-stream even for all text attachments.
+		# Try to see if it's printable text that we can index
+		# and display:
+		$s = $part->body;
+		if ($s =~ /[^\p{XPosixPrint}\s]/s) {
+			utf8::decode($s);
+			$s =~ /[^\p{XPosixPrint}\s]/s ? undef($s) : undef($err);
+		} else {
+			undef($err);
+		}
 	}
 	($s, $err);
 }
diff --git a/t/msg_iter.t b/t/msg_iter.t
index e46d515c..6c52eec8 100644
--- a/t/msg_iter.t
+++ b/t/msg_iter.t
@@ -1,10 +1,8 @@
 # Copyright (C) 2016-2021 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-use strict;
-use warnings;
-use Test::More;
-use PublicInbox::TestCommon;
+use strict; use v5.10.1; use PublicInbox::TestCommon;
 use PublicInbox::Hval qw(ascii_html);
+use MIME::QuotedPrint 3.05 qw(encode_qp);
 use_ok('PublicInbox::MsgIter');
 
 {
@@ -88,5 +86,61 @@ use_ok('PublicInbox::MsgIter');
 	is($check[1], $nq, 'long quoted section matches');
 }
 
+{
+	open my $fh, '<', 't/utf8.eml' or BAIL_OUT $!;
+	my $expect = do { local $/; <$fh>  };
+	my $qp_patch = encode_qp($expect, "\r\n");
+	my $common = <<EOM;
+Content-Type: multipart/mixed; boundary="DEADBEEF"
+MIME-Version: 1.0
+
+--DEADBEEF
+Content-Transfer-Encoding: quoted-printable
+Content-Type: text/plain;
+	charset=utf-8
+
+blah
+
+--DEADBEEF
+Content-Disposition: attachment;
+	filename=foo.patch
+Content-Type: application/octet-stream;
+	x-unix-mode=0644;
+	name="foo.patch"
+Content-Transfer-Encoding: quoted-printable
+EOM
+	my $eml = PublicInbox::Eml->new(<<EOM);
+$common
+$qp_patch
+--DEADBEEF--
+EOM
+	my @parts;
+	$eml->each_part(sub {
+		my ($part, $level, @ex) = @{$_[0]};
+		my ($s, $err) = msg_part_text($part, $part->content_type);
+		push @parts, $s;
+	});
+	$expect =~ s/\n/\r\n/sg;
+	is_deeply(\@parts, [ "blah\r\n", $expect ],
+		'fallback to application/octet-stream as UTF-8 text');
+
+	my $qp_binary = encode_qp("Binary\0crap", "\r\n");
+	$eml = PublicInbox::Eml->new(<<EOM);
+$common
+$qp_binary
+--DEADBEEF--
+EOM
+	@parts = ();
+	my @err;
+	$eml->each_part(sub {
+		my ($part, $level, @ex) = @{$_[0]};
+		my ($s, $err) = msg_part_text($part, $part->content_type);
+		push @parts, $s;
+		push @err, $err;
+	});
+	is_deeply(\@parts, [ "blah\r\n", undef ],
+		'non-text ignored in octet-stream');
+	ok($err[1], 'got error for second element');
+}
+
 done_testing();
-1;
diff --git a/xt/eml_octet-stream.t b/xt/eml_octet-stream.t
new file mode 100644
index 00000000..8173aec2
--- /dev/null
+++ b/xt/eml_octet-stream.t
@@ -0,0 +1,77 @@
+#!perl -w
+# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict; use v5.10.1; use PublicInbox::TestCommon;
+use PublicInbox::Git;
+use PublicInbox::Eml;
+use PublicInbox::MsgIter qw(msg_part_text);
+use PublicInbox::LeiToMail;
+my $eml2mboxcl2 = PublicInbox::LeiToMail->can('eml2mboxcl2');
+my $git_dir = $ENV{GIANT_GIT_DIR};
+plan 'skip_all' => "GIANT_GIT_DIR not defined for $0" unless defined($git_dir);
+use Data::Dumper;
+$Data::Dumper::Useqq = 1;
+my $mboxfh;
+if (my $out = $ENV{DEBUG_MBOXCL2}) {
+	BAIL_OUT("$out exists") if -s $out;
+	open $mboxfh, '>', $out or BAIL_OUT "open $out: $!";
+} else {
+	diag "DEBUG_MBOXCL2 unset, not saving debug output";
+}
+
+my $git = PublicInbox::Git->new($git_dir);
+my @cat = qw(cat-file --buffer --batch-check --batch-all-objects);
+if (require_git(2.19, 1)) {
+	push @cat, '--unordered';
+} else {
+	warn "git <2.19, cat-file lacks --unordered, locality suffers\n";
+}
+my ($errs, $ok, $tot);
+$errs = $ok = $tot = 0;
+my $ep = sub { # eml->each_part callback
+	my ($part, $level, @ex) = @{$_[0]};
+	++$tot;
+	my $ct = $part->content_type // return;
+	$ct =~ m!\bapplication/octet-stream\b!i or return;
+	my ($s, $err) = msg_part_text($part, $ct);
+	if (defined $s) {
+		++$ok;
+	} else {
+		warn "binary $err\n";
+		++$errs;
+		my $x = eval { $part->body };
+		if ($@) {
+			warn "decode totally failed: $@";
+		} else {
+			my ($bad) = ($x =~ m/([\p{XPosixPrint}\s]{0,10}
+						[^\p{XPosixPrint}\s]+
+						[\p{XPosixPrint}\s]{0,10})/sx);
+			warn Dumper([$bad]);
+		}
+
+		push @{$_[1]}, $err; # $fail
+	}
+};
+
+my $cb = sub {
+	my ($bref, $oid) = @_;
+	my $eml = PublicInbox::Eml->new($bref);
+	local $SIG{__WARN__} = sub { diag("$oid ", @_) };
+	$eml->each_part($ep, my $fail = []);
+	if (@$fail && $mboxfh) {
+		diag "@$fail";
+		print $mboxfh ${$eml2mboxcl2->($eml, { blob => $oid })} or
+			BAIL_OUT "print: $!";
+	}
+};
+my $cat = $git->popen(@cat);
+while (<$cat>) {
+	my ($oid, $type, $size) = split(/ /);
+	$git->cat_async($oid, $cb) if $size && $type eq 'blob';
+}
+$git->cat_async_wait;
+note "$errs errors";
+note "$ok/$tot messages had text as application/octet-stream";
+ok 1;
+
+done_testing;

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [SQUASH] msg_part_text: discover text in application/octet-stream
  2021-03-11  1:45 [PATCH] msg_part_text: discover text in application/octet-stream Eric Wong
@ 2021-03-12  0:31 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2021-03-12  0:31 UTC (permalink / raw)
  To: meta

This simplifies the check and ensures returned text is Perl "utf8"
text (that is, Perl's internal "utf8" and not the strict "UTF-8".

diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm
index e2819523..9c6581cc 100644
--- a/lib/PublicInbox/MsgIter.pm
+++ b/lib/PublicInbox/MsgIter.pm
@@ -90,12 +90,8 @@ sub msg_part_text ($$) {
 		# Try to see if it's printable text that we can index
 		# and display:
 		$s = $part->body;
-		if ($s =~ /[^\p{XPosixPrint}\s]/s) {
-			utf8::decode($s);
-			$s =~ /[^\p{XPosixPrint}\s]/s ? undef($s) : undef($err);
-		} else {
-			undef($err);
-		}
+		utf8::decode($s);
+		undef($s =~ /[^\p{XPosixPrint}\s]/s ? $s : $err);
 	}
 	($s, $err);
 }
diff --git a/t/msg_iter.t b/t/msg_iter.t
index 6c52eec8..ae3594da 100644
--- a/t/msg_iter.t
+++ b/t/msg_iter.t
@@ -121,6 +121,7 @@ EOM
 		push @parts, $s;
 	});
 	$expect =~ s/\n/\r\n/sg;
+	utf8::decode($expect); # aka "bytes2str"
 	is_deeply(\@parts, [ "blah\r\n", $expect ],
 		'fallback to application/octet-stream as UTF-8 text');
 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-03-12  0:31 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-11  1:45 [PATCH] msg_part_text: discover text in application/octet-stream Eric Wong
2021-03-12  0:31 ` [SQUASH] " Eric Wong

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.mail.public-inbox.meta
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.mail.public-inbox.meta
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git