user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 08/13] EmlContentFoo: Email::MIME::ContentType replacement
Date: Thu,  7 May 2020 21:05:51 +0000	[thread overview]
Message-ID: <20200507210556.22995-9-e@yhbt.net> (raw)
In-Reply-To: <20200507210556.22995-1-e@yhbt.net>

Since we're getting rid of Email::MIME, get rid of
Email::MIME::ContentType, too; since we may introduce
speedups down the line specific to our codebase.
---
 MANIFEST                         |   3 +
 lib/PublicInbox/Eml.pm           |   7 +-
 lib/PublicInbox/EmlContentFoo.pm | 294 +++++++++++++++++++++++++++++++
 lib/PublicInbox/WwwAttach.pm     |   2 +-
 t/eml_content_disposition.t      | 102 +++++++++++
 t/eml_content_type.t             | 289 ++++++++++++++++++++++++++++++
 6 files changed, 692 insertions(+), 5 deletions(-)
 create mode 100644 lib/PublicInbox/EmlContentFoo.pm
 create mode 100644 t/eml_content_disposition.t
 create mode 100644 t/eml_content_type.t

diff --git a/MANIFEST b/MANIFEST
index 0906448e..055c8c9a 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -106,6 +106,7 @@ lib/PublicInbox/DSPoll.pm
 lib/PublicInbox/Daemon.pm
 lib/PublicInbox/Emergency.pm
 lib/PublicInbox/Eml.pm
+lib/PublicInbox/EmlContentFoo.pm
 lib/PublicInbox/ExtMsg.pm
 lib/PublicInbox/Feed.pm
 lib/PublicInbox/Filter/Base.pm
@@ -231,6 +232,8 @@ t/ds-poll.t
 t/edit.t
 t/emergency.t
 t/eml.t
+t/eml_content_disposition.t
+t/eml_content_type.t
 t/epoll.t
 t/fail-bin/spamc
 t/feed.t
diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm
index 0c23bed0..1988bdb3 100644
--- a/lib/PublicInbox/Eml.pm
+++ b/lib/PublicInbox/Eml.pm
@@ -33,10 +33,9 @@ use Text::Wrap qw(wrap); # stdlib, we need Perl 5.6+ for $huge
 
 my $MIME_Header = find_encoding('MIME-Header');
 
-# TODO remove these dependencies
-use Email::MIME::ContentType;
+use PublicInbox::EmlContentFoo qw(parse_content_type parse_content_disposition);
 use Email::MIME::Encodings;
-$Email::MIME::ContentType::STRICT_PARAMS = 0;
+$PublicInbox::EmlContentFoo::STRICT_PARAMS = 0;
 
 our $MAXPARTS = 1000; # same as SpamAssassin
 our $MAXDEPTH = 20; # seems enough, Perl sucks, here
@@ -108,7 +107,7 @@ sub header_raw {
 # pick the first Content-Type header to match Email::MIME behavior.
 # It's usually the right one based on historical archives.
 sub ct ($) {
-	# Email::MIME::ContentType::content_type:
+	# PublicInbox::EmlContentFoo::content_type:
 	$_[0]->{ct} //= parse_content_type(header($_[0], 'Content-Type'));
 }
 
diff --git a/lib/PublicInbox/EmlContentFoo.pm b/lib/PublicInbox/EmlContentFoo.pm
new file mode 100644
index 00000000..f507d548
--- /dev/null
+++ b/lib/PublicInbox/EmlContentFoo.pm
@@ -0,0 +1,294 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
+# This library is free software; you can redistribute it and/or modify
+# it under the same terms as Perl itself.
+#
+# License: GPL-1.0+ or Artistic-1.0-Perl
+#  <https://www.gnu.org/licenses/gpl-1.0.txt>
+#  <https://dev.perl.org/licenses/artistic.html>
+#
+# This license differs from the rest of public-inbox
+#
+# This is a fork of the Email::MIME::ContentType 1.022 with
+# minor improvements and incompatibilities; namely changes to
+# quiet warnings with legacy data.
+package PublicInbox::EmlContentFoo;
+use strict;
+use parent qw(Exporter);
+# ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
+
+use Encode 2.87 qw(find_mime_encoding);
+our @EXPORT_OK = qw(parse_content_type parse_content_disposition);
+
+our $STRICT_PARAMS = 1;
+
+my $ct_default = 'text/plain; charset=us-ascii';
+
+my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
+	qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;
+
+my $re_token_non_strict = # allow CTLs and above ASCII
+	qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;
+
+my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
+	qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
+my $re_quoted_pair = qr/\\[\x00-\x7F]/;
+my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
+
+my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
+my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
+my $re_quoted_string_non_strict =
+qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
+
+my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
+my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
+my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
+
+sub parse_content_type {
+	my ($ct) = @_;
+
+	# If the header isn't there or is empty, give default answer.
+	$ct = $ct_default unless defined($ct) && length($ct);
+
+	_unfold_lines($ct);
+	_clean_comments($ct);
+
+	# It is also recommend (sic.) that this default be assumed when a
+	# syntactically invalid Content-Type header field is encountered.
+	unless ($ct =~ s/^($re_token)\/($re_token)//) {
+		unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
+						($re_token_non_strict)//x) {
+			#carp "Invalid Content-Type '$ct'";
+			return parse_content_type($ct_default);
+		}
+	}
+
+	my ($type, $subtype) = (lc $1, lc $2);
+
+	_clean_comments($ct);
+	$ct =~ s/\s+$//;
+
+	my $attributes = {};
+	if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
+		# carp "Missing ';' before first Content-Type parameter '$ct'";
+	} else {
+		$attributes = _process_rfc2231(_parse_attributes($ct));
+	}
+
+	{
+		type	   => $type,
+		subtype	=> $subtype,
+		attributes => $attributes,
+
+		# This is dumb.  Really really dumb.  For backcompat. -- rjbs,
+		# 2013-08-10
+		discrete   => $type,
+		composite  => $subtype,
+	};
+}
+
+my $cd_default = 'attachment';
+
+sub parse_content_disposition {
+	my ($cd) = @_;
+
+	$cd = $cd_default unless defined($cd) && length($cd);
+
+	_unfold_lines($cd);
+	_clean_comments($cd);
+
+	unless ($cd =~ s/^($re_token)//) {
+		unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
+			#carp "Invalid Content-Disposition '$cd'";
+			return parse_content_disposition($cd_default);
+		}
+	}
+
+	my $type = lc $1;
+
+	_clean_comments($cd);
+	$cd =~ s/\s+$//;
+
+	my $attributes = {};
+	if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
+# carp "Missing ';' before first Content-Disposition parameter '$cd'";
+	} else {
+		$attributes = _process_rfc2231(_parse_attributes($cd));
+	}
+
+	{
+		type	   => $type,
+		attributes => $attributes,
+	};
+}
+
+sub _unfold_lines {
+	$_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
+}
+
+sub _clean_comments {
+	my $ret = ($_[0] =~ s/^\s+//);
+	while (length $_[0]) {
+		last unless $_[0] =~ s/^\(//;
+		my $level = 1;
+		while (length $_[0]) {
+			my $ch = substr $_[0], 0, 1, '';
+			if ($ch eq '(') {
+				$level++;
+			} elsif ($ch eq ')') {
+				$level--;
+				last if $level == 0;
+			} elsif ($ch eq '\\') {
+				substr $_[0], 0, 1, '';
+			}
+		}
+		# carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
+		$ret |= ($_[0] =~ s/^\s+//);
+	}
+	$ret;
+}
+
+sub _process_rfc2231 {
+	my ($attribs) = @_;
+	my %cont;
+	my %encoded;
+	foreach (keys %{$attribs}) {
+		next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
+		my ($attr, $sec) = ($1, $2);
+		$cont{$attr}->[$sec] = $attribs->{$_};
+		$encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
+		delete $attribs->{$_};
+	}
+	foreach (keys %cont) {
+		my $key = $_;
+		$key .= '*' if $encoded{$_};
+		$attribs->{$key} = join '', @{$cont{$_}};
+	}
+	foreach (keys %{$attribs}) {
+		next unless $_ =~ m/^(.*)\*$/;
+		my $key = $1;
+		next unless $attribs->{$_} =~ m/^$re_exvalue$/;
+		my ($charset, $value) = ($1, $2);
+		$value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
+		if (length $charset) {
+			my $enc = find_mime_encoding($charset);
+			if (defined $enc) {
+				$value = $enc->decode($value);
+			# } else {
+				#carp "Unknown charset '$charset' in
+				#attribute '$key' value";
+			}
+		}
+		$attribs->{$key} = $value;
+		delete $attribs->{$_};
+	}
+	$attribs;
+}
+
+sub _parse_attributes {
+	local $_ = shift;
+	substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
+	my $attribs = {};
+	while (length $_) {
+		s/^;// or $STRICT_PARAMS and do {
+			#carp "Missing semicolon before parameter '$_'";
+			return $attribs;
+		};
+		_clean_comments($_);
+		unless (length $_) {
+			# Some mail software generates a Content-Type like this:
+			# "Content-Type: text/plain;"
+			# RFC 1521 section 3 says a parameter must exist if
+			# there is a semicolon.
+			#carp "Extra semicolon after last parameter" if
+			#$STRICT_PARAMS;
+			return $attribs;
+		}
+		my $attribute;
+		if (s/^($re_token)=//) {
+			$attribute = lc $1;
+		} else {
+			if ($STRICT_PARAMS) {
+				# carp "Illegal parameter '$_'";
+				return $attribs;
+			}
+			if (s/^($re_token_non_strict)=//) {
+				$attribute = lc $1;
+			} else {
+				unless (s/^([^;=\s]+)\s*=//) {
+					#carp "Cannot parse parameter '$_'";
+					return $attribs;
+				}
+				$attribute = lc $1;
+			}
+		}
+		_clean_comments($_);
+		my $value = _extract_attribute_value();
+		$attribs->{$attribute} = $value;
+		_clean_comments($_);
+	}
+	$attribs;
+}
+
+sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
+	my $value;
+	while (length $_) {
+		if (s/^($re_token)//) {
+			$value .= $1;
+		} elsif (s/^$re_quoted_string//) {
+			my $sub = $1;
+			$sub =~ s/\\(.)/$1/g;
+			$value .= $sub;
+		} elsif ($STRICT_PARAMS) {
+			#my $char = substr $_, 0, 1;
+			#carp "Unquoted '$char' not allowed";
+			return;
+		} elsif (s/^($re_token_non_strict)//) {
+			$value .= $1;
+		} elsif (s/^$re_quoted_string_non_strict//) {
+			my $sub = $1;
+			$sub =~ s/\\(.)/$1/g;
+			$value .= $sub;
+		}
+		my $erased = _clean_comments($_);
+		last if !length $_ or /^;/;
+		if ($STRICT_PARAMS) {
+			#my $char = substr $_, 0, 1;
+			#carp "Extra '$char' found after parameter";
+			return;
+		}
+		if ($erased) {
+			# Sometimes semicolon is missing, so check for = char
+			last if m/^$re_token_non_strict=/;
+			$value .= ' ';
+		}
+		$value .= substr $_, 0, 1, '';
+	}
+	$value;
+}
+
+1;
+__END__
+=func parse_content_type
+
+This routine is exported by default.
+
+This routine parses email content type headers according to section 5.1 of RFC
+2045 and also RFC 2231 (Character Set and Parameter Continuations).  It returns
+a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
+C<attributes>.
+
+For backward compatibility with a really unfortunate misunderstanding of RFC
+2045 by the early implementors of this module, C<discrete> and C<composite> are
+also present in the returned hashref, with the values of C<type> and C<subtype>
+respectively.
+
+=func parse_content_disposition
+
+This routine is exported by default.
+
+This routine parses email Content-Disposition headers according to RFC 2183 and
+RFC 2231.  It returns a hash as above, with entries for the C<type>, and a hash
+of C<attributes>.
+
+=cut
diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm
index 5b2914b3..754da13f 100644
--- a/lib/PublicInbox/WwwAttach.pm
+++ b/lib/PublicInbox/WwwAttach.pm
@@ -6,7 +6,7 @@ package PublicInbox::WwwAttach; # internal package
 use strict;
 use warnings;
 use bytes (); # only for bytes::length
-use Email::MIME::ContentType qw(parse_content_type);
+use PublicInbox::EmlContentFoo qw(parse_content_type);
 use PublicInbox::Eml;
 
 sub get_attach_i { # ->each_part callback
diff --git a/t/eml_content_disposition.t b/t/eml_content_disposition.t
new file mode 100644
index 00000000..9bdacc05
--- /dev/null
+++ b/t/eml_content_disposition.t
@@ -0,0 +1,102 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
+# This library is free software; you can redistribute it and/or modify
+# it under the same terms as Perl itself.
+#
+# License: GPL-1.0+ or Artistic-1.0-Perl
+#  <https://www.gnu.org/licenses/gpl-1.0.txt>
+#  <https://dev.perl.org/licenses/artistic.html>
+use strict;
+use Test::More;
+use PublicInbox::EmlContentFoo qw(parse_content_disposition);
+
+my %cd_tests = (
+	'' => { type => 'attachment', attributes => {} },
+	'inline' => { type => 'inline', attributes => {} },
+	'attachment' => { type => 'attachment', attributes => {} },
+
+	'attachment; filename=genome.jpeg;' .
+	' modification-date="Wed, 12 Feb 1997 16:29:51 -0500"' => {
+		type => 'attachment',
+		attributes => {
+			filename => 'genome.jpeg',
+			'modification-date' => 'Wed, 12 Feb 1997 16:29:51 -0500'
+		}
+	},
+
+	q(attachment; filename*=UTF-8''genome.jpeg;) .
+	q( modification-date="Wed, 12 Feb 1997 16:29:51 -0500") => {
+		type => 'attachment',
+		attributes => {
+			filename => 'genome.jpeg',
+			'modification-date' => 'Wed, 12 Feb 1997 16:29:51 -0500'
+		}
+	},
+
+	q(attachment; filename*0*=us-ascii'en'This%20is%20even%20more%20;) .
+	q( filename*1*=%2A%2A%2Afun%2A%2A%2A%20; filename*2="isn't it!") => {
+		type => 'attachment',
+		attributes => {
+			filename => "This is even more ***fun*** isn't it!"
+		}
+	},
+
+	q(attachment; filename*0*='en'This%20is%20even%20more%20;) .
+	q( filename*1*=%2A%2A%2Afun%2A%2A%2A%20; filename*2="isn't it!") => {
+		type => 'attachment',
+		attributes => {
+			filename => "This is even more ***fun*** isn't it!"
+		}
+	},
+
+	q(attachment; filename*0*=''This%20is%20even%20more%20;) .
+	q( filename*1*=%2A%2A%2Afun%2A%2A%2A%20; filename*2="isn't it!") => {
+		type => 'attachment',
+		attributes => {
+			filename => "This is even more ***fun*** isn't it!"
+		}
+	},
+
+	q(attachment; filename*0*=us-ascii''This%20is%20even%20more%20;).
+	q( filename*1*=%2A%2A%2Afun%2A%2A%2A%20; filename*2="isn't it!") => {
+		type => 'attachment',
+		attributes => {
+			filename => "This is even more ***fun*** isn't it!"
+		}
+	},
+);
+
+my %non_strict_cd_tests = (
+	'attachment; filename=genome.jpeg;' .
+	' modification-date="Wed, 12 Feb 1997 16:29:51 -0500";' => {
+		type => 'attachment',
+		attributes => {
+			filename => 'genome.jpeg',
+			'modification-date' =>
+				'Wed, 12 Feb 1997 16:29:51 -0500'
+		}
+	},
+);
+
+sub test {
+	my ($string, $expect, $info) = @_;
+	local $_;
+	$info =~ s/\r/\\r/g;
+	$info =~ s/\n/\\n/g;
+	is_deeply(parse_content_disposition($string), $expect, $info);
+}
+
+for (sort keys %cd_tests) {
+	test($_, $cd_tests{$_}, "Can parse C-D <$_>");
+}
+
+local $PublicInbox::EmlContentFoo::STRICT_PARAMS = 0;
+for (sort keys %cd_tests) {
+	test($_, $cd_tests{$_}, "Can parse non-strict C-D <$_>");
+}
+for (sort keys %non_strict_cd_tests) {
+	test($_, $non_strict_cd_tests{$_}, "Can parse non-strict C-D <$_>");
+}
+
+done_testing;
diff --git a/t/eml_content_type.t b/t/eml_content_type.t
new file mode 100644
index 00000000..5fd7d1d9
--- /dev/null
+++ b/t/eml_content_type.t
@@ -0,0 +1,289 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
+# This library is free software; you can redistribute it and/or modify
+# it under the same terms as Perl itself.
+#
+# License: GPL-1.0+ or Artistic-1.0-Perl
+#  <https://www.gnu.org/licenses/gpl-1.0.txt>
+#  <https://dev.perl.org/licenses/artistic.html>
+use strict;
+use Test::More;
+use PublicInbox::EmlContentFoo qw(parse_content_type);
+
+my %ct_tests = (
+	'' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "us-ascii" }
+	},
+
+	"text/plain" => {
+		type => "text",
+		subtype => "plain",
+		attributes => {}
+	},
+	'text/plain; charset=us-ascii' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "us-ascii" }
+	},
+	'text/plain; charset="us-ascii"' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "us-ascii" }
+	},
+	"text/plain; charset=us-ascii (Plain text)" => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "us-ascii" }
+	},
+
+	'text/plain; charset=ISO-8859-1' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "ISO-8859-1" }
+	},
+	'text/plain; charset="ISO-8859-1"' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "ISO-8859-1" }
+	},
+	'text/plain; charset="ISO-8859-1" (comment)' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "ISO-8859-1" }
+	},
+
+	'(c) text/plain (c); (c) charset=ISO-8859-1 (c)' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "ISO-8859-1" }
+	},
+	'(c \( \\\\) (c) text/plain (c) (c) ; (c) (c) charset=utf-8 (c)' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "utf-8" }
+	},
+	'text/plain; (c (nested ()c)another c)() charset=ISO-8859-1' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "ISO-8859-1" }
+	},
+	'text/plain (c \(!nested ()c\)\)(nested\(c())); charset=utf-8' => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { charset => "utf-8" }
+	},
+
+	"application/foo" => {
+		type       => "application",
+		subtype    => "foo",
+		attributes => {}
+	},
+	"multipart/mixed; boundary=unique-boundary-1" => {
+		type       => "multipart",
+		subtype    => "mixed",
+		attributes => { boundary => "unique-boundary-1" }
+	},
+	'message/external-body; access-type=local-file; name="/u/n/m.jpg"' => {
+		type       => "message",
+		subtype    => "external-body",
+		attributes => {
+			"access-type" => "local-file",
+			"name"        => "/u/n/m.jpg"
+		}
+	},
+	'multipart/mixed; boundary="----------=_1026452699-10321-0" ' => {
+		'type'       => 'multipart',
+		'subtype'    => 'mixed',
+		'attributes' => {
+			'boundary' => '----------=_1026452699-10321-0'
+		}
+	},
+	'multipart/report; boundary= "=_0=73e476c3-cd5a-5ba3-b910-2="' => {
+		'type'       => 'multipart',
+		'subtype'    => 'report',
+		'attributes' => {
+			'boundary' => '=_0=73e476c3-cd5a-5ba3-b910-2='
+		}
+	},
+	'multipart/report; boundary=' . " \t" . '"=_0=7-c-5-b-2="' => {
+		'type'       => 'multipart',
+		'subtype'    => 'report',
+		'attributes' => {
+			'boundary' => '=_0=7-c-5-b-2='
+		}
+	},
+
+	'message/external-body; access-type=URL;' .
+	' URL*0="ftp://";' .
+	' URL*1="example.com/"' => {
+		'type'       => 'message',
+		'subtype'    => 'external-body',
+		'attributes' => {
+			'access-type' => 'URL',
+			'url' => 'ftp://example.com/'
+		}
+	},
+	'message/external-body; access-type=URL; URL="ftp://example.com/"' => {
+		'type'       => 'message',
+		'subtype'    => 'external-body',
+		'attributes' => {
+			'access-type' => 'URL',
+			'url' => 'ftp://example.com/',
+		}
+	},
+
+	"application/x-stuff; title*=us-ascii'en-us'This%20is%20f%2Ad" => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => 'This is f*d'
+		}
+	},
+	"application/x-stuff; title*=us-ascii''This%20is%20f%2Ad" => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => 'This is f*d'
+		}
+	},
+	"application/x-stuff; title*=''This%20is%20f%2Ad" => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => 'This is f*d'
+		}
+	},
+	"application/x-stuff; title*='en-us'This%20is%20f%2Ad" => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => 'This is f*d'
+		}
+	},
+	q(application/x-stuff;) .
+	q( title*0*=us-ascii'en'This%20is%20even%20more%20;) .
+	q(title*1*=%2A%2A%2Afun%2A%2A%2A%20; title*2="isn't it!") => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => "This is even more ***fun*** isn't it!"
+		}
+	},
+	q(application/x-stuff;) .
+	q( title*0*='en'This%20is%20even%20more%20;) .
+	q( title*1*=%2A%2A%2Afun%2A%2A%2A%20; title*2="isn't it!") => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => "This is even more ***fun*** isn't it!"
+		}
+	},
+	q(application/x-stuff;) .
+	q( title*0*=''This%20is%20even%20more%20;) .
+	q( title*1*=%2A%2A%2Afun%2A%2A%2A%20; title*2="isn't it!") => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => "This is even more ***fun*** isn't it!"
+		}
+	},
+	q(application/x-stuff;).
+	q( title*0*=us-ascii''This%20is%20even%20more%20;).
+	q( title*1*=%2A%2A%2Afun%2A%2A%2A%20; title*2="isn't it!")
+	  => {
+		'type'       => 'application',
+		'subtype'    => 'x-stuff',
+		'attributes' => {
+			'title' => "This is even more ***fun*** isn't it!"
+		}
+	},
+
+	'text/plain; attribute="v\"v\\\\v\(v\>\<\)\@\,\;\:\/\]\[\?\=v v";' .
+	' charset=us-ascii' => {
+		'type'       => 'text',
+		'subtype'    => 'plain',
+		'attributes' => {
+			'attribute' => 'v"v\\v(v><)@,;:/][?=v v',
+			'charset' => 'us-ascii',
+		},
+	},
+
+	qq(text/plain;\r
+	 charset=us-ascii;\r
+	 attribute="\r value1 \r value2\r\n value3\r\n value4\r\n "\r\n ) => {
+		'type'       => 'text',
+		'subtype'    => 'plain',
+		'attributes' => {
+			'attribute' => ' value1  value2 value3 value4 ',
+			'charset'   => 'us-ascii',
+		},
+	},
+);
+
+my %non_strict_ct_tests = (
+	"text/plain;" => { type => "text", subtype => "plain", attributes => {} },
+	"text/plain; " =>
+	  { type => "text", subtype => "plain", attributes => {} },
+	'image/jpeg;' .
+	' x-mac-type="3F3F3F3F";'.
+	' x-mac-creator="3F3F3F3F" name="file name.jpg";' => {
+		type       => "image",
+		subtype    => "jpeg",
+		attributes => {
+			'x-mac-type'    => "3F3F3F3F",
+			'x-mac-creator' => "3F3F3F3F",
+			'name'          => "file name.jpg"
+		}
+	},
+	"text/plain; key=very long value" => {
+		type       => "text",
+		subtype    => "plain",
+		attributes => { key => "very long value" }
+	},
+	"text/plain; key=very long value key2=value2" => {
+		type    => "text",
+		subtype => "plain",
+		attributes => { key => "very long value", key2 => "value2" }
+	},
+	'multipart/mixed; boundary = "--=_Next_Part_24_Nov_2016_08.09.21"' => {
+		type    => "multipart",
+		subtype => "mixed",
+		attributes => {
+			boundary => "--=_Next_Part_24_Nov_2016_08.09.21"
+		}
+	},
+);
+
+sub test {
+	my ($string, $expect, $info) = @_;
+
+	# So stupid. -- rjbs, 2013-08-10
+	$expect->{discrete}  = $expect->{type};
+	$expect->{composite} = $expect->{subtype};
+
+	local $_;
+	$info =~ s/\r/\\r/g;
+	$info =~ s/\n/\\n/g;
+	is_deeply(parse_content_type($string), $expect, $info);
+}
+
+for (sort keys %ct_tests) {
+	test($_, $ct_tests{$_}, "Can parse C-T <$_>");
+}
+
+local $PublicInbox::EmlContentFoo::STRICT_PARAMS = 0;
+for (sort keys %ct_tests) {
+	test($_, $ct_tests{$_}, "Can parse non-strict C-T <$_>");
+}
+for (sort keys %non_strict_ct_tests) {
+	test(
+		$_,
+		$non_strict_ct_tests{$_},
+		"Can parse non-strict C-T <$_>"
+	);
+}
+
+done_testing;

  parent reply	other threads:[~2020-05-07 21:05 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-07 21:05 [PATCH 00/13] eml: pure-Perl replacement for Email::MIME Eric Wong
2020-05-07 21:05 ` [PATCH 01/13] msg_iter: make ->each_part method for PublicInbox::MIME Eric Wong
2020-05-07 21:05 ` [PATCH 02/13] msg_iter: pass $idx as a scalar, not array Eric Wong
2020-05-07 21:05 ` [PATCH 03/13] filter/rubylang: avoid recursing subparts to strip trailers Eric Wong
2020-05-07 21:05 ` [PATCH 04/13] smsg: use capitalization for header retrieval Eric Wong
2020-05-07 21:05 ` [PATCH 05/13] eml: pure-Perl replacement for Email::MIME Eric Wong
2020-05-07 21:05 ` [PATCH 06/13] switch read-only Email::Simple users to Eml Eric Wong
2020-05-07 21:05 ` [PATCH 07/13] replace most uses of PublicInbox::MIME with Eml Eric Wong
2020-05-07 21:05 ` Eric Wong [this message]
2020-05-07 21:05 ` [PATCH 09/13] EmlContentFoo: relax Encode version requirement Eric Wong
2020-05-07 21:05 ` [PATCH 10/13] eml: remove dependency on Email::MIME::Encodings Eric Wong
2020-05-07 21:05 ` [PATCH 11/13] xt: eml comparison tests Eric Wong
2020-05-08  4:47   ` Eric Wong
2020-05-07 21:05 ` [PATCH 12/13] remove most internal Email::MIME usage Eric Wong
2020-05-07 21:05 ` [PATCH 13/13] eml: drop trailing blank line on missing epilogue Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200507210556.22995-9-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).