public-inbox.git  about / heads / tags
an "archives first" approach to mailing lists
blob 80fc7364fc9a1943759585c30487ab0c5d36ca0c 8137 bytes (raw)
$ git show HEAD:lib/PublicInbox/EmlContentFoo.pm	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
 
# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
# Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
# This library is free software; you can redistribute it and/or modify
# it under the same terms as Perl itself.
#
# License: GPL-1.0+ or Artistic-1.0-Perl
#  <https://www.gnu.org/licenses/gpl-1.0.txt>
#  <https://dev.perl.org/licenses/artistic.html>
#
# This license differs from the rest of public-inbox
#
# ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
#
# This is a fork of the Email::MIME::ContentType 1.022 with
# minor improvements and incompatibilities; namely changes to
# quiet warnings with legacy data.
package PublicInbox::EmlContentFoo;
use strict;
use parent qw(Exporter);
use v5.10.1;

# find_mime_encoding() only appeared in Encode 2.87+ (Perl 5.26+),
# while we support 2.35 shipped with Perl 5.10.1
use Encode 2.35 qw(find_encoding);
my %mime_name_map; # $enc->mime_name => $enc object
BEGIN {
	eval { Encode->import('find_mime_encoding') };
	if ($@) {
		*find_mime_encoding = sub { $mime_name_map{lc($_[0])} };
		%mime_name_map = map {;
			my $enc = find_encoding($_);
			my $m = lc($enc->mime_name // '');
			$m => $enc;
		} Encode->encodings(':all');

		# delete fallback for encodings w/o ->mime_name:
		delete $mime_name_map{''};

		# an extra alias see Encode::MIME::NAME
		$mime_name_map{'utf8'} = find_encoding('UTF-8');
	}
}

our @EXPORT_OK = qw(parse_content_type parse_content_disposition);

our $STRICT_PARAMS = 1;

my $ct_default = 'text/plain; charset=us-ascii';

my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
	qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;

my $re_token_non_strict = # allow CTLs and above ASCII
	qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;

my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
	qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
my $re_quoted_pair = qr/\\[\x00-\x7F]/;
my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;

my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
my $re_quoted_string_non_strict =
qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;

my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;

sub parse_content_type {
	my ($ct) = @_;

	# If the header isn't there or is empty, give default answer.
	$ct = $ct_default unless defined($ct) && length($ct);

	_unfold_lines($ct);
	_clean_comments($ct);

	# It is also recommend (sic.) that this default be assumed when a
	# syntactically invalid Content-Type header field is encountered.
	unless ($ct =~ s/^($re_token)\/($re_token)//) {
		unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
						($re_token_non_strict)//x) {
			#carp "Invalid Content-Type '$ct'";
			return parse_content_type($ct_default);
		}
	}

	my ($type, $subtype) = (lc $1, lc $2);

	_clean_comments($ct);
	$ct =~ s/\s+$//;

	my $attributes = {};
	if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
		# carp "Missing ';' before first Content-Type parameter '$ct'";
	} else {
		$attributes = _process_rfc2231(_parse_attributes($ct));
	}

	{
		type	   => $type,
		subtype	=> $subtype,
		attributes => $attributes,
	};
}

my $cd_default = 'attachment';

sub parse_content_disposition {
	my ($cd) = @_;

	$cd = $cd_default unless defined($cd) && length($cd);

	_unfold_lines($cd);
	_clean_comments($cd);

	unless ($cd =~ s/^($re_token)//) {
		unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
			#carp "Invalid Content-Disposition '$cd'";
			return parse_content_disposition($cd_default);
		}
	}

	my $type = lc $1;

	_clean_comments($cd);
	$cd =~ s/\s+$//;

	my $attributes = {};
	if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
# carp "Missing ';' before first Content-Disposition parameter '$cd'";
	} else {
		$attributes = _process_rfc2231(_parse_attributes($cd));
	}

	{
		type	   => $type,
		attributes => $attributes,
	};
}

sub _unfold_lines {
	$_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
}

sub _clean_comments {
	my $ret = ($_[0] =~ s/^\s+//);
	while (length $_[0]) {
		last unless $_[0] =~ s/^\(//;
		my $level = 1;
		while (length $_[0]) {
			my $ch = substr $_[0], 0, 1, '';
			if ($ch eq '(') {
				$level++;
			} elsif ($ch eq ')') {
				$level--;
				last if $level == 0;
			} elsif ($ch eq '\\') {
				substr $_[0], 0, 1, '';
			}
		}
		# carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
		$ret |= ($_[0] =~ s/^\s+//);
	}
	$ret;
}

sub _process_rfc2231 {
	my ($attribs) = @_;
	my %cont;
	my %encoded;
	foreach (keys %{$attribs}) {
		next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
		my ($attr, $sec) = ($1, $2);
		$cont{$attr}->[$sec] = $attribs->{$_};
		$encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
		delete $attribs->{$_};
	}
	foreach (keys %cont) {
		my $key = $_;
		$key .= '*' if $encoded{$_};
		$attribs->{$key} = join '', @{$cont{$_}};
	}
	foreach (keys %{$attribs}) {
		next unless $_ =~ m/^(.*)\*$/;
		my $key = $1;
		next unless ($attribs->{$_} // '') =~ m/^$re_exvalue$/;
		my ($charset, $value) = ($1, $2);
		$value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
		if (length $charset) {
			my $enc = find_mime_encoding($charset);
			if (defined $enc) {
				$value = $enc->decode($value);
			# } else {
				#carp "Unknown charset '$charset' in
				#attribute '$key' value";
			}
		}
		$attribs->{$key} = $value;
		delete $attribs->{$_};
	}
	$attribs;
}

sub _parse_attributes {
	local $_ = shift;
	substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
	my $attribs = {};
	while (length $_) {
		s/^;// or $STRICT_PARAMS and do {
			#carp "Missing semicolon before parameter '$_'";
			return $attribs;
		};
		_clean_comments($_);
		unless (length $_) {
			# Some mail software generates a Content-Type like this:
			# "Content-Type: text/plain;"
			# RFC 1521 section 3 says a parameter must exist if
			# there is a semicolon.
			#carp "Extra semicolon after last parameter" if
			#$STRICT_PARAMS;
			return $attribs;
		}
		my $attribute;
		if (s/^($re_token)=//) {
			$attribute = lc $1;
		} else {
			if ($STRICT_PARAMS) {
				# carp "Illegal parameter '$_'";
				return $attribs;
			}
			if (s/^($re_token_non_strict)=//) {
				$attribute = lc $1;
			} else {
				unless (s/^([^;=\s]+)\s*=//) {
					#carp "Cannot parse parameter '$_'";
					return $attribs;
				}
				$attribute = lc $1;
			}
		}
		_clean_comments($_);
		my $value = _extract_attribute_value();
		$attribs->{$attribute} = $value;
		_clean_comments($_);
	}
	$attribs;
}

sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
	my $value;
	while (length $_) {
		if (s/^($re_token)//) {
			$value .= $1;
		} elsif (s/^$re_quoted_string//) {
			my $sub = $1;
			$sub =~ s/\\(.)/$1/g;
			$value .= $sub;
		} elsif ($STRICT_PARAMS) {
			#my $char = substr $_, 0, 1;
			#carp "Unquoted '$char' not allowed";
			return;
		} elsif (s/^($re_token_non_strict)//) {
			$value .= $1;
		} elsif (s/^$re_quoted_string_non_strict//) {
			my $sub = $1;
			$sub =~ s/\\(.)/$1/g;
			$value .= $sub;
		}
		my $erased = _clean_comments($_);
		last if !length $_ or /^;/;
		if ($STRICT_PARAMS) {
			#my $char = substr $_, 0, 1;
			#carp "Extra '$char' found after parameter";
			return;
		}
		if ($erased) {
			# Sometimes semicolon is missing, so check for = char
			last if m/^$re_token_non_strict=/;
			$value .= ' ';
		}
		$value .= substr $_, 0, 1, '';
	}
	$value;
}

1;
__END__
=func parse_content_type

This routine is exported by default.

This routine parses email content type headers according to section 5.1 of RFC
2045 and also RFC 2231 (Character Set and Parameter Continuations).  It returns
a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
C<attributes>.

=func parse_content_disposition

This routine is exported by default.

This routine parses email Content-Disposition headers according to RFC 2183 and
RFC 2231.  It returns a hash as above, with entries for the C<type>, and a hash
of C<attributes>.

=cut

git clone https://public-inbox.org/public-inbox.git
git clone http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git