user/dev discussion of public-inbox itself
 help / color / Atom feed
e6005d290c933b1575bdc795123b6c1ae67ccc79 blob 8514 bytes (raw)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
 
# Copyright (C) 2020 all contributors <meta@public-inbox.org>
# Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES
# This library is free software; you can redistribute it and/or modify
# it under the same terms as Perl itself.
#
# License: GPL-1.0+ or Artistic-1.0-Perl
#  <https://www.gnu.org/licenses/gpl-1.0.txt>
#  <https://dev.perl.org/licenses/artistic.html>
#
# This license differs from the rest of public-inbox
#
# ABSTRACT: Parse a MIME Content-Type or Content-Disposition Header
#
# This is a fork of the Email::MIME::ContentType 1.022 with
# minor improvements and incompatibilities; namely changes to
# quiet warnings with legacy data.
package PublicInbox::EmlContentFoo;
use strict;
use parent qw(Exporter);
use v5.10.1;

# find_mime_encoding() only appeared in Encode 2.87+ (Perl 5.26+),
# while we support 2.35 shipped with Perl 5.10.1
use Encode 2.35 qw(find_encoding);
my %mime_name_map; # $enc->mime_name => $enc object
BEGIN {
	eval { Encode->import('find_mime_encoding') };
	if ($@) {
		*find_mime_encoding = sub { $mime_name_map{lc($_[0])} };
		%mime_name_map = map {;
			my $enc = find_encoding($_);
			my $m = lc($enc->mime_name // '');
			$m => $enc;
		} Encode->encodings(':all');

		# delete fallback for encodings w/o ->mime_name:
		delete $mime_name_map{''};

		# an extra alias see Encode::MIME::NAME
		$mime_name_map{'utf8'} = find_encoding('UTF-8');
	}
}

our @EXPORT_OK = qw(parse_content_type parse_content_disposition);

our $STRICT_PARAMS = 1;

my $ct_default = 'text/plain; charset=us-ascii';

my $re_token = # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
	qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/;

my $re_token_non_strict = # allow CTLs and above ASCII
	qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/;

my $re_qtext = # US-ASCII except CR, LF, white space, backslash and quote
	qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/;
my $re_quoted_pair = qr/\\[\x00-\x7F]/;
my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;

my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
my $re_quoted_string_non_strict =
qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;

my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;

sub parse_content_type {
	my ($ct) = @_;

	# If the header isn't there or is empty, give default answer.
	$ct = $ct_default unless defined($ct) && length($ct);

	_unfold_lines($ct);
	_clean_comments($ct);

	# It is also recommend (sic.) that this default be assumed when a
	# syntactically invalid Content-Type header field is encountered.
	unless ($ct =~ s/^($re_token)\/($re_token)//) {
		unless ($STRICT_PARAMS && $ct =~ s/^($re_token_non_strict)\/
						($re_token_non_strict)//x) {
			#carp "Invalid Content-Type '$ct'";
			return parse_content_type($ct_default);
		}
	}

	my ($type, $subtype) = (lc $1, lc $2);

	_clean_comments($ct);
	$ct =~ s/\s+$//;

	my $attributes = {};
	if ($STRICT_PARAMS && length($ct) && $ct !~ /^;/) {
		# carp "Missing ';' before first Content-Type parameter '$ct'";
	} else {
		$attributes = _process_rfc2231(_parse_attributes($ct));
	}

	{
		type	   => $type,
		subtype	=> $subtype,
		attributes => $attributes,

		# This is dumb.  Really really dumb.  For backcompat. -- rjbs,
		# 2013-08-10
		discrete   => $type,
		composite  => $subtype,
	};
}

my $cd_default = 'attachment';

sub parse_content_disposition {
	my ($cd) = @_;

	$cd = $cd_default unless defined($cd) && length($cd);

	_unfold_lines($cd);
	_clean_comments($cd);

	unless ($cd =~ s/^($re_token)//) {
		unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
			#carp "Invalid Content-Disposition '$cd'";
			return parse_content_disposition($cd_default);
		}
	}

	my $type = lc $1;

	_clean_comments($cd);
	$cd =~ s/\s+$//;

	my $attributes = {};
	if ($STRICT_PARAMS && length($cd) && $cd !~ /^;/) {
# carp "Missing ';' before first Content-Disposition parameter '$cd'";
	} else {
		$attributes = _process_rfc2231(_parse_attributes($cd));
	}

	{
		type	   => $type,
		attributes => $attributes,
	};
}

sub _unfold_lines {
	$_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
}

sub _clean_comments {
	my $ret = ($_[0] =~ s/^\s+//);
	while (length $_[0]) {
		last unless $_[0] =~ s/^\(//;
		my $level = 1;
		while (length $_[0]) {
			my $ch = substr $_[0], 0, 1, '';
			if ($ch eq '(') {
				$level++;
			} elsif ($ch eq ')') {
				$level--;
				last if $level == 0;
			} elsif ($ch eq '\\') {
				substr $_[0], 0, 1, '';
			}
		}
		# carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
		$ret |= ($_[0] =~ s/^\s+//);
	}
	$ret;
}

sub _process_rfc2231 {
	my ($attribs) = @_;
	my %cont;
	my %encoded;
	foreach (keys %{$attribs}) {
		next unless $_ =~ m/^(.*)\*([0-9])\*?$/;
		my ($attr, $sec) = ($1, $2);
		$cont{$attr}->[$sec] = $attribs->{$_};
		$encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
		delete $attribs->{$_};
	}
	foreach (keys %cont) {
		my $key = $_;
		$key .= '*' if $encoded{$_};
		$attribs->{$key} = join '', @{$cont{$_}};
	}
	foreach (keys %{$attribs}) {
		next unless $_ =~ m/^(.*)\*$/;
		my $key = $1;
		next unless ($attribs->{$_} // '') =~ m/^$re_exvalue$/;
		my ($charset, $value) = ($1, $2);
		$value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
		if (length $charset) {
			my $enc = find_mime_encoding($charset);
			if (defined $enc) {
				$value = $enc->decode($value);
			# } else {
				#carp "Unknown charset '$charset' in
				#attribute '$key' value";
			}
		}
		$attribs->{$key} = $value;
		delete $attribs->{$_};
	}
	$attribs;
}

sub _parse_attributes {
	local $_ = shift;
	substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
	my $attribs = {};
	while (length $_) {
		s/^;// or $STRICT_PARAMS and do {
			#carp "Missing semicolon before parameter '$_'";
			return $attribs;
		};
		_clean_comments($_);
		unless (length $_) {
			# Some mail software generates a Content-Type like this:
			# "Content-Type: text/plain;"
			# RFC 1521 section 3 says a parameter must exist if
			# there is a semicolon.
			#carp "Extra semicolon after last parameter" if
			#$STRICT_PARAMS;
			return $attribs;
		}
		my $attribute;
		if (s/^($re_token)=//) {
			$attribute = lc $1;
		} else {
			if ($STRICT_PARAMS) {
				# carp "Illegal parameter '$_'";
				return $attribs;
			}
			if (s/^($re_token_non_strict)=//) {
				$attribute = lc $1;
			} else {
				unless (s/^([^;=\s]+)\s*=//) {
					#carp "Cannot parse parameter '$_'";
					return $attribs;
				}
				$attribute = lc $1;
			}
		}
		_clean_comments($_);
		my $value = _extract_attribute_value();
		$attribs->{$attribute} = $value;
		_clean_comments($_);
	}
	$attribs;
}

sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
	my $value;
	while (length $_) {
		if (s/^($re_token)//) {
			$value .= $1;
		} elsif (s/^$re_quoted_string//) {
			my $sub = $1;
			$sub =~ s/\\(.)/$1/g;
			$value .= $sub;
		} elsif ($STRICT_PARAMS) {
			#my $char = substr $_, 0, 1;
			#carp "Unquoted '$char' not allowed";
			return;
		} elsif (s/^($re_token_non_strict)//) {
			$value .= $1;
		} elsif (s/^$re_quoted_string_non_strict//) {
			my $sub = $1;
			$sub =~ s/\\(.)/$1/g;
			$value .= $sub;
		}
		my $erased = _clean_comments($_);
		last if !length $_ or /^;/;
		if ($STRICT_PARAMS) {
			#my $char = substr $_, 0, 1;
			#carp "Extra '$char' found after parameter";
			return;
		}
		if ($erased) {
			# Sometimes semicolon is missing, so check for = char
			last if m/^$re_token_non_strict=/;
			$value .= ' ';
		}
		$value .= substr $_, 0, 1, '';
	}
	$value;
}

1;
__END__
=func parse_content_type

This routine is exported by default.

This routine parses email content type headers according to section 5.1 of RFC
2045 and also RFC 2231 (Character Set and Parameter Continuations).  It returns
a hash as above, with entries for the C<type>, the C<subtype>, and a hash of
C<attributes>.

For backward compatibility with a really unfortunate misunderstanding of RFC
2045 by the early implementors of this module, C<discrete> and C<composite> are
also present in the returned hashref, with the values of C<type> and C<subtype>
respectively.

=func parse_content_disposition

This routine is exported by default.

This routine parses email Content-Disposition headers according to RFC 2183 and
RFC 2231.  It returns a hash as above, with entries for the C<type>, and a hash
of C<attributes>.

=cut
debug log:

solving e6005d29 ...
found e6005d29 in https://80x24.org/public-inbox.git

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror http://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git