public-inbox.git  about / heads / tags
an "archives first" approach to mailing lists
blob fa25564a5db1042f6623ae838270920414fd12f2 2436 bytes (raw)
$ git show v1.4.0:lib/PublicInbox/MsgIter.pm	# shows this blob on the CLI

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
 
# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>

# read-only utilities for Email::MIME
package PublicInbox::MsgIter;
use strict;
use warnings;
use base qw(Exporter);
our @EXPORT = qw(msg_iter msg_part_text);
use PublicInbox::MIME;

# Like Email::MIME::walk_parts, but this is:
# * non-recursive
# * passes depth and indices to the iterator callback
sub msg_iter ($$;$$) {
	my ($mime, $cb, $cb_arg, $do_undef) = @_;
	my @parts = $mime->subparts;
	if (@parts) {
		$mime = $_[0] = undef if $do_undef; # saves some memory
		my $i = 0;
		@parts = map { [ $_, 1, ++$i ] } @parts;
		while (my $p = shift @parts) {
			my ($part, $depth, @idx) = @$p;
			my @sub = $part->subparts;
			if (@sub) {
				$depth++;
				$i = 0;
				@sub = map { [ $_, $depth, @idx, ++$i ] } @sub;
				@parts = (@sub, @parts);
			} else {
				$cb->($p, $cb_arg);
			}
		}
	} else {
		$cb->([$mime, 0, 0], $cb_arg);
	}
}

sub msg_part_text ($$) {
	my ($part, $ct) = @_;

	# TODO: we may offer a separate sub for people who need to index
	# HTML-only mail, but the majority of HTML mail is multipart/alternative
	# with a text part which we don't have to waste cycles decoding
	return if $ct =~ m!\btext/x?html\b!;

	my $s = eval { $part->body_str };
	my $err = $@;

	# text/plain is the default, multipart/mixed happened a few
	# times when it should not have been:
	#   <87llgalspt.fsf@free.fr>
	#   <200308111450.h7BEoOu20077@mail.osdl.org>
	if ($err && ($ct =~ m!\btext/\b!i ||
			$ct =~ m!\bmultipart/mixed\b!i)) {
		my $cte = $part->header_raw('Content-Transfer-Encoding');
		if (defined($cte) && $cte =~ /\b7bit\b/i) {
			$s = $part->body;
			$err = undef if $s =~ /\A[[:ascii:]]+\z/s;
		} else {
			# Try to assume UTF-8 because Alpine seems to
			# do wacky things and set charset=X-UNKNOWN
			$part->charset_set('UTF-8');
			$s = eval { $part->body_str };
		}

		# If forcing charset=UTF-8 failed,
		# caller will warn further down...
		$s = $part->body if $@;
	}
	($s, $err);
}

# returns an array of quoted or unquoted sections
sub split_quotes {
	# Quiet "Complex regular subexpression recursion limit" warning
	# in case an inconsiderate sender quotes 32K of text at once.
	# The warning from Perl is harmless for us since our callers can
	# tolerate less-than-ideal matches which work within Perl limits.
	no warnings 'regexp';
	split(/((?:^>[^\n]*\n)+)/sm, shift);
}

1;

git clone https://public-inbox.org/public-inbox.git
git clone http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git