public-inbox.git  about / heads / tags
an "archives first" approach to mailing lists
blob dd28417b70c043b092f231147d82580ba94e31cf 3570 bytes (raw)
$ git show HEAD:lib/PublicInbox/MsgIter.pm	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
 
# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>

# read-only utilities for Email::MIME
package PublicInbox::MsgIter;
use strict;
use warnings;
use base qw(Exporter);
our @EXPORT = qw(msg_iter msg_part_text);

# This becomes PublicInbox::MIME->each_part:
# Like Email::MIME::walk_parts, but this is:
# * non-recursive
# * passes depth and indices to the iterator callback
sub em_each_part ($$;$$) {
	my ($mime, $cb, $cb_arg, $do_undef) = @_;
	my @parts = $mime->subparts;
	if (@parts) {
		$mime = $_[0] = undef if $do_undef; # saves some memory
		my $i = 0;
		@parts = map { [ $_, 1, ++$i ] } @parts;
		while (my $p = shift @parts) {
			my ($part, $depth, $idx) = @$p;
			my @sub = $part->subparts;
			if (@sub) {
				$depth++;
				$i = 0;
				@sub = map {
					[ $_, $depth, "$idx.".(++$i) ]
				} @sub;
				@parts = (@sub, @parts);
			} else {
				$cb->($p, $cb_arg);
			}
		}
	} else {
		$cb->([$mime, 0, 1], $cb_arg);
	}
}

# Use this when we may accept Email::MIME from user scripts
# (not just PublicInbox::MIME)
sub msg_iter ($$;$$) { # $_[0] = PublicInbox::MIME/Email::MIME-like obj
	my (undef, $cb, $cb_arg, $once) = @_;
	if (my $ep = $_[0]->can('each_part')) { # PublicInbox::{MIME,*}
		$ep->($_[0], $cb, $cb_arg, $once);
	} else { # for compatibility with existing Email::MIME users:
		em_each_part($_[0], $cb, $cb_arg, $once);
	}
}

sub msg_part_text ($$) {
	my ($part, $ct) = @_;

	# TODO: we may offer a separate sub for people who need to index
	# HTML-only mail, but the majority of HTML mail is multipart/alternative
	# with a text part which we don't have to waste cycles decoding
	return if $ct =~ m!\btext/x?html\b!;

	my $s = eval { $part->body_str };
	my $err = $@;

	# text/plain is the default, multipart/mixed happened a few
	# times when it should not have been:
	#   <87llgalspt.fsf@free.fr>
	#   <200308111450.h7BEoOu20077@mail.osdl.org>
	# But also do not try this with ->{is_submsg} (message/rfc822),
	# since a broken multipart/mixed inside a message/rfc822 part
	# has not been seen in the wild, yet...
	if ($err && ($ct =~ m!\btext/\b!i ||
			(!$part->{is_submsg} &&
				$ct =~ m!\bmultipart/mixed\b!i) ) ) {
		my $cte = $part->header_raw('Content-Transfer-Encoding');
		if (defined($cte) && $cte =~ /\b7bit\b/i) {
			$s = $part->body;
			$err = undef if $s =~ /\A[[:ascii:]]+\z/s;
		} else {
			# Try to assume UTF-8 because Alpine seems to
			# do wacky things and set charset=X-UNKNOWN
			$part->charset_set('UTF-8');
			$s = eval { $part->body_str };
		}

		# If forcing charset=UTF-8 failed,
		# caller will warn further down...
		$s = $part->body if $@;
	} elsif ($err && $ct =~ m!\bapplication/octet-stream\b!i) {
		# Some unconfigured/poorly-configured MUAs will set
		# application/octet-stream even for all text attachments.
		# Try to see if it's printable text that we can index
		# and display:
		$s = $part->body;
		utf8::decode($s);
		undef($s =~ /[^\p{XPosixPrint}\s]/s ? $s : $err);
	}
	($s, $err);
}

# returns an array of quoted or unquoted sections
sub split_quotes {
	# some editors don't put trailing newlines at the end,
	# make sure split_quotes can work:
	$_[0] .= "\n" if substr($_[0], -1) ne "\n";

	# Quiet "Complex regular subexpression recursion limit" warning
	# in case an inconsiderate sender quotes 32K of text at once.
	# The warning from Perl is harmless for us since our callers can
	# tolerate less-than-ideal matches which work within Perl limits.
	no warnings 'regexp';
	split(/((?:^>[^\n]*\n)+)/sm, $_[0]);
}

1;

git clone https://public-inbox.org/public-inbox.git
git clone http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git