From e5cc97f6a2ff53f072a5d692e56d0918b33c5081 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 31 Oct 2019 03:12:19 +0000 Subject: msgiter: attempt to decode all text/* bodies We want to index text/x-patch and text/x-diff, at least, since "git format-patch" can generate a patch series as attachments using --attach. --- lib/PublicInbox/MsgIter.pm | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index f11ba223..d9df32ab 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -45,12 +45,18 @@ sub msg_part_text ($$) { # times when it should not have been: # <87llgalspt.fsf@free.fr> # <200308111450.h7BEoOu20077@mail.osdl.org> - if ($err && ($ct =~ m!\btext/plain\b!i || + if ($err && ($ct =~ m!\btext/\b!i || $ct =~ m!\bmultipart/mixed\b!i)) { - # Try to assume UTF-8 because Alpine seems to - # do wacky things and set charset=X-UNKNOWN - $part->charset_set('UTF-8'); - $s = eval { $part->body_str }; + my $cte = $part->header_raw('Content-Transfer-Encoding'); + if (defined($cte) && $cte =~ /\b7bit\b/i) { + $s = $part->body; + $err = undef if $s =~ /\A[[:ascii:]]+\z/s; + } else { + # Try to assume UTF-8 because Alpine seems to + # do wacky things and set charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + } # If forcing charset=UTF-8 failed, # caller will warn further down... -- cgit v1.2.3-24-ge0c7