diff options
author | Eric Wong (Contractor, The Linux Foundation) <e@80x24.org> | 2018-04-18 09:13:11 +0000 |
---|---|---|
committer | Eric Wong (Contractor, The Linux Foundation) <e@80x24.org> | 2018-04-18 09:14:15 +0000 |
commit | f0ef0a56a8957d6f3095b1a24798e54b0b815d04 (patch) | |
tree | fcab14a29eaf1ec68564aa2163e31751f7e9936d /lib/PublicInbox/ContentId.pm | |
parent | 69329215485cf2ab9d8cd1fa7faf65d8ec42dc0b (diff) | |
download | public-inbox-f0ef0a56a8957d6f3095b1a24798e54b0b815d04.tar.gz |
First off, decode text portions of messages since some archived mail I got was converted from quoted-printable or base-64 to 8bit by the original recipient. Attempting to merge them with my own archives (which had no conversion done) led to unnecessary duplicates showing up. Then, normalize CRLF line endings in text portions to LF. In the headers, we relax the content_id hashing to ignore quotes and lower-case domain names in To, Cc, and From headers since some mail processors will alter them. Finally, I've discovered Email::MIME->new($mime->as_string) does not always round-trip reliably, so we calculate the content_id twice on user-supplied messages.
Diffstat (limited to 'lib/PublicInbox/ContentId.pm')
-rw-r--r-- | lib/PublicInbox/ContentId.pm | 67 |
1 files changed, 57 insertions, 10 deletions
diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm index 279eec0c..b1d27eb8 100644 --- a/lib/PublicInbox/ContentId.pm +++ b/lib/PublicInbox/ContentId.pm @@ -7,10 +7,19 @@ use warnings; use base qw/Exporter/; our @EXPORT_OK = qw/content_id content_digest/; use PublicInbox::MID qw(mids references); +use PublicInbox::MsgIter; # not sure if less-widely supported hash families are worth bothering with use Digest::SHA; +sub digest_addr ($$$) { + my ($dig, $h, $v) = @_; + $v =~ tr/"//d; + $v =~ s/@([a-z0-9\_\.\-\(\)]*([A-Z])\S*)/'@'.lc($1)/ge; + utf8::encode($v); + $dig->add("$h\0$v\0"); +} + sub content_digest ($) { my ($mime) = @_; my $dig = Digest::SHA->new(256); @@ -27,24 +36,62 @@ sub content_digest ($) { } foreach my $mid (@{references($hdr)}) { next if $seen{$mid}; - $dig->add('ref: '.$mid); + $dig->add("ref\0$mid\0"); } # Only use Sender: if From is not present foreach my $h (qw(From Sender)) { - my @v = $hdr->header_raw($h); + my @v = $hdr->header($h); if (@v) { - $dig->add("$h: $_") foreach @v; - last; + digest_addr($dig, $h, $_) foreach @v; } } - - # Content-* headers are often no-ops, so maybe we don't need them - foreach my $h (qw(Subject Date To Cc)) { - my @v = $hdr->header_raw($h); - $dig->add("$h: $_") foreach @v; + foreach my $h (qw(Subject Date)) { + my @v = $hdr->header($h); + foreach my $v (@v) { + utf8::encode($v); + $dig->add("$h\0$v\0"); + } + } + # Some mail processors will add " to unquoted names that were + # not in the original message. For the purposes of deduplication, + # do not take it into account: + foreach my $h (qw(To Cc)) { + my @v = $hdr->header($h); + digest_addr($dig, $h, $_) foreach @v; } - $dig->add($mime->body_raw); + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + $dig->add("\0$depth:".join('.', @idx)."\0"); + my $fn = $part->filename; + if (defined $fn) { + utf8::encode($fn); + $dig->add("fn\0$fn\0"); + } + my @d = $part->header('Content-Description'); + foreach my $d (@d) { + utf8::encode($d); + $dig->add("d\0$d\0"); + } + $dig->add("b\0"); + my $ct = $part->content_type || 'text/plain'; + my $s = eval { $part->body_str }; + if ($@ && $ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + } + if (defined $s) { + $s =~ s/\r\n/\n/gs; + $s =~ s/\s*\z//s; + utf8::encode($s); + } else { + $s = $part->body; + } + $dig->add($s); + }); $dig; } |