diff options
author | Eric Wong (Contractor, The Linux Foundation) <e@80x24.org> | 2018-04-18 09:13:11 +0000 |
---|---|---|
committer | Eric Wong (Contractor, The Linux Foundation) <e@80x24.org> | 2018-04-18 09:14:15 +0000 |
commit | f0ef0a56a8957d6f3095b1a24798e54b0b815d04 (patch) | |
tree | fcab14a29eaf1ec68564aa2163e31751f7e9936d /lib/PublicInbox/V2Writable.pm | |
parent | 69329215485cf2ab9d8cd1fa7faf65d8ec42dc0b (diff) | |
download | public-inbox-f0ef0a56a8957d6f3095b1a24798e54b0b815d04.tar.gz |
First off, decode text portions of messages since some archived mail I got was converted from quoted-printable or base-64 to 8bit by the original recipient. Attempting to merge them with my own archives (which had no conversion done) led to unnecessary duplicates showing up. Then, normalize CRLF line endings in text portions to LF. In the headers, we relax the content_id hashing to ignore quotes and lower-case domain names in To, Cc, and From headers since some mail processors will alter them. Finally, I've discovered Email::MIME->new($mime->as_string) does not always round-trip reliably, so we calculate the content_id twice on user-supplied messages.
Diffstat (limited to 'lib/PublicInbox/V2Writable.pm')
-rw-r--r-- | lib/PublicInbox/V2Writable.pm | 35 |
1 files changed, 27 insertions, 8 deletions
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 0dcdedae..e9fd502e 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -259,12 +259,32 @@ sub purge_oids { $purges; } +sub content_ids ($) { + my ($mime) = @_; + my @cids = ( content_id($mime) ); + + # Email::MIME->as_string doesn't always round-trip, so we may + # use a second content_id + my $rt = content_id(PublicInbox::MIME->new(\($mime->as_string))); + push @cids, $rt if $cids[0] ne $rt; + \@cids; +} + +sub content_matches ($$) { + my ($cids, $existing) = @_; + my $cid = content_id($existing); + foreach (@$cids) { + return 1 if $_ eq $cid + } + 0 +} + sub remove_internal { my ($self, $mime, $cmt_msg, $purge) = @_; $self->idx_init; my $im = $self->importer unless $purge; my $over = $self->{over}; - my $cid = content_id($mime); + my $cids = content_ids($mime); my $parts = $self->{idx_parts}; my $mm = $self->{mm}; my $removed; @@ -287,7 +307,7 @@ sub remove_internal { } my $orig = $$msg; my $cur = PublicInbox::MIME->new($msg); - if (content_id($cur) eq $cid) { + if (content_matches($cids, $cur)) { $smsg->{mime} = $cur; $gone{$smsg->{num}} = [ $smsg, \$orig ]; } @@ -572,8 +592,7 @@ sub get_blob ($$) { sub lookup_content { my ($self, $mime, $mid) = @_; my $over = $self->{over}; - my $cid = content_id($mime); - my $found; + my $cids = content_ids($mime); my ($id, $prev); while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { my $msg = get_blob($self, $smsg); @@ -582,16 +601,16 @@ sub lookup_content { next; } my $cur = PublicInbox::MIME->new($msg); - if (content_id($cur) eq $cid) { + if (content_matches($cids, $cur)) { $smsg->{mime} = $cur; - $found = $smsg; - last; + return $smsg; } + # XXX DEBUG_DIFF is experimental and may be removed diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF}; } - $found; + undef; } sub atfork_child { |