about summary refs log tree commit homepage
path: root/lib/PublicInbox/ContentId.pm
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-03-02 03:08:33 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-03-02 03:23:12 +0000
commit6d448aae2a6624d37b291946b6666d5101c2957b (patch)
tree1c3afd607e545dde451a2bd15b3ef71e1041e98b /lib/PublicInbox/ContentId.pm
parent66642eb8e5f51a675d699c4fe9107202d61d2626 (diff)
downloadpublic-inbox-6d448aae2a6624d37b291946b6666d5101c2957b.tar.gz
Some emails in LKML archives are identical with the only
difference being s/References:/In-Reply-To:/ in the headers.
Since this difference doesn't affect how we handle message
threading, we will treat them the same way for the purposes
of deduplication.

There may be more changes to how we do content_id along these
lines (e.g. using msg_iter to walk the message).
Diffstat (limited to 'lib/PublicInbox/ContentId.pm')
-rw-r--r--lib/PublicInbox/ContentId.pm16
1 files changed, 15 insertions, 1 deletions
diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm
index 65d5a76c..7ec638ca 100644
--- a/lib/PublicInbox/ContentId.pm
+++ b/lib/PublicInbox/ContentId.pm
@@ -11,7 +11,7 @@ our @EXPORT_OK = qw/content_id/;
 use Digest::SHA;
 
 # Content-* headers are often no-ops, so maybe we don't need them
-my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To);
+my @ID_HEADERS = qw(Subject From Date To Cc);
 
 sub content_id ($;$) {
         my ($mime, $alg) = @_;
@@ -19,6 +19,20 @@ sub content_id ($;$) {
         my $dig = Digest::SHA->new($alg);
         my $hdr = $mime->header_obj;
 
+        # References: and In-Reply-To: get used interchangeably
+        # in some "duplicates" in LKML.  We treat them the same
+        # in SearchIdx, so treat them the same for this:
+        my @mid = $hdr->header_raw('Message-ID');
+        @mid = (join(' ', @mid) =~ /<([^>]+)>/g);
+        my $refs = join(' ', $hdr->header_raw('References'),
+                        $hdr->header_raw('In-Reply-To'));
+        my @refs = ($refs =~ /<([^>]+)>/g);
+        my %seen;
+        foreach my $mid (@mid, @refs) {
+                next if $seen{$mid};
+                $dig->add($mid);
+                $seen{$mid} = 1;
+        }
         foreach my $h (@ID_HEADERS) {
                 my @v = $hdr->header_raw($h);
                 $dig->add($_) foreach @v;