From 6d448aae2a6624d37b291946b6666d5101c2957b Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Fri, 2 Mar 2018 03:08:33 +0000 Subject: content_id: special treatment for Message-Id headers Some emails in LKML archives are identical with the only difference being s/References:/In-Reply-To:/ in the headers. Since this difference doesn't affect how we handle message threading, we will treat them the same way for the purposes of deduplication. There may be more changes to how we do content_id along these lines (e.g. using msg_iter to walk the message). --- lib/PublicInbox/ContentId.pm | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'lib/PublicInbox/ContentId.pm') diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm index 65d5a76c..7ec638ca 100644 --- a/lib/PublicInbox/ContentId.pm +++ b/lib/PublicInbox/ContentId.pm @@ -11,7 +11,7 @@ our @EXPORT_OK = qw/content_id/; use Digest::SHA; # Content-* headers are often no-ops, so maybe we don't need them -my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To); +my @ID_HEADERS = qw(Subject From Date To Cc); sub content_id ($;$) { my ($mime, $alg) = @_; @@ -19,6 +19,20 @@ sub content_id ($;$) { my $dig = Digest::SHA->new($alg); my $hdr = $mime->header_obj; + # References: and In-Reply-To: get used interchangeably + # in some "duplicates" in LKML. We treat them the same + # in SearchIdx, so treat them the same for this: + my @mid = $hdr->header_raw('Message-ID'); + @mid = (join(' ', @mid) =~ /<([^>]+)>/g); + my $refs = join(' ', $hdr->header_raw('References'), + $hdr->header_raw('In-Reply-To')); + my @refs = ($refs =~ /<([^>]+)>/g); + my %seen; + foreach my $mid (@mid, @refs) { + next if $seen{$mid}; + $dig->add($mid); + $seen{$mid} = 1; + } foreach my $h (@ID_HEADERS) { my @v = $hdr->header_raw($h); $dig->add($_) foreach @v; -- cgit v1.2.3-24-ge0c7