diff options
Diffstat (limited to 'lib/PublicInbox/ContentId.pm')
-rw-r--r-- | lib/PublicInbox/ContentId.pm | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm index 65d5a76c..7ec638ca 100644 --- a/lib/PublicInbox/ContentId.pm +++ b/lib/PublicInbox/ContentId.pm @@ -11,7 +11,7 @@ our @EXPORT_OK = qw/content_id/; use Digest::SHA; # Content-* headers are often no-ops, so maybe we don't need them -my @ID_HEADERS = qw(Subject From Date Message-ID References To Cc In-Reply-To); +my @ID_HEADERS = qw(Subject From Date To Cc); sub content_id ($;$) { my ($mime, $alg) = @_; @@ -19,6 +19,20 @@ sub content_id ($;$) { my $dig = Digest::SHA->new($alg); my $hdr = $mime->header_obj; + # References: and In-Reply-To: get used interchangeably + # in some "duplicates" in LKML. We treat them the same + # in SearchIdx, so treat them the same for this: + my @mid = $hdr->header_raw('Message-ID'); + @mid = (join(' ', @mid) =~ /<([^>]+)>/g); + my $refs = join(' ', $hdr->header_raw('References'), + $hdr->header_raw('In-Reply-To')); + my @refs = ($refs =~ /<([^>]+)>/g); + my %seen; + foreach my $mid (@mid, @refs) { + next if $seen{$mid}; + $dig->add($mid); + $seen{$mid} = 1; + } foreach my $h (@ID_HEADERS) { my @v = $hdr->header_raw($h); $dig->add($_) foreach @v; |