From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id DB59F1FAEB for ; Tue, 6 Mar 2018 08:42:42 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 09/34] content_id: use `mids' and `references' for MID extraction Date: Tue, 6 Mar 2018 08:42:17 +0000 Message-Id: <20180306084242.19988-10-e@80x24.org> In-Reply-To: <20180306084242.19988-1-e@80x24.org> References: <20180306084242.19988-1-e@80x24.org> List-Id: These already take care of deduping internally, so we'll save ourselves at least some of the trouble while using a more consistent API. While we're at it, hash the header name as well, since we need to distinguish which header a certain value came from. --- lib/PublicInbox/ContentId.pm | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/PublicInbox/ContentId.pm b/lib/PublicInbox/ContentId.pm index 7ec638c..d1a009e 100644 --- a/lib/PublicInbox/ContentId.pm +++ b/lib/PublicInbox/ContentId.pm @@ -6,6 +6,7 @@ use strict; use warnings; use base qw/Exporter/; our @EXPORT_OK = qw/content_id/; +use PublicInbox::MID qw(mids references); # not sure if less-widely supported hash families are worth bothering with use Digest::SHA; @@ -22,20 +23,18 @@ sub content_id ($;$) { # References: and In-Reply-To: get used interchangeably # in some "duplicates" in LKML. We treat them the same # in SearchIdx, so treat them the same for this: - my @mid = $hdr->header_raw('Message-ID'); - @mid = (join(' ', @mid) =~ /<([^>]+)>/g); - my $refs = join(' ', $hdr->header_raw('References'), - $hdr->header_raw('In-Reply-To')); - my @refs = ($refs =~ /<([^>]+)>/g); my %seen; - foreach my $mid (@mid, @refs) { - next if $seen{$mid}; - $dig->add($mid); + foreach my $mid (@{mids($hdr)}) { + $dig->add('mid: '.$mid); $seen{$mid} = 1; } + foreach my $mid (@{references($hdr)}) { + next if $seen{$mid}; + $dig->add('ref: '.$mid); + } foreach my $h (@ID_HEADERS) { my @v = $hdr->header_raw($h); - $dig->add($_) foreach @v; + $dig->add("$h: $_") foreach @v; } $dig->add($mime->body_raw); 'SHA-' . $dig->algorithm . ':' . $dig->hexdigest; -- EW