about summary refs log tree commit homepage
path: root/lib/PublicInbox/ContentHash.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox/ContentHash.pm')
-rw-r--r--lib/PublicInbox/ContentHash.pm41
1 files changed, 24 insertions, 17 deletions
diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm
index bacc9cdd..95ca2929 100644
--- a/lib/PublicInbox/ContentHash.pm
+++ b/lib/PublicInbox/ContentHash.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # Unstable internal API.
@@ -15,7 +15,8 @@ use PublicInbox::MID qw(mids references);
 use PublicInbox::MsgIter;
 
 # not sure if less-widely supported hash families are worth bothering with
-use Digest::SHA;
+use PublicInbox::SHA; # faster, but no ->clone
+use Digest::SHA; # we still need this for ->clone
 
 sub digest_addr ($$$) {
         my ($dig, $h, $v) = @_;
@@ -44,7 +45,7 @@ sub content_dig_i {
         my $ct = $part->content_type || 'text/plain';
         my ($s, undef) = msg_part_text($part, $ct);
         if (defined $s) {
-                $s =~ s/\r\n/\n/gs;
+                $s =~ s/\r\n/\n/gs; # TODO: consider \r+\n to match View
                 $s =~ s/\s*\z//s;
                 utf8::encode($s);
         } else {
@@ -53,18 +54,26 @@ sub content_dig_i {
         $dig->add($s);
 }
 
-sub content_digest ($;$) {
-        my ($eml, $dig) = @_;
+sub content_digest ($;$$) {
+        my ($eml, $dig, $hash_mids) = @_;
         $dig //= Digest::SHA->new(256);
 
         # References: and In-Reply-To: get used interchangeably
         # in some "duplicates" in LKML.  We treat them the same
         # in SearchIdx, so treat them the same for this:
         # do NOT consider the Message-ID as part of the content_hash
-        # if we got here, we've already got Message-ID reuse
-        my %seen = map { $_ => 1 } @{mids($eml)};
-        foreach my $mid (@{references($eml)}) {
-                $dig->add("ref\0$mid\0") unless $seen{$mid}++;
+        # if we got here, we've already got Message-ID reuse for v2.
+        #
+        # However, `lei q --dedupe=content' does use $hash_mids since
+        # it doesn't have any other dedupe
+        my $mids = mids($eml);
+        if ($hash_mids) {
+                $dig->add("mid\0$_\0") for @$mids;
+        }
+        my %seen = map { $_ => 1 } @$mids;
+        for (grep { !$seen{$_}++ } @{references($eml)}) {
+                utf8::encode($_);
+                $dig->add("ref\0$_\0");
         }
 
         # Only use Sender: if From is not present
@@ -74,8 +83,7 @@ sub content_digest ($;$) {
                 last;
         }
         foreach my $h (qw(Subject Date)) {
-                my @v = $eml->header($h);
-                foreach my $v (@v) {
+                for my $v ($eml->header($h)) {
                         utf8::encode($v);
                         $dig->add("$h\0$v\0");
                 }
@@ -84,23 +92,22 @@ sub content_digest ($;$) {
         # not in the original message.  For the purposes of deduplication,
         # do not take it into account:
         foreach my $h (qw(To Cc)) {
-                my @v = $eml->header($h);
-                digest_addr($dig, $h, $_) foreach @v;
+                digest_addr($dig, $h, $_) for ($eml->header($h));
         }
         msg_iter($eml, \&content_dig_i, $dig);
         $dig;
 }
 
 sub content_hash ($) {
-        content_digest($_[0])->digest;
+        content_digest($_[0], PublicInbox::SHA->new(256))->digest;
 }
 
+# don't clone the result of this
 sub git_sha ($$) {
         my ($n, $eml) = @_;
-        my $dig = Digest::SHA->new($n);
+        my $dig = PublicInbox::SHA->new($n);
         my $bref = ref($eml) eq 'SCALAR' ? $eml : \($eml->as_string);
-        $dig->add('blob '.length($$bref)."\0");
-        $dig->add($$bref);
+        $dig->add('blob '.length($$bref)."\0", $$bref);
         $dig;
 }