1 files changed, 24 insertions, 17 deletions
diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm
index bacc9cdd..95ca2929 100644
--- a/lib/PublicInbox/ContentHash.pm
+++ b/lib/PublicInbox/ContentHash.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
  # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
  
  # Unstable internal API.
@@ -15,7 +15,8 @@ use PublicInbox::MID qw(mids references);
  use PublicInbox::MsgIter;
  
  # not sure if less-widely supported hash families are worth bothering with
-use Digest::SHA;
+use PublicInbox::SHA; # faster, but no ->clone
+use Digest::SHA; # we still need this for ->clone
  
  sub digest_addr ($$$) {
          my ($dig, $h, $v) = @_;
@@ -44,7 +45,7 @@ sub content_dig_i {
          my $ct = $part->content_type || 'text/plain';
          my ($s, undef) = msg_part_text($part, $ct);
          if (defined $s) {
-                $s =~ s/\r\n/\n/gs;
+                $s =~ s/\r\n/\n/gs; # TODO: consider \r+\n to match View
                  $s =~ s/\s*\z//s;
                  utf8::encode($s);
          } else {
@@ -53,18 +54,26 @@ sub content_dig_i {
          $dig->add($s);
  }
  
-sub content_digest ($;$) {
-        my ($eml, $dig) = @_;
+sub content_digest ($;$$) {
+        my ($eml, $dig, $hash_mids) = @_;
          $dig //= Digest::SHA->new(256);
  
          # References: and In-Reply-To: get used interchangeably
          # in some "duplicates" in LKML.  We treat them the same
          # in SearchIdx, so treat them the same for this:
          # do NOT consider the Message-ID as part of the content_hash
-        # if we got here, we've already got Message-ID reuse
-        my %seen = map { $_ => 1 } @{mids($eml)};
-        foreach my $mid (@{references($eml)}) {
-                $dig->add("ref\0$mid\0") unless $seen{$mid}++;
+        # if we got here, we've already got Message-ID reuse for v2.
+        #
+        # However, `lei q --dedupe=content' does use $hash_mids since
+        # it doesn't have any other dedupe
+        my $mids = mids($eml);
+        if ($hash_mids) {
+                $dig->add("mid\0$_\0") for @$mids;
+        }
+        my %seen = map { $_ => 1 } @$mids;
+        for (grep { !$seen{$_}++ } @{references($eml)}) {
+                utf8::encode($_);
+                $dig->add("ref\0$_\0");
          }
  
          # Only use Sender: if From is not present
@@ -74,8 +83,7 @@ sub content_digest ($;$) {
                  last;
          }
          foreach my $h (qw(Subject Date)) {
-                my @v = $eml->header($h);
-                foreach my $v (@v) {
+                for my $v ($eml->header($h)) {
                          utf8::encode($v);
                          $dig->add("$h\0$v\0");
                  }
@@ -84,23 +92,22 @@ sub content_digest ($;$) {
          # not in the original message.  For the purposes of deduplication,
          # do not take it into account:
          foreach my $h (qw(To Cc)) {
-                my @v = $eml->header($h);
-                digest_addr($dig, $h, $_) foreach @v;
+                digest_addr($dig, $h, $_) for ($eml->header($h));
          }
          msg_iter($eml, \&content_dig_i, $dig);
          $dig;
  }
  
  sub content_hash ($) {
-        content_digest($_[0])->digest;
+        content_digest($_[0], PublicInbox::SHA->new(256))->digest;
  }
  
+# don't clone the result of this
  sub git_sha ($$) {
          my ($n, $eml) = @_;
-        my $dig = Digest::SHA->new($n);
+        my $dig = PublicInbox::SHA->new($n);
          my $bref = ref($eml) eq 'SCALAR' ? $eml : \($eml->as_string);
-        $dig->add('blob '.length($$bref)."\0");
-        $dig->add($$bref);
+        $dig->add('blob '.length($$bref)."\0", $$bref);
          $dig;
  }