From f28fdcd6d8d6ac36c7b6adf6670238426f3cc067 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 2 Oct 2021 11:18:34 +0000 Subject: content_hash: normalize whitespace before hashing addresses This should prevent some false duplicates. I noticed this while implementing "lei mail-diff", and only noticed it when I implemented the ContentDigestDbg wrapper for mail-diff. --- lib/PublicInbox/ContentHash.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index f6ae9011..bacc9cdd 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -20,6 +20,7 @@ use Digest::SHA; sub digest_addr ($$$) { my ($dig, $h, $v) = @_; $v =~ tr/"//d; + $v =~ tr/\r\n\t / /s; $v =~ s/@([a-z0-9\_\.\-\(\)]*([A-Z])\S*)/'@'.lc($1)/ge; utf8::encode($v); $dig->add("$h\0$v\0"); -- cgit v1.2.3-24-ge0c7