about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2022-11-27 09:15:47 +0000
committerEric Wong <e@80x24.org>2022-11-27 09:19:13 +0000
commit9174f534ccc51054895bdb198c8bc1a765abd9e9 (patch)
treeed68c3770eab136f88d5ad745dc5ab361d046a70
parent86cb9010c49523b1968c29ef592bc1afacc77894 (diff)
downloadpublic-inbox-master.tar.gz
The alsa-devel archives on lore has some UTF-8 References:
headers, so we need to treat them as octets, again, otherwise
(re)indexing triggers cascading failures.

Fixes: 5198c976ce8b "eml: header_raw converts octets to Perl UTF-8"
-rw-r--r--lib/PublicInbox/ContentHash.pm7
-rw-r--r--t/v2writable.t16
2 files changed, 20 insertions, 3 deletions
diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm
index bacc9cdd..1afbb413 100644
--- a/lib/PublicInbox/ContentHash.pm
+++ b/lib/PublicInbox/ContentHash.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # Unstable internal API.
@@ -63,8 +63,9 @@ sub content_digest ($;$) {
         # do NOT consider the Message-ID as part of the content_hash
         # if we got here, we've already got Message-ID reuse
         my %seen = map { $_ => 1 } @{mids($eml)};
-        foreach my $mid (@{references($eml)}) {
-                $dig->add("ref\0$mid\0") unless $seen{$mid}++;
+        for (grep { !$seen{$_}++ } @{references($eml)}) {
+                utf8::encode($_);
+                $dig->add("ref\0$_\0");
         }
 
         # Only use Sender: if From is not present
diff --git a/t/v2writable.t b/t/v2writable.t
index ad946338..0d102204 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -283,6 +283,22 @@ EOF
         is($msgs->[1]->{mid}, 'y'x244, 'stored truncated mid(2)');
 }
 
+if ('UTF-8 References') {
+        my @w;
+        local $SIG{__WARN__} = sub { push @w, @_ };
+        my $msg = <<EOM;
+From: a\@example.com
+Subject: b
+Message-ID: <horrible\@example>
+References: <\xc4\x80\@example>
+
+EOM
+        ok($im->add(PublicInbox::Eml->new($msg."a\n")), 'UTF-8 References 1');
+        ok($im->add(PublicInbox::Eml->new($msg."b\n")), 'UTF-8 References 2');
+        $im->done;
+        ok(!grep(/Wide character/, @w), 'no wide characters') or xbail(\@w);
+}
+
 my $tmp = {
         inboxdir => "$inboxdir/non-existent/subdir",
         name => 'nope',