diff options
author | Eric Wong <e@80x24.org> | 2022-11-27 09:15:47 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2022-11-27 09:19:13 +0000 |
commit | 9174f534ccc51054895bdb198c8bc1a765abd9e9 (patch) | |
tree | ed68c3770eab136f88d5ad745dc5ab361d046a70 | |
parent | 86cb9010c49523b1968c29ef592bc1afacc77894 (diff) | |
download | public-inbox-9174f534ccc51054895bdb198c8bc1a765abd9e9.tar.gz |
The alsa-devel archives on lore has some UTF-8 References: headers, so we need to treat them as octets, again, otherwise (re)indexing triggers cascading failures. Fixes: 5198c976ce8b "eml: header_raw converts octets to Perl UTF-8"
-rw-r--r-- | lib/PublicInbox/ContentHash.pm | 7 | ||||
-rw-r--r-- | t/v2writable.t | 16 |
2 files changed, 20 insertions, 3 deletions
diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index bacc9cdd..1afbb413 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Unstable internal API. @@ -63,8 +63,9 @@ sub content_digest ($;$) { # do NOT consider the Message-ID as part of the content_hash # if we got here, we've already got Message-ID reuse my %seen = map { $_ => 1 } @{mids($eml)}; - foreach my $mid (@{references($eml)}) { - $dig->add("ref\0$mid\0") unless $seen{$mid}++; + for (grep { !$seen{$_}++ } @{references($eml)}) { + utf8::encode($_); + $dig->add("ref\0$_\0"); } # Only use Sender: if From is not present diff --git a/t/v2writable.t b/t/v2writable.t index ad946338..0d102204 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -283,6 +283,22 @@ EOF is($msgs->[1]->{mid}, 'y'x244, 'stored truncated mid(2)'); } +if ('UTF-8 References') { + my @w; + local $SIG{__WARN__} = sub { push @w, @_ }; + my $msg = <<EOM; +From: a\@example.com +Subject: b +Message-ID: <horrible\@example> +References: <\xc4\x80\@example> + +EOM + ok($im->add(PublicInbox::Eml->new($msg."a\n")), 'UTF-8 References 1'); + ok($im->add(PublicInbox::Eml->new($msg."b\n")), 'UTF-8 References 2'); + $im->done; + ok(!grep(/Wide character/, @w), 'no wide characters') or xbail(\@w); +} + my $tmp = { inboxdir => "$inboxdir/non-existent/subdir", name => 'nope', |