From 9174f534ccc51054895bdb198c8bc1a765abd9e9 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Nov 2022 09:15:47 +0000 Subject: content_hash: handle References as octets The alsa-devel archives on lore has some UTF-8 References: headers, so we need to treat them as octets, again, otherwise (re)indexing triggers cascading failures. Fixes: 5198c976ce8b "eml: header_raw converts octets to Perl UTF-8" --- lib/PublicInbox/ContentHash.pm | 7 ++++--- t/v2writable.t | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index bacc9cdd..1afbb413 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # Unstable internal API. @@ -63,8 +63,9 @@ sub content_digest ($;$) { # do NOT consider the Message-ID as part of the content_hash # if we got here, we've already got Message-ID reuse my %seen = map { $_ => 1 } @{mids($eml)}; - foreach my $mid (@{references($eml)}) { - $dig->add("ref\0$mid\0") unless $seen{$mid}++; + for (grep { !$seen{$_}++ } @{references($eml)}) { + utf8::encode($_); + $dig->add("ref\0$_\0"); } # Only use Sender: if From is not present diff --git a/t/v2writable.t b/t/v2writable.t index ad946338..0d102204 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -283,6 +283,22 @@ EOF is($msgs->[1]->{mid}, 'y'x244, 'stored truncated mid(2)'); } +if ('UTF-8 References') { + my @w; + local $SIG{__WARN__} = sub { push @w, @_ }; + my $msg = < +References: <\xc4\x80\@example> + +EOM + ok($im->add(PublicInbox::Eml->new($msg."a\n")), 'UTF-8 References 1'); + ok($im->add(PublicInbox::Eml->new($msg."b\n")), 'UTF-8 References 2'); + $im->done; + ok(!grep(/Wide character/, @w), 'no wide characters') or xbail(\@w); +} + my $tmp = { inboxdir => "$inboxdir/non-existent/subdir", name => 'nope', -- cgit v1.2.3-24-ge0c7