From 5754faeb3fa1c9aaeff8922b449127cfbc86236d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 9 Oct 2023 17:56:23 +0000 Subject: www_coderepo: fix handling of non-UTF-8 git data We can't assume git output is UTF-8, and we'll always have legacy data in git coderepos. So attempt to display some some garbled text rather than nothing at all if Perl croaks on it. sox commit c38987e8d20505621b8d872863afa7d233ed1096 (Added raw inverse-bit u-law and A-law support. Updated *.txt files., 2001-12-13) is an example of a commit which caused problems for me. --- lib/PublicInbox/Hval.pm | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'lib/PublicInbox/Hval.pm') diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index 0677865e..e9b9ae64 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -4,13 +4,13 @@ # represents a header value in various forms. Used for HTML generation # in our web interface(s) package PublicInbox::Hval; +use v5.10.1; # be careful about unicode_strings in v5.12; use strict; -use warnings; use Encode qw(find_encoding); use PublicInbox::MID qw/mid_clean mid_escape/; use base qw/Exporter/; our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape - to_attr prurl mid_href fmt_ts ts2str/; + to_attr prurl mid_href fmt_ts ts2str utf8_maybe/; use POSIX qw(strftime); my $enc_ascii = find_encoding('us-ascii'); @@ -137,4 +137,9 @@ sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) }; # human-friendly format sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) } +sub utf8_maybe ($) { + utf8::decode($_[0]); + utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists +} + 1; -- cgit v1.2.3-24-ge0c7