user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH] www_coderepo: fix handling of non-UTF-8 git data
Date: Mon,  9 Oct 2023 17:56:23 +0000	[thread overview]
Message-ID: <20231009175623.2792414-1-e@80x24.org> (raw)

We can't assume git output is UTF-8, and we'll always have
legacy data in git coderepos.  So attempt to display some
some garbled text rather than nothing at all if Perl croaks
on it.

sox commit c38987e8d20505621b8d872863afa7d233ed1096
(Added raw inverse-bit u-law and A-law support.  Updated *.txt files., 2001-12-13)
is an example of a commit which caused problems for me.
---
 lib/PublicInbox/Hval.pm        |  9 +++++++--
 lib/PublicInbox/RepoAtom.pm    |  4 ++--
 lib/PublicInbox/RepoTree.pm    |  4 ++--
 lib/PublicInbox/ViewDiff.pm    |  4 ++--
 lib/PublicInbox/ViewVCS.pm     | 19 +++++++++----------
 lib/PublicInbox/WwwCoderepo.pm |  8 ++++----
 xt/solver.t                    | 31 ++++++++++++++++++-------------
 7 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index 0677865e..e9b9ae64 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -4,13 +4,13 @@
 # represents a header value in various forms.  Used for HTML generation
 # in our web interface(s)
 package PublicInbox::Hval;
+use v5.10.1; # be careful about unicode_strings in v5.12;
 use strict;
-use warnings;
 use Encode qw(find_encoding);
 use PublicInbox::MID qw/mid_clean mid_escape/;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape
-		to_attr prurl mid_href fmt_ts ts2str/;
+		to_attr prurl mid_href fmt_ts ts2str utf8_maybe/;
 use POSIX qw(strftime);
 my $enc_ascii = find_encoding('us-ascii');
 
@@ -137,4 +137,9 @@ sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) };
 # human-friendly format
 sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
 
+sub utf8_maybe ($) {
+	utf8::decode($_[0]);
+	utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists
+}
+
 1;
diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm
index c89d4551..79b76c12 100644
--- a/lib/PublicInbox/RepoAtom.pm
+++ b/lib/PublicInbox/RepoAtom.pm
@@ -8,7 +8,7 @@ use parent qw(PublicInbox::GzipFilter);
 use POSIX qw(strftime);
 use URI::Escape qw(uri_escape);
 use Scalar::Util ();
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 
 # git for-each-ref and log use different format fields :<
 my $ATOM_FMT = '--pretty=tformat:'.join('%n',
@@ -50,7 +50,7 @@ sub translate {
 	my $is_tag = $self->{-is_tag};
 	my ($H, $ct, $an, $ae, $at, $s, $bdy);
 	while ($lbuf =~ s/\A([^\0]+)\0\n//s) {
-		utf8::decode($bdy = $1);
+		utf8_maybe($bdy = $1);
 		if ($is_tag) {
 			my %r;
 			eval "$bdy";
diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm
index 9c7b86b3..5c73531a 100644
--- a/lib/PublicInbox/RepoTree.pm
+++ b/lib/PublicInbox/RepoTree.pm
@@ -8,7 +8,7 @@ use PublicInbox::ViewDiff qw(uri_escape_path);
 use PublicInbox::WwwStatic qw(r);
 use PublicInbox::Qspawn;
 use PublicInbox::WwwStream qw(html_oneshot);
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 
 sub rd_404_log {
 	my ($bref, $ctx) = @_;
@@ -26,7 +26,7 @@ sub rd_404_log {
 		$code = 404;
 	} else {
 		my ($H, $h, $s_as) = split(/ /, $$bref, 3);
-		utf8::decode($s_as);
+		utf8_maybe($s_as);
 		my $x = uri_escape_path($ctx->{-path});
 		$s_as = ascii_html($s_as);
 		print $zfh <<EOM;
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index 124a723a..d078c5f9 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -11,7 +11,7 @@ use v5.12;
 use parent qw(Exporter);
 our @EXPORT_OK = qw(flush_diff uri_escape_path);
 use URI::Escape qw(uri_escape_utf8);
-use PublicInbox::Hval qw(ascii_html to_attr);
+use PublicInbox::Hval qw(ascii_html to_attr utf8_maybe);
 use PublicInbox::Git qw(git_unquote);
 
 my $OID_NULL = '0{7,}';
@@ -236,7 +236,7 @@ sub flush_diff ($$) {
 				}
 			}
 			if (!$dctx) {
-				utf8::decode($after);
+				utf8_maybe($after);
 				diff_before_or_after($ctx, \$after);
 			}
 		} else {
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index e402d557..53e98c71 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -25,7 +25,7 @@ use PublicInbox::ViewDiff qw(flush_diff uri_escape_path);
 use PublicInbox::View;
 use PublicInbox::Eml;
 use Text::Wrap qw(wrap);
-use PublicInbox::Hval qw(ascii_html to_filename prurl);
+use PublicInbox::Hval qw(ascii_html to_filename prurl utf8_maybe);
 use POSIX qw(strftime);
 my $hl = eval {
 	require PublicInbox::HlMod;
@@ -115,14 +115,14 @@ sub show_other_result ($$) { # future-proofing
 				"git show error:$qsp_err");
 	}
 	my $l = PublicInbox::Linkify->new;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	html_page($ctx, 200, '<pre>', $l->to_html($$bref), '</pre><hr>',
 		dbg_log($ctx));
 }
 
 sub cmt_title { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $ctx) = @_;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : '';
 	push(@{$ctx->{-cmt_pt}} , ascii_html($title)) == @{$ctx->{-cmt_P}} and
 		cmt_finalize($ctx);
@@ -160,8 +160,7 @@ sub show_commit_start { # ->psgi_qx callback
 	open my $fh, '<', "$ctx->{-tmp}/h" or
 		die "open $ctx->{-tmp}/h: $!";
 	chop(my $buf = do { local $/ = "\0"; <$fh> });
-	utf8::decode($buf);
-	utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist
+	utf8_maybe($buf); # non-UTF-8 commits exist
 	chomp $buf;
 	my ($P, $p);
 	($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9);
@@ -248,12 +247,12 @@ committer $co
 EOM
 	print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy);
 	$bdy = '';
-	open my $fh, '<:utf8', "$ctx->{-tmp}/p" or
-		die "open $ctx->{-tmp}/p: $!";
+	open my $fh, '<', "$ctx->{-tmp}/p" or die "open $ctx->{-tmp}/p: $!";
 	if (-s $fh > $MAX_SIZE) {
 		print $zfh "---\n patch is too large to show\n";
 	} else { # prepare flush_diff:
 		read($fh, $x, -s _);
+		utf8_maybe($x);
 		$ctx->{-apfx} = $ctx->{-spfx} = $upfx;
 		$x =~ s/\r?\n/\n/gs;
 		$ctx->{-anchors} = {} if $x =~ /^diff --git /sm;
@@ -418,7 +417,7 @@ EOM
 		undef $_;
 		($m, $t, $oid, $sz) = split(/ +/, $x, 4);
 		$m = $GIT_MODE{$m} // '?';
-		utf8::decode($f);
+		utf8_maybe($f);
 		$n = ascii_html($f);
 		if ($m eq 'g') { # gitlink submodule commit
 			$$bref .= "\ng\t\t$n @ <a\nhref=#g>commit</a>$oid";
@@ -480,7 +479,7 @@ sub tz_adj ($) {
 
 sub show_tag_result { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $ctx) = @_;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	my $l = PublicInbox::Linkify->new;
 	$$bref = $l->to_html($$bref);
 	$$bref =~ s!^object ([a-f0-9]+)!object <a
@@ -569,7 +568,7 @@ sub show_blob { # git->cat_async callback
 				" $raw_more</pre>".dbg_log($ctx));
 
 	# TODO: detect + convert to ensure validity
-	utf8::decode($$blob);
+	utf8_maybe($$blob);
 	my $nl = ($$blob =~ s/\r?\n/\n/sg);
 	my $pad = length($nl);
 
diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm
index 834145e9..e8c340b5 100644
--- a/lib/PublicInbox/WwwCoderepo.pm
+++ b/lib/PublicInbox/WwwCoderepo.pm
@@ -14,7 +14,7 @@ use PublicInbox::ViewVCS;
 use PublicInbox::WwwStatic qw(r);
 use PublicInbox::GitHTTPBackend;
 use PublicInbox::WwwStream;
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 use PublicInbox::ViewDiff qw(uri_escape_path);
 use PublicInbox::RepoSnapshot;
 use PublicInbox::RepoAtom;
@@ -179,7 +179,7 @@ EOM
 
 sub capture { # psgi_qx callback to capture git-for-each-ref
 	my ($bref, $arg) = @_; # arg = [ctx, key, OnDestroy(summary_END)]
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	$arg->[0]->{qx_res}->{$arg->[1]} = $$bref;
 	# summary_END may be called via OnDestroy $arg->[2]
 }
@@ -241,13 +241,13 @@ sub translate {
 	$fbuf .= shift while @_;
 	if ($ctx->{-heads}) {
 		while ($fbuf =~ s/\A([^\n]+)\n//s) {
-			utf8::decode(my $x = $1);
+			utf8_maybe(my $x = $1);
 			push @out, _refs_heads_link($x, '../../');
 		}
 	} else {
 		my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx);
 		while ($fbuf =~ s/\A([^\n]+)\n//s) {
-			utf8::decode(my $x = $1);
+			utf8_maybe(my $x = $1);
 			push @out, _refs_tags_link($x, '../../',
 						$snap_pfx, @snap_fmt);
 		}
diff --git a/xt/solver.t b/xt/solver.t
index 06f5a493..51b4144c 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -32,23 +32,30 @@ my $todo = {
 		'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag
 		'7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8
 	],
+	'sox-devel' => [
+		'c38987e8d20505621b8d872863afa7d233ed1096/s/', # non-UTF-8
+	]
 };
 
-my ($ibx_name, $urls, @gone);
+my @gone;
 my $client = sub {
 	my ($cb) = @_;
-	for my $u (@$urls) {
-		my $url = "/$ibx_name/$u";
-		my $res = $cb->(GET($url));
-		is($res->code, 200, $url);
-		next if $res->code == 200;
-		diag "$url failed";
-		diag $res->content;
+	for my $ibx_name (sort keys %$todo) {
+		diag "testing $ibx_name";
+		my $urls = $todo->{$ibx_name};
+		for my $u (@$urls) {
+			my $url = "/$ibx_name/$u";
+			my $res = $cb->(GET($url));
+			is($res->code, 200, $url);
+			next if $res->code == 200;
+			diag "$url failed";
+			diag $res->content;
+		}
 	}
 };
 
 my $nr = 0;
-while (($ibx_name, $urls) = each %$todo) {
+while (my ($ibx_name, $urls) = each %$todo) {
 	SKIP: {
 		my $ibx = $cfg->lookup_name($ibx_name);
 		if (!$ibx) {
@@ -61,15 +68,13 @@ while (($ibx_name, $urls) = each %$todo) {
 			skip(qq{publicinbox.$ibx_name.coderepo not configured},
 				scalar(@$urls));
 		}
-		test_psgi($app, $client);
 		$nr++;
 	}
 }
 
 delete @$todo{@gone};
+test_psgi($app, $client);
 my $env = { PI_CONFIG => PublicInbox::Config->default_file };
-while (($ibx_name, $urls) = each %$todo) {
-	test_httpd($env, $client, $nr);
-}
+test_httpd($env, $client, $nr);
 
 done_testing();

                 reply	other threads:[~2023-10-09 17:56 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231009175623.2792414-1-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).