From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 2D8E0211B5 for ; Tue, 5 Feb 2019 11:10:54 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 1/6] viewvcs: cleanup utf8 handling Date: Tue, 5 Feb 2019 11:10:48 +0000 Message-Id: <20190205111053.7155-2-e@80x24.org> In-Reply-To: <20190205111053.7155-1-e@80x24.org> References: <20190205111053.7155-1-e@80x24.org> List-Id: Favor in-place utf8::decode since it's a bit faster without method dispatch overhead; and don't care about validity just yet. HlMod->do_hl itself should return "utf8" strings, since other parts of our code can use it, so it's not the job of ViewVCS to post-process HlMod output. --- lib/PublicInbox/HlMod.pm | 7 ++++++- lib/PublicInbox/ViewVCS.pm | 6 ++---- t/hl_mod.t | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/HlMod.pm b/lib/PublicInbox/HlMod.pm index 237ffac..decfd71 100644 --- a/lib/PublicInbox/HlMod.pm +++ b/lib/PublicInbox/HlMod.pm @@ -107,7 +107,12 @@ sub do_hl { $g->setEncoding('utf-8'); $g; }; - \($gen->generateString($$str)) + + # we assume $$str is valid UTF-8, but the SWIG binding doesn't + # know that, so ensure it's marked as UTF-8 even if it isnt... + my $out = $gen->generateString($$str); + utf8::decode($out); + \$out; } # SWIG instances aren't reference-counted, but $self is; diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm index d67b5eb..acdd822 100644 --- a/lib/PublicInbox/ViewVCS.pm +++ b/lib/PublicInbox/ViewVCS.pm @@ -16,7 +16,6 @@ package PublicInbox::ViewVCS; use strict; use warnings; -use Encode qw(find_encoding); use PublicInbox::SolverGit; use PublicInbox::WwwStream; use PublicInbox::Linkify; @@ -33,7 +32,6 @@ END { $hl = undef }; my %QP_MAP = ( A => 'oid_a', B => 'oid_b', a => 'path_a', b => 'path_b' ); my $max_size = 1024 * 1024; # TODO: configurable -my $enc_utf8 = find_encoding('UTF-8'); my $BIN_DETECT = 8000; # same as git sub html_page ($$$) { @@ -122,14 +120,14 @@ sub solve_result { return html_page($ctx, 200, \$log); } - $$blob = $enc_utf8->decode($$blob); + # TODO: detect + convert to ensure validity + utf8::decode($$blob); my $nl = ($$blob =~ tr/\n/\n/); my $pad = length($nl); $l->linkify_1($$blob); my $ok = $hl->do_hl($blob, $path) if $hl; if ($ok) { - $$ok = $enc_utf8->decode($$ok); src_escape($$ok); $blob = $ok; } else { diff --git a/t/hl_mod.t b/t/hl_mod.t index 80f8890..c402f1f 100644 --- a/t/hl_mod.t +++ b/t/hl_mod.t @@ -19,6 +19,7 @@ my $orig = $str; { my $ref = $hls->do_hl(\$str, 'foo.perl'); is(ref($ref), 'SCALAR', 'got a scalar reference back'); + ok(utf8::valid($$ref), 'resulting string is utf8::valid'); like($$ref, qr/I can see you!/, 'we can see ourselves in output'); like($$ref, qr/&&/, 'escaped'); -- EW