From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id D46801FD69 for ; Sat, 25 Jan 2020 04:45:14 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 22/22] viewdiff: rewrite and simplify Date: Sat, 25 Jan 2020 04:45:10 +0000 Message-Id: <20200125044510.13769-23-e@yhbt.net> In-Reply-To: <20200125044510.13769-1-e@yhbt.net> References: <20200125044510.13769-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Instead of going line-by-line, use split() with a giant regexp to capture groups of contiguous lines. This offloads state management to the regexp itself and makes it FAR easier to keep track of and pairings. Performance seems roughly on par after this change for the meta@public-inbox archives. It seems a tiny bit faster for git@vger with xt/perf-msgview.t, likely due to the longer messages and larger contiguous groups of lines having the same prefix (or no prefix at all) and drastically reduces the number of subroutine calls and Perl ops executed. --- lib/PublicInbox/View.pm | 8 +- lib/PublicInbox/ViewDiff.pm | 297 +++++++++++++++++------------------- 2 files changed, 139 insertions(+), 166 deletions(-) diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index f613afb5..9a5e3997 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -562,7 +562,7 @@ sub add_text_body { # callback for msg_iter $idx[0] = $upfx . $idx[0] if $upfx ne ''; $ctx->{-apfx} = join('/', @idx); $ctx->{-anchors} = {}; # attr => filename - $ctx->{-diff} = $diff = []; + $diff = 1; delete $ctx->{-long_path}; my $spfx; if ($ibx->{-repo_objs}) { @@ -595,14 +595,12 @@ sub add_text_body { # callback for msg_iter attach_link($ctx, $ct, $p, $fn, $err); $$rv .= "\n"; } - my $l = PublicInbox::Linkify->new; + my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; foreach my $cur (@sections) { if ($cur =~ /\A>/) { flush_quote($rv, $l, \$cur); } elsif ($diff) { - @$diff = split(/^/m, $cur); - $cur = undef; - flush_diff($rv, $ctx, $l); + flush_diff($rv, $ctx, \$cur); } else { # regular lines, OK $$rv .= $l->to_html($cur); diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm index ece95f4c..8c37c139 100644 --- a/lib/PublicInbox/ViewDiff.pm +++ b/lib/PublicInbox/ViewDiff.pm @@ -7,6 +7,7 @@ # (or reconstruct) blobs. package PublicInbox::ViewDiff; +use 5.010_001; use strict; use warnings; use base qw(Exporter); @@ -15,52 +16,31 @@ use URI::Escape qw(uri_escape_utf8); use PublicInbox::Hval qw(ascii_html to_attr); use PublicInbox::Git qw(git_unquote); -# keep track of state so we can avoid redundant HTML tags for -# identically-classed lines -sub DSTATE_INIT () { 0 } -sub DSTATE_STAT () { 1 } -sub DSTATE_HEAD () { 2 } # /^diff --git /, /^index /, /^--- /, /^\+\+\+ / -sub DSTATE_CTX () { 3 } # /^ / -sub DSTATE_ADD () { 4 } # /^\+/ -sub DSTATE_DEL () { 5 } # /^\-/ - -# maps the DSTATE_* to CSS class names compatible with what cgit uses: -my @state2class = ( - '', # init - '', # stat - 'head', - '', # ctx - 'add', - 'del' -); - sub UNSAFE () { "^A-Za-z0-9\-\._~/" } my $OID_NULL = '0{7,40}'; my $OID_BLOB = '[a-f0-9]{7,40}'; -my $PATH_X = '"?[^/]+/.+|/dev/null'; # cf. git diff.c :: get_compact_summary my $DIFFSTAT_COMMENT = qr/\((?:new|gone|(?:(?:new|mode) [\+\-][lx]))\)/; # link to line numbers in blobs sub diff_hunk ($$$$) { - my ($dctx, $spfx, $ca, $cb) = @_; - my $oid_a = $dctx->{oid_a}; - my $oid_b = $dctx->{oid_b}; - - (defined($spfx) && defined($oid_a) && defined($oid_b)) or - return "@@ $ca $cb @@"; + my ($dst, $dctx, $ca, $cb) = @_; + my ($oid_a, $oid_b, $spfx) = @$dctx{qw(oid_a oid_b spfx)}; - my ($n) = ($ca =~ /^-([0-9]+)/); - $n = defined($n) ? do { ++$n; "#n$n" } : ''; + if (defined($spfx) && defined($oid_a) && defined($oid_b)) { + my ($n) = ($ca =~ /^-([0-9]+)/); + $n = defined($n) ? do { ++$n; "#n$n" } : ''; - my $rv = qq(@@ {Q}$n">$ca); + $$dst .= qq(@@ {Q}$n">$ca); - ($n) = ($cb =~ /^\+([0-9]+)/); - $n = defined($n) ? do { ++$n; "#n$n" } : ''; - - $rv .= qq( {Q}$n">$cb @@); + ($n) = ($cb =~ /^\+([0-9]+)/); + $n = defined($n) ? do { ++$n; "#n$n" } : ''; + $$dst .= qq( {Q}$n">$cb @@); + } else { + $$dst .= "@@ $ca $cb @@"; + } } sub oid ($$$) { @@ -68,16 +48,9 @@ sub oid ($$$) { defined($spfx) ? qq({Q}">$oid) : $oid; } -sub to_state ($$$) { - my ($dst, $state, $new_state) = @_; - $$dst .= '' if $state2class[$state]; - $_[1] = $new_state; - my $class = $state2class[$new_state] or return; - $$dst .= qq(); -} - -sub anchor0 ($$$$$) { - my ($dst, $ctx, $linkify, $fn, $rest) = @_; +# returns true if diffstat anchor written, false otherwise +sub anchor0 ($$$$) { + my ($dst, $ctx, $fn, $rest) = @_; my $orig = $fn; @@ -100,16 +73,16 @@ sub anchor0 ($$$$$) { my $spaces = ($orig =~ s/( +)\z//) ? $1 : ''; $$dst .= " " . ascii_html($orig) . '' . $spaces . - $linkify->to_html($rest); + $ctx->{-linkify}->to_html($rest); return 1; } undef; } -sub anchor1 ($$$$$) { - my ($dst, $ctx, $linkify, $pb, $s) = @_; +# returns "diff --git" anchor destination, undef otherwise +sub anchor1 ($$) { + my ($ctx, $pb) = @_; my $attr = to_attr($ctx->{-apfx}.$pb) or return; - my $line = $linkify->to_html($s); my $ok = delete $ctx->{-anchors}->{$attr}; @@ -125,131 +98,133 @@ sub anchor1 ($$$$$) { last; } } - if ($ok && $line =~ s/^diff //) { - $$dst .= "diff ".$line; - return 1; - } - undef + $ok ? "diff --git" : undef } -sub missing_diff_git_line ($$) { - my ($dctx, $pb) = @_; - # missing "diff --git ..." - $dctx->{path_b} = $pb; - $dctx->{Q} = '?b='.uri_escape_utf8($pb, UNSAFE); - my $pa = $dctx->{path_a}; - if (defined($pa) && $pa ne $pb) { - $dctx->{Q} .= '&a='. uri_escape_utf8($pa, UNSAFE); - } -} - -sub flush_diff ($$$) { - my ($dst, $ctx, $linkify) = @_; - my $diff = $ctx->{-diff}; +sub diff_header ($$$$) { + my ($dst, $x, $ctx, $top) = @_; + my (undef, undef, $pa, $pb) = splice(@$top, 0, 4); # ignore oid_{a,b} my $spfx = $ctx->{-spfx}; - my $state = DSTATE_INIT; - my $dctx = { Q => '' }; # {}, keys: oid_a, oid_b, path_a, path_b - - foreach my $s (@$diff) { - if ($s =~ /^---$/) { - to_state($dst, $state, DSTATE_STAT); - $$dst .= $s; - } elsif ($s =~ /^ / || ($s =~ /^$/ && $state >= DSTATE_CTX)) { - # works for common cases, but not weird/long filenames - if ($state == DSTATE_STAT && - $s =~ /^ (.+)( +\| .*\z)/s) { - anchor0($dst, $ctx, $linkify, $1, $2) and next; - } elsif ($state2class[$state]) { - to_state($dst, $state, DSTATE_CTX); - } - $$dst .= $linkify->to_html($s); - } elsif ($s =~ /^-- $/) { # email signature begins - $state == DSTATE_INIT or - to_state($dst, $state, DSTATE_INIT); - $$dst .= $s; - } elsif ($s =~ m!^diff --git ($PATH_X) ($PATH_X)$!o) { - my ($pa, $pb) = ($1, $2); - if ($state != DSTATE_HEAD) { - to_state($dst, $state, DSTATE_HEAD); - } - $pa = (split('/', git_unquote($pa), 2))[1]; + my $dctx = { spfx => $spfx }; + if ($pa eq $pb && $pb ne '/dev/null') { + $pa = $pb = (split('/', git_unquote($pb), 2))[1]; + $dctx->{Q} = "?b=".uri_escape_utf8($pb, UNSAFE); + } else { + my @q; + if ($pb ne '/dev/null') { $pb = (split('/', git_unquote($pb), 2))[1]; - $dctx = { - Q => "?b=".uri_escape_utf8($pb, UNSAFE), - }; - if ($pa ne $pb) { - $dctx->{Q} .= '&a='. - uri_escape_utf8($pa, UNSAFE); - } - anchor1($dst, $ctx, $linkify, $pb, $s) and next; - $$dst .= $linkify->to_html($s); - } elsif ($s =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b//o) { - $$dst .= $1 . oid($dctx, $spfx, $2); - $dctx = { Q => '' }; - $$dst .= $linkify->to_html($s) ; - } elsif ($s =~ s/^index ($OID_BLOB)(\.\.$OID_NULL)\b//o) { - $$dst .= 'index ' . oid($dctx, $spfx, $1) . $2; - $dctx = { Q => '' }; - $$dst .= $linkify->to_html($s); - } elsif ($s =~ /^index ($OID_BLOB)\.\.($OID_BLOB)/o) { - $dctx->{oid_a} = $1; - $dctx->{oid_b} = $2; - $$dst .= $linkify->to_html($s); - } elsif ($s =~ s/^@@ (\S+) (\S+) @@//) { - $$dst .= '' if $state2class[$state]; - $$dst .= qq(); - $$dst .= diff_hunk($dctx, $spfx, $1, $2); - $$dst .= ''; - $state = DSTATE_CTX; - $$dst .= $linkify->to_html($s); - } elsif ($s =~ m!^--- ($PATH_X)!o) { - my $pa = $1; + push @q, 'b='.uri_escape_utf8($pb, UNSAFE); + } + if ($pa ne '/dev/null') { $pa = (split('/', git_unquote($pa), 2))[1]; - if (($dctx->{path_a} // '') ne $pa) { - # missing "diff --git ..." ? - $dctx->{path_a} = $pa; - } - # color only (no oid link) if missing dctx->{oid_*} - $state <= DSTATE_STAT and - to_state($dst, $state, DSTATE_HEAD); - $$dst .= $linkify->to_html($s); - } elsif ($s =~ m!^\+{3} ($PATH_X)!o) { - my $pb = $1; - $pb = (split('/', git_unquote($pb), 2))[1]; - if (($dctx->{path_b} // '') ne $pb) { - missing_diff_git_line($dctx, $pb); - } + push @q, 'a='.uri_escape_utf8($pa, UNSAFE); + } + $dctx->{Q} = '?'.join('&', @q); + } - # color only (no oid link) if missing dctx->{oid_*} - $state <= DSTATE_STAT and - to_state($dst, $state, DSTATE_HEAD); - $$dst .= $linkify->to_html($s); - } elsif ($s =~ /^\+/) { - if ($state != DSTATE_ADD && $state > DSTATE_STAT) { - to_state($dst, $state, DSTATE_ADD); + # linkify early and all at once, since we know the following + # subst ops on $$x won't need further escaping: + $$x = $ctx->{-linkify}->to_html($$x); + + # no need to capture oid_a and oid_b on add/delete, + # we just linkify OIDs directly via s///e in conditional + if (($$x =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b/ + $1 . oid($dctx, $spfx, $2)/emos) || + ($$x =~ s/^index ($OID_BLOB)(\.\.$OID_NULL)\b/ + 'index ' . oid($dctx, $spfx, $1) . $2/emos)) { + } elsif ($$x =~ /^index ($OID_BLOB)\.\.($OID_BLOB)/mos) { + # modification-only, not add/delete: + # linkify hunk headers later using oid_a and oid_b + @$dctx{qw(oid_a oid_b)} = ($1, $2); + } else { + warn "BUG? <$$x> had no ^index line"; + } + $$x =~ s!^diff --git!anchor1($ctx, $pb) // 'diff --git'!emos; + $$dst .= qq(); + $$dst .= $$x; + $$dst .= ''; + $dctx; +} + +sub diff_before_or_after ($$$) { + my ($dst, $ctx, $x) = @_; + my $linkify = $ctx->{-linkify}; + for my $y (split(/(^---\r?\n)/sm, $$x)) { + if ($y =~ /\A---\r?\n\z/s) { + $$dst .= "---\n"; # all HTML is "\r\n" => "\n" + } elsif ($y =~ /^ [0-9]+ files? changed, /sm) { + # ok, looks like a diffstat, go line-by-line: + for my $l (split(/^/m, $y)) { + if ($l =~ /^ (.+)( +\| .*\z)/s) { + anchor0($dst, $ctx, $1, $2) and next; + } + $$dst .= $linkify->to_html($l); } - $$dst .= $linkify->to_html($s); - } elsif ($s =~ /^-/) { - if ($state != DSTATE_DEL && $state > DSTATE_STAT) { - to_state($dst, $state, DSTATE_DEL); + } else { # commit message, notes, etc + $$dst .= $linkify->to_html($y); + } + } +} + +sub flush_diff ($$$) { + my ($dst, $ctx, $cur) = @_; + state $LF = qr!\r?\n!; + state $ANY = qr![^\r\n]!; + state $FN = qr!(?:"?[^/\n]+/[^\r\n]+|/dev/null)!; + + my @top = split(/( + (?: # begin header stuff, don't capture filenames, here, + # but instead wait for the --- and +++ lines. + (?:^diff\x20--git\x20$FN\x20$FN$LF) + + # old mode || new mode || copy|rename|deleted|... + (?:^[a-z]$ANY+$LF)* + )? # end of optional stuff, everything below is required + ^index\x20($OID_BLOB)\.\.($OID_BLOB)$ANY*$LF + ^---\x20($FN)$LF + ^\+{3}\x20($FN)$LF)/smxo, $$cur); + $$cur = undef; + + my $linkify = $ctx->{-linkify}; + my $dctx; # {}, keys: Q, oid_a, oid_b + + while (defined(my $x = shift @top)) { + if (scalar(@top) >= 4 && + $top[1] =~ /\A$OID_BLOB\z/os && + $top[0] =~ /\A$OID_BLOB\z/os) { + $dctx = diff_header($dst, \$x, $ctx, \@top); + } elsif ($dctx) { + my $after = ''; + for my $s (split(/((?:(?:^\+[^\n]*\n)+)| + (?:(?:^-[^\n]*\n)+)| + (?:^@@ [^\n]+\n))/xsm, $x)) { + if (!defined($dctx)) { + $after .= $s; + } elsif ($s =~ s/\A@@ (\S+) (\S+) @@//) { + $$dst .= qq(); + diff_hunk($dst, $dctx, $1, $2); + $$dst .= $linkify->to_html($s); + $$dst .= ''; + } elsif ($s =~ /\A\+/) { + $$dst .= qq(); + $$dst .= $linkify->to_html($s); + $$dst .= ''; + } elsif ($s =~ /\A-- $/sm) { # email sig starts + $dctx = undef; + $after .= $s; + } elsif ($s =~ /\A-/) { + $$dst .= qq(); + $$dst .= $linkify->to_html($s); + $$dst .= ''; + } else { + $$dst .= $linkify->to_html($s); + } } - $$dst .= $linkify->to_html($s); - # ignore the following lines in headers: - } elsif ($s =~ /^(?:dis)similarity index/ || - $s =~ /^(?:old|new) mode/ || - $s =~ /^(?:deleted|new) file mode/ || - $s =~ /^(?:copy|rename) (?:from|to) / || - $s =~ /^(?:dis)?similarity index /) { - $$dst .= $linkify->to_html($s); + diff_before_or_after($dst, $ctx, \$after) unless $dctx; } else { - $state <= DSTATE_STAT or - to_state($dst, $state, DSTATE_INIT); - $$dst .= $linkify->to_html($s); + diff_before_or_after($dst, $ctx, \$x); } } - @$diff = (); - $$dst .= '' if $state2class[$state]; - undef; } 1;