about summary refs log tree commit
diff options
context:
space:
mode:
authorEric Wong <e@yhbt.net>2020-01-25 04:45:10 +0000
committerEric Wong <e@yhbt.net>2020-01-27 02:59:09 +0000
commitddec19694cbf0e1d94fb56c0fc4bd90eae540884 (patch)
tree67b174dd71519fd5b14f57f2740e41fd3645ffe6
parent19671d5736639c9f5d063a87a075fd309e41d203 (diff)
downloadpublic-inbox-ddec19694cbf0e1d94fb56c0fc4bd90eae540884.tar.gz
Instead of going line-by-line, use split() with a giant regexp
to capture groups of contiguous lines.  This offloads state
management to the regexp itself and makes it FAR easier to
keep track of <span> and </span> pairings.

Performance seems roughly on par after this change for the
meta@public-inbox archives.  It seems a tiny bit faster for
git@vger with xt/perf-msgview.t, likely due to the longer
messages and larger contiguous groups of lines having the same
prefix (or no prefix at all) and drastically reduces the number
of subroutine calls and Perl ops executed.
-rw-r--r--lib/PublicInbox/View.pm8
-rw-r--r--lib/PublicInbox/ViewDiff.pm297
2 files changed, 139 insertions, 166 deletions
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index e604aee0..ff36777c 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -562,7 +562,7 @@ sub add_text_body { # callback for msg_iter
                 $idx[0] = $upfx . $idx[0] if $upfx ne '';
                 $ctx->{-apfx} = join('/', @idx);
                 $ctx->{-anchors} = {}; # attr => filename
-                $ctx->{-diff} = $diff = [];
+                $diff = 1;
                 delete $ctx->{-long_path};
                 my $spfx;
                 if ($ibx->{-repo_objs}) {
@@ -595,14 +595,12 @@ sub add_text_body { # callback for msg_iter
                 attach_link($ctx, $ct, $p, $fn, $err);
                 $$rv .= "\n";
         }
-        my $l = PublicInbox::Linkify->new;
+        my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
         foreach my $cur (@sections) {
                 if ($cur =~ /\A>/) {
                         flush_quote($rv, $l, \$cur);
                 } elsif ($diff) {
-                        @$diff = split(/^/m, $cur);
-                        $cur = undef;
-                        flush_diff($rv, $ctx, $l);
+                        flush_diff($rv, $ctx, \$cur);
                 } else {
                         # regular lines, OK
                         $$rv .= $l->to_html($cur);
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index ece95f4c..8c37c139 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -7,6 +7,7 @@
 # (or reconstruct) blobs.
 
 package PublicInbox::ViewDiff;
+use 5.010_001;
 use strict;
 use warnings;
 use base qw(Exporter);
@@ -15,52 +16,31 @@ use URI::Escape qw(uri_escape_utf8);
 use PublicInbox::Hval qw(ascii_html to_attr);
 use PublicInbox::Git qw(git_unquote);
 
-# keep track of state so we can avoid redundant HTML tags for
-# identically-classed lines
-sub DSTATE_INIT () { 0 }
-sub DSTATE_STAT () { 1 }
-sub DSTATE_HEAD () { 2 } # /^diff --git /, /^index /, /^--- /, /^\+\+\+ /
-sub DSTATE_CTX () { 3 } # /^ /
-sub DSTATE_ADD () { 4 } # /^\+/
-sub DSTATE_DEL () { 5 } # /^\-/
-
-# maps the DSTATE_* to CSS class names compatible with what cgit uses:
-my @state2class = (
-        '', # init
-        '', # stat
-        'head',
-        '', # ctx
-        'add',
-        'del'
-);
-
 sub UNSAFE () { "^A-Za-z0-9\-\._~/" }
 
 my $OID_NULL = '0{7,40}';
 my $OID_BLOB = '[a-f0-9]{7,40}';
-my $PATH_X = '"?[^/]+/.+|/dev/null';
 
 # cf. git diff.c :: get_compact_summary
 my $DIFFSTAT_COMMENT = qr/\((?:new|gone|(?:(?:new|mode) [\+\-][lx]))\)/;
 
 # link to line numbers in blobs
 sub diff_hunk ($$$$) {
-        my ($dctx, $spfx, $ca, $cb) = @_;
-        my $oid_a = $dctx->{oid_a};
-        my $oid_b = $dctx->{oid_b};
-
-        (defined($spfx) && defined($oid_a) && defined($oid_b)) or
-                return "@@ $ca $cb @@";
+        my ($dst, $dctx, $ca, $cb) = @_;
+        my ($oid_a, $oid_b, $spfx) = @$dctx{qw(oid_a oid_b spfx)};
 
-        my ($n) = ($ca =~ /^-([0-9]+)/);
-        $n = defined($n) ? do { ++$n; "#n$n" } : '';
+        if (defined($spfx) && defined($oid_a) && defined($oid_b)) {
+                my ($n) = ($ca =~ /^-([0-9]+)/);
+                $n = defined($n) ? do { ++$n; "#n$n" } : '';
 
-        my $rv = qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>);
+                $$dst .= qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>);
 
-        ($n) = ($cb =~ /^\+([0-9]+)/);
-        $n = defined($n) ? do { ++$n; "#n$n" } : '';
-
-        $rv .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@);
+                ($n) = ($cb =~ /^\+([0-9]+)/);
+                $n = defined($n) ? do { ++$n; "#n$n" } : '';
+                $$dst .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@);
+        } else {
+                $$dst .= "@@ $ca $cb @@";
+        }
 }
 
 sub oid ($$$) {
@@ -68,16 +48,9 @@ sub oid ($$$) {
         defined($spfx) ? qq(<a\nhref="$spfx$oid/s/$dctx->{Q}">$oid</a>) : $oid;
 }
 
-sub to_state ($$$) {
-        my ($dst, $state, $new_state) = @_;
-        $$dst .= '</span>' if $state2class[$state];
-        $_[1] = $new_state;
-        my $class = $state2class[$new_state] or return;
-        $$dst .= qq(<span\nclass="$class">);
-}
-
-sub anchor0 ($$$$$) {
-        my ($dst, $ctx, $linkify, $fn, $rest) = @_;
+# returns true if diffstat anchor written, false otherwise
+sub anchor0 ($$$$) {
+        my ($dst, $ctx, $fn, $rest) = @_;
 
         my $orig = $fn;
 
@@ -100,16 +73,16 @@ sub anchor0 ($$$$$) {
                 my $spaces = ($orig =~ s/( +)\z//) ? $1 : '';
                 $$dst .= " <a\nid=i$attr\nhref=#$attr>" .
                         ascii_html($orig) . '</a>' . $spaces .
-                        $linkify->to_html($rest);
+                        $ctx->{-linkify}->to_html($rest);
                 return 1;
         }
         undef;
 }
 
-sub anchor1 ($$$$$) {
-        my ($dst, $ctx, $linkify, $pb, $s) = @_;
+# returns "diff --git" anchor destination, undef otherwise
+sub anchor1 ($$) {
+        my ($ctx, $pb) = @_;
         my $attr = to_attr($ctx->{-apfx}.$pb) or return;
-        my $line = $linkify->to_html($s);
 
         my $ok = delete $ctx->{-anchors}->{$attr};
 
@@ -125,131 +98,133 @@ sub anchor1 ($$$$$) {
                         last;
                 }
         }
-        if ($ok && $line =~ s/^diff //) {
-                $$dst .= "<a\nhref=#i$attr\nid=$attr>diff</a> ".$line;
-                return 1;
-        }
-        undef
+        $ok ? "<a\nhref=#i$attr\nid=$attr>diff</a> --git" : undef
 }
 
-sub missing_diff_git_line ($$) {
-        my ($dctx, $pb) = @_;
-        # missing "diff --git ..."
-        $dctx->{path_b} = $pb;
-        $dctx->{Q} = '?b='.uri_escape_utf8($pb, UNSAFE);
-        my $pa = $dctx->{path_a};
-        if (defined($pa) && $pa ne $pb) {
-                $dctx->{Q} .= '&amp;a='. uri_escape_utf8($pa, UNSAFE);
-        }
-}
-
-sub flush_diff ($$$) {
-        my ($dst, $ctx, $linkify) = @_;
-        my $diff = $ctx->{-diff};
+sub diff_header ($$$$) {
+        my ($dst, $x, $ctx, $top) = @_;
+        my (undef, undef, $pa, $pb) = splice(@$top, 0, 4); # ignore oid_{a,b}
         my $spfx = $ctx->{-spfx};
-        my $state = DSTATE_INIT;
-        my $dctx = { Q => '' }; # {}, keys: oid_a, oid_b, path_a, path_b
-
-        foreach my $s (@$diff) {
-                if ($s =~ /^---$/) {
-                        to_state($dst, $state, DSTATE_STAT);
-                        $$dst .= $s;
-                } elsif ($s =~ /^ / || ($s =~ /^$/ && $state >= DSTATE_CTX)) {
-                        # works for common cases, but not weird/long filenames
-                        if ($state == DSTATE_STAT &&
-                                        $s =~ /^ (.+)( +\| .*\z)/s) {
-                                anchor0($dst, $ctx, $linkify, $1, $2) and next;
-                        } elsif ($state2class[$state]) {
-                                to_state($dst, $state, DSTATE_CTX);
-                        }
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ /^-- $/) { # email signature begins
-                        $state == DSTATE_INIT or
-                                to_state($dst, $state, DSTATE_INIT);
-                        $$dst .= $s;
-                } elsif ($s =~ m!^diff --git ($PATH_X) ($PATH_X)$!o) {
-                        my ($pa, $pb) = ($1, $2);
-                        if ($state != DSTATE_HEAD) {
-                                to_state($dst, $state, DSTATE_HEAD);
-                        }
-                        $pa = (split('/', git_unquote($pa), 2))[1];
+        my $dctx = { spfx => $spfx };
+        if ($pa eq $pb && $pb ne '/dev/null') {
+                $pa = $pb = (split('/', git_unquote($pb), 2))[1];
+                $dctx->{Q} = "?b=".uri_escape_utf8($pb, UNSAFE);
+        } else {
+                my @q;
+                if ($pb ne '/dev/null') {
                         $pb = (split('/', git_unquote($pb), 2))[1];
-                        $dctx = {
-                                Q => "?b=".uri_escape_utf8($pb, UNSAFE),
-                        };
-                        if ($pa ne $pb) {
-                                $dctx->{Q} .= '&amp;a='.
-                                        uri_escape_utf8($pa, UNSAFE);
-                        }
-                        anchor1($dst, $ctx, $linkify, $pb, $s) and next;
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b//o) {
-                        $$dst .= $1 . oid($dctx, $spfx, $2);
-                        $dctx = { Q => '' };
-                        $$dst .= $linkify->to_html($s) ;
-                } elsif ($s =~ s/^index ($OID_BLOB)(\.\.$OID_NULL)\b//o) {
-                        $$dst .= 'index ' . oid($dctx, $spfx, $1) . $2;
-                        $dctx = { Q => '' };
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ /^index ($OID_BLOB)\.\.($OID_BLOB)/o) {
-                        $dctx->{oid_a} = $1;
-                        $dctx->{oid_b} = $2;
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ s/^@@ (\S+) (\S+) @@//) {
-                        $$dst .= '</span>' if $state2class[$state];
-                        $$dst .= qq(<span\nclass="hunk">);
-                        $$dst .= diff_hunk($dctx, $spfx, $1, $2);
-                        $$dst .= '</span>';
-                        $state = DSTATE_CTX;
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ m!^--- ($PATH_X)!o) {
-                        my $pa = $1;
+                        push @q, 'b='.uri_escape_utf8($pb, UNSAFE);
+                }
+                if ($pa ne '/dev/null') {
                         $pa = (split('/', git_unquote($pa), 2))[1];
-                        if (($dctx->{path_a} // '') ne $pa) {
-                                # missing "diff --git ..." ?
-                                $dctx->{path_a} = $pa;
-                        }
-                        # color only (no oid link) if missing dctx->{oid_*}
-                        $state <= DSTATE_STAT and
-                                to_state($dst, $state, DSTATE_HEAD);
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ m!^\+{3} ($PATH_X)!o) {
-                        my $pb = $1;
-                        $pb = (split('/', git_unquote($pb), 2))[1];
-                        if (($dctx->{path_b} // '') ne $pb) {
-                                missing_diff_git_line($dctx, $pb);
-                        }
+                        push @q, 'a='.uri_escape_utf8($pa, UNSAFE);
+                }
+                $dctx->{Q} = '?'.join('&amp;', @q);
+        }
 
-                        # color only (no oid link) if missing dctx->{oid_*}
-                        $state <= DSTATE_STAT and
-                                to_state($dst, $state, DSTATE_HEAD);
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ /^\+/) {
-                        if ($state != DSTATE_ADD && $state > DSTATE_STAT) {
-                                to_state($dst, $state, DSTATE_ADD);
+        # linkify early and all at once, since we know the following
+        # subst ops on $$x won't need further escaping:
+        $$x = $ctx->{-linkify}->to_html($$x);
+
+        # no need to capture oid_a and oid_b on add/delete,
+        # we just linkify OIDs directly via s///e in conditional
+        if (($$x =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b/
+                        $1 . oid($dctx, $spfx, $2)/emos) ||
+                ($$x =~ s/^index ($OID_BLOB)(\.\.$OID_NULL)\b/
+                        'index ' . oid($dctx, $spfx, $1) . $2/emos)) {
+        } elsif ($$x =~ /^index ($OID_BLOB)\.\.($OID_BLOB)/mos) {
+                # modification-only, not add/delete:
+                # linkify hunk headers later using oid_a and oid_b
+                @$dctx{qw(oid_a oid_b)} = ($1, $2);
+        } else {
+                warn "BUG? <$$x> had no ^index line";
+        }
+        $$x =~ s!^diff --git!anchor1($ctx, $pb) // 'diff --git'!emos;
+        $$dst .= qq(<span\nclass="head">);
+        $$dst .= $$x;
+        $$dst .= '</span>';
+        $dctx;
+}
+
+sub diff_before_or_after ($$$) {
+        my ($dst, $ctx, $x) = @_;
+        my $linkify = $ctx->{-linkify};
+        for my $y (split(/(^---\r?\n)/sm, $$x)) {
+                if ($y =~ /\A---\r?\n\z/s) {
+                        $$dst .= "---\n"; # all HTML is "\r\n" => "\n"
+                } elsif ($y =~ /^ [0-9]+ files? changed, /sm) {
+                        # ok, looks like a diffstat, go line-by-line:
+                        for my $l (split(/^/m, $y)) {
+                                if ($l =~ /^ (.+)( +\| .*\z)/s) {
+                                        anchor0($dst, $ctx, $1, $2) and next;
+                                }
+                                $$dst .= $linkify->to_html($l);
                         }
-                        $$dst .= $linkify->to_html($s);
-                } elsif ($s =~ /^-/) {
-                        if ($state != DSTATE_DEL && $state > DSTATE_STAT) {
-                                to_state($dst, $state, DSTATE_DEL);
+                } else { # commit message, notes, etc
+                        $$dst .= $linkify->to_html($y);
+                }
+        }
+}
+
+sub flush_diff ($$$) {
+        my ($dst, $ctx, $cur) = @_;
+        state $LF = qr!\r?\n!;
+        state $ANY = qr![^\r\n]!;
+        state $FN = qr!(?:"?[^/\n]+/[^\r\n]+|/dev/null)!;
+
+        my @top = split(/(
+                (?:        # begin header stuff, don't capture filenames, here,
+                        # but instead wait for the --- and +++ lines.
+                        (?:^diff\x20--git\x20$FN\x20$FN$LF)
+
+                        # old mode || new mode || copy|rename|deleted|...
+                        (?:^[a-z]$ANY+$LF)*
+                )? # end of optional stuff, everything below is required
+                ^index\x20($OID_BLOB)\.\.($OID_BLOB)$ANY*$LF
+                ^---\x20($FN)$LF
+                ^\+{3}\x20($FN)$LF)/smxo, $$cur);
+        $$cur = undef;
+
+        my $linkify = $ctx->{-linkify};
+        my $dctx; # {}, keys: Q, oid_a, oid_b
+
+        while (defined(my $x = shift @top)) {
+                if (scalar(@top) >= 4 &&
+                                $top[1] =~ /\A$OID_BLOB\z/os &&
+                                $top[0] =~ /\A$OID_BLOB\z/os) {
+                        $dctx = diff_header($dst, \$x, $ctx, \@top);
+                } elsif ($dctx) {
+                        my $after = '';
+                        for my $s (split(/((?:(?:^\+[^\n]*\n)+)|
+                                        (?:(?:^-[^\n]*\n)+)|
+                                        (?:^@@ [^\n]+\n))/xsm, $x)) {
+                                if (!defined($dctx)) {
+                                        $after .= $s;
+                                } elsif ($s =~ s/\A@@ (\S+) (\S+) @@//) {
+                                        $$dst .= qq(<span\nclass="hunk">);
+                                        diff_hunk($dst, $dctx, $1, $2);
+                                        $$dst .= $linkify->to_html($s);
+                                        $$dst .= '</span>';
+                                } elsif ($s =~ /\A\+/) {
+                                        $$dst .= qq(<span\nclass="add">);
+                                        $$dst .= $linkify->to_html($s);
+                                        $$dst .= '</span>';
+                                } elsif ($s =~ /\A-- $/sm) { # email sig starts
+                                        $dctx = undef;
+                                        $after .= $s;
+                                } elsif ($s =~ /\A-/) {
+                                        $$dst .= qq(<span\nclass="del">);
+                                        $$dst .= $linkify->to_html($s);
+                                        $$dst .= '</span>';
+                                } else {
+                                        $$dst .= $linkify->to_html($s);
+                                }
                         }
-                        $$dst .= $linkify->to_html($s);
-                # ignore the following lines in headers:
-                } elsif ($s =~ /^(?:dis)similarity index/ ||
-                         $s =~ /^(?:old|new) mode/ ||
-                         $s =~ /^(?:deleted|new) file mode/ ||
-                         $s =~ /^(?:copy|rename) (?:from|to) / ||
-                         $s =~ /^(?:dis)?similarity index /) {
-                        $$dst .= $linkify->to_html($s);
+                        diff_before_or_after($dst, $ctx, \$after) unless $dctx;
                 } else {
-                        $state <= DSTATE_STAT or
-                                to_state($dst, $state, DSTATE_INIT);
-                        $$dst .= $linkify->to_html($s);
+                        diff_before_or_after($dst, $ctx, \$x);
                 }
         }
-        @$diff = ();
-        $$dst .= '</span>' if $state2class[$state];
-        undef;
 }
 
 1;