user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 22/22] viewdiff: rewrite and simplify
Date: Sat, 25 Jan 2020 04:45:10 +0000	[thread overview]
Message-ID: <20200125044510.13769-23-e@yhbt.net> (raw)
In-Reply-To: <20200125044510.13769-1-e@yhbt.net>

Instead of going line-by-line, use split() with a giant regexp
to capture groups of contiguous lines.  This offloads state
management to the regexp itself and makes it FAR easier to
keep track of <span> and </span> pairings.

Performance seems roughly on par after this change for the
meta@public-inbox archives.  It seems a tiny bit faster for
git@vger with xt/perf-msgview.t, likely due to the longer
messages and larger contiguous groups of lines having the same
prefix (or no prefix at all) and drastically reduces the number
of subroutine calls and Perl ops executed.
---
 lib/PublicInbox/View.pm     |   8 +-
 lib/PublicInbox/ViewDiff.pm | 297 +++++++++++++++++-------------------
 2 files changed, 139 insertions(+), 166 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index f613afb5..9a5e3997 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -562,7 +562,7 @@ sub add_text_body { # callback for msg_iter
 		$idx[0] = $upfx . $idx[0] if $upfx ne '';
 		$ctx->{-apfx} = join('/', @idx);
 		$ctx->{-anchors} = {}; # attr => filename
-		$ctx->{-diff} = $diff = [];
+		$diff = 1;
 		delete $ctx->{-long_path};
 		my $spfx;
 		if ($ibx->{-repo_objs}) {
@@ -595,14 +595,12 @@ sub add_text_body { # callback for msg_iter
 		attach_link($ctx, $ct, $p, $fn, $err);
 		$$rv .= "\n";
 	}
-	my $l = PublicInbox::Linkify->new;
+	my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new;
 	foreach my $cur (@sections) {
 		if ($cur =~ /\A>/) {
 			flush_quote($rv, $l, \$cur);
 		} elsif ($diff) {
-			@$diff = split(/^/m, $cur);
-			$cur = undef;
-			flush_diff($rv, $ctx, $l);
+			flush_diff($rv, $ctx, \$cur);
 		} else {
 			# regular lines, OK
 			$$rv .= $l->to_html($cur);
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index ece95f4c..8c37c139 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -7,6 +7,7 @@
 # (or reconstruct) blobs.
 
 package PublicInbox::ViewDiff;
+use 5.010_001;
 use strict;
 use warnings;
 use base qw(Exporter);
@@ -15,52 +16,31 @@ use URI::Escape qw(uri_escape_utf8);
 use PublicInbox::Hval qw(ascii_html to_attr);
 use PublicInbox::Git qw(git_unquote);
 
-# keep track of state so we can avoid redundant HTML tags for
-# identically-classed lines
-sub DSTATE_INIT () { 0 }
-sub DSTATE_STAT () { 1 }
-sub DSTATE_HEAD () { 2 } # /^diff --git /, /^index /, /^--- /, /^\+\+\+ /
-sub DSTATE_CTX () { 3 } # /^ /
-sub DSTATE_ADD () { 4 } # /^\+/
-sub DSTATE_DEL () { 5 } # /^\-/
-
-# maps the DSTATE_* to CSS class names compatible with what cgit uses:
-my @state2class = (
-	'', # init
-	'', # stat
-	'head',
-	'', # ctx
-	'add',
-	'del'
-);
-
 sub UNSAFE () { "^A-Za-z0-9\-\._~/" }
 
 my $OID_NULL = '0{7,40}';
 my $OID_BLOB = '[a-f0-9]{7,40}';
-my $PATH_X = '"?[^/]+/.+|/dev/null';
 
 # cf. git diff.c :: get_compact_summary
 my $DIFFSTAT_COMMENT = qr/\((?:new|gone|(?:(?:new|mode) [\+\-][lx]))\)/;
 
 # link to line numbers in blobs
 sub diff_hunk ($$$$) {
-	my ($dctx, $spfx, $ca, $cb) = @_;
-	my $oid_a = $dctx->{oid_a};
-	my $oid_b = $dctx->{oid_b};
-
-	(defined($spfx) && defined($oid_a) && defined($oid_b)) or
-		return "@@ $ca $cb @@";
+	my ($dst, $dctx, $ca, $cb) = @_;
+	my ($oid_a, $oid_b, $spfx) = @$dctx{qw(oid_a oid_b spfx)};
 
-	my ($n) = ($ca =~ /^-([0-9]+)/);
-	$n = defined($n) ? do { ++$n; "#n$n" } : '';
+	if (defined($spfx) && defined($oid_a) && defined($oid_b)) {
+		my ($n) = ($ca =~ /^-([0-9]+)/);
+		$n = defined($n) ? do { ++$n; "#n$n" } : '';
 
-	my $rv = qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>);
+		$$dst .= qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>);
 
-	($n) = ($cb =~ /^\+([0-9]+)/);
-	$n = defined($n) ? do { ++$n; "#n$n" } : '';
-
-	$rv .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@);
+		($n) = ($cb =~ /^\+([0-9]+)/);
+		$n = defined($n) ? do { ++$n; "#n$n" } : '';
+		$$dst .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@);
+	} else {
+		$$dst .= "@@ $ca $cb @@";
+	}
 }
 
 sub oid ($$$) {
@@ -68,16 +48,9 @@ sub oid ($$$) {
 	defined($spfx) ? qq(<a\nhref="$spfx$oid/s/$dctx->{Q}">$oid</a>) : $oid;
 }
 
-sub to_state ($$$) {
-	my ($dst, $state, $new_state) = @_;
-	$$dst .= '</span>' if $state2class[$state];
-	$_[1] = $new_state;
-	my $class = $state2class[$new_state] or return;
-	$$dst .= qq(<span\nclass="$class">);
-}
-
-sub anchor0 ($$$$$) {
-	my ($dst, $ctx, $linkify, $fn, $rest) = @_;
+# returns true if diffstat anchor written, false otherwise
+sub anchor0 ($$$$) {
+	my ($dst, $ctx, $fn, $rest) = @_;
 
 	my $orig = $fn;
 
@@ -100,16 +73,16 @@ sub anchor0 ($$$$$) {
 		my $spaces = ($orig =~ s/( +)\z//) ? $1 : '';
 		$$dst .= " <a\nid=i$attr\nhref=#$attr>" .
 			ascii_html($orig) . '</a>' . $spaces .
-			$linkify->to_html($rest);
+			$ctx->{-linkify}->to_html($rest);
 		return 1;
 	}
 	undef;
 }
 
-sub anchor1 ($$$$$) {
-	my ($dst, $ctx, $linkify, $pb, $s) = @_;
+# returns "diff --git" anchor destination, undef otherwise
+sub anchor1 ($$) {
+	my ($ctx, $pb) = @_;
 	my $attr = to_attr($ctx->{-apfx}.$pb) or return;
-	my $line = $linkify->to_html($s);
 
 	my $ok = delete $ctx->{-anchors}->{$attr};
 
@@ -125,131 +98,133 @@ sub anchor1 ($$$$$) {
 			last;
 		}
 	}
-	if ($ok && $line =~ s/^diff //) {
-		$$dst .= "<a\nhref=#i$attr\nid=$attr>diff</a> ".$line;
-		return 1;
-	}
-	undef
+	$ok ? "<a\nhref=#i$attr\nid=$attr>diff</a> --git" : undef
 }
 
-sub missing_diff_git_line ($$) {
-	my ($dctx, $pb) = @_;
-	# missing "diff --git ..."
-	$dctx->{path_b} = $pb;
-	$dctx->{Q} = '?b='.uri_escape_utf8($pb, UNSAFE);
-	my $pa = $dctx->{path_a};
-	if (defined($pa) && $pa ne $pb) {
-		$dctx->{Q} .= '&amp;a='. uri_escape_utf8($pa, UNSAFE);
-	}
-}
-
-sub flush_diff ($$$) {
-	my ($dst, $ctx, $linkify) = @_;
-	my $diff = $ctx->{-diff};
+sub diff_header ($$$$) {
+	my ($dst, $x, $ctx, $top) = @_;
+	my (undef, undef, $pa, $pb) = splice(@$top, 0, 4); # ignore oid_{a,b}
 	my $spfx = $ctx->{-spfx};
-	my $state = DSTATE_INIT;
-	my $dctx = { Q => '' }; # {}, keys: oid_a, oid_b, path_a, path_b
-
-	foreach my $s (@$diff) {
-		if ($s =~ /^---$/) {
-			to_state($dst, $state, DSTATE_STAT);
-			$$dst .= $s;
-		} elsif ($s =~ /^ / || ($s =~ /^$/ && $state >= DSTATE_CTX)) {
-			# works for common cases, but not weird/long filenames
-			if ($state == DSTATE_STAT &&
-					$s =~ /^ (.+)( +\| .*\z)/s) {
-				anchor0($dst, $ctx, $linkify, $1, $2) and next;
-			} elsif ($state2class[$state]) {
-				to_state($dst, $state, DSTATE_CTX);
-			}
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ /^-- $/) { # email signature begins
-			$state == DSTATE_INIT or
-				to_state($dst, $state, DSTATE_INIT);
-			$$dst .= $s;
-		} elsif ($s =~ m!^diff --git ($PATH_X) ($PATH_X)$!o) {
-			my ($pa, $pb) = ($1, $2);
-			if ($state != DSTATE_HEAD) {
-				to_state($dst, $state, DSTATE_HEAD);
-			}
-			$pa = (split('/', git_unquote($pa), 2))[1];
+	my $dctx = { spfx => $spfx };
+	if ($pa eq $pb && $pb ne '/dev/null') {
+		$pa = $pb = (split('/', git_unquote($pb), 2))[1];
+		$dctx->{Q} = "?b=".uri_escape_utf8($pb, UNSAFE);
+	} else {
+		my @q;
+		if ($pb ne '/dev/null') {
 			$pb = (split('/', git_unquote($pb), 2))[1];
-			$dctx = {
-				Q => "?b=".uri_escape_utf8($pb, UNSAFE),
-			};
-			if ($pa ne $pb) {
-				$dctx->{Q} .= '&amp;a='.
-					uri_escape_utf8($pa, UNSAFE);
-			}
-			anchor1($dst, $ctx, $linkify, $pb, $s) and next;
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b//o) {
-			$$dst .= $1 . oid($dctx, $spfx, $2);
-			$dctx = { Q => '' };
-			$$dst .= $linkify->to_html($s) ;
-		} elsif ($s =~ s/^index ($OID_BLOB)(\.\.$OID_NULL)\b//o) {
-			$$dst .= 'index ' . oid($dctx, $spfx, $1) . $2;
-			$dctx = { Q => '' };
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ /^index ($OID_BLOB)\.\.($OID_BLOB)/o) {
-			$dctx->{oid_a} = $1;
-			$dctx->{oid_b} = $2;
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ s/^@@ (\S+) (\S+) @@//) {
-			$$dst .= '</span>' if $state2class[$state];
-			$$dst .= qq(<span\nclass="hunk">);
-			$$dst .= diff_hunk($dctx, $spfx, $1, $2);
-			$$dst .= '</span>';
-			$state = DSTATE_CTX;
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ m!^--- ($PATH_X)!o) {
-			my $pa = $1;
+			push @q, 'b='.uri_escape_utf8($pb, UNSAFE);
+		}
+		if ($pa ne '/dev/null') {
 			$pa = (split('/', git_unquote($pa), 2))[1];
-			if (($dctx->{path_a} // '') ne $pa) {
-				# missing "diff --git ..." ?
-				$dctx->{path_a} = $pa;
-			}
-			# color only (no oid link) if missing dctx->{oid_*}
-			$state <= DSTATE_STAT and
-				to_state($dst, $state, DSTATE_HEAD);
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ m!^\+{3} ($PATH_X)!o) {
-			my $pb = $1;
-			$pb = (split('/', git_unquote($pb), 2))[1];
-			if (($dctx->{path_b} // '') ne $pb) {
-				missing_diff_git_line($dctx, $pb);
-			}
+			push @q, 'a='.uri_escape_utf8($pa, UNSAFE);
+		}
+		$dctx->{Q} = '?'.join('&amp;', @q);
+	}
 
-			# color only (no oid link) if missing dctx->{oid_*}
-			$state <= DSTATE_STAT and
-				to_state($dst, $state, DSTATE_HEAD);
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ /^\+/) {
-			if ($state != DSTATE_ADD && $state > DSTATE_STAT) {
-				to_state($dst, $state, DSTATE_ADD);
+	# linkify early and all at once, since we know the following
+	# subst ops on $$x won't need further escaping:
+	$$x = $ctx->{-linkify}->to_html($$x);
+
+	# no need to capture oid_a and oid_b on add/delete,
+	# we just linkify OIDs directly via s///e in conditional
+	if (($$x =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b/
+			$1 . oid($dctx, $spfx, $2)/emos) ||
+		($$x =~ s/^index ($OID_BLOB)(\.\.$OID_NULL)\b/
+			'index ' . oid($dctx, $spfx, $1) . $2/emos)) {
+	} elsif ($$x =~ /^index ($OID_BLOB)\.\.($OID_BLOB)/mos) {
+		# modification-only, not add/delete:
+		# linkify hunk headers later using oid_a and oid_b
+		@$dctx{qw(oid_a oid_b)} = ($1, $2);
+	} else {
+		warn "BUG? <$$x> had no ^index line";
+	}
+	$$x =~ s!^diff --git!anchor1($ctx, $pb) // 'diff --git'!emos;
+	$$dst .= qq(<span\nclass="head">);
+	$$dst .= $$x;
+	$$dst .= '</span>';
+	$dctx;
+}
+
+sub diff_before_or_after ($$$) {
+	my ($dst, $ctx, $x) = @_;
+	my $linkify = $ctx->{-linkify};
+	for my $y (split(/(^---\r?\n)/sm, $$x)) {
+		if ($y =~ /\A---\r?\n\z/s) {
+			$$dst .= "---\n"; # all HTML is "\r\n" => "\n"
+		} elsif ($y =~ /^ [0-9]+ files? changed, /sm) {
+			# ok, looks like a diffstat, go line-by-line:
+			for my $l (split(/^/m, $y)) {
+				if ($l =~ /^ (.+)( +\| .*\z)/s) {
+					anchor0($dst, $ctx, $1, $2) and next;
+				}
+				$$dst .= $linkify->to_html($l);
 			}
-			$$dst .= $linkify->to_html($s);
-		} elsif ($s =~ /^-/) {
-			if ($state != DSTATE_DEL && $state > DSTATE_STAT) {
-				to_state($dst, $state, DSTATE_DEL);
+		} else { # commit message, notes, etc
+			$$dst .= $linkify->to_html($y);
+		}
+	}
+}
+
+sub flush_diff ($$$) {
+	my ($dst, $ctx, $cur) = @_;
+	state $LF = qr!\r?\n!;
+	state $ANY = qr![^\r\n]!;
+	state $FN = qr!(?:"?[^/\n]+/[^\r\n]+|/dev/null)!;
+
+	my @top = split(/(
+		(?:	# begin header stuff, don't capture filenames, here,
+			# but instead wait for the --- and +++ lines.
+			(?:^diff\x20--git\x20$FN\x20$FN$LF)
+
+			# old mode || new mode || copy|rename|deleted|...
+			(?:^[a-z]$ANY+$LF)*
+		)? # end of optional stuff, everything below is required
+		^index\x20($OID_BLOB)\.\.($OID_BLOB)$ANY*$LF
+		^---\x20($FN)$LF
+		^\+{3}\x20($FN)$LF)/smxo, $$cur);
+	$$cur = undef;
+
+	my $linkify = $ctx->{-linkify};
+	my $dctx; # {}, keys: Q, oid_a, oid_b
+
+	while (defined(my $x = shift @top)) {
+		if (scalar(@top) >= 4 &&
+				$top[1] =~ /\A$OID_BLOB\z/os &&
+				$top[0] =~ /\A$OID_BLOB\z/os) {
+			$dctx = diff_header($dst, \$x, $ctx, \@top);
+		} elsif ($dctx) {
+			my $after = '';
+			for my $s (split(/((?:(?:^\+[^\n]*\n)+)|
+					(?:(?:^-[^\n]*\n)+)|
+					(?:^@@ [^\n]+\n))/xsm, $x)) {
+				if (!defined($dctx)) {
+					$after .= $s;
+				} elsif ($s =~ s/\A@@ (\S+) (\S+) @@//) {
+					$$dst .= qq(<span\nclass="hunk">);
+					diff_hunk($dst, $dctx, $1, $2);
+					$$dst .= $linkify->to_html($s);
+					$$dst .= '</span>';
+				} elsif ($s =~ /\A\+/) {
+					$$dst .= qq(<span\nclass="add">);
+					$$dst .= $linkify->to_html($s);
+					$$dst .= '</span>';
+				} elsif ($s =~ /\A-- $/sm) { # email sig starts
+					$dctx = undef;
+					$after .= $s;
+				} elsif ($s =~ /\A-/) {
+					$$dst .= qq(<span\nclass="del">);
+					$$dst .= $linkify->to_html($s);
+					$$dst .= '</span>';
+				} else {
+					$$dst .= $linkify->to_html($s);
+				}
 			}
-			$$dst .= $linkify->to_html($s);
-		# ignore the following lines in headers:
-		} elsif ($s =~ /^(?:dis)similarity index/ ||
-			 $s =~ /^(?:old|new) mode/ ||
-			 $s =~ /^(?:deleted|new) file mode/ ||
-			 $s =~ /^(?:copy|rename) (?:from|to) / ||
-			 $s =~ /^(?:dis)?similarity index /) {
-			$$dst .= $linkify->to_html($s);
+			diff_before_or_after($dst, $ctx, \$after) unless $dctx;
 		} else {
-			$state <= DSTATE_STAT or
-				to_state($dst, $state, DSTATE_INIT);
-			$$dst .= $linkify->to_html($s);
+			diff_before_or_after($dst, $ctx, \$x);
 		}
 	}
-	@$diff = ();
-	$$dst .= '</span>' if $state2class[$state];
-	undef;
 }
 
 1;

      parent reply	other threads:[~2020-01-25  4:45 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-25  4:44 [PATCH 00/22] HTML display cleanups, fixes, speedups Eric Wong
2020-01-25  4:44 ` [PATCH 01/22] www*stream: favor \&close instead of *close Eric Wong
2020-01-25  4:44 ` [PATCH 02/22] www: use "skel" terminology consistently Eric Wong
2020-01-25  4:44 ` [PATCH 03/22] view: improve readability around walk_thread Eric Wong
2020-01-25  4:44 ` [PATCH 04/22] searchview: keep $noop sub private to the package Eric Wong
2020-01-25  4:44 ` [PATCH 05/22] view: reduce parameters for html_footer Eric Wong
2020-01-25  4:44 ` [PATCH 06/22] view: thread_skel: drop constant tpfx parameter Eric Wong
2020-01-25  4:44 ` [PATCH 07/22] view: simplify duplicate Message-ID handling Eric Wong
2020-01-25  4:44 ` [PATCH 08/22] wwwstream: discard single-use $ctx fields after use Eric Wong
2020-01-25  4:44 ` [PATCH 09/22] view: start performing buffering into {obuf} Eric Wong
2020-01-25  4:44 ` [PATCH 10/22] t/plack.t: modernize and unindent Eric Wong
2020-01-25  4:44 ` [PATCH 11/22] init: use Import::run_die instead of system() Eric Wong
2020-01-25  4:45 ` [PATCH 12/22] tests: move the majority of t/view.t into t/plack.t Eric Wong
2020-01-25  4:45 ` [PATCH 13/22] xt/perf-msgview: switch to multipart_text_as_html Eric Wong
2020-01-25  4:45 ` [PATCH 14/22] view: inline and eliminate msg_html Eric Wong
2020-01-25  4:45 ` [PATCH 15/22] linkify: compile $LINK_RE once Eric Wong
2020-01-25  4:45 ` [PATCH 16/22] linkify: move to_html over from ViewDiff Eric Wong
2020-01-25  4:45 ` [PATCH 17/22] searchidx: skip filenames on "diff --git ..." Eric Wong
2020-01-25  4:45 ` [PATCH 18/22] searchidx: don't assume "a/" and "b/" as prefixes Eric Wong
2020-01-25  4:45 ` [PATCH 19/22] viewdiff: add "b=" param with non-standard diff prefix Eric Wong
2020-01-25  4:45 ` [PATCH 20/22] viewdiff: add "b=" param when missing "diff --git" line Eric Wong
2020-01-25  4:45 ` [PATCH 21/22] viewdiff: use autovivification for long_path hash Eric Wong
2020-01-25  4:45 ` Eric Wong [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200125044510.13769-23-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).