user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: Re: [PATCH 2/3] solver: extract_diff: deal with missing "diff --git" line
Date: Fri, 3 Jan 2020 00:43:57 +0000	[thread overview]
Message-ID: <20200103004357.GA18883@dcvr> (raw)
In-Reply-To: <20200102092459.17612-3-e@80x24.org>

Eric Wong <e@80x24.org> wrote:
> Rewrite the patch extraction loop using a single regexp which
> accounts for missing "diff --git ..." lines and is capable of
> extracting pathnames off the "+++ b/foo" line.
> 
> This fixes the solving of blob "96f1c7f" off
> <2841d2de-32ad-eae8-6039-9251a40bb00e@tngtech.com>
> in git@vger archives.
> ---
>  lib/PublicInbox/SolverGit.pm | 165 ++++++++++++++++++-----------------
>  1 file changed, 85 insertions(+), 80 deletions(-)
> 
> diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
> index 3e3a5899..e1b82f3e 100644
> --- a/lib/PublicInbox/SolverGit.pm
> +++ b/lib/PublicInbox/SolverGit.pm
> @@ -10,6 +10,7 @@
>  package PublicInbox::SolverGit;
>  use strict;
>  use warnings;
> +use 5.010_001;
>  use File::Temp 0.19 ();
>  use Fcntl qw(SEEK_SET);
>  use PublicInbox::Git qw(git_unquote git_quote);
> @@ -38,7 +39,7 @@ my $MAX_PATCH = 9999;
>  #	oid_a => abbreviated pre-image oid,
>  #	oid_b => abbreviated post-image oid,
>  #	tmp => anonymous file handle with the diff,
> -#	hdr_lines => arrayref of various header lines for mode information
> +#	hdr_lines => string of various header lines for mode information
>  #	mode_a => original mode of oid_a (string, not integer),
>  #	ibx => PublicInbox::Inbox object containing the diff
>  #	smsg => PublicInbox::SearchMsg object containing diff
> @@ -47,10 +48,6 @@ my $MAX_PATCH = 9999;
>  #	n => numeric path of the patch (relative to worktree)
>  # }
>  
> -# don't bother if somebody sends us a patch with these path components,
> -# it's junk at best, an attack attempt at worse:
> -my %bad_component = map { $_ => 1 } ('', '.', '..');
> -
>  sub dbg ($$) {
>  	print { $_[0]->{out} } $_[1], "\n" or ERR($_[0], "print(dbg): $!");
>  }
> @@ -100,10 +97,8 @@ sub solve_existing ($$) {
>  
>  sub extract_diff ($$) {
>  	my ($p, $arg) = @_;
> -	my ($self, $diffs, $re, $ibx, $smsg) = @$arg;
> +	my ($self, $diffs, $pre, $post, $ibx, $smsg) = @$arg;
>  	my ($part) = @$p; # ignore $depth and @idx;
> -	my $hdr_lines; # diff --git a/... b/...
> -	my $tmp;
>  	my $ct = $part->content_type || 'text/plain';
>  	my ($s, undef) = msg_part_text($part, $ct);
>  	defined $s or return;
> @@ -116,66 +111,85 @@ sub extract_diff ($$) {
>  		$s =~ s/\r\n/\n/sg;
>  	}
>  
> -	foreach my $l (split(/^/m, $s)) {
> -		if ($l =~ $re) {
> -			$di->{oid_a} = $1;
> -			$di->{oid_b} = $2;
> -			if (defined($3)) {
> -				my $mode_a = $3;
> -				if ($mode_a =~ /\A(?:100644|120000|100755)\z/) {
> -					$di->{mode_a} = $mode_a;
> -				}
> -			}
> -
> -
> -			# start writing the diff out to a tempfile
> -			my $path = ++$self->{tot};
> -			$di->{n} = $path;
> -			open($tmp, '>', $self->{tmp}->dirname . "/$path") or
> -							die "open(tmp): $!";
> -
> -			push @$hdr_lines, $l;
> -			$di->{hdr_lines} = $hdr_lines;
> -			utf8::encode($_) for @$hdr_lines;
> -			print $tmp @$hdr_lines or die "print(tmp): $!";
> -
> -			# for debugging/diagnostics:
> -			$di->{ibx} = $ibx;
> -			$di->{smsg} = $smsg;
> -		} elsif ($l =~ m!\Adiff --git ("?[^/]+/.+) ("?[^/]+/.+)$!) {
> -			last if $tmp; # got our blob, done!
> -
> -			my ($path_a, $path_b) = ($1, $2);
> -
> -			# diff header lines won't have \r because git
> -			# will quote them, but Email::MIME gives CRLF
> -			# for quoted-printable:
> -			$path_b =~ tr/\r//d;
> -
> -			# don't care for leading 'a/' and 'b/'
> -			my (undef, @a) = split(m{/}, git_unquote($path_a));
> -			my (undef, @b) = split(m{/}, git_unquote($path_b));
> -
> -			# get rid of path-traversal attempts and junk patches:
> -			foreach (@a, @b) {
> -				return if $bad_component{$_};
> -			}
> -
> -			$di->{path_a} = join('/', @a);
> -			$di->{path_b} = join('/', @b);
> -			$hdr_lines = [ $l ];
> -		} elsif ($tmp) {
> -			utf8::encode($l);
> -			print $tmp $l or die "print(tmp): $!";
> -		} elsif ($hdr_lines) {
> -			push @$hdr_lines, $l;
> -			if ($l =~ /\Anew file mode (100644|120000|100755)$/) {
> -				$di->{mode_a} = $1;
> -			}
> -		}
> -	}
> -	return undef unless $tmp;
> +	state $LF = qr!\r?\n!;
> +	state $FN = qr!(?:("?[^/\n]+/[^\r\n]+)|/dev/null)!;
> +
> +	$s =~ m!( # $1 start header lines we save for debugging:
> +
> +		# everything before ^index is optional, but we don't
> +		# want to match ^(old|copy|rename|deleted|...) unless
> +		# we match /^diff --git/ first:
> +		(?: # begin optional stuff:
> +
> +		# try to get the pre-and-post filenames as $2 and $3
> +		(?:^diff\x20--git\x20$FN\x20$FN$LF)
> +
> +		# old mode $4
> +		(?:^old mode\x20(100644|120000|100755)$LF)?
> +
> +		# ignore other info
> +		(?:^(?:copy|rename|deleted|dissimilarity|similarity).*$LF)?
> +
> +		# new mode (possibly new file) ($5)
> +		(?:^new\x20(?:file\x20)?mode\x20(100644|120000|100755)$LF)?
> +
> +		# ignore other info
> +		(?:^(?:copy|rename|deleted|dissimilarity|similarity).*$LF)?
> +
> +		)? # end of optional stuff, everything below is required
> +
> +		# match the pre and post-image OIDs as $6 $7
> +		^index\x20(${pre}[a-f0-9]*)\.\.(${post}[a-f0-9]*)
> +			# mode if unchanged $8
> +			(?:\x20(100644|120000|100755))?$LF
> +	) # end of header lines ($1)
> +	( # $9 is the patch body
> +		# "--- a/foo.c" sets pre-filename ($10) in case
> +		# $2 is missing
> +		(?:^---\x20$FN$LF)
> +
> +		# "+++ b/foo.c" sets post-filename ($11) in case
> +		# $3 is missing
> +		(?:^\+{3}\x20$FN$LF)
> +
> +		# the meat of the diff, including "^\\No newline ..."
> +		(?:^[\@\+\x20\-\\][^\r\n]*$LF)+
> +	)!smx or return;

...And we need to support context lines w/o leading space

a) because it happens in the real world because of MUAs

b) it was apparently proposed for GNU diff in the past
   (thankfully, it's not the default)

cf. https://public-inbox.org/git/b507b465f7831612b9d9fc643e3e5218b64e5bfa/s/
(oddly, I can't find the mail for that patch specifically)

This fixes a regression on git/5cd8845/s/?b=submodule.c

diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
index ab7d5c19..a78360fd 100644
--- a/lib/PublicInbox/SolverGit.pm
+++ b/lib/PublicInbox/SolverGit.pm
@@ -153,7 +153,9 @@ sub extract_diff ($$) {
 		(?:^\+{3}\x20$FN$LF)
 
 		# the meat of the diff, including "^\\No newline ..."
-		(?:^[\@\+\x20\-\\][^\r\n]*$LF)+
+		# We also allow for totally blank lines w/o leading spaces,
+		# because git-apply(1) handles that case, too
+		(?:^(?:[\@\+\x20\-\\][^\r\n]*|)$LF)+
 	)!smx or return;
 
 	my $hdr_lines = $1;

  parent reply	other threads:[~2020-01-03  0:43 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-02  9:24 [PATCH 0/3] solver bugfixes and tweaks Eric Wong
2020-01-02  9:24 ` [PATCH 1/3] solver: try the next patch on apply failures Eric Wong
2020-01-02  9:24 ` [PATCH 2/3] solver: extract_diff: deal with missing "diff --git" line Eric Wong
2020-01-02  9:36   ` Eric Wong
2020-01-02 19:04     ` Eric Wong
2020-01-02 21:12   ` Eric Wong
2020-01-03  0:43   ` Eric Wong [this message]
2020-01-02  9:24 ` [PATCH 3/3] qspawn: use per-call quiet flag for solver Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200103004357.GA18883@dcvr \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).