From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A16771F463 for ; Fri, 3 Jan 2020 00:43:57 +0000 (UTC) Date: Fri, 3 Jan 2020 00:43:57 +0000 From: Eric Wong To: meta@public-inbox.org Subject: Re: [PATCH 2/3] solver: extract_diff: deal with missing "diff --git" line Message-ID: <20200103004357.GA18883@dcvr> References: <20200102092459.17612-1-e@80x24.org> <20200102092459.17612-3-e@80x24.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20200102092459.17612-3-e@80x24.org> List-Id: Eric Wong wrote: > Rewrite the patch extraction loop using a single regexp which > accounts for missing "diff --git ..." lines and is capable of > extracting pathnames off the "+++ b/foo" line. > > This fixes the solving of blob "96f1c7f" off > <2841d2de-32ad-eae8-6039-9251a40bb00e@tngtech.com> > in git@vger archives. > --- > lib/PublicInbox/SolverGit.pm | 165 ++++++++++++++++++----------------- > 1 file changed, 85 insertions(+), 80 deletions(-) > > diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm > index 3e3a5899..e1b82f3e 100644 > --- a/lib/PublicInbox/SolverGit.pm > +++ b/lib/PublicInbox/SolverGit.pm > @@ -10,6 +10,7 @@ > package PublicInbox::SolverGit; > use strict; > use warnings; > +use 5.010_001; > use File::Temp 0.19 (); > use Fcntl qw(SEEK_SET); > use PublicInbox::Git qw(git_unquote git_quote); > @@ -38,7 +39,7 @@ my $MAX_PATCH = 9999; > # oid_a => abbreviated pre-image oid, > # oid_b => abbreviated post-image oid, > # tmp => anonymous file handle with the diff, > -# hdr_lines => arrayref of various header lines for mode information > +# hdr_lines => string of various header lines for mode information > # mode_a => original mode of oid_a (string, not integer), > # ibx => PublicInbox::Inbox object containing the diff > # smsg => PublicInbox::SearchMsg object containing diff > @@ -47,10 +48,6 @@ my $MAX_PATCH = 9999; > # n => numeric path of the patch (relative to worktree) > # } > > -# don't bother if somebody sends us a patch with these path components, > -# it's junk at best, an attack attempt at worse: > -my %bad_component = map { $_ => 1 } ('', '.', '..'); > - > sub dbg ($$) { > print { $_[0]->{out} } $_[1], "\n" or ERR($_[0], "print(dbg): $!"); > } > @@ -100,10 +97,8 @@ sub solve_existing ($$) { > > sub extract_diff ($$) { > my ($p, $arg) = @_; > - my ($self, $diffs, $re, $ibx, $smsg) = @$arg; > + my ($self, $diffs, $pre, $post, $ibx, $smsg) = @$arg; > my ($part) = @$p; # ignore $depth and @idx; > - my $hdr_lines; # diff --git a/... b/... > - my $tmp; > my $ct = $part->content_type || 'text/plain'; > my ($s, undef) = msg_part_text($part, $ct); > defined $s or return; > @@ -116,66 +111,85 @@ sub extract_diff ($$) { > $s =~ s/\r\n/\n/sg; > } > > - foreach my $l (split(/^/m, $s)) { > - if ($l =~ $re) { > - $di->{oid_a} = $1; > - $di->{oid_b} = $2; > - if (defined($3)) { > - my $mode_a = $3; > - if ($mode_a =~ /\A(?:100644|120000|100755)\z/) { > - $di->{mode_a} = $mode_a; > - } > - } > - > - > - # start writing the diff out to a tempfile > - my $path = ++$self->{tot}; > - $di->{n} = $path; > - open($tmp, '>', $self->{tmp}->dirname . "/$path") or > - die "open(tmp): $!"; > - > - push @$hdr_lines, $l; > - $di->{hdr_lines} = $hdr_lines; > - utf8::encode($_) for @$hdr_lines; > - print $tmp @$hdr_lines or die "print(tmp): $!"; > - > - # for debugging/diagnostics: > - $di->{ibx} = $ibx; > - $di->{smsg} = $smsg; > - } elsif ($l =~ m!\Adiff --git ("?[^/]+/.+) ("?[^/]+/.+)$!) { > - last if $tmp; # got our blob, done! > - > - my ($path_a, $path_b) = ($1, $2); > - > - # diff header lines won't have \r because git > - # will quote them, but Email::MIME gives CRLF > - # for quoted-printable: > - $path_b =~ tr/\r//d; > - > - # don't care for leading 'a/' and 'b/' > - my (undef, @a) = split(m{/}, git_unquote($path_a)); > - my (undef, @b) = split(m{/}, git_unquote($path_b)); > - > - # get rid of path-traversal attempts and junk patches: > - foreach (@a, @b) { > - return if $bad_component{$_}; > - } > - > - $di->{path_a} = join('/', @a); > - $di->{path_b} = join('/', @b); > - $hdr_lines = [ $l ]; > - } elsif ($tmp) { > - utf8::encode($l); > - print $tmp $l or die "print(tmp): $!"; > - } elsif ($hdr_lines) { > - push @$hdr_lines, $l; > - if ($l =~ /\Anew file mode (100644|120000|100755)$/) { > - $di->{mode_a} = $1; > - } > - } > - } > - return undef unless $tmp; > + state $LF = qr!\r?\n!; > + state $FN = qr!(?:("?[^/\n]+/[^\r\n]+)|/dev/null)!; > + > + $s =~ m!( # $1 start header lines we save for debugging: > + > + # everything before ^index is optional, but we don't > + # want to match ^(old|copy|rename|deleted|...) unless > + # we match /^diff --git/ first: > + (?: # begin optional stuff: > + > + # try to get the pre-and-post filenames as $2 and $3 > + (?:^diff\x20--git\x20$FN\x20$FN$LF) > + > + # old mode $4 > + (?:^old mode\x20(100644|120000|100755)$LF)? > + > + # ignore other info > + (?:^(?:copy|rename|deleted|dissimilarity|similarity).*$LF)? > + > + # new mode (possibly new file) ($5) > + (?:^new\x20(?:file\x20)?mode\x20(100644|120000|100755)$LF)? > + > + # ignore other info > + (?:^(?:copy|rename|deleted|dissimilarity|similarity).*$LF)? > + > + )? # end of optional stuff, everything below is required > + > + # match the pre and post-image OIDs as $6 $7 > + ^index\x20(${pre}[a-f0-9]*)\.\.(${post}[a-f0-9]*) > + # mode if unchanged $8 > + (?:\x20(100644|120000|100755))?$LF > + ) # end of header lines ($1) > + ( # $9 is the patch body > + # "--- a/foo.c" sets pre-filename ($10) in case > + # $2 is missing > + (?:^---\x20$FN$LF) > + > + # "+++ b/foo.c" sets post-filename ($11) in case > + # $3 is missing > + (?:^\+{3}\x20$FN$LF) > + > + # the meat of the diff, including "^\\No newline ..." > + (?:^[\@\+\x20\-\\][^\r\n]*$LF)+ > + )!smx or return; ...And we need to support context lines w/o leading space a) because it happens in the real world because of MUAs b) it was apparently proposed for GNU diff in the past (thankfully, it's not the default) cf. https://public-inbox.org/git/b507b465f7831612b9d9fc643e3e5218b64e5bfa/s/ (oddly, I can't find the mail for that patch specifically) This fixes a regression on git/5cd8845/s/?b=submodule.c diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index ab7d5c19..a78360fd 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -153,7 +153,9 @@ sub extract_diff ($$) { (?:^\+{3}\x20$FN$LF) # the meat of the diff, including "^\\No newline ..." - (?:^[\@\+\x20\-\\][^\r\n]*$LF)+ + # We also allow for totally blank lines w/o leading spaces, + # because git-apply(1) handles that case, too + (?:^(?:[\@\+\x20\-\\][^\r\n]*|)$LF)+ )!smx or return; my $hdr_lines = $1;