From c34a83286234ea1e876ebdf92a33744272bb6f4e Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Sun, 1 Apr 2018 23:15:04 +0000 Subject: v2: one file, really We need to ensure there is only one file in the top-level tree at any commit so the "add; remove; add;" sequence on the same message is detected properly. Otherwise, git will not detect the second "add" unless a second message is added to history. Deletes are now stored in "d" (and not "D" or "_/D") at the top-level, now. There's no need to have a "_" to reduce churn as "m" and "d" should never co-exist. It's now lowercased to make it easier-to-distinguish from "D" in git-log output. --- lib/PublicInbox/Import.pm | 61 +++++++++++++++++++++++++++++-------------- lib/PublicInbox/V2Writable.pm | 12 ++++++--- 2 files changed, 50 insertions(+), 23 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 463b44e2..b2aae9a7 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -49,7 +49,14 @@ sub gfi_start { $self->lock_acquire; local $/ = "\n"; - chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $self->{ref})); + my $ref = $self->{ref}; + chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + if ($self->{path_type} ne '2/38' && $self->{tip}) { + local $/ = "\0"; + my @tree = $git->qx(qw(ls-tree -r -z --name-only), $ref); + chomp @tree; + $self->{-tree} = { map { $_ => 1 } @tree }; + } my $git_dir = $git->{git_dir}; my @cmd = ('git', "--git-dir=$git_dir", qw(fast-import @@ -238,7 +245,8 @@ sub remove { if (defined $path) { print $w "D $path\n\n" or wfail; } else { - print $w "M 100644 :$blob _/D\n\n" or wfail; + clean_tree_v2($self, $w, 'd'); + print $w "M 100644 :$blob d\n\n" or wfail; } $self->{nchg}++; (($self->{tip} = ":$commit"), $cur); @@ -317,6 +325,15 @@ sub v1_mid0 ($) { } $mids->[0]; } +sub clean_tree_v2 ($$$) { + my ($self, $w, $keep) = @_; + my $tree = $self->{-tree} or return; #v2 only + delete $tree->{$keep}; + foreach (keys %$tree) { + print $w "D $_\n" or wfail; + } + %$tree = ($keep => 1); +} # returns undef on duplicate # returns the :MARK of the most recent commit @@ -382,6 +399,7 @@ sub add { if ($tip ne '') { print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; } + clean_tree_v2($self, $w, $path); print $w "M 100644 :$blob $path\n\n" or wfail; $self->{nchg}++; $self->{tip} = ":$commit"; @@ -431,8 +449,9 @@ sub digest2mid ($) { } sub clean_purge_buffer { - my ($oid, $buf) = @_; - my $cmt_msg = "purged $oid\n"; + my ($oids, $buf) = @_; + my $cmt_msg = 'purged '.join(' ',@$oids)."\n"; + @$oids = (); foreach my $i (0..$#$buf) { my $l = $buf->[$i]; @@ -456,6 +475,8 @@ sub purge_oids { my ($r, $w) = $self->gfi_start; my @buf; my $npurge = 0; + my @oids; + my $tree = $self->{-tree}; while (<$rd>) { if (/^reset (?:.+)/) { push @buf, "reset $tmp\n"; @@ -472,25 +493,27 @@ sub purge_oids { my $n = read($rd, my $buf, $len) or die "read: $!"; $len == $n or die "short read ($n < $len)"; push @buf, $buf; - } elsif (/^M 100644 ([a-f0-9]+) /) { - my $oid = $1; + } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { + my ($oid, $path) = ($1, $2); if ($purge->{$oid}) { - my $lf = <$rd>; - if ($lf eq "\n") { - my $out = join('', @buf); - $out =~ s/^/# /sgm; - warn "purge rewriting\n", $out, "\n"; - clean_purge_buffer($oid, \@buf); - $out = join('', @buf); - $w->print(@buf, "\n") or wfail; - @buf = (); - $npurge++; - } else { - die "expected LF: $lf\n"; - } + push @oids, $oid; + delete $tree->{$path}; } else { + $tree->{$path} = 1; push @buf, $_; } + } elsif (/^D (\w+)/) { + my $path = $1; + push @buf, $_ if $tree->{$path}; + } elsif ($_ eq "\n") { + my $out = join('', @buf); + $out =~ s/^/# /sgm; + warn "purge rewriting\n", $out, "\n"; + clean_purge_buffer(\@oids, \@buf); + $out = join('', @buf); + $w->print(@buf, "\n") or wfail; + @buf = (); + $npurge++; } else { push @buf, $_; } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index c4368ccc..c8869bda 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -634,15 +634,19 @@ sub reindex { -d $git->{git_dir} or next; # missing parts are fine chomp($tip = $git->qx('rev-parse', $head)) unless $tip; my $h = $cur == $max_git ? $tip : $head; - my @count = ('rev-list', '--count', $h, '--', 'm'); - $regen_max += $git->qx(@count); + + # can't use 'rev-list --count' if we use --diff-filter + my $fh = $git->popen(qw(log --pretty=tformat:%h + --no-notes --no-color --no-renames + --diff-filter=AM), $h, '--', 'm'); + ++$regen_max while <$fh>; } die "No messages found in $pfx/*.git, bug?\n" unless $regen_max; $regen = \$regen_max; } my $D = {}; my @cmd = qw(log --raw -r --pretty=tformat:%h - --no-notes --no-color --no-abbrev); + --no-notes --no-color --no-abbrev --no-renames); # if we are regenerating, we must not use a newer tip commit than what # the regeneration counter used: @@ -663,7 +667,7 @@ sub reindex { } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { $self->reindex_oid($mm_tmp, $D, $git, $1, $regen); - } elsif (m!\A:\d{6} 100644 $x40 ($x40) [AM]\t_/D$!o) { + } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { $self->mark_deleted($D, $git, $1); } } -- cgit v1.2.3-24-ge0c7