From c34a83286234ea1e876ebdf92a33744272bb6f4e Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Sun, 1 Apr 2018 23:15:04 +0000 Subject: v2: one file, really We need to ensure there is only one file in the top-level tree at any commit so the "add; remove; add;" sequence on the same message is detected properly. Otherwise, git will not detect the second "add" unless a second message is added to history. Deletes are now stored in "d" (and not "D" or "_/D") at the top-level, now. There's no need to have a "_" to reduce churn as "m" and "d" should never co-exist. It's now lowercased to make it easier-to-distinguish from "D" in git-log output. --- MANIFEST | 1 + lib/PublicInbox/Import.pm | 61 +++++++++++++++++++++++++++++-------------- lib/PublicInbox/V2Writable.pm | 12 ++++++--- script/public-inbox-convert | 11 +++++++- t/convert-compact.t | 6 +++++ t/v2-add-remove-add.t | 42 +++++++++++++++++++++++++++++ t/v2writable.t | 7 ++--- 7 files changed, 113 insertions(+), 27 deletions(-) create mode 100644 t/v2-add-remove-add.t diff --git a/MANIFEST b/MANIFEST index 4a1096d6..60e15f28 100644 --- a/MANIFEST +++ b/MANIFEST @@ -187,6 +187,7 @@ t/thread-all.t t/thread-cycle.t t/time.t t/utf8.mbox +t/v2-add-remove-add.t t/v2mda.t t/v2reindex.t t/v2writable.t diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 463b44e2..b2aae9a7 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -49,7 +49,14 @@ sub gfi_start { $self->lock_acquire; local $/ = "\n"; - chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $self->{ref})); + my $ref = $self->{ref}; + chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + if ($self->{path_type} ne '2/38' && $self->{tip}) { + local $/ = "\0"; + my @tree = $git->qx(qw(ls-tree -r -z --name-only), $ref); + chomp @tree; + $self->{-tree} = { map { $_ => 1 } @tree }; + } my $git_dir = $git->{git_dir}; my @cmd = ('git', "--git-dir=$git_dir", qw(fast-import @@ -238,7 +245,8 @@ sub remove { if (defined $path) { print $w "D $path\n\n" or wfail; } else { - print $w "M 100644 :$blob _/D\n\n" or wfail; + clean_tree_v2($self, $w, 'd'); + print $w "M 100644 :$blob d\n\n" or wfail; } $self->{nchg}++; (($self->{tip} = ":$commit"), $cur); @@ -317,6 +325,15 @@ sub v1_mid0 ($) { } $mids->[0]; } +sub clean_tree_v2 ($$$) { + my ($self, $w, $keep) = @_; + my $tree = $self->{-tree} or return; #v2 only + delete $tree->{$keep}; + foreach (keys %$tree) { + print $w "D $_\n" or wfail; + } + %$tree = ($keep => 1); +} # returns undef on duplicate # returns the :MARK of the most recent commit @@ -382,6 +399,7 @@ sub add { if ($tip ne '') { print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; } + clean_tree_v2($self, $w, $path); print $w "M 100644 :$blob $path\n\n" or wfail; $self->{nchg}++; $self->{tip} = ":$commit"; @@ -431,8 +449,9 @@ sub digest2mid ($) { } sub clean_purge_buffer { - my ($oid, $buf) = @_; - my $cmt_msg = "purged $oid\n"; + my ($oids, $buf) = @_; + my $cmt_msg = 'purged '.join(' ',@$oids)."\n"; + @$oids = (); foreach my $i (0..$#$buf) { my $l = $buf->[$i]; @@ -456,6 +475,8 @@ sub purge_oids { my ($r, $w) = $self->gfi_start; my @buf; my $npurge = 0; + my @oids; + my $tree = $self->{-tree}; while (<$rd>) { if (/^reset (?:.+)/) { push @buf, "reset $tmp\n"; @@ -472,25 +493,27 @@ sub purge_oids { my $n = read($rd, my $buf, $len) or die "read: $!"; $len == $n or die "short read ($n < $len)"; push @buf, $buf; - } elsif (/^M 100644 ([a-f0-9]+) /) { - my $oid = $1; + } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { + my ($oid, $path) = ($1, $2); if ($purge->{$oid}) { - my $lf = <$rd>; - if ($lf eq "\n") { - my $out = join('', @buf); - $out =~ s/^/# /sgm; - warn "purge rewriting\n", $out, "\n"; - clean_purge_buffer($oid, \@buf); - $out = join('', @buf); - $w->print(@buf, "\n") or wfail; - @buf = (); - $npurge++; - } else { - die "expected LF: $lf\n"; - } + push @oids, $oid; + delete $tree->{$path}; } else { + $tree->{$path} = 1; push @buf, $_; } + } elsif (/^D (\w+)/) { + my $path = $1; + push @buf, $_ if $tree->{$path}; + } elsif ($_ eq "\n") { + my $out = join('', @buf); + $out =~ s/^/# /sgm; + warn "purge rewriting\n", $out, "\n"; + clean_purge_buffer(\@oids, \@buf); + $out = join('', @buf); + $w->print(@buf, "\n") or wfail; + @buf = (); + $npurge++; } else { push @buf, $_; } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index c4368ccc..c8869bda 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -634,15 +634,19 @@ sub reindex { -d $git->{git_dir} or next; # missing parts are fine chomp($tip = $git->qx('rev-parse', $head)) unless $tip; my $h = $cur == $max_git ? $tip : $head; - my @count = ('rev-list', '--count', $h, '--', 'm'); - $regen_max += $git->qx(@count); + + # can't use 'rev-list --count' if we use --diff-filter + my $fh = $git->popen(qw(log --pretty=tformat:%h + --no-notes --no-color --no-renames + --diff-filter=AM), $h, '--', 'm'); + ++$regen_max while <$fh>; } die "No messages found in $pfx/*.git, bug?\n" unless $regen_max; $regen = \$regen_max; } my $D = {}; my @cmd = qw(log --raw -r --pretty=tformat:%h - --no-notes --no-color --no-abbrev); + --no-notes --no-color --no-abbrev --no-renames); # if we are regenerating, we must not use a newer tip commit than what # the regeneration counter used: @@ -663,7 +667,7 @@ sub reindex { } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { $self->reindex_oid($mm_tmp, $D, $git, $1, $regen); - } elsif (m!\A:\d{6} 100644 $x40 ($x40) [AM]\t_/D$!o) { + } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { $self->mark_deleted($D, $git, $1); } } diff --git a/script/public-inbox-convert b/script/public-inbox-convert index f58bf27b..56ac44f5 100755 --- a/script/public-inbox-convert +++ b/script/public-inbox-convert @@ -71,6 +71,7 @@ my $im = $v2w->importer; my ($r, $w) = $im->gfi_start; my $h = '[0-9a-f]'; my %D; +my $last; while (<$rd>) { if ($_ eq "blob\n") { $state = 'blob'; @@ -90,13 +91,21 @@ while (<$rd>) { if (m{^M 100644 :(\d+) (${h}{2}/${h}{38})}o) { my ($mark, $path) = ($1, $2); $D{$path} = $mark; + if ($last && $last ne 'm') { + $w->print("D $last\n") or $im->wfail; + } $w->print("M 100644 :$mark m\n") or $im->wfail; + $last = 'm'; next; } if (m{^D (${h}{2}/${h}{38})}o) { my $mark = delete $D{$1}; defined $mark or die "undeleted path: $1\n"; - $w->print("M 100644 :$mark _/D\n") or $im->wfail; + if ($last && $last ne 'd') { + $w->print("D $last\n") or $im->wfail; + } + $w->print("M 100644 :$mark d\n") or $im->wfail; + $last = 'd'; next; } if (m{^from (:\d+)}) { diff --git a/t/convert-compact.t b/t/convert-compact.t index e51eadcf..92a6a9c5 100644 --- a/t/convert-compact.t +++ b/t/convert-compact.t @@ -37,6 +37,8 @@ my $mime = PublicInbox::MIME->create( body => "hello world\n", ); ok($im->add($mime), 'added one message'); +ok($im->remove($mime), 'remove message'); +ok($im->add($mime), 'added message again'); $im->done; PublicInbox::SearchIdx->new($ibx, 1)->index_sync; @@ -77,6 +79,7 @@ $cmd = [ 'public-inbox-compact', "$tmpdir/v2" ]; my $env = { NPROC => 2 }; ok(PublicInbox::Import::run_die($cmd, $env, $rdr), 'v2 compact works'); $ibx->{mainrepo} = "$tmpdir/v2"; +$ibx->{version} = 2; my $v2w = PublicInbox::V2Writable->new($ibx); is($v2w->{partitions}, 1, "only one partition in compacted repo"); @@ -96,5 +99,8 @@ foreach (@xdir) { is($st[2] & 07777, -f _ ? 0444 : 0755, 'sharedRepository respected after v2 compact'); } +my $res = $ibx->recent({limit => 1000}); +is($res->{msgs}->[0]->{mid}, 'a-mid@b', 'message exists in history'); +is(scalar @{$res->{msgs}}, 1, 'only one message in history'); done_testing(); diff --git a/t/v2-add-remove-add.t b/t/v2-add-remove-add.t new file mode 100644 index 00000000..b6c58872 --- /dev/null +++ b/t/v2-add-remove-add.t @@ -0,0 +1,42 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use File::Temp qw/tempdir/; + +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for v2-add-remove-add.t" if $@; +} +use_ok 'PublicInbox::V2Writable'; +my $mainrepo = tempdir('pi-add-remove-add-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $ibx = { + mainrepo => "$mainrepo/v2", + name => 'test-v2writable', + version => 2, + -primary_address => 'test@example.com', +}; +$ibx = PublicInbox::Inbox->new($ibx); +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + 'Message-ID' => '', + ], + body => "hello world\n", +); +my $im = PublicInbox::V2Writable->new($ibx, 1); +$im->{parallel} = 0; +ok($im->add($mime), 'message added'); +ok($im->remove($mime), 'message added'); +ok($im->add($mime), 'message added again'); +$im->done; +my $res = $ibx->recent({limit => 1000}); +is($res->{msgs}->[0]->{mid}, 'a-mid@b', 'message exists in history'); +is(scalar @{$res->{msgs}}, 1, 'only one message in history'); + +done_testing(); diff --git a/t/v2writable.t b/t/v2writable.t index 7abb14f6..4a7cfb90 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -210,9 +210,10 @@ EOF my @found = (); $srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 }); is(scalar(@found), 0, 'no longer found in Xapian skeleton'); + my @log1 = qw(log -1 --pretty=raw --raw -r --no-abbrev --no-renames); - my $after = $git0->qx(qw(log -1 --pretty=raw --raw -r --no-abbrev)); - if ($after =~ m!( [a-f0-9]+ )A\t_/D$!) { + my $after = $git0->qx(@log1); + if ($after =~ m!( [a-f0-9]+ )A\td$!m) { my $oid = $1; ok(index($before, $oid) > 0, 'no new blob introduced'); } else { @@ -221,7 +222,7 @@ EOF is($im->remove($mime, 'test removal'), undef, 'remove is idempotent'); $im->done; - is($git0->qx(qw(log -1 --pretty=raw --raw -r --no-abbrev)), + is($git0->qx(@log1), $after, 'no git history made with idempotent remove'); eval { $im->done }; ok(!$@, '->done is idempotent'); -- cgit v1.2.3-24-ge0c7