user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH] v2: one file, really
Date: Sun,  1 Apr 2018 23:15:04 +0000	[thread overview]
Message-ID: <20180401231504.3763-1-e@80x24.org> (raw)

We need to ensure there is only one file in the top-level tree
at any commit so the "add; remove; add;" sequence on the same
message is detected properly.

Otherwise, git will not detect the second "add" unless
a second message is added to history.

Deletes are now stored in "d" (and not "D" or "_/D") at the
top-level, now.  There's no need to have a "_" to reduce churn
as "m" and "d" should never co-exist.  It's now lowercased to
make it easier-to-distinguish from "D" in git-log output.
---
 MANIFEST                      |  1 +
 lib/PublicInbox/Import.pm     | 61 +++++++++++++++++++++++++++++--------------
 lib/PublicInbox/V2Writable.pm | 12 ++++++---
 script/public-inbox-convert   | 11 +++++++-
 t/convert-compact.t           |  6 +++++
 t/v2-add-remove-add.t         | 42 +++++++++++++++++++++++++++++
 t/v2writable.t                |  7 ++---
 7 files changed, 113 insertions(+), 27 deletions(-)
 create mode 100644 t/v2-add-remove-add.t

diff --git a/MANIFEST b/MANIFEST
index 4a1096d..60e15f2 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -187,6 +187,7 @@ t/thread-all.t
 t/thread-cycle.t
 t/time.t
 t/utf8.mbox
+t/v2-add-remove-add.t
 t/v2mda.t
 t/v2reindex.t
 t/v2writable.t
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 463b44e..b2aae9a 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -49,7 +49,14 @@ sub gfi_start {
 	$self->lock_acquire;
 
 	local $/ = "\n";
-	chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $self->{ref}));
+	my $ref = $self->{ref};
+	chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref));
+	if ($self->{path_type} ne '2/38' && $self->{tip}) {
+		local $/ = "\0";
+		my @tree = $git->qx(qw(ls-tree -r -z --name-only), $ref);
+		chomp @tree;
+		$self->{-tree} = { map { $_ => 1 } @tree };
+	}
 
 	my $git_dir = $git->{git_dir};
 	my @cmd = ('git', "--git-dir=$git_dir", qw(fast-import
@@ -238,7 +245,8 @@ sub remove {
 	if (defined $path) {
 		print $w "D $path\n\n" or wfail;
 	} else {
-		print $w "M 100644 :$blob _/D\n\n" or wfail;
+		clean_tree_v2($self, $w, 'd');
+		print $w "M 100644 :$blob d\n\n" or wfail;
 	}
 	$self->{nchg}++;
 	(($self->{tip} = ":$commit"), $cur);
@@ -317,6 +325,15 @@ sub v1_mid0 ($) {
 	}
 	$mids->[0];
 }
+sub clean_tree_v2 ($$$) {
+	my ($self, $w, $keep) = @_;
+	my $tree = $self->{-tree} or return; #v2 only
+	delete $tree->{$keep};
+	foreach (keys %$tree) {
+		print $w "D $_\n" or wfail;
+	}
+	%$tree = ($keep => 1);
+}
 
 # returns undef on duplicate
 # returns the :MARK of the most recent commit
@@ -382,6 +399,7 @@ sub add {
 	if ($tip ne '') {
 		print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail;
 	}
+	clean_tree_v2($self, $w, $path);
 	print $w "M 100644 :$blob $path\n\n" or wfail;
 	$self->{nchg}++;
 	$self->{tip} = ":$commit";
@@ -431,8 +449,9 @@ sub digest2mid ($) {
 }
 
 sub clean_purge_buffer {
-	my ($oid, $buf) = @_;
-	my $cmt_msg = "purged $oid\n";
+	my ($oids, $buf) = @_;
+	my $cmt_msg = 'purged '.join(' ',@$oids)."\n";
+	@$oids = ();
 
 	foreach my $i (0..$#$buf) {
 		my $l = $buf->[$i];
@@ -456,6 +475,8 @@ sub purge_oids {
 	my ($r, $w) = $self->gfi_start;
 	my @buf;
 	my $npurge = 0;
+	my @oids;
+	my $tree = $self->{-tree};
 	while (<$rd>) {
 		if (/^reset (?:.+)/) {
 			push @buf, "reset $tmp\n";
@@ -472,25 +493,27 @@ sub purge_oids {
 			my $n = read($rd, my $buf, $len) or die "read: $!";
 			$len == $n or die "short read ($n < $len)";
 			push @buf, $buf;
-		} elsif (/^M 100644 ([a-f0-9]+) /) {
-			my $oid = $1;
+		} elsif (/^M 100644 ([a-f0-9]+) (\w+)/) {
+			my ($oid, $path) = ($1, $2);
 			if ($purge->{$oid}) {
-				my $lf = <$rd>;
-				if ($lf eq "\n") {
-					my $out = join('', @buf);
-					$out =~ s/^/# /sgm;
-					warn "purge rewriting\n", $out, "\n";
-					clean_purge_buffer($oid, \@buf);
-					$out = join('', @buf);
-					$w->print(@buf, "\n") or wfail;
-					@buf = ();
-					$npurge++;
-				} else {
-					die "expected LF: $lf\n";
-				}
+				push @oids, $oid;
+				delete $tree->{$path};
 			} else {
+				$tree->{$path} = 1;
 				push @buf, $_;
 			}
+		} elsif (/^D (\w+)/) {
+			my $path = $1;
+			push @buf, $_ if $tree->{$path};
+		} elsif ($_ eq "\n") {
+			my $out = join('', @buf);
+			$out =~ s/^/# /sgm;
+			warn "purge rewriting\n", $out, "\n";
+			clean_purge_buffer(\@oids, \@buf);
+			$out = join('', @buf);
+			$w->print(@buf, "\n") or wfail;
+			@buf = ();
+			$npurge++;
 		} else {
 			push @buf, $_;
 		}
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index c4368cc..c8869bd 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -634,15 +634,19 @@ sub reindex {
 			-d $git->{git_dir} or next; # missing parts are fine
 			chomp($tip = $git->qx('rev-parse', $head)) unless $tip;
 			my $h = $cur == $max_git ? $tip : $head;
-			my @count = ('rev-list', '--count', $h, '--', 'm');
-			$regen_max += $git->qx(@count);
+
+			# can't use 'rev-list --count' if we use --diff-filter
+			my $fh = $git->popen(qw(log --pretty=tformat:%h
+					--no-notes --no-color --no-renames
+					--diff-filter=AM), $h, '--', 'm');
+			++$regen_max while <$fh>;
 		}
 		die "No messages found in $pfx/*.git, bug?\n" unless $regen_max;
 		$regen = \$regen_max;
 	}
 	my $D = {};
 	my @cmd = qw(log --raw -r --pretty=tformat:%h
-			--no-notes --no-color --no-abbrev);
+			--no-notes --no-color --no-abbrev --no-renames);
 
 	# if we are regenerating, we must not use a newer tip commit than what
 	# the regeneration counter used:
@@ -663,7 +667,7 @@ sub reindex {
 			} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
 				$self->reindex_oid($mm_tmp, $D, $git, $1,
 						$regen);
-			} elsif (m!\A:\d{6} 100644 $x40 ($x40) [AM]\t_/D$!o) {
+			} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
 				$self->mark_deleted($D, $git, $1);
 			}
 		}
diff --git a/script/public-inbox-convert b/script/public-inbox-convert
index f58bf27..56ac44f 100755
--- a/script/public-inbox-convert
+++ b/script/public-inbox-convert
@@ -71,6 +71,7 @@ my $im = $v2w->importer;
 my ($r, $w) = $im->gfi_start;
 my $h = '[0-9a-f]';
 my %D;
+my $last;
 while (<$rd>) {
 	if ($_ eq "blob\n") {
 		$state = 'blob';
@@ -90,13 +91,21 @@ while (<$rd>) {
 		if (m{^M 100644 :(\d+) (${h}{2}/${h}{38})}o) {
 			my ($mark, $path) = ($1, $2);
 			$D{$path} = $mark;
+			if ($last && $last ne 'm') {
+				$w->print("D $last\n") or $im->wfail;
+			}
 			$w->print("M 100644 :$mark m\n") or $im->wfail;
+			$last = 'm';
 			next;
 		}
 		if (m{^D (${h}{2}/${h}{38})}o) {
 			my $mark = delete $D{$1};
 			defined $mark or die "undeleted path: $1\n";
-			$w->print("M 100644 :$mark _/D\n") or $im->wfail;
+			if ($last && $last ne 'd') {
+				$w->print("D $last\n") or $im->wfail;
+			}
+			$w->print("M 100644 :$mark d\n") or $im->wfail;
+			$last = 'd';
 			next;
 		}
 		if (m{^from (:\d+)}) {
diff --git a/t/convert-compact.t b/t/convert-compact.t
index e51eadc..92a6a9c 100644
--- a/t/convert-compact.t
+++ b/t/convert-compact.t
@@ -37,6 +37,8 @@ my $mime = PublicInbox::MIME->create(
 	body => "hello world\n",
 );
 ok($im->add($mime), 'added one message');
+ok($im->remove($mime), 'remove message');
+ok($im->add($mime), 'added message again');
 $im->done;
 PublicInbox::SearchIdx->new($ibx, 1)->index_sync;
 
@@ -77,6 +79,7 @@ $cmd = [ 'public-inbox-compact', "$tmpdir/v2" ];
 my $env = { NPROC => 2 };
 ok(PublicInbox::Import::run_die($cmd, $env, $rdr), 'v2 compact works');
 $ibx->{mainrepo} = "$tmpdir/v2";
+$ibx->{version} = 2;
 my $v2w = PublicInbox::V2Writable->new($ibx);
 is($v2w->{partitions}, 1, "only one partition in compacted repo");
 
@@ -96,5 +99,8 @@ foreach (@xdir) {
 	is($st[2] & 07777, -f _ ? 0444 : 0755,
 		'sharedRepository respected after v2 compact');
 }
+my $res = $ibx->recent({limit => 1000});
+is($res->{msgs}->[0]->{mid}, 'a-mid@b', 'message exists in history');
+is(scalar @{$res->{msgs}}, 1, 'only one message in history');
 
 done_testing();
diff --git a/t/v2-add-remove-add.t b/t/v2-add-remove-add.t
new file mode 100644
index 0000000..b6c5887
--- /dev/null
+++ b/t/v2-add-remove-add.t
@@ -0,0 +1,42 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::MIME;
+use File::Temp qw/tempdir/;
+
+foreach my $mod (qw(DBD::SQLite Search::Xapian)) {
+	eval "require $mod";
+	plan skip_all => "$mod missing for v2-add-remove-add.t" if $@;
+}
+use_ok 'PublicInbox::V2Writable';
+my $mainrepo = tempdir('pi-add-remove-add-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+my $ibx = {
+	mainrepo => "$mainrepo/v2",
+	name => 'test-v2writable',
+	version => 2,
+	-primary_address => 'test@example.com',
+};
+$ibx = PublicInbox::Inbox->new($ibx);
+my $mime = PublicInbox::MIME->create(
+	header => [
+		From => 'a@example.com',
+		To => 'test@example.com',
+		Subject => 'this is a subject',
+		Date => 'Fri, 02 Oct 1993 00:00:00 +0000',
+		'Message-ID' => '<a-mid@b>',
+	],
+	body => "hello world\n",
+);
+my $im = PublicInbox::V2Writable->new($ibx, 1);
+$im->{parallel} = 0;
+ok($im->add($mime), 'message added');
+ok($im->remove($mime), 'message added');
+ok($im->add($mime), 'message added again');
+$im->done;
+my $res = $ibx->recent({limit => 1000});
+is($res->{msgs}->[0]->{mid}, 'a-mid@b', 'message exists in history');
+is(scalar @{$res->{msgs}}, 1, 'only one message in history');
+
+done_testing();
diff --git a/t/v2writable.t b/t/v2writable.t
index 7abb14f..4a7cfb9 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -210,9 +210,10 @@ EOF
 	my @found = ();
 	$srch->each_smsg_by_mid($smsg->mid, sub { push @found, @_; 1 });
 	is(scalar(@found), 0, 'no longer found in Xapian skeleton');
+	my @log1 = qw(log -1 --pretty=raw --raw -r --no-abbrev --no-renames);
 
-	my $after = $git0->qx(qw(log -1 --pretty=raw --raw -r --no-abbrev));
-	if ($after =~ m!( [a-f0-9]+ )A\t_/D$!) {
+	my $after = $git0->qx(@log1);
+	if ($after =~ m!( [a-f0-9]+ )A\td$!m) {
 		my $oid = $1;
 		ok(index($before, $oid) > 0, 'no new blob introduced');
 	} else {
@@ -221,7 +222,7 @@ EOF
 	is($im->remove($mime, 'test removal'), undef,
 		'remove is idempotent');
 	$im->done;
-	is($git0->qx(qw(log -1 --pretty=raw --raw -r --no-abbrev)),
+	is($git0->qx(@log1),
 		$after, 'no git history made with idempotent remove');
 	eval { $im->done };
 	ok(!$@, '->done is idempotent');
-- 
EW


                 reply	other threads:[~2018-04-01 23:15 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180401231504.3763-1-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).