user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/4] v2writable: speedup no-op -index invocation
@ 2019-05-30  6:52 Eric Wong
  2019-05-30  6:52 ` [PATCH 1/4] v2writable: split off unindex_range mapping Eric Wong
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Eric Wong @ 2019-05-30  6:52 UTC (permalink / raw)
  To: meta

`public-inbox-index' was doing unnecessary work in no-op
situations (after "git fetch") and takes several seconds
on v2 repos.

On my dinky laptop, this allows a no-op -index on
lore.kernel.org/lkml to go from ~8s to ~0.4s

Eric Wong (4):
  v2writable: split off unindex_range mapping
  v2writable: hoist out index_epoch sub
  v2writable: avoid mm_tmp creation without regen
  v2writable: short-circuit is_ancestor check on equality

 lib/PublicInbox/V2Writable.pm | 92 +++++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 31 deletions(-)

-- 
EW


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/4] v2writable: split off unindex_range mapping
  2019-05-30  6:52 [PATCH 0/4] v2writable: speedup no-op -index invocation Eric Wong
@ 2019-05-30  6:52 ` Eric Wong
  2019-05-30  6:52 ` [PATCH 2/4] v2writable: hoist out index_epoch sub Eric Wong
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2019-05-30  6:52 UTC (permalink / raw)
  To: meta

It'll make it easier to detect if we have anything to
unindex and run git-log on, at all.
---
 lib/PublicInbox/V2Writable.pm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 6b01171..df8cfb4 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -861,7 +861,7 @@ Rewritten history? (in $git->{git_dir})
 reindexing $git->{git_dir} starting at
 $range
 
-		$sync->{"unindex-range.$i"} = "$base..$cur";
+		$sync->{unindex_range}->{$i} = "$base..$cur";
 	}
 	$range;
 }
@@ -993,6 +993,7 @@ sub index_sync {
 	my $sync = {
 		mm_tmp => $self->{mm}->tmp_clone,
 		D => {}, # "$mid\0$cid" => $oid
+		unindex_range => {}, # EPOCH => oid_old..oid_new
 		reindex => $opt->{reindex},
 		-opt => $opt
 	};
@@ -1009,7 +1010,7 @@ sub index_sync {
 		-d $git_dir or next; # missing parts are fine
 		fill_alternates($self, $i);
 		my $git = PublicInbox::Git->new($git_dir);
-		my $unindex_range = delete $sync->{"unindex-range.$i"};
+		my $unindex_range = delete $sync->{unindex_range}->{$i};
 		unindex($self, $sync, $git, $unindex_range) if $unindex_range;
 		defined(my $range = $sync->{ranges}->[$i]) or next;
 		$pr->("$i.git indexing $range\n") if $pr;
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/4] v2writable: hoist out index_epoch sub
  2019-05-30  6:52 [PATCH 0/4] v2writable: speedup no-op -index invocation Eric Wong
  2019-05-30  6:52 ` [PATCH 1/4] v2writable: split off unindex_range mapping Eric Wong
@ 2019-05-30  6:52 ` Eric Wong
  2019-05-30  6:52 ` [PATCH 3/4] v2writable: avoid mm_tmp creation without regen Eric Wong
  2019-05-30  6:52 ` [PATCH 4/4] v2writable: short-circuit is_ancestor check on equality Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2019-05-30  6:52 UTC (permalink / raw)
  To: meta

This will make future changes easier-to-follow.
---
 lib/PublicInbox/V2Writable.pm | 65 ++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index df8cfb4..375f12f 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -981,6 +981,42 @@ sub sync_ranges ($$$) {
 	$ranges;
 }
 
+sub index_epoch ($$$) {
+	my ($self, $sync, $i) = @_;
+
+	my $git_dir = git_dir_n($self, $i);
+	die 'BUG: already reindexing!' if $self->{reindex_pipe};
+	-d $git_dir or return; # missing parts are fine
+	fill_alternates($self, $i);
+	my $git = PublicInbox::Git->new($git_dir);
+	if (my $unindex_range = delete $sync->{unindex_range}->{$i}) {
+		unindex($self, $sync, $git, $unindex_range);
+	}
+	defined(my $range = $sync->{ranges}->[$i]) or return;
+	if (my $pr = $sync->{-opt}->{-progress}) {
+		$pr->("$i.git indexing $range\n");
+	}
+
+	my @cmd = qw(log --raw -r --pretty=tformat:%H
+			--no-notes --no-color --no-abbrev --no-renames);
+	my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range);
+	my $cmt;
+	while (<$fh>) {
+		chomp;
+		$self->{current_info} = "$i.git $_";
+		if (/\A$x40$/o && !defined($cmt)) {
+			$cmt = $_;
+		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
+			reindex_oid($self, $sync, $git, $1);
+		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
+			mark_deleted($self, $sync, $git, $1);
+		}
+	}
+	$fh = undef;
+	delete $self->{reindex_pipe};
+	update_last_commit($self, $git, $i, $cmt) if defined $cmt;
+}
+
 # public, called by public-inbox-index
 sub index_sync {
 	my ($self, $opt) = @_;
@@ -1000,36 +1036,9 @@ sub index_sync {
 	$sync->{ranges} = sync_ranges($self, $sync, $epoch_max);
 	$sync->{regen} = sync_prepare($self, $sync, $epoch_max);
 
-	my @cmd = qw(log --raw -r --pretty=tformat:%H
-			--no-notes --no-color --no-abbrev --no-renames);
-
 	# work backwards through history
 	for (my $i = $epoch_max; $i >= 0; $i--) {
-		my $git_dir = git_dir_n($self, $i);
-		die 'BUG: already reindexing!' if $self->{reindex_pipe};
-		-d $git_dir or next; # missing parts are fine
-		fill_alternates($self, $i);
-		my $git = PublicInbox::Git->new($git_dir);
-		my $unindex_range = delete $sync->{unindex_range}->{$i};
-		unindex($self, $sync, $git, $unindex_range) if $unindex_range;
-		defined(my $range = $sync->{ranges}->[$i]) or next;
-		$pr->("$i.git indexing $range\n") if $pr;
-		my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range);
-		my $cmt;
-		while (<$fh>) {
-			chomp;
-			$self->{current_info} = "$i.git $_";
-			if (/\A$x40$/o && !defined($cmt)) {
-				$cmt = $_;
-			} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
-				reindex_oid($self, $sync, $git, $1);
-			} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
-				mark_deleted($self, $sync, $git, $1);
-			}
-		}
-		$fh = undef;
-		delete $self->{reindex_pipe};
-		update_last_commit($self, $git, $i, $cmt) if defined $cmt;
+		index_epoch($self, $sync, $i);
 	}
 
 	# unindex is required for leftovers if "deletes" affect messages
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/4] v2writable: avoid mm_tmp creation without regen
  2019-05-30  6:52 [PATCH 0/4] v2writable: speedup no-op -index invocation Eric Wong
  2019-05-30  6:52 ` [PATCH 1/4] v2writable: split off unindex_range mapping Eric Wong
  2019-05-30  6:52 ` [PATCH 2/4] v2writable: hoist out index_epoch sub Eric Wong
@ 2019-05-30  6:52 ` Eric Wong
  2019-05-30  6:52 ` [PATCH 4/4] v2writable: short-circuit is_ancestor check on equality Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2019-05-30  6:52 UTC (permalink / raw)
  To: meta

Creating mm_tmp is an expensive operation with large inboxes
and can be avoided if there are no new messages to process.

Since git-fetch(1) currently lacks an --exit-code option(*),
mirrors will run `public-inbox-index' unconditionally after
fetch, which is an expensive op if it needs to duplicate
a large SQLite DB.

This speeds up the mirror case of:

	git --git-dir=git/$EPOCH.git fetch && public-inbox-index

This reduces the no-op `public-inbox-index' time from over 8s to
~0.5s on a (currently) 7-epoch clone of https://lore.kernel.org/lkml/
on my system.

(*) WIP --exit-code for git-fetch:
    https://public-inbox.org/git/87ftphw7mv.fsf@evledraar.gmail.com/
---
 lib/PublicInbox/V2Writable.pm | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 375f12f..fd93ac2 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -900,6 +900,9 @@ sub sync_prepare ($$$) {
 		$pr->("$n\n") if $pr;
 		$regen_max += $n;
 	}
+
+	return 0 if (!$regen_max && !keys(%{$self->{unindex_range}}));
+
 	# reindex should NOT see new commits anymore, if we do,
 	# it's a problem and we need to notice it via die()
 	my $pad = length($regen_max) + 1;
@@ -1027,7 +1030,6 @@ sub index_sync {
 	return unless defined $latest;
 	$self->idx_init($opt); # acquire lock
 	my $sync = {
-		mm_tmp => $self->{mm}->tmp_clone,
 		D => {}, # "$mid\0$cid" => $oid
 		unindex_range => {}, # EPOCH => oid_old..oid_new
 		reindex => $opt->{reindex},
@@ -1036,6 +1038,16 @@ sub index_sync {
 	$sync->{ranges} = sync_ranges($self, $sync, $epoch_max);
 	$sync->{regen} = sync_prepare($self, $sync, $epoch_max);
 
+	if ($sync->{regen}) {
+		# tmp_clone seems to fail if inside a transaction, so
+		# we rollback here (because we opened {mm} for reading)
+		# Note: we do NOT rely on DBI transactions for atomicity;
+		# only for batch performance.
+		$self->{mm}->{dbh}->rollback;
+		$self->{mm}->{dbh}->begin_work;
+		$sync->{mm_tmp} = $self->{mm}->tmp_clone;
+	}
+
 	# work backwards through history
 	for (my $i = $epoch_max; $i >= 0; $i--) {
 		index_epoch($self, $sync, $i);
@@ -1049,8 +1061,10 @@ sub index_sync {
 		$git->cleanup;
 	}
 	$self->done;
-	if (my $pr = $sync->{-opt}->{-progress}) {
-		$pr->('all.git '.sprintf($sync->{-regen_fmt}, $sync->{nr}));
+
+	if (my $nr = $sync->{nr}) {
+		my $pr = $sync->{-opt}->{-progress};
+		$pr->('all.git '.sprintf($sync->{-regen_fmt}, $nr)) if $pr;
 	}
 
 	# reindex does not pick up new changes, so we rerun w/o it:
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/4] v2writable: short-circuit is_ancestor check on equality
  2019-05-30  6:52 [PATCH 0/4] v2writable: speedup no-op -index invocation Eric Wong
                   ` (2 preceding siblings ...)
  2019-05-30  6:52 ` [PATCH 3/4] v2writable: avoid mm_tmp creation without regen Eric Wong
@ 2019-05-30  6:52 ` Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2019-05-30  6:52 UTC (permalink / raw)
  To: meta

We don't need to use git to check ancestry if object IDs
match on a string comparison.

This saves 100ms or so and brings down the ~0.5s no-op time on
lore.kernel.org/lkml down to ~0.4s.
---
 lib/PublicInbox/V2Writable.pm | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index fd93ac2..76844cd 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -831,6 +831,12 @@ sub log_range ($$$$$) {
 		return $tip; # all of it
 	};
 
+	# fast equality check to avoid (v)fork+execve overhead
+	if ($cur eq $tip) {
+		$sync->{ranges}->[$i] = undef;
+		return;
+	}
+
 	my $range = "$cur..$tip";
 	$pr->("$i.git checking contiguity... ") if $pr;
 	if (is_ancestor($git, $cur, $tip)) { # common case
-- 
EW


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-05-30  6:52 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-30  6:52 [PATCH 0/4] v2writable: speedup no-op -index invocation Eric Wong
2019-05-30  6:52 ` [PATCH 1/4] v2writable: split off unindex_range mapping Eric Wong
2019-05-30  6:52 ` [PATCH 2/4] v2writable: hoist out index_epoch sub Eric Wong
2019-05-30  6:52 ` [PATCH 3/4] v2writable: avoid mm_tmp creation without regen Eric Wong
2019-05-30  6:52 ` [PATCH 4/4] v2writable: short-circuit is_ancestor check on equality Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).