user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 10/15] cindex: speed up initial scan setup phase
Date: Thu, 30 Nov 2023 11:41:03 +0000	[thread overview]
Message-ID: <20231130114109.2577708-11-e@80x24.org> (raw)
In-Reply-To: <20231130114109.2577708-1-e@80x24.org>

This brings a no-op -cindex scan of a git.kernel.org mirror
down from 70s to 10s with a hot cache on a busy machine.

CPU-intensive SHA-256 fingerprinting of the `git show-ref'
result can be parallelized on shard workers.  Future changes can
move more of the initial scan setup phase into shard workers for
more parallelism.

But most of the performance for skipping unchanged repos is
gained from delaying the commit time reading until we've seen
the fingerprint is out-of-date, since reading commit times
requires a large amount of I/O compared to only reading refs
for fingerprints.
---
 lib/PublicInbox/CodeSearchIdx.pm | 99 +++++++++++++++++---------------
 1 file changed, 53 insertions(+), 46 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 555a1efe..ec0fc6e3 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -105,6 +105,7 @@ our (
 	@JOIN_DT, # YYYYmmddHHMMSS for dt:
 	$QRY_STR, # common query string for both code and inbox associations
 	$DUMP_IBX_WPIPE, # goes to sort(1)
+	$ANY_SHARD, # shard round-robin for scan fingerprinting
 	@OFF2ROOT,
 );
 
@@ -416,51 +417,42 @@ sub run_git {
 
 # this is different from the grokmirror-compatible fingerprint since we
 # only care about --heads (branches) and --tags, and not even their names
-sub fp_start ($$$) {
-	my ($self, $git, $prep_repo) = @_;
+sub fp_start ($$) {
+	my ($self, $git) = @_;
 	return if $DO_QUIT;
 	open my $refs, '+>', undef;
 	$git->{-repo}->{refs} = $refs;
-	run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
-		\&fp_fini, $self, $git, $prep_repo);
-}
-
-sub fp_fini { # run_git cb
-	my (undef, $self, $git, $prep_repo) = @_;
-	my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
-	sysseek($refs, 0, SEEK_SET);
-	$git->{-repo}->{fp} = sha_all(256, $refs)->hexdigest;
+	my ($c, $p) = PublicInbox::PktOp->pair;
+	my $next_on_err = PublicInbox::OnDestroy->new(\&index_next, $self);
+	$c->{ops}->{fp_done} = [ $self, $git, $next_on_err ];
+	$IDX_SHARDS[++$ANY_SHARD % scalar(@IDX_SHARDS)]->wq_io_do('fp_async',
+					[ $p->{op_p}, $refs ], $git->{git_dir})
 }
 
-sub ct_start ($$$) {
-	my ($self, $git, $prep_repo) = @_;
-	return if $DO_QUIT;
-	run_git([ qw[for-each-ref --sort=-committerdate
-		--format=%(committerdate:raw) --count=1
-		refs/heads/ refs/tags/] ], undef, # capture like qx
-		\&ct_fini, $self, $git, $prep_repo);
+sub fp_async { # via wq_io_do in worker
+	my ($self, $git_dir) = @_;
+	my $op_p = delete $self->{0} // die 'BUG: no {0} op_p';
+	my $refs = delete $self->{1} // die 'BUG: no {1} refs';
+	my $git = PublicInbox::Git->new($git_dir);
+	run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
+		\&fp_async_done, $self, $git, $op_p);
 }
 
-sub ct_fini { # run_git cb
-	my ($opt, $self, $git, $prep_repo) = @_;
-	my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
-	$git->{-repo}->{ct} = $ct + 0;
+sub fp_async_done { # run_git cb from worker
+	my ($opt, $self, $git, $op_p) = @_;
+	my $refs = delete $opt->{1} // 'BUG: no {-repo}->{refs}';
+	sysseek($refs, 0, SEEK_SET);
+	send($op_p, 'fp_done '.sha_all(256, $refs)->hexdigest, 0);
 }
 
-# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
-sub prep_repo ($$) {
-	my ($self, $git) = @_;
+sub fp_done { # called parent via PktOp by fp_async_done
+	my ($self, $git, $next_on_err, $hex) = @_;
+	$next_on_err->cancel;
 	return if $DO_QUIT;
-	return index_next($self) if $git->{-cidx_err};
-	my $repo = $git->{-repo} // die 'BUG: no {-repo}';
-	if (!defined($repo->{ct})) {
-		warn "W: $git->{git_dir} has no commits, skipping\n";
-		delete $git->{-repo};
-		return index_next($self);
-	}
+	$git->{-repo}->{fp} = $hex;
 	my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB);
 	my $shard = bless { %$self, shard => $n }, ref($self);
-	$repo->{shard_n} = $n;
+	$git->{-repo}->{shard_n} = $n;
 	delete @$shard{qw(lockfh lock_path)};
 	local $shard->{xdb} = $RDONLY_XDB[$n] // die "BUG: shard[$n] undef";
 	$shard->retry_reopen(\&check_existing, $self, $git);
@@ -469,7 +461,7 @@ sub prep_repo ($$) {
 sub check_existing { # retry_reopen callback
 	my ($shard, $self, $git) = @_;
 	my @docids = $shard->docids_of_git_dir($git->{git_dir});
-	my $docid = shift(@docids) // return get_roots($self, $git);
+	my $docid = shift(@docids) // return prep_repo($self, $git); # new repo
 	my $doc = $shard->get_doc($docid) //
 			die "BUG: no #$docid ($git->{git_dir})";
 	my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data;
@@ -482,7 +474,7 @@ sub check_existing { # retry_reopen callback
 		warn "BUG: $git->{git_dir} indexed multiple times, culling\n";
 		$git->{-repo}->{to_delete} = \@docids; # XXX needed?
 	}
-	get_roots($self, $git);
+	prep_repo($self, $git);
 }
 
 sub partition_refs ($$$) {
@@ -604,13 +596,9 @@ sub index_next ($) {
 	my ($self) = @_;
 	return if $DO_QUIT;
 	if ($IDXQ && @$IDXQ) {
-		index_repo(undef, $self, shift @$IDXQ);
+		index_repo($self, shift @$IDXQ);
 	} elsif ($SCANQ && @$SCANQ) {
-		my $git = shift @$SCANQ;
-		my $prep_repo = PublicInbox::OnDestroy->new(\&prep_repo,
-								$self, $git);
-		fp_start($self, $git, $prep_repo);
-		ct_start($self, $git, $prep_repo);
+		fp_start $self, shift @$SCANQ;
 	} elsif ($TMPDIR) {
 		delete $TODO{dump_roots_start};
 		delete $TODO{dump_ibx_start}; # runs OnDestroy once
@@ -649,12 +637,17 @@ sub index_done { # OnDestroy cb called when done indexing each code repo
 	# repo_stored will fire once store_repo is done
 }
 
-sub index_repo { # run_git cb
-	my (undef, $self, $git) = @_;
+sub index_repo {
+	my ($self, $git) = @_;
 	return if $DO_QUIT;
+	my $repo = $git->{-repo} // die 'BUG: no {-repo}';
 	return index_next($self) if $git->{-cidx_err};
+	if (!defined($repo->{ct})) {
+		warn "W: $git->{git_dir} has no commits, skipping\n";
+		return index_next($self);
+	}
 	return push(@$IDXQ, $git) if $REPO_CTX; # busy
-	my $repo = delete $git->{-repo} or return index_next($self);
+	delete $git->{-repo};
 	my $roots_fh = delete $repo->{roots_fh} // die 'BUG: no {roots_fh}';
 	seek($roots_fh, 0, SEEK_SET);
 	chomp(my @roots = PublicInbox::IO::read_all $roots_fh);
@@ -685,15 +678,28 @@ sub index_repo { # run_git cb
 	# shard_done fires when shard_index is done
 }
 
-sub get_roots ($$) {
+sub ct_fini { # run_git cb
+	my ($opt, $self, $git, $index_repo) = @_;
+	my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
+	$git->{-repo}->{ct} = $ct + 0;
+}
+
+# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
+sub prep_repo ($$) {
 	my ($self, $git) = @_;
 	return if $DO_QUIT;
+	my $index_repo = PublicInbox::OnDestroy->new(\&index_repo, $self, $git);
 	my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
 	sysseek($refs, 0, SEEK_SET);
 	open my $roots_fh, '+>', undef;
 	$git->{-repo}->{roots_fh} = $roots_fh;
 	run_git([ qw(rev-list --stdin --max-parents=0) ],
-		{ 0 => $refs, 1 => $roots_fh }, \&index_repo, $self, $git)
+		{ 0 => $refs, 1 => $roots_fh }, \&PublicInbox::Config::noop,
+		$self, $git, $index_repo);
+	run_git([ qw[for-each-ref --sort=-committerdate
+		--format=%(committerdate:raw) --count=1
+		refs/heads/ refs/tags/] ], undef, # capture like qx
+		\&ct_fini, $self, $git, $index_repo);
 }
 
 # for PublicInbox::SearchIdx `git patch-id' call and with_umask
@@ -1295,6 +1301,7 @@ sub cidx_run { # main entry point
 		init_join_prefork($self)
 	}
 	local @IDX_SHARDS = cidx_init($self); # forks workers
+	local $ANY_SHARD = -1;
 	local $self->{current_info} = '';
 	local $MY_SIG = {
 		CHLD => \&PublicInbox::DS::enqueue_reap,

  parent reply	other threads:[~2023-11-30 11:41 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
2023-11-30 11:40 ` [PATCH 01/15] cindex: fix store_repo+repo_stored on no-op Eric Wong
2023-11-30 11:40 ` [PATCH 02/15] codesearch: allow inbox count to exceed matches Eric Wong
2023-11-30 11:40 ` [PATCH 03/15] config: reject newlines consistently in dir names Eric Wong
2023-11-30 11:40 ` [PATCH 04/15] cindex: only create {-cidx_err} field on failures Eric Wong
2023-11-30 11:40 ` [PATCH 05/15] cindex: keep batch pipe for pruning SHA-256 repos Eric Wong
2023-11-30 11:40 ` [PATCH 06/15] cindex: store extensions.objectFormat with repo data Eric Wong
2023-11-30 21:36   ` Eric Wong
2023-11-30 11:41 ` [PATCH 07/15] git: share unlinked pack checking code with gcf2 Eric Wong
2023-11-30 11:41 ` [PATCH 08/15] cindex: skip getpid guard for most OnDestroy use Eric Wong
2023-11-30 11:41 ` [PATCH 09/15] spawn: drop IO layer support from redirects Eric Wong
2023-11-30 11:41 ` Eric Wong [this message]
2023-11-30 11:41 ` [PATCH 11/15] inbox: expire resources more aggressively Eric Wong
2023-11-30 11:41 ` [PATCH 12/15] git_async_cat: use git from "all" extindex if possible Eric Wong
2023-11-30 11:41 ` [PATCH 13/15] www_listing: support publicInbox.nameIsUrl Eric Wong
2023-12-01  1:29   ` Kyle Meyer
2023-12-01  2:01     ` [PATCH] doc: config: fix grammar for nameIsUrl Eric Wong
2023-11-30 11:41 ` [PATCH 14/15] inbox: shrink data structures for publicinbox.*.hide Eric Wong
2023-11-30 11:41 ` [PATCH 15/15] codesearch: use retry_reopen for WWW Eric Wong
2023-11-30 21:40   ` [PATCH v2] " Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231130114109.2577708-11-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).