user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 10/15] cindex: speed up initial scan setup phase
  2023-11-30 11:40  7% [PATCH 00/15] various cindex fixes + speedups Eric Wong
@ 2023-11-30 11:41  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
  To: meta

This brings a no-op -cindex scan of a git.kernel.org mirror
down from 70s to 10s with a hot cache on a busy machine.

CPU-intensive SHA-256 fingerprinting of the `git show-ref'
result can be parallelized on shard workers.  Future changes can
move more of the initial scan setup phase into shard workers for
more parallelism.

But most of the performance for skipping unchanged repos is
gained from delaying the commit time reading until we've seen
the fingerprint is out-of-date, since reading commit times
requires a large amount of I/O compared to only reading refs
for fingerprints.
---
 lib/PublicInbox/CodeSearchIdx.pm | 99 +++++++++++++++++---------------
 1 file changed, 53 insertions(+), 46 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 555a1efe..ec0fc6e3 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -105,6 +105,7 @@ our (
 	@JOIN_DT, # YYYYmmddHHMMSS for dt:
 	$QRY_STR, # common query string for both code and inbox associations
 	$DUMP_IBX_WPIPE, # goes to sort(1)
+	$ANY_SHARD, # shard round-robin for scan fingerprinting
 	@OFF2ROOT,
 );
 
@@ -416,51 +417,42 @@ sub run_git {
 
 # this is different from the grokmirror-compatible fingerprint since we
 # only care about --heads (branches) and --tags, and not even their names
-sub fp_start ($$$) {
-	my ($self, $git, $prep_repo) = @_;
+sub fp_start ($$) {
+	my ($self, $git) = @_;
 	return if $DO_QUIT;
 	open my $refs, '+>', undef;
 	$git->{-repo}->{refs} = $refs;
-	run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
-		\&fp_fini, $self, $git, $prep_repo);
-}
-
-sub fp_fini { # run_git cb
-	my (undef, $self, $git, $prep_repo) = @_;
-	my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
-	sysseek($refs, 0, SEEK_SET);
-	$git->{-repo}->{fp} = sha_all(256, $refs)->hexdigest;
+	my ($c, $p) = PublicInbox::PktOp->pair;
+	my $next_on_err = PublicInbox::OnDestroy->new(\&index_next, $self);
+	$c->{ops}->{fp_done} = [ $self, $git, $next_on_err ];
+	$IDX_SHARDS[++$ANY_SHARD % scalar(@IDX_SHARDS)]->wq_io_do('fp_async',
+					[ $p->{op_p}, $refs ], $git->{git_dir})
 }
 
-sub ct_start ($$$) {
-	my ($self, $git, $prep_repo) = @_;
-	return if $DO_QUIT;
-	run_git([ qw[for-each-ref --sort=-committerdate
-		--format=%(committerdate:raw) --count=1
-		refs/heads/ refs/tags/] ], undef, # capture like qx
-		\&ct_fini, $self, $git, $prep_repo);
+sub fp_async { # via wq_io_do in worker
+	my ($self, $git_dir) = @_;
+	my $op_p = delete $self->{0} // die 'BUG: no {0} op_p';
+	my $refs = delete $self->{1} // die 'BUG: no {1} refs';
+	my $git = PublicInbox::Git->new($git_dir);
+	run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
+		\&fp_async_done, $self, $git, $op_p);
 }
 
-sub ct_fini { # run_git cb
-	my ($opt, $self, $git, $prep_repo) = @_;
-	my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
-	$git->{-repo}->{ct} = $ct + 0;
+sub fp_async_done { # run_git cb from worker
+	my ($opt, $self, $git, $op_p) = @_;
+	my $refs = delete $opt->{1} // 'BUG: no {-repo}->{refs}';
+	sysseek($refs, 0, SEEK_SET);
+	send($op_p, 'fp_done '.sha_all(256, $refs)->hexdigest, 0);
 }
 
-# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
-sub prep_repo ($$) {
-	my ($self, $git) = @_;
+sub fp_done { # called parent via PktOp by fp_async_done
+	my ($self, $git, $next_on_err, $hex) = @_;
+	$next_on_err->cancel;
 	return if $DO_QUIT;
-	return index_next($self) if $git->{-cidx_err};
-	my $repo = $git->{-repo} // die 'BUG: no {-repo}';
-	if (!defined($repo->{ct})) {
-		warn "W: $git->{git_dir} has no commits, skipping\n";
-		delete $git->{-repo};
-		return index_next($self);
-	}
+	$git->{-repo}->{fp} = $hex;
 	my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB);
 	my $shard = bless { %$self, shard => $n }, ref($self);
-	$repo->{shard_n} = $n;
+	$git->{-repo}->{shard_n} = $n;
 	delete @$shard{qw(lockfh lock_path)};
 	local $shard->{xdb} = $RDONLY_XDB[$n] // die "BUG: shard[$n] undef";
 	$shard->retry_reopen(\&check_existing, $self, $git);
@@ -469,7 +461,7 @@ sub prep_repo ($$) {
 sub check_existing { # retry_reopen callback
 	my ($shard, $self, $git) = @_;
 	my @docids = $shard->docids_of_git_dir($git->{git_dir});
-	my $docid = shift(@docids) // return get_roots($self, $git);
+	my $docid = shift(@docids) // return prep_repo($self, $git); # new repo
 	my $doc = $shard->get_doc($docid) //
 			die "BUG: no #$docid ($git->{git_dir})";
 	my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data;
@@ -482,7 +474,7 @@ sub check_existing { # retry_reopen callback
 		warn "BUG: $git->{git_dir} indexed multiple times, culling\n";
 		$git->{-repo}->{to_delete} = \@docids; # XXX needed?
 	}
-	get_roots($self, $git);
+	prep_repo($self, $git);
 }
 
 sub partition_refs ($$$) {
@@ -604,13 +596,9 @@ sub index_next ($) {
 	my ($self) = @_;
 	return if $DO_QUIT;
 	if ($IDXQ && @$IDXQ) {
-		index_repo(undef, $self, shift @$IDXQ);
+		index_repo($self, shift @$IDXQ);
 	} elsif ($SCANQ && @$SCANQ) {
-		my $git = shift @$SCANQ;
-		my $prep_repo = PublicInbox::OnDestroy->new(\&prep_repo,
-								$self, $git);
-		fp_start($self, $git, $prep_repo);
-		ct_start($self, $git, $prep_repo);
+		fp_start $self, shift @$SCANQ;
 	} elsif ($TMPDIR) {
 		delete $TODO{dump_roots_start};
 		delete $TODO{dump_ibx_start}; # runs OnDestroy once
@@ -649,12 +637,17 @@ sub index_done { # OnDestroy cb called when done indexing each code repo
 	# repo_stored will fire once store_repo is done
 }
 
-sub index_repo { # run_git cb
-	my (undef, $self, $git) = @_;
+sub index_repo {
+	my ($self, $git) = @_;
 	return if $DO_QUIT;
+	my $repo = $git->{-repo} // die 'BUG: no {-repo}';
 	return index_next($self) if $git->{-cidx_err};
+	if (!defined($repo->{ct})) {
+		warn "W: $git->{git_dir} has no commits, skipping\n";
+		return index_next($self);
+	}
 	return push(@$IDXQ, $git) if $REPO_CTX; # busy
-	my $repo = delete $git->{-repo} or return index_next($self);
+	delete $git->{-repo};
 	my $roots_fh = delete $repo->{roots_fh} // die 'BUG: no {roots_fh}';
 	seek($roots_fh, 0, SEEK_SET);
 	chomp(my @roots = PublicInbox::IO::read_all $roots_fh);
@@ -685,15 +678,28 @@ sub index_repo { # run_git cb
 	# shard_done fires when shard_index is done
 }
 
-sub get_roots ($$) {
+sub ct_fini { # run_git cb
+	my ($opt, $self, $git, $index_repo) = @_;
+	my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
+	$git->{-repo}->{ct} = $ct + 0;
+}
+
+# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
+sub prep_repo ($$) {
 	my ($self, $git) = @_;
 	return if $DO_QUIT;
+	my $index_repo = PublicInbox::OnDestroy->new(\&index_repo, $self, $git);
 	my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
 	sysseek($refs, 0, SEEK_SET);
 	open my $roots_fh, '+>', undef;
 	$git->{-repo}->{roots_fh} = $roots_fh;
 	run_git([ qw(rev-list --stdin --max-parents=0) ],
-		{ 0 => $refs, 1 => $roots_fh }, \&index_repo, $self, $git)
+		{ 0 => $refs, 1 => $roots_fh }, \&PublicInbox::Config::noop,
+		$self, $git, $index_repo);
+	run_git([ qw[for-each-ref --sort=-committerdate
+		--format=%(committerdate:raw) --count=1
+		refs/heads/ refs/tags/] ], undef, # capture like qx
+		\&ct_fini, $self, $git, $index_repo);
 }
 
 # for PublicInbox::SearchIdx `git patch-id' call and with_umask
@@ -1295,6 +1301,7 @@ sub cidx_run { # main entry point
 		init_join_prefork($self)
 	}
 	local @IDX_SHARDS = cidx_init($self); # forks workers
+	local $ANY_SHARD = -1;
 	local $self->{current_info} = '';
 	local $MY_SIG = {
 		CHLD => \&PublicInbox::DS::enqueue_reap,

^ permalink raw reply related	[relevance 5%]

* [PATCH 00/15] various cindex fixes + speedups
@ 2023-11-30 11:40  7% Eric Wong
  2023-11-30 11:41  5% ` [PATCH 10/15] cindex: speed up initial scan setup phase Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
  To: meta

Notable changes:

10/15 provides a huge speedup which will hopefully make
future developments faster.

12/15 probably obsoletes libgit2 for extindex "all" users.

13/15 can save some memory with many inboxes while making
configuration easier.

Eric Wong (15):
  cindex: fix store_repo+repo_stored on no-op
  codesearch: allow inbox count to exceed matches
  config: reject newlines consistently in dir names
  cindex: only create {-cidx_err} field on failures
  cindex: keep batch pipe for pruning SHA-256 repos
  cindex: store extensions.objectFormat with repo data
  git: share unlinked pack checking code with gcf2
  cindex: skip getpid guard for most OnDestroy use
  spawn: drop IO layer support from redirects
  cindex: speed up initial scan setup phase
  inbox: expire resources more aggressively
  git_async_cat: use git from "all" extindex if possible
  www_listing: support publicInbox.nameIsUrl
  inbox: shrink data structures for publicinbox.*.hide
  codesearch: use retry_reopen for WWW

 Documentation/public-inbox-config.pod |  19 +-
 lib/PublicInbox/CodeSearch.pm         |  54 +++--
 lib/PublicInbox/CodeSearchIdx.pm      | 286 ++++++++++++++++----------
 lib/PublicInbox/Config.pm             |  32 ++-
 lib/PublicInbox/Gcf2.pm               |  16 +-
 lib/PublicInbox/Git.pm                |  27 +--
 lib/PublicInbox/GitAsyncCat.pm        |   8 +-
 lib/PublicInbox/Inbox.pm              |  32 +--
 lib/PublicInbox/MailDiff.pm           |   3 +-
 lib/PublicInbox/SearchIdx.pm          |   5 +-
 lib/PublicInbox/Spawn.pm              |  32 +--
 lib/PublicInbox/WwwListing.pm         |  21 +-
 12 files changed, 303 insertions(+), 232 deletions(-)


^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-11-30 11:40  7% [PATCH 00/15] various cindex fixes + speedups Eric Wong
2023-11-30 11:41  5% ` [PATCH 10/15] cindex: speed up initial scan setup phase Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).