* [PATCH 10/15] cindex: speed up initial scan setup phase
2023-11-30 11:40 7% [PATCH 00/15] various cindex fixes + speedups Eric Wong
@ 2023-11-30 11:41 5% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
This brings a no-op -cindex scan of a git.kernel.org mirror
down from 70s to 10s with a hot cache on a busy machine.
CPU-intensive SHA-256 fingerprinting of the `git show-ref'
result can be parallelized on shard workers. Future changes can
move more of the initial scan setup phase into shard workers for
more parallelism.
But most of the performance for skipping unchanged repos is
gained from delaying the commit time reading until we've seen
the fingerprint is out-of-date, since reading commit times
requires a large amount of I/O compared to only reading refs
for fingerprints.
---
lib/PublicInbox/CodeSearchIdx.pm | 99 +++++++++++++++++---------------
1 file changed, 53 insertions(+), 46 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 555a1efe..ec0fc6e3 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -105,6 +105,7 @@ our (
@JOIN_DT, # YYYYmmddHHMMSS for dt:
$QRY_STR, # common query string for both code and inbox associations
$DUMP_IBX_WPIPE, # goes to sort(1)
+ $ANY_SHARD, # shard round-robin for scan fingerprinting
@OFF2ROOT,
);
@@ -416,51 +417,42 @@ sub run_git {
# this is different from the grokmirror-compatible fingerprint since we
# only care about --heads (branches) and --tags, and not even their names
-sub fp_start ($$$) {
- my ($self, $git, $prep_repo) = @_;
+sub fp_start ($$) {
+ my ($self, $git) = @_;
return if $DO_QUIT;
open my $refs, '+>', undef;
$git->{-repo}->{refs} = $refs;
- run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
- \&fp_fini, $self, $git, $prep_repo);
-}
-
-sub fp_fini { # run_git cb
- my (undef, $self, $git, $prep_repo) = @_;
- my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
- sysseek($refs, 0, SEEK_SET);
- $git->{-repo}->{fp} = sha_all(256, $refs)->hexdigest;
+ my ($c, $p) = PublicInbox::PktOp->pair;
+ my $next_on_err = PublicInbox::OnDestroy->new(\&index_next, $self);
+ $c->{ops}->{fp_done} = [ $self, $git, $next_on_err ];
+ $IDX_SHARDS[++$ANY_SHARD % scalar(@IDX_SHARDS)]->wq_io_do('fp_async',
+ [ $p->{op_p}, $refs ], $git->{git_dir})
}
-sub ct_start ($$$) {
- my ($self, $git, $prep_repo) = @_;
- return if $DO_QUIT;
- run_git([ qw[for-each-ref --sort=-committerdate
- --format=%(committerdate:raw) --count=1
- refs/heads/ refs/tags/] ], undef, # capture like qx
- \&ct_fini, $self, $git, $prep_repo);
+sub fp_async { # via wq_io_do in worker
+ my ($self, $git_dir) = @_;
+ my $op_p = delete $self->{0} // die 'BUG: no {0} op_p';
+ my $refs = delete $self->{1} // die 'BUG: no {1} refs';
+ my $git = PublicInbox::Git->new($git_dir);
+ run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
+ \&fp_async_done, $self, $git, $op_p);
}
-sub ct_fini { # run_git cb
- my ($opt, $self, $git, $prep_repo) = @_;
- my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
- $git->{-repo}->{ct} = $ct + 0;
+sub fp_async_done { # run_git cb from worker
+ my ($opt, $self, $git, $op_p) = @_;
+ my $refs = delete $opt->{1} // 'BUG: no {-repo}->{refs}';
+ sysseek($refs, 0, SEEK_SET);
+ send($op_p, 'fp_done '.sha_all(256, $refs)->hexdigest, 0);
}
-# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
-sub prep_repo ($$) {
- my ($self, $git) = @_;
+sub fp_done { # called parent via PktOp by fp_async_done
+ my ($self, $git, $next_on_err, $hex) = @_;
+ $next_on_err->cancel;
return if $DO_QUIT;
- return index_next($self) if $git->{-cidx_err};
- my $repo = $git->{-repo} // die 'BUG: no {-repo}';
- if (!defined($repo->{ct})) {
- warn "W: $git->{git_dir} has no commits, skipping\n";
- delete $git->{-repo};
- return index_next($self);
- }
+ $git->{-repo}->{fp} = $hex;
my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB);
my $shard = bless { %$self, shard => $n }, ref($self);
- $repo->{shard_n} = $n;
+ $git->{-repo}->{shard_n} = $n;
delete @$shard{qw(lockfh lock_path)};
local $shard->{xdb} = $RDONLY_XDB[$n] // die "BUG: shard[$n] undef";
$shard->retry_reopen(\&check_existing, $self, $git);
@@ -469,7 +461,7 @@ sub prep_repo ($$) {
sub check_existing { # retry_reopen callback
my ($shard, $self, $git) = @_;
my @docids = $shard->docids_of_git_dir($git->{git_dir});
- my $docid = shift(@docids) // return get_roots($self, $git);
+ my $docid = shift(@docids) // return prep_repo($self, $git); # new repo
my $doc = $shard->get_doc($docid) //
die "BUG: no #$docid ($git->{git_dir})";
my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data;
@@ -482,7 +474,7 @@ sub check_existing { # retry_reopen callback
warn "BUG: $git->{git_dir} indexed multiple times, culling\n";
$git->{-repo}->{to_delete} = \@docids; # XXX needed?
}
- get_roots($self, $git);
+ prep_repo($self, $git);
}
sub partition_refs ($$$) {
@@ -604,13 +596,9 @@ sub index_next ($) {
my ($self) = @_;
return if $DO_QUIT;
if ($IDXQ && @$IDXQ) {
- index_repo(undef, $self, shift @$IDXQ);
+ index_repo($self, shift @$IDXQ);
} elsif ($SCANQ && @$SCANQ) {
- my $git = shift @$SCANQ;
- my $prep_repo = PublicInbox::OnDestroy->new(\&prep_repo,
- $self, $git);
- fp_start($self, $git, $prep_repo);
- ct_start($self, $git, $prep_repo);
+ fp_start $self, shift @$SCANQ;
} elsif ($TMPDIR) {
delete $TODO{dump_roots_start};
delete $TODO{dump_ibx_start}; # runs OnDestroy once
@@ -649,12 +637,17 @@ sub index_done { # OnDestroy cb called when done indexing each code repo
# repo_stored will fire once store_repo is done
}
-sub index_repo { # run_git cb
- my (undef, $self, $git) = @_;
+sub index_repo {
+ my ($self, $git) = @_;
return if $DO_QUIT;
+ my $repo = $git->{-repo} // die 'BUG: no {-repo}';
return index_next($self) if $git->{-cidx_err};
+ if (!defined($repo->{ct})) {
+ warn "W: $git->{git_dir} has no commits, skipping\n";
+ return index_next($self);
+ }
return push(@$IDXQ, $git) if $REPO_CTX; # busy
- my $repo = delete $git->{-repo} or return index_next($self);
+ delete $git->{-repo};
my $roots_fh = delete $repo->{roots_fh} // die 'BUG: no {roots_fh}';
seek($roots_fh, 0, SEEK_SET);
chomp(my @roots = PublicInbox::IO::read_all $roots_fh);
@@ -685,15 +678,28 @@ sub index_repo { # run_git cb
# shard_done fires when shard_index is done
}
-sub get_roots ($$) {
+sub ct_fini { # run_git cb
+ my ($opt, $self, $git, $index_repo) = @_;
+ my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
+ $git->{-repo}->{ct} = $ct + 0;
+}
+
+# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
+sub prep_repo ($$) {
my ($self, $git) = @_;
return if $DO_QUIT;
+ my $index_repo = PublicInbox::OnDestroy->new(\&index_repo, $self, $git);
my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
sysseek($refs, 0, SEEK_SET);
open my $roots_fh, '+>', undef;
$git->{-repo}->{roots_fh} = $roots_fh;
run_git([ qw(rev-list --stdin --max-parents=0) ],
- { 0 => $refs, 1 => $roots_fh }, \&index_repo, $self, $git)
+ { 0 => $refs, 1 => $roots_fh }, \&PublicInbox::Config::noop,
+ $self, $git, $index_repo);
+ run_git([ qw[for-each-ref --sort=-committerdate
+ --format=%(committerdate:raw) --count=1
+ refs/heads/ refs/tags/] ], undef, # capture like qx
+ \&ct_fini, $self, $git, $index_repo);
}
# for PublicInbox::SearchIdx `git patch-id' call and with_umask
@@ -1295,6 +1301,7 @@ sub cidx_run { # main entry point
init_join_prefork($self)
}
local @IDX_SHARDS = cidx_init($self); # forks workers
+ local $ANY_SHARD = -1;
local $self->{current_info} = '';
local $MY_SIG = {
CHLD => \&PublicInbox::DS::enqueue_reap,
^ permalink raw reply related [relevance 5%]
* [PATCH 00/15] various cindex fixes + speedups
@ 2023-11-30 11:40 7% Eric Wong
2023-11-30 11:41 5% ` [PATCH 10/15] cindex: speed up initial scan setup phase Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
To: meta
Notable changes:
10/15 provides a huge speedup which will hopefully make
future developments faster.
12/15 probably obsoletes libgit2 for extindex "all" users.
13/15 can save some memory with many inboxes while making
configuration easier.
Eric Wong (15):
cindex: fix store_repo+repo_stored on no-op
codesearch: allow inbox count to exceed matches
config: reject newlines consistently in dir names
cindex: only create {-cidx_err} field on failures
cindex: keep batch pipe for pruning SHA-256 repos
cindex: store extensions.objectFormat with repo data
git: share unlinked pack checking code with gcf2
cindex: skip getpid guard for most OnDestroy use
spawn: drop IO layer support from redirects
cindex: speed up initial scan setup phase
inbox: expire resources more aggressively
git_async_cat: use git from "all" extindex if possible
www_listing: support publicInbox.nameIsUrl
inbox: shrink data structures for publicinbox.*.hide
codesearch: use retry_reopen for WWW
Documentation/public-inbox-config.pod | 19 +-
lib/PublicInbox/CodeSearch.pm | 54 +++--
lib/PublicInbox/CodeSearchIdx.pm | 286 ++++++++++++++++----------
lib/PublicInbox/Config.pm | 32 ++-
lib/PublicInbox/Gcf2.pm | 16 +-
lib/PublicInbox/Git.pm | 27 +--
lib/PublicInbox/GitAsyncCat.pm | 8 +-
lib/PublicInbox/Inbox.pm | 32 +--
lib/PublicInbox/MailDiff.pm | 3 +-
lib/PublicInbox/SearchIdx.pm | 5 +-
lib/PublicInbox/Spawn.pm | 32 +--
lib/PublicInbox/WwwListing.pm | 21 +-
12 files changed, 303 insertions(+), 232 deletions(-)
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-11-30 11:40 7% [PATCH 00/15] various cindex fixes + speedups Eric Wong
2023-11-30 11:41 5% ` [PATCH 10/15] cindex: speed up initial scan setup phase Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).