From: Eric Wong <e@yhbt.net> To: meta@public-inbox.org Subject: [PATCH 18/20] searchidx: support async git check Date: Fri, 24 Jul 2020 05:56:04 +0000 [thread overview] Message-ID: <20200724055606.27332-19-e@yhbt.net> (raw) In-Reply-To: <20200724055606.27332-1-e@yhbt.net> This allows v1 indexing to run while the `cat-file --batch-check' process is waiting on high-latency storage. --- lib/PublicInbox/Git.pm | 72 +++++++++++++++++++++++++++++------- lib/PublicInbox/SearchIdx.pm | 23 ++++++++++-- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 265524ffa..ffc464eb3 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -231,26 +231,71 @@ sub cat_file { $result->[0]; } -sub check { - my ($self, $obj) = @_; - _bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c)); - print { $self->{out_c} } $obj, "\n" or fail($self, "write error: $!"); - my $rbuf = ''; # TODO: async + {chk_rbuf} - chomp(my $line = my_readline($self->{in_c}, \$rbuf)); - my ($hex, $type, $size) = split(' ', $line); - - # Future versions of git.git may show 'ambiguous', but for now, +sub check_async_step ($$) { + my ($self, $inflight_c) = @_; + die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3; + my ($req, $cb, $arg) = splice(@$inflight_c, 0, 3); + my $rbuf = delete($self->{rbuf_c}) // \(my $new = ''); + chomp(my $line = my_readline($self->{in_c}, $rbuf)); + my ($hex, $type, $size) = split(/ /, $line); + + # Future versions of git.git may have type=ambiguous, but for now, # we must handle 'dangling' below (and maybe some other oddball # stuff): # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ - return if $type eq 'missing' || $type eq 'ambiguous'; - if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') { - my $ret = my_read($self->{in_c}, \$rbuf, $type + 1); + my $ret = my_read($self->{in_c}, $rbuf, $type + 1); fail($self, defined($ret) ? 'read EOF' : "read: $!") if !$ret; - return; } + eval { $cb->($hex, $type, $size, $arg, $self) }; + warn "E: check($req) $@\n" if $@; + $self->{rbuf_c} = $rbuf if $$rbuf ne ''; +} + +sub check_async_wait ($) { + my ($self) = @_; + my $inflight_c = delete $self->{inflight_c} or return; + while (scalar(@$inflight_c)) { + check_async_step($self, $inflight_c); + } +} +sub check_async_begin ($) { + my ($self) = @_; + cleanup($self) if alternates_changed($self); + _bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c)); + die 'BUG: already in async check' if $self->{inflight_c}; + $self->{inflight_c} = []; +} + +sub check_async ($$$$) { + my ($self, $oid, $cb, $arg) = @_; + my $inflight_c = $self->{inflight_c} // check_async_begin($self); + if (scalar(@$inflight_c) >= MAX_INFLIGHT) { + check_async_step($self, $inflight_c); + } + print { $self->{out_c} } $oid, "\n" or fail($self, "write error: $!"); + push(@$inflight_c, $oid, $cb, $arg); +} + +sub _check_cb { # check_async callback + my ($hex, $type, $size, $result) = @_; + @$result = ($hex, $type, $size); +} + +sub check { + my ($self, $oid) = @_; + my $result = []; + check_async($self, $oid, \&_check_cb, $result); + check_async_wait($self); + my ($hex, $type, $size) = @$result; + + # Future versions of git.git may show 'ambiguous', but for now, + # we must handle 'dangling' below (and maybe some other oddball + # stuff): + # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ + return if $type eq 'missing' || $type eq 'ambiguous'; + return if $hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop'; ($hex, $type, $size); } @@ -297,6 +342,7 @@ sub cleanup { my ($self) = @_; local $in_cleanup = 1; delete $self->{async_cat}; + check_async_wait($self); cat_async_wait($self); _destroy($self, qw(cat_rbuf in out pid)); _destroy($self, qw(chk_rbuf in_c out_c pid_c err_c)); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 4d2e0da92..39dc1f874 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -563,6 +563,16 @@ sub too_big ($$) { 1; } +sub ck_size { # check_async cb for -index --max-size=... + my ($oid, $type, $size, $arg, $git) = @_; + (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}"; + if ($size <= $arg->{index_max_size}) { + $git->cat_async($oid, \&index_both, $arg); + } else { + warn "W: skipping $oid ($size > $arg->{index_max_size})\n"; + } +} + # only for v1 sub process_stack { my ($self, $stk, $sync, $batch_cb) = @_; @@ -580,13 +590,17 @@ sub process_stack { $git->cat_async($oid, \&unindex_both, $self); } } + $sync->{index_max_size} = $self->{ibx}->{index_max_size}; while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { if ($f eq 'm') { - $sync->{autime} = $at; - $sync->{cotime} = $ct; - next if too_big($self, $oid); - $git->cat_async($oid, \&index_both, { %$sync }); + my $arg = { %$sync, autime => $at, cotime => $ct }; + if ($sync->{index_max_size}) { + $git->check_async($oid, \&ck_size, $arg); + } else { + $git->cat_async($oid, \&index_both, $arg); + } if ($max <= 0) { + $git->check_async_wait; $git->cat_async_wait; $max = $BATCH_BYTES; $batch_cb->($nr); @@ -595,6 +609,7 @@ sub process_stack { $git->cat_async($oid, \&unindex_both, $self); } } + $git->check_async_wait; $git->cat_async_wait; $batch_cb->($nr, $stk); }
next prev parent reply other threads:[~2020-07-24 5:56 UTC|newest] Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top 2020-07-24 5:55 [PATCH 00/20] indexing changes and new features Eric Wong 2020-07-24 5:55 ` [PATCH 01/20] index: support --rethread switch to fix old indices Eric Wong 2020-07-24 5:55 ` [PATCH 02/20] v2: index forwards (via `git log --reverse') Eric Wong 2020-07-24 5:55 ` [PATCH 03/20] v2writable: introduce idx_stack Eric Wong 2020-07-24 5:55 ` [PATCH 04/20] v2writable: index_sync: reduce fill_alternates calls Eric Wong 2020-07-24 5:55 ` [PATCH 05/20] v2writable: move {autime} and {cotime} into $sync state Eric Wong 2020-07-24 5:55 ` [PATCH 06/20] v2writable: allow >= 40 byte git object IDs Eric Wong 2020-07-24 5:55 ` [PATCH 07/20] v2writable: drop "EPOCH.git indexing $RANGE" progress Eric Wong 2020-07-24 5:55 ` [PATCH 08/20] use consistent {ibx} field for writable code paths Eric Wong 2020-07-24 5:55 ` [PATCH 09/20] search: avoid copying {inboxdir} Eric Wong 2020-07-24 5:55 ` [PATCH 10/20] v2writable: use read-only PublicInbox::Git for cat_file Eric Wong 2020-07-24 5:55 ` [PATCH 11/20] v2writable: get rid of {reindex_pipe} field Eric Wong 2020-07-24 5:55 ` [PATCH 12/20] v2writable: clarify "epoch" comment Eric Wong 2020-07-24 5:55 ` [PATCH 13/20] xapcmd: set {from} properly for v1 inboxes Eric Wong 2020-07-24 5:56 ` [PATCH 14/20] searchidx: rename _xdb_{acquire,release} => idx_ Eric Wong 2020-07-24 5:56 ` [PATCH 15/20] searchidx: make v1 indexing closer to v2 Eric Wong 2020-07-24 5:56 ` [PATCH 16/20] index+xcpdb: support --no-sync flag Eric Wong 2020-07-24 5:56 ` [PATCH 17/20] v2writable: share log2stack code with v1 Eric Wong 2020-07-24 5:56 ` Eric Wong [this message] 2020-07-24 5:56 ` [PATCH 19/20] searchidx: $batch_cb => v1_checkpoint Eric Wong 2020-07-24 5:56 ` [PATCH 20/20] v2writable: {unindexed} belongs in $sync state Eric Wong
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style List information: https://public-inbox.org/README * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20200724055606.27332-19-e@yhbt.net \ --to=e@yhbt.net \ --cc=meta@public-inbox.org \ --subject='Re: [PATCH 18/20] searchidx: support async git check' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Code repositories for project(s) associated with this inbox: https://80x24.org/public-inbox.git This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).