* [PATCH 18/20] searchidx: support async git check
2020-07-24 5:55 7% [PATCH 00/20] indexing changes and new features Eric Wong
@ 2020-07-24 5:56 7% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-07-24 5:56 UTC (permalink / raw)
To: meta
This allows v1 indexing to run while the `cat-file --batch-check'
process is waiting on high-latency storage.
---
lib/PublicInbox/Git.pm | 72 +++++++++++++++++++++++++++++-------
lib/PublicInbox/SearchIdx.pm | 23 ++++++++++--
2 files changed, 78 insertions(+), 17 deletions(-)
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index 265524ffa..ffc464eb3 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -231,26 +231,71 @@ sub cat_file {
$result->[0];
}
-sub check {
- my ($self, $obj) = @_;
- _bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c));
- print { $self->{out_c} } $obj, "\n" or fail($self, "write error: $!");
- my $rbuf = ''; # TODO: async + {chk_rbuf}
- chomp(my $line = my_readline($self->{in_c}, \$rbuf));
- my ($hex, $type, $size) = split(' ', $line);
-
- # Future versions of git.git may show 'ambiguous', but for now,
+sub check_async_step ($$) {
+ my ($self, $inflight_c) = @_;
+ die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3;
+ my ($req, $cb, $arg) = splice(@$inflight_c, 0, 3);
+ my $rbuf = delete($self->{rbuf_c}) // \(my $new = '');
+ chomp(my $line = my_readline($self->{in_c}, $rbuf));
+ my ($hex, $type, $size) = split(/ /, $line);
+
+ # Future versions of git.git may have type=ambiguous, but for now,
# we must handle 'dangling' below (and maybe some other oddball
# stuff):
# https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/
- return if $type eq 'missing' || $type eq 'ambiguous';
-
if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') {
- my $ret = my_read($self->{in_c}, \$rbuf, $type + 1);
+ my $ret = my_read($self->{in_c}, $rbuf, $type + 1);
fail($self, defined($ret) ? 'read EOF' : "read: $!") if !$ret;
- return;
}
+ eval { $cb->($hex, $type, $size, $arg, $self) };
+ warn "E: check($req) $@\n" if $@;
+ $self->{rbuf_c} = $rbuf if $$rbuf ne '';
+}
+
+sub check_async_wait ($) {
+ my ($self) = @_;
+ my $inflight_c = delete $self->{inflight_c} or return;
+ while (scalar(@$inflight_c)) {
+ check_async_step($self, $inflight_c);
+ }
+}
+sub check_async_begin ($) {
+ my ($self) = @_;
+ cleanup($self) if alternates_changed($self);
+ _bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c));
+ die 'BUG: already in async check' if $self->{inflight_c};
+ $self->{inflight_c} = [];
+}
+
+sub check_async ($$$$) {
+ my ($self, $oid, $cb, $arg) = @_;
+ my $inflight_c = $self->{inflight_c} // check_async_begin($self);
+ if (scalar(@$inflight_c) >= MAX_INFLIGHT) {
+ check_async_step($self, $inflight_c);
+ }
+ print { $self->{out_c} } $oid, "\n" or fail($self, "write error: $!");
+ push(@$inflight_c, $oid, $cb, $arg);
+}
+
+sub _check_cb { # check_async callback
+ my ($hex, $type, $size, $result) = @_;
+ @$result = ($hex, $type, $size);
+}
+
+sub check {
+ my ($self, $oid) = @_;
+ my $result = [];
+ check_async($self, $oid, \&_check_cb, $result);
+ check_async_wait($self);
+ my ($hex, $type, $size) = @$result;
+
+ # Future versions of git.git may show 'ambiguous', but for now,
+ # we must handle 'dangling' below (and maybe some other oddball
+ # stuff):
+ # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/
+ return if $type eq 'missing' || $type eq 'ambiguous';
+ return if $hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop';
($hex, $type, $size);
}
@@ -297,6 +342,7 @@ sub cleanup {
my ($self) = @_;
local $in_cleanup = 1;
delete $self->{async_cat};
+ check_async_wait($self);
cat_async_wait($self);
_destroy($self, qw(cat_rbuf in out pid));
_destroy($self, qw(chk_rbuf in_c out_c pid_c err_c));
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 4d2e0da92..39dc1f874 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -563,6 +563,16 @@ sub too_big ($$) {
1;
}
+sub ck_size { # check_async cb for -index --max-size=...
+ my ($oid, $type, $size, $arg, $git) = @_;
+ (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}";
+ if ($size <= $arg->{index_max_size}) {
+ $git->cat_async($oid, \&index_both, $arg);
+ } else {
+ warn "W: skipping $oid ($size > $arg->{index_max_size})\n";
+ }
+}
+
# only for v1
sub process_stack {
my ($self, $stk, $sync, $batch_cb) = @_;
@@ -580,13 +590,17 @@ sub process_stack {
$git->cat_async($oid, \&unindex_both, $self);
}
}
+ $sync->{index_max_size} = $self->{ibx}->{index_max_size};
while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
if ($f eq 'm') {
- $sync->{autime} = $at;
- $sync->{cotime} = $ct;
- next if too_big($self, $oid);
- $git->cat_async($oid, \&index_both, { %$sync });
+ my $arg = { %$sync, autime => $at, cotime => $ct };
+ if ($sync->{index_max_size}) {
+ $git->check_async($oid, \&ck_size, $arg);
+ } else {
+ $git->cat_async($oid, \&index_both, $arg);
+ }
if ($max <= 0) {
+ $git->check_async_wait;
$git->cat_async_wait;
$max = $BATCH_BYTES;
$batch_cb->($nr);
@@ -595,6 +609,7 @@ sub process_stack {
$git->cat_async($oid, \&unindex_both, $self);
}
}
+ $git->check_async_wait;
$git->cat_async_wait;
$batch_cb->($nr, $stk);
}
^ permalink raw reply related [relevance 7%]
* [PATCH 00/20] indexing changes and new features
@ 2020-07-24 5:55 7% Eric Wong
2020-07-24 5:56 7% ` [PATCH 18/20] searchidx: support async git check Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-07-24 5:55 UTC (permalink / raw)
To: meta
--rethread and --no-sync options are now supported in
public-inbox-index. --no-sync should be nice for users
of FSes with poor fsync(2) performance.
Now: I also wonder if --no-sync is a bad name since we
also use it for to mean synchronising indices. Perhaps
--no-fsync would be a better name, though technically
SQLite and Xapian use fdatasync(2), nowadays.
Some of this is prep work for exposing THREADID via IMAP (and
JMAP) to aid in searching.
Since THREADID (`over.tid') will be exposed in a user-visible
way, I'm finally giving up on using the default (reverse
chronological) log order for indexing to ensure THREADID
ascends for newer threads.
This also simplifies the indexing code significantly.
To avoid pinning huge amounts of RAM, the working space is held
in a IdxStack temporary file. This further simplifies our code
since we no longer have to worry about old that did not use
Xapian w/o FD_CLOEXEC.
There's still more work on the horizon, here...
Eric Wong (20):
index: support --rethread switch to fix old indices
v2: index forwards (via `git log --reverse')
v2writable: introduce idx_stack
v2writable: index_sync: reduce fill_alternates calls
v2writable: move {autime} and {cotime} into $sync state
v2writable: allow >= 40 byte git object IDs
v2writable: drop "EPOCH.git indexing $RANGE" progress message
use consistent {ibx} field for writable code paths
search: avoid copying {inboxdir}
v2writable: use read-only PublicInbox::Git for cat_file
v2writable: get rid of {reindex_pipe} field
v2writable: clarify "epoch" for {last_commits}
xapcmd: set {from} properly for v1 inboxes
searchidx: rename _xdb_{acquire,release} => idx_
searchidx: make v1 indexing closer to v2
index+xcpdb: support --no-sync flag
v2writable: share log2stack code with v1
searchidx: support async git check
searchidx: $batch_cb => v1_checkpoint
v2writable: {unindexed} belongs in $sync state
Documentation/public-inbox-index.pod | 30 +-
Documentation/public-inbox-xcpdb.pod | 6 +
MANIFEST | 3 +-
lib/PublicInbox/Git.pm | 72 ++++-
lib/PublicInbox/IdxStack.pm | 52 ++++
lib/PublicInbox/Import.pm | 6 +-
lib/PublicInbox/Msgmap.pm | 21 +-
lib/PublicInbox/MultiMidQueue.pm | 62 ----
lib/PublicInbox/Over.pm | 1 +
lib/PublicInbox/OverIdx.pm | 78 ++++-
lib/PublicInbox/Search.pm | 25 +-
lib/PublicInbox/SearchIdx.pm | 384 ++++++++++++------------
lib/PublicInbox/SearchIdxShard.pm | 12 +-
lib/PublicInbox/Smsg.pm | 8 +-
lib/PublicInbox/V2Writable.pm | 427 +++++++++------------------
lib/PublicInbox/Xapcmd.pm | 10 +-
script/public-inbox-index | 5 +-
script/public-inbox-xcpdb | 4 +-
t/idx_stack.t | 56 ++++
t/inbox_idle.t | 4 +-
t/search.t | 4 +-
t/v1reindex.t | 36 ++-
t/v2reindex.t | 45 +++
23 files changed, 744 insertions(+), 607 deletions(-)
create mode 100644 lib/PublicInbox/IdxStack.pm
delete mode 100644 lib/PublicInbox/MultiMidQueue.pm
create mode 100644 t/idx_stack.t
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-07-24 5:55 7% [PATCH 00/20] indexing changes and new features Eric Wong
2020-07-24 5:56 7% ` [PATCH 18/20] searchidx: support async git check Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).