user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 18/20] searchidx: support async git check
Date: Fri, 24 Jul 2020 05:56:04 +0000	[thread overview]
Message-ID: <20200724055606.27332-19-e@yhbt.net> (raw)
In-Reply-To: <20200724055606.27332-1-e@yhbt.net>

This allows v1 indexing to run while the `cat-file --batch-check'
process is waiting on high-latency storage.
---
 lib/PublicInbox/Git.pm       | 72 +++++++++++++++++++++++++++++-------
 lib/PublicInbox/SearchIdx.pm | 23 ++++++++++--
 2 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index 265524ffa..ffc464eb3 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -231,26 +231,71 @@ sub cat_file {
 	$result->[0];
 }
 
-sub check {
-	my ($self, $obj) = @_;
-	_bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c));
-	print { $self->{out_c} } $obj, "\n" or fail($self, "write error: $!");
-	my $rbuf = ''; # TODO: async + {chk_rbuf}
-	chomp(my $line = my_readline($self->{in_c}, \$rbuf));
-	my ($hex, $type, $size) = split(' ', $line);
-
-	# Future versions of git.git may show 'ambiguous', but for now,
+sub check_async_step ($$) {
+	my ($self, $inflight_c) = @_;
+	die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3;
+	my ($req, $cb, $arg) = splice(@$inflight_c, 0, 3);
+	my $rbuf = delete($self->{rbuf_c}) // \(my $new = '');
+	chomp(my $line = my_readline($self->{in_c}, $rbuf));
+	my ($hex, $type, $size) = split(/ /, $line);
+
+	# Future versions of git.git may have type=ambiguous, but for now,
 	# we must handle 'dangling' below (and maybe some other oddball
 	# stuff):
 	# https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/
-	return if $type eq 'missing' || $type eq 'ambiguous';
-
 	if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') {
-		my $ret = my_read($self->{in_c}, \$rbuf, $type + 1);
+		my $ret = my_read($self->{in_c}, $rbuf, $type + 1);
 		fail($self, defined($ret) ? 'read EOF' : "read: $!") if !$ret;
-		return;
 	}
+	eval { $cb->($hex, $type, $size, $arg, $self) };
+	warn "E: check($req) $@\n" if $@;
+	$self->{rbuf_c} = $rbuf if $$rbuf ne '';
+}
+
+sub check_async_wait ($) {
+	my ($self) = @_;
+	my $inflight_c = delete $self->{inflight_c} or return;
+	while (scalar(@$inflight_c)) {
+		check_async_step($self, $inflight_c);
+	}
+}
 
+sub check_async_begin ($) {
+	my ($self) = @_;
+	cleanup($self) if alternates_changed($self);
+	_bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c));
+	die 'BUG: already in async check' if $self->{inflight_c};
+	$self->{inflight_c} = [];
+}
+
+sub check_async ($$$$) {
+	my ($self, $oid, $cb, $arg) = @_;
+	my $inflight_c = $self->{inflight_c} // check_async_begin($self);
+	if (scalar(@$inflight_c) >= MAX_INFLIGHT) {
+		check_async_step($self, $inflight_c);
+	}
+	print { $self->{out_c} } $oid, "\n" or fail($self, "write error: $!");
+	push(@$inflight_c, $oid, $cb, $arg);
+}
+
+sub _check_cb { # check_async callback
+	my ($hex, $type, $size, $result) = @_;
+	@$result = ($hex, $type, $size);
+}
+
+sub check {
+	my ($self, $oid) = @_;
+	my $result = [];
+	check_async($self, $oid, \&_check_cb, $result);
+	check_async_wait($self);
+	my ($hex, $type, $size) = @$result;
+
+	# Future versions of git.git may show 'ambiguous', but for now,
+	# we must handle 'dangling' below (and maybe some other oddball
+	# stuff):
+	# https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/
+	return if $type eq 'missing' || $type eq 'ambiguous';
+	return if $hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop';
 	($hex, $type, $size);
 }
 
@@ -297,6 +342,7 @@ sub cleanup {
 	my ($self) = @_;
 	local $in_cleanup = 1;
 	delete $self->{async_cat};
+	check_async_wait($self);
 	cat_async_wait($self);
 	_destroy($self, qw(cat_rbuf in out pid));
 	_destroy($self, qw(chk_rbuf in_c out_c pid_c err_c));
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 4d2e0da92..39dc1f874 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -563,6 +563,16 @@ sub too_big ($$) {
 	1;
 }
 
+sub ck_size { # check_async cb for -index --max-size=...
+	my ($oid, $type, $size, $arg, $git) = @_;
+	(($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}";
+	if ($size <= $arg->{index_max_size}) {
+		$git->cat_async($oid, \&index_both, $arg);
+	} else {
+		warn "W: skipping $oid ($size > $arg->{index_max_size})\n";
+	}
+}
+
 # only for v1
 sub process_stack {
 	my ($self, $stk, $sync, $batch_cb) = @_;
@@ -580,13 +590,17 @@ sub process_stack {
 			$git->cat_async($oid, \&unindex_both, $self);
 		}
 	}
+	$sync->{index_max_size} = $self->{ibx}->{index_max_size};
 	while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
 		if ($f eq 'm') {
-			$sync->{autime} = $at;
-			$sync->{cotime} = $ct;
-			next if too_big($self, $oid);
-			$git->cat_async($oid, \&index_both, { %$sync });
+			my $arg = { %$sync, autime => $at, cotime => $ct };
+			if ($sync->{index_max_size}) {
+				$git->check_async($oid, \&ck_size, $arg);
+			} else {
+				$git->cat_async($oid, \&index_both, $arg);
+			}
 			if ($max <= 0) {
+				$git->check_async_wait;
 				$git->cat_async_wait;
 				$max = $BATCH_BYTES;
 				$batch_cb->($nr);
@@ -595,6 +609,7 @@ sub process_stack {
 			$git->cat_async($oid, \&unindex_both, $self);
 		}
 	}
+	$git->check_async_wait;
 	$git->cat_async_wait;
 	$batch_cb->($nr, $stk);
 }

  parent reply	other threads:[~2020-07-24  5:56 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-24  5:55 [PATCH 00/20] indexing changes and new features Eric Wong
2020-07-24  5:55 ` [PATCH 01/20] index: support --rethread switch to fix old indices Eric Wong
2020-07-24  5:55 ` [PATCH 02/20] v2: index forwards (via `git log --reverse') Eric Wong
2020-07-24  5:55 ` [PATCH 03/20] v2writable: introduce idx_stack Eric Wong
2020-07-24  5:55 ` [PATCH 04/20] v2writable: index_sync: reduce fill_alternates calls Eric Wong
2020-07-24  5:55 ` [PATCH 05/20] v2writable: move {autime} and {cotime} into $sync state Eric Wong
2020-07-24  5:55 ` [PATCH 06/20] v2writable: allow >= 40 byte git object IDs Eric Wong
2020-07-24  5:55 ` [PATCH 07/20] v2writable: drop "EPOCH.git indexing $RANGE" progress Eric Wong
2020-07-24  5:55 ` [PATCH 08/20] use consistent {ibx} field for writable code paths Eric Wong
2020-07-24  5:55 ` [PATCH 09/20] search: avoid copying {inboxdir} Eric Wong
2020-07-24  5:55 ` [PATCH 10/20] v2writable: use read-only PublicInbox::Git for cat_file Eric Wong
2020-07-24  5:55 ` [PATCH 11/20] v2writable: get rid of {reindex_pipe} field Eric Wong
2020-07-24  5:55 ` [PATCH 12/20] v2writable: clarify "epoch" comment Eric Wong
2020-07-24  5:55 ` [PATCH 13/20] xapcmd: set {from} properly for v1 inboxes Eric Wong
2020-07-24  5:56 ` [PATCH 14/20] searchidx: rename _xdb_{acquire,release} => idx_ Eric Wong
2020-07-24  5:56 ` [PATCH 15/20] searchidx: make v1 indexing closer to v2 Eric Wong
2020-07-24  5:56 ` [PATCH 16/20] index+xcpdb: support --no-sync flag Eric Wong
2020-07-24  5:56 ` [PATCH 17/20] v2writable: share log2stack code with v1 Eric Wong
2020-07-24  5:56 ` Eric Wong [this message]
2020-07-24  5:56 ` [PATCH 19/20] searchidx: $batch_cb => v1_checkpoint Eric Wong
2020-07-24  5:56 ` [PATCH 20/20] v2writable: {unindexed} belongs in $sync state Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200724055606.27332-19-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    --subject='Re: [PATCH 18/20] searchidx: support async git check' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/inbox.comp.mail.public-inbox.meta
	nntp://ie5yzdi7fg72h7s4sdcztq5evakq23rdt33mfyfcddc5u3ndnw24ogqd.onion/inbox.comp.mail.public-inbox.meta
	nntp://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git