user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 4/7] gcf2: transparently retry on missing OID
  2020-09-19  9:37  7% [PATCH 0/7] gcf2: libgit2-based cat-file alternative Eric Wong
@ 2020-09-19  9:37  4% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-09-19  9:37 UTC (permalink / raw)
  To: meta

Since we only get OIDs from trusted local data sources
(over.sqlite3), we can safely retry within the -gcf2 process
without worry about clients spamming us with requests for
invalid OIDs and triggering reopens.
---
 lib/PublicInbox/Gcf2Client.pm  | 11 +++++--
 lib/PublicInbox/Git.pm         |  5 ++--
 lib/PublicInbox/gcf2_libgit2.h | 17 ++++++-----
 script/public-inbox-gcf2       | 25 ++++++++++++++--
 t/gcf2.t                       | 34 ++++++++++-----------
 t/gcf2_client.t                | 54 ++++++++++++++++++++++++++++++----
 6 files changed, 109 insertions(+), 37 deletions(-)

diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm
index 71fbb1d1..5120698f 100644
--- a/lib/PublicInbox/Gcf2Client.pm
+++ b/lib/PublicInbox/Gcf2Client.pm
@@ -7,11 +7,13 @@ use PublicInbox::Spawn qw(popen_rd);
 use IO::Handle ();
 
 sub new {
-	my $self = shift->SUPER::new('/nonexistent');
+	my ($rdr) = @_;
+	my $self = bless {}, __PACKAGE__;
 	my ($out_r, $out_w);
 	pipe($out_r, $out_w) or $self->fail("pipe failed: $!");
-	my $cmd = [ 'public-inbox-gcf2' ];
-	@$self{qw(in pid)} = popen_rd($cmd, undef, { 0 => $out_r });
+	$rdr //= {};
+	$rdr->{0} = $out_r;
+	@$self{qw(in pid)} = popen_rd(['public-inbox-gcf2'], undef, $rdr);
 	$self->{inflight} = [];
 	$self->{out} = $out_w;
 	fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ
@@ -32,4 +34,7 @@ sub add_git_dir {
 				$self->fail("write error: $!");
 }
 
+# always false, since -gcf2 retries internally
+sub alternates_changed {}
+
 1;
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index a7ba57f9..b49b5bd3 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -192,7 +192,8 @@ sub cat_async_step ($$) {
 		chop($$bref) eq "\n" or fail($self, 'LF missing after blob');
 	} elsif ($head =~ / missing$/) {
 		# ref($req) indicates it's already been retried
-		if (!ref($req) && !$in_cleanup && alternates_changed($self)) {
+		# -gcf2 retries internally, so it never hits this path:
+		if (!ref($req) && !$in_cleanup && $self->alternates_changed) {
 			return cat_async_retry($self, $inflight,
 						$req, $cb, $arg);
 		}
@@ -394,7 +395,7 @@ sub pub_urls {
 
 sub cat_async_begin {
 	my ($self) = @_;
-	cleanup($self) if alternates_changed($self);
+	cleanup($self) if $self->alternates_changed;
 	batch_prepare($self);
 	die 'BUG: already in async' if $self->{inflight};
 	$self->{inflight} = [];
diff --git a/lib/PublicInbox/gcf2_libgit2.h b/lib/PublicInbox/gcf2_libgit2.h
index d9c79cf9..800c6bad 100644
--- a/lib/PublicInbox/gcf2_libgit2.h
+++ b/lib/PublicInbox/gcf2_libgit2.h
@@ -52,9 +52,13 @@ void add_alternate(SV *self, const char *objects_path)
 	croak_if_err(rc, "git_odb_add_disk_alternate");
 }
 
-/* this requires an unabbreviated git OID */
 #define CAPA(v) (sizeof(v) / sizeof((v)[0]))
-void cat_oid(SV *self, int fd, SV *oidsv)
+
+/*
+ * returns true on success, false on failure
+ * this requires an unabbreviated git OID
+ */
+int cat_oid(SV *self, int fd, SV *oidsv)
 {
 	/*
 	 * adjust when libgit2 gets SHA-256 support, we return the
@@ -89,11 +93,8 @@ void cat_oid(SV *self, int fd, SV *oidsv)
 					git_object_type2string(
 						git_odb_object_type(object)),
 					vec[1].iov_len);
-	} else {
-		vec[0].iov_base = oidptr;
-		vec[0].iov_len = oidlen;
-		vec[1].iov_base = " missing";
-		vec[1].iov_len = strlen(vec[1].iov_base);
+	} else { /* caller retries */
+		nvec = 0;
 	}
 	while (nvec && !err) {
 		ssize_t w = writev(fd, vec + CAPA(vec) - nvec, nvec);
@@ -136,4 +137,6 @@ void cat_oid(SV *self, int fd, SV *oidsv)
 		git_odb_object_free(object);
 	if (err)
 		croak("writev error: %s", strerror(err));
+
+	return rc == GIT_OK;
 }
diff --git a/script/public-inbox-gcf2 b/script/public-inbox-gcf2
index 51811698..d2d2ac8b 100755
--- a/script/public-inbox-gcf2
+++ b/script/public-inbox-gcf2
@@ -3,12 +3,33 @@
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 eval { require PublicInbox::Gcf2 };
 die "libgit2 development package or Inline::C missing for $0: $@\n" if $@;
+my @dirs; # may get big (30K-100K)
 my $gcf2 = PublicInbox::Gcf2::new();
+use IO::Handle; # autoflush
+STDERR->autoflush(1);
+STDOUT->autoflush(1);
+
 while (<STDIN>) {
 	chomp;
 	if (m!\A/!) { # +/path/to/git-dir
+		push @dirs, $_;
 		$gcf2->add_alternate("$_/objects");
-	} else {
-		$gcf2->cat_oid(1, $_);
+	} elsif (!$gcf2->cat_oid(1, $_)) {
+		# retry once if missing.  We only get unabbreviated OIDs
+		# from SQLite or Xapian DBs, here, so malicious clients
+		# can't trigger excessive retries:
+		my $oid = $_;
+		warn "I: $$ $oid missing, retrying...\n";
+
+		# clients may need to wait a bit for this:
+		$gcf2 = PublicInbox::Gcf2::new();
+		$gcf2->add_alternate("$_/objects") for @dirs;
+
+		if ($gcf2->cat_oid(1, $oid)) {
+			warn "I: $$ $oid found after retry\n";
+		} else {
+			warn "W: $$ $oid missing after retry\n";
+			print "$oid missing\n"; # mimic git-cat-file
+		}
 	}
 }
diff --git a/t/gcf2.t b/t/gcf2.t
index 9056b340..35b2f113 100644
--- a/t/gcf2.t
+++ b/t/gcf2.t
@@ -76,43 +76,41 @@ SKIP: {
 	}
 
 	open my $fh, '+>', undef or BAIL_OUT "open: $!";
-	my $fd = fileno($fh);
 	$fh->autoflush(1);
 
-	$gcf2->cat_oid($fd, 'invalid');
+	ok(!$gcf2->cat_oid(fileno($fh), 'invalid'), 'invalid fails');
 	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
-	is(do { local $/; <$fh> }, "invalid missing\n", 'got missing message');
+	is(do { local $/; <$fh> }, '', 'nothing written');
 
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	ok(!$gcf2->cat_oid(fileno($fh), '0'x40), 'z40 fails');
 	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
-	$gcf2->cat_oid($fd, '0'x40);
-	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
-	is(do { local $/; <$fh> }, ('0'x40)." missing\n",
-		'got missing message for 0x40');
+	is(do { local $/; <$fh> }, '', 'nothing written for z40');
 
-	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
-	$gcf2->cat_oid($fd, $COPYING);
-	my $buf;
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
 	my $ck_copying = sub {
 		my ($desc) = @_;
 		seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
-		is(<$fh>, "$COPYING blob 34520\n", 'got expected header');
-		$buf = do { local $/; <$fh> };
+		is(<$fh>, "$COPYING blob 34520\n", "got expected header $desc");
+		my $buf = do { local $/; <$fh> };
 		is(chop($buf), "\n", 'got trailing \\n');
 		is($buf, $agpl, "AGPL matches ($desc)");
 	};
+	ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid normal');
 	$ck_copying->('regular file');
 
 	$gcf2 = PublicInbox::Gcf2::new();
 	$gcf2->add_alternate("$tmpdir/objects");
-	$ck_copying->('alternates respected');
+	open $fh, '+>', undef or BAIL_OUT "open: $!";
+	ok($gcf2->cat_oid(fileno($fh), $COPYING), 'cat_oid alternate');
+	$ck_copying->('alternates after reopen');
 
-	$^O eq 'linux' or skip('pipe tests are Linux-only', 12);
-	my $size = -s $fh;
+	$^O eq 'linux' or skip('pipe tests are Linux-only', 14);
 	for my $blk (1, 0) {
 		my ($r, $w);
 		pipe($r, $w) or BAIL_OUT $!;
 		fcntl($w, 1031, 4096) or
-			skip('Linux too old for F_SETPIPE_SZ', 12);
+			skip('Linux too old for F_SETPIPE_SZ', 14);
 		$w->blocking($blk);
 		seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
 		truncate($fh, 0) or BAIL_OUT "truncate: $!";
@@ -120,11 +118,11 @@ SKIP: {
 		if ($pid == 0) {
 			close $w;
 			tick; # wait for parent to block on writev
-			$buf = do { local $/; <$r> };
+			my $buf = do { local $/; <$r> };
 			print $fh $buf or _exit(1);
 			_exit(0);
 		}
-		$gcf2->cat_oid(fileno($w), $COPYING);
+		ok($gcf2->cat_oid(fileno($w), $COPYING), "cat blocking=$blk");
 		close $w or BAIL_OUT "close: $!";
 		is(waitpid($pid, 0), $pid, 'child exited');
 		is($?, 0, 'no error in child');
diff --git a/t/gcf2_client.t b/t/gcf2_client.t
index 39f9f296..0f7e7203 100644
--- a/t/gcf2_client.t
+++ b/t/gcf2_client.t
@@ -10,19 +10,25 @@ use PublicInbox::Import;
 require_mods('PublicInbox::Gcf2');
 use_ok 'PublicInbox::Gcf2Client';
 my ($tmpdir, $for_destroy) = tmpdir();
-PublicInbox::Import::init_bare($tmpdir);
+my $git_a = "$tmpdir/a.git";
+my $git_b = "$tmpdir/b.git";
+PublicInbox::Import::init_bare($git_a);
+PublicInbox::Import::init_bare($git_b);
 my $fi_data = './t/git.fast-import-data';
 my $rdr = {};
 open $rdr->{0}, '<', $fi_data or BAIL_OUT $!;
-xsys([qw(git fast-import --quiet)], { GIT_DIR => $tmpdir }, $rdr);
+xsys([qw(git fast-import --quiet)], { GIT_DIR => $git_a }, $rdr);
 is($?, 0, 'fast-import succeeded');
 
 my $tree = 'fdbc43725f21f485051c17463b50185f4c3cf88c';
 my $called = 0;
+my $err_f = "$tmpdir/err";
 {
 	local $ENV{PATH} = getcwd()."/blib/script:$ENV{PATH}";
-	my $gcf2c = PublicInbox::Gcf2Client->new;
-	$gcf2c->add_git_dir($tmpdir);
+	open my $err, '>', $err_f or BAIL_OUT $!;
+	my $gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+	$gcf2c->add_git_dir($git_a);
+
 	$gcf2c->cat_async($tree, sub {
 		my ($bref, $oid, $type, $size, $arg) = @_;
 		is($oid, $tree, 'got expected OID');
@@ -32,6 +38,12 @@ my $called = 0;
 		is($arg, 'hi', 'arg passed');
 		$called++;
 	}, 'hi');
+	$gcf2c->cat_async_wait;
+
+	open $err, '<', $err_f or BAIL_OUT $!;
+	my $estr = do { local $/; <$err> };
+	is($estr, '', 'nothing in stderr');
+
 	my $trunc = substr($tree, 0, 39);
 	$gcf2c->cat_async($trunc, sub {
 		my ($bref, $oid, $type, $size, $arg) = @_;
@@ -42,6 +54,38 @@ my $called = 0;
 		is($arg, 'bye', 'arg passed when missing');
 		$called++;
 	}, 'bye');
+	$gcf2c->cat_async_wait;
+
+	open $err, '<', $err_f or BAIL_OUT $!;
+	$estr = do { local $/; <$err> };
+	like($estr, qr/retrying/, 'warned about retry');
+
+	# try failed alternates lookup
+	open $err, '>', $err_f or BAIL_OUT $!;
+	$gcf2c = PublicInbox::Gcf2Client::new({ 2 => $err });
+	$gcf2c->add_git_dir($git_b);
+	$gcf2c->cat_async($tree, sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is(undef, $bref, 'missing bref from alt is undef');
+		$called++;
+	});
+	$gcf2c->cat_async_wait;
+	open $err, '<', $err_f or BAIL_OUT $!;
+	$estr = do { local $/; <$err> };
+	like($estr, qr/retrying/, 'warned about retry before alt update');
+
+	# now try successful alternates lookup
+	open my $alt, '>>', "$git_b/objects/info/alternates" or BAIL_OUT $!;
+	print $alt "$git_a/objects\n" or BAIL_OUT $!;
+	close $alt or BAIL_OUT;
+	my $expect = xqx(['git', "--git-dir=$git_a", qw(cat-file tree), $tree]);
+	$gcf2c->cat_async($tree, sub {
+		my ($bref, $oid, $type, $size, $arg) = @_;
+		is($oid, $tree, 'oid match on alternates retry');
+		is($$bref, $expect, 'tree content matched');
+		$called++;
+	});
+	$gcf2c->cat_async_wait;
 }
-is($called, 2, 'cat_async callbacks hit');
+is($called, 4, 'cat_async callbacks hit');
 done_testing;

^ permalink raw reply related	[relevance 4%]

* [PATCH 0/7] gcf2: libgit2-based cat-file alternative
@ 2020-09-19  9:37  7% Eric Wong
  2020-09-19  9:37  4% ` [PATCH 4/7] gcf2: transparently retry on missing OID Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-09-19  9:37 UTC (permalink / raw)
  To: meta

This allows a single cat-file-like process to handle multiple
inboxes; instead of having a "git cat-file --batch" process for
every inbox; saving pipes and process table space.

The preliminary code was done months ago, but I struggled to put
all the pieces together in a coherent way.  My brain has been
scattered :x

I finally decided to make the gcf2 process a global singleton
(per-worker) to avoid complexity elsewhere in the config...

It doesn't detect or release unlinked packs + indices, yet,
so "git gc" may not free disk space until restarted.

Otherwise it does detect new epochs and seems mostly working
otherwise...

Eric Wong (7):
  gcf2: libgit2-based git cat-file alternative
  t/gcf2: test changes to alternates
  add gcf2 client and executable script
  gcf2: transparently retry on missing OID
  gcf2*: more descriptive package descriptions
  gcf2: require git dir with OID
  gcf2: wire up read-only daemons and rm -gcf2 script

 MANIFEST                       |   5 +
 lib/PublicInbox/Daemon.pm      |  11 +++
 lib/PublicInbox/Gcf2.pm        |  89 ++++++++++++++++++
 lib/PublicInbox/Gcf2Client.pm  |  62 +++++++++++++
 lib/PublicInbox/Git.pm         |  41 ++++++---
 lib/PublicInbox/GitAsyncCat.pm |  73 +++++++++++++--
 lib/PublicInbox/IMAP.pm        |   2 +-
 lib/PublicInbox/gcf2_libgit2.h | 142 +++++++++++++++++++++++++++++
 script/public-inbox-httpd      |   1 +
 t/gcf2.t                       | 162 +++++++++++++++++++++++++++++++++
 t/gcf2_client.t                |  90 ++++++++++++++++++
 11 files changed, 652 insertions(+), 26 deletions(-)
 create mode 100644 lib/PublicInbox/Gcf2.pm
 create mode 100644 lib/PublicInbox/Gcf2Client.pm
 create mode 100644 lib/PublicInbox/gcf2_libgit2.h
 create mode 100644 t/gcf2.t
 create mode 100644 t/gcf2_client.t

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-09-19  9:37  7% [PATCH 0/7] gcf2: libgit2-based cat-file alternative Eric Wong
2020-09-19  9:37  4% ` [PATCH 4/7] gcf2: transparently retry on missing OID Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).