From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 17ACE1F8C6 for ; Thu, 24 Jun 2021 05:50:04 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] favor git(1) rather than libgit2 for ExtSearch Date: Thu, 24 Jun 2021 05:50:04 +0000 Message-Id: <20210624055004.27955-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: While both git and libgit2 take around 16 minutes to load 100K alternates there's already a proposed patch to make git faster: It's also easier to patch and install git locally since the git.git build system defaults to prefix=$HOME and dealing with dynamic linking with libgit2 is more difficult for end users relying on Inline::C. libgit2 remains in use for the non-ALL.git case, but maybe it's not necessary (libgit2 is significantly slower than git in Debian 10 due to SHA-1 collision checking). --- Documentation/technical/ds.txt | 2 +- lib/PublicInbox/GitAsyncCat.pm | 21 +++++++++++++-------- lib/PublicInbox/GzipFilter.pm | 3 +-- lib/PublicInbox/HTTPD.pm | 2 +- lib/PublicInbox/IMAP.pm | 10 +++++----- lib/PublicInbox/NNTP.pm | 4 ++-- lib/PublicInbox/SolverGit.pm | 3 +-- 7 files changed, 24 insertions(+), 21 deletions(-) diff --git a/Documentation/technical/ds.txt b/Documentation/technical/ds.txt index a0793ca2..7bc1ad79 100644 --- a/Documentation/technical/ds.txt +++ b/Documentation/technical/ds.txt @@ -64,7 +64,7 @@ Augmented features: * ->requeue support. An optimization of the AddTimer(0, ...) idiom for immediately dispatching code at the next event loop iteration. public-inbox uses this for fairly generating large responses - iteratively (see PublicInbox::NNTP::long_response or git_async_cat + iteratively (see PublicInbox::NNTP::long_response or ibx_async_cat for blob retrievals). New features diff --git a/lib/PublicInbox/GitAsyncCat.pm b/lib/PublicInbox/GitAsyncCat.pm index 7d1a13db..57c194d9 100644 --- a/lib/PublicInbox/GitAsyncCat.pm +++ b/lib/PublicInbox/GitAsyncCat.pm @@ -8,7 +8,7 @@ use strict; use parent qw(PublicInbox::DS Exporter); use POSIX qw(WNOHANG); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); -our @EXPORT = qw(git_async_cat git_async_prefetch); +our @EXPORT = qw(ibx_async_cat ibx_async_prefetch); use PublicInbox::Git (); our $GCF2C; # singleton PublicInbox::Gcf2Client @@ -45,12 +45,16 @@ sub event_step { } } -sub git_async_cat ($$$$) { - my ($git, $oid, $cb, $arg) = @_; - if ($GCF2C //= eval { +sub ibx_async_cat ($$$$) { + my ($ibx, $oid, $cb, $arg) = @_; + my $git = $ibx->git; + # {topdir} means ExtSearch (likely [extindex "all"]) with potentially + # 100K alternates. git(1) has a proposed patch for 100K alternates: + # + if (!defined($ibx->{topdir}) && ($GCF2C //= eval { require PublicInbox::Gcf2Client; PublicInbox::Gcf2Client::new(); - } // 0) { # 0: do not retry if libgit2 or Inline::C are missing + } // 0)) { # 0: do not retry if libgit2 or Inline::C are missing $GCF2C->gcf2_async(\"$oid $git->{git_dir}\n", $cb, $arg); \undef; } else { # read-only end of git-cat-file pipe @@ -66,9 +70,10 @@ sub git_async_cat ($$$$) { # this is safe to call inside $cb, but not guaranteed to enqueue # returns true if successful, undef if not. -sub git_async_prefetch { - my ($git, $oid, $cb, $arg) = @_; - if ($GCF2C) { +sub ibx_async_prefetch { + my ($ibx, $oid, $cb, $arg) = @_; + my $git = $ibx->git; + if (!defined($ibx->{topdir}) && $GCF2C) { if (!$GCF2C->{wbuf}) { $oid .= " $git->{git_dir}\n"; return $GCF2C->gcf2_async(\$oid, $cb, $arg); # true diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm index 48ed11a5..334d6581 100644 --- a/lib/PublicInbox/GzipFilter.pm +++ b/lib/PublicInbox/GzipFilter.pm @@ -180,8 +180,7 @@ sub async_blob_cb { # git->cat_async callback sub smsg_blob { my ($self, $smsg) = @_; - git_async_cat($self->{ibx}->git, $smsg->{blob}, - \&async_blob_cb, $self); + ibx_async_cat($self->{ibx}, $smsg->{blob}, \&async_blob_cb, $self); } 1; diff --git a/lib/PublicInbox/HTTPD.pm b/lib/PublicInbox/HTTPD.pm index b193c9ae..fb683f74 100644 --- a/lib/PublicInbox/HTTPD.pm +++ b/lib/PublicInbox/HTTPD.pm @@ -37,7 +37,7 @@ sub new { # XXX unstable API!, only GitHTTPBackend needs # this to limit git-http-backend(1) parallelism. # We also check for the truthiness of this to - # detect when to use git_async_cat for slow blobs + # detect when to use async paths for slow blobs 'pi-httpd.async' => \&pi_httpd_async ); bless { diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index af8ce72b..9402aa41 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -612,7 +612,7 @@ sub fetch_run_ops { $self->msg_more(")\r\n"); } -sub fetch_blob_cb { # called by git->cat_async via git_async_cat +sub fetch_blob_cb { # called by git->cat_async via ibx_async_cat my ($bref, $oid, $type, $size, $fetch_arg) = @_; my ($self, undef, $msgs, $range_info, $ops, $partial) = @$fetch_arg; my $ibx = $self->{ibx} or return $self->close; # client disconnected @@ -627,8 +627,8 @@ sub fetch_blob_cb { # called by git->cat_async via git_async_cat } my $pre; if (!$self->{wbuf} && (my $nxt = $msgs->[0])) { - $pre = git_async_prefetch($ibx->git, $nxt->{blob}, - \&fetch_blob_cb, $fetch_arg); + $pre = ibx_async_prefetch($ibx, $nxt->{blob}, + \&fetch_blob_cb, $fetch_arg); } fetch_run_ops($self, $smsg, $bref, $ops, $partial); $pre ? $self->zflush : requeue_once($self); @@ -760,7 +760,7 @@ sub fetch_blob { # long_response } } uo2m_extend($self, $msgs->[-1]->{num}); - git_async_cat($self->{ibx}->git, $msgs->[0]->{blob}, + ibx_async_cat($self->{ibx}, $msgs->[0]->{blob}, \&fetch_blob_cb, \@_); } @@ -1228,7 +1228,7 @@ sub long_step { } elsif ($more) { # $self->{wbuf}: $self->update_idle_time; - # control passed to git_async_cat if $more == \undef + # control passed to ibx_async_cat if $more == \undef requeue_once($self) if !ref($more); } else { # all done! delete $self->{long_cb}; diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index f7d99913..9df47133 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -515,7 +515,7 @@ found: $smsg->{nntp_code} = $code; set_art($self, $art); # this dereferences to `undef' - ${git_async_cat($ibx->git, $smsg->{blob}, \&blob_cb, $smsg)}; + ${ibx_async_cat($ibx, $smsg->{blob}, \&blob_cb, $smsg)}; } } @@ -549,7 +549,7 @@ sub msg_hdr_write ($$) { $smsg->{nntp}->msg_more($$hdr); } -sub blob_cb { # called by git->cat_async via git_async_cat +sub blob_cb { # called by git->cat_async via ibx_async_cat my ($bref, $oid, $type, $size, $smsg) = @_; my $self = $smsg->{nntp}; my $code = $smsg->{nntp_code}; diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 92106e75..b0cd0f2c 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -593,8 +593,7 @@ sub resolve_patch ($$) { if (my $msgs = $want->{try_smsgs}) { my $smsg = shift @$msgs; if ($self->{psgi_env}->{'pi-httpd.async'}) { - return git_async_cat($want->{cur_ibx}->git, - $smsg->{blob}, + return ibx_async_cat($want->{cur_ibx}, $smsg->{blob}, \&extract_diff_async, [$self, $want, $smsg]); } else {