user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 27/26] lei_mirror+fetch: don't slurp `git show-ref' output
Date: Wed, 25 Oct 2023 06:33:55 +0000	[thread overview]
Message-ID: <20231025063355.M844764@dcvr> (raw)
In-Reply-To: <20231025002949.3092193-27-e@80x24.org>

While uncommon, some git repos have hundreds of thousands of
refs and slurping that output into memory can bloat the heap.
Introduce a sha_all sub in PublicInbox::SHA to loop until EOF
and rely on autodie for checking sysread errors.
---
 lib/PublicInbox/CodeSearchIdx.pm |  7 ++-----
 lib/PublicInbox/Fetch.pm         |  4 ++--
 lib/PublicInbox/Git.pm           |  6 ++----
 lib/PublicInbox/LeiMirror.pm     | 14 +++++++-------
 lib/PublicInbox/SHA.pm           | 11 ++++++++++-
 5 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index e31432b9..aeee37c0 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -45,7 +45,7 @@ use POSIX qw(WNOHANG SEEK_SET);
 use File::Path ();
 use File::Spec ();
 use List::Util qw(max);
-use PublicInbox::SHA qw(sha256_hex);
+use PublicInbox::SHA qw(sha256_hex sha_all);
 use PublicInbox::Search qw(xap_terms);
 use PublicInbox::SearchIdx qw(add_val);
 use PublicInbox::Config qw(glob2re rel2abs_collapsed);
@@ -386,10 +386,7 @@ sub fp_fini { # run_git cb
 	my (undef, $self, $git, $prep_repo) = @_;
 	my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
 	sysseek($refs, 0, SEEK_SET);
-	my $buf;
-	my $dig = PublicInbox::SHA->new(256);
-	while (sysread($refs, $buf, 65536)) { $dig->add($buf) }
-	$git->{-repo}->{fp} = $dig->hexdigest;
+	$git->{-repo}->{fp} = sha_all(256, $refs)->hexdigest;
 }
 
 sub ct_start ($$$) {
diff --git a/lib/PublicInbox/Fetch.pm b/lib/PublicInbox/Fetch.pm
index 6e9b1e94..e41dd448 100644
--- a/lib/PublicInbox/Fetch.pm
+++ b/lib/PublicInbox/Fetch.pm
@@ -10,6 +10,7 @@ use PublicInbox::Admin;
 use PublicInbox::LEI;
 use PublicInbox::LeiCurl;
 use PublicInbox::LeiMirror;
+use PublicInbox::SHA qw(sha_all);
 use File::Temp ();
 
 sub new { bless {}, __PACKAGE__ }
@@ -92,9 +93,8 @@ sub do_manifest ($$$) {
 
 sub get_fingerprint2 {
 	my ($git_dir) = @_;
-	require PublicInbox::SHA;
 	my $rd = popen_rd([qw(git show-ref)], undef, { -C => $git_dir });
-	PublicInbox::SHA::sha256(do { local $/; <$rd> });
+	sha_all(256, $rd)->digest; # ignore show-ref errors
 }
 
 sub writable_dir ($) {
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index 476dcf30..9c26d8bf 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -23,7 +23,7 @@ use PublicInbox::ProcessIONBF;
 use PublicInbox::Tmpfile;
 use IO::Poll qw(POLLIN);
 use Carp qw(croak carp);
-use PublicInbox::SHA ();
+use PublicInbox::SHA qw(sha_all);
 our %HEXLEN2SHA = (40 => 1, 64 => 256);
 our %OFMT2HEXLEN = (sha1 => 40, sha256 => 64);
 our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN read_all);
@@ -620,10 +620,8 @@ sub manifest_entry {
 			$ent->{reference} = $buf;
 		}
 	}
-	my $dig = PublicInbox::SHA->new(1);
-	while (CORE::read($sr, $buf, 65536)) { $dig->add($buf) }
+	$ent->{fingerprint} = sha_all(1, $sr)->hexdigest;
 	CORE::close $sr or return; # empty, uninitialized git repo
-	$ent->{fingerprint} = $dig->hexdigest;
 	$ent->{modified} = modified(undef, $mod);
 	chomp($buf = <$own> // '');
 	utf8::decode($buf);
diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm
index 47fb767b..43e59e6c 100644
--- a/lib/PublicInbox/LeiMirror.pm
+++ b/lib/PublicInbox/LeiMirror.pm
@@ -19,10 +19,10 @@ use PublicInbox::Inbox;
 use PublicInbox::Git qw(read_all);
 use PublicInbox::LeiCurl;
 use PublicInbox::OnDestroy;
-use PublicInbox::SHA qw(sha256_hex sha1_hex);
+use PublicInbox::SHA qw(sha256_hex sha_all);
 use POSIX qw(strftime);
-use autodie qw(chdir chmod close open pipe readlink seek symlink sysopen
-		truncate unlink);
+use autodie qw(chdir chmod close open pipe readlink
+		seek symlink sysopen sysseek truncate unlink);
 
 our $LIVE; # pid => callback
 our $FGRP_TODO; # objstore -> [[ to resume ], [ to clone ]]
@@ -533,10 +533,10 @@ sub fp_done {
 	}
 	return if !keep_going($self);
 	my $fh = delete $self->{-show_ref} // die 'BUG: no show-ref output';
-	seek($fh, SEEK_SET, 0);
+	sysseek($fh, SEEK_SET, 0);
 	$self->{-ent} // die 'BUG: no -ent';
 	my $A = $self->{-ent}->{fingerprint} // die 'BUG: no fingerprint';
-	my $B = sha1_hex(read_all($fh));
+	my $B = sha_all(1, $fh)->hexdigest;
 	return $cb->($self, @arg) if $A ne $B;
 	$self->{lei}->qerr("# $self->{-key} up-to-date");
 }
@@ -730,10 +730,10 @@ sub up_fp_done {
 	my ($self) = @_;
 	return if !keep_going($self);
 	my $fh = delete $self->{-show_ref_up} // die 'BUG: no show-ref output';
-	seek($fh, SEEK_SET, 0);
+	sysseek($fh, SEEK_SET, 0);
 	$self->{-ent} // die 'BUG: no -ent';
 	my $A = $self->{-ent}->{fingerprint} // die 'BUG: no fingerprint';
-	my $B = sha1_hex(read_all($fh));
+	my $B = sha_all(1, $fh)->hexdigest;
 	return if $A eq $B;
 	$self->{-ent}->{fingerprint} = $B;
 	push @{$self->{chg}->{fp_mismatch}}, $self->{-key};
diff --git a/lib/PublicInbox/SHA.pm b/lib/PublicInbox/SHA.pm
index 81f62618..3fa8530e 100644
--- a/lib/PublicInbox/SHA.pm
+++ b/lib/PublicInbox/SHA.pm
@@ -12,7 +12,8 @@
 package PublicInbox::SHA;
 use v5.12;
 require Exporter;
-our @EXPORT_OK = qw(sha1_hex sha256_hex sha256);
+our @EXPORT_OK = qw(sha1_hex sha256_hex sha256 sha_all);
+use autodie qw(sysread);
 our @ISA;
 
 BEGIN {
@@ -55,4 +56,12 @@ EOM
 }
 
 } # /BEGIN
+
+sub sha_all ($$) {
+	my ($n, $fh) = @_;
+	my ($dig, $buf) = (PublicInbox::SHA->new($n));
+	while (sysread($fh, $buf, 65536)) { $dig->add($buf) }
+	$dig
+}
+
 1;

      reply	other threads:[~2023-10-25  6:33 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-25  0:29 [PATCH 00/26] process management simplifications Eric Wong
2023-10-25  0:29 ` [PATCH 01/26] limiter: split out from qspawn Eric Wong
2023-10-25  0:29 ` [PATCH 02/26] spawn: support synchronous run_qx Eric Wong
2023-10-25  0:29 ` [PATCH 03/26] psgi_qx: use a temporary file rather than pipe Eric Wong
2023-10-25  0:29 ` [PATCH 04/26] www_coderepo: capture uses a flattened list Eric Wong
2023-10-25  0:29 ` [PATCH 05/26] qspawn: psgi_return allows list for callback args Eric Wong
2023-10-25  0:29 ` [PATCH 06/26] qspawn: drop unused err arg for ->event_step Eric Wong
2023-10-25  0:29 ` [PATCH 07/26] httpd/async: require IO arg Eric Wong
2023-10-25  0:29 ` [PATCH 08/26] xt/check-run: call DS->Reset after all tests Eric Wong
2023-10-25  0:29 ` [PATCH 09/26] qspawn: introduce new psgi_yield API Eric Wong
2023-10-25  0:29 ` [PATCH 10/26] repo_atom: switch to psgi_yield Eric Wong
2023-10-25  0:29 ` [PATCH 11/26] repo_snapshot: psgi_yield Eric Wong
2023-10-25  0:29 ` [PATCH 12/26] viewvcs: psgi_yield Eric Wong
2023-10-25  0:29 ` [PATCH 13/26] www_altid: switch to psgi_yield Eric Wong
2023-10-25  0:29 ` [PATCH 14/26] cgit: " Eric Wong
2023-10-25  0:29 ` [PATCH 15/26] www_coderepo: use psgi_yield Eric Wong
2023-10-25  0:29 ` [PATCH 16/26] drop psgi_return, httpd/async and GetlineBody Eric Wong
2023-10-25  0:29 ` [PATCH 17/26] qspawn: use WwwStatic for fallbacks and error code Eric Wong
2023-10-25  0:29 ` [PATCH 18/26] qspawn: simplify internal argument passing Eric Wong
2023-10-25  0:29 ` [PATCH 19/26] cidx_log_p: don't bother with F_SETPIPE_SZ Eric Wong
2023-10-25  0:29 ` [PATCH 20/26] cindex: avoid awaitpid for popen Eric Wong
2023-10-25  0:29 ` [PATCH 21/26] cindex: use timer for inits Eric Wong
2023-10-25  0:29 ` [PATCH 22/26] cindex: start using run_await to simplify code Eric Wong
2023-10-25  0:29 ` [PATCH 23/26] cindex: use run_await to read extensions.objectFormat Eric Wong
2023-10-25  0:29 ` [PATCH 24/26] cindex: drop XH_PID global Eric Wong
2023-10-25  0:29 ` [PATCH 25/26] cindex: use run_await wrapper for git commands Eric Wong
2023-10-25  0:29 ` [PATCH 26/26] cindex: use sysread for generating fingerprint Eric Wong
2023-10-25  6:33   ` Eric Wong [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231025063355.M844764@dcvr \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).