user/dev discussion of public-inbox itself
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 04/11] www: manifest.js.gz generation no longer hogs event loop
Date: Wed,  9 Sep 2020 06:26:11 +0000
Message-ID: <20200909062618.5940-5-e@80x24.org> (raw)
In-Reply-To: <20200909062618.5940-1-e@80x24.org>

It's still as slow as before with hundreds/thousands of inboxes,
but at least it's fair.  Future changes will allow it to be
cached and memoized with persistent HTTP servers.
---
 MANIFEST                        |   1 +
 lib/PublicInbox/ManifestJsGz.pm | 153 ++++++++++++++++++++++++++++++++
 lib/PublicInbox/WWW.pm          |   4 +-
 lib/PublicInbox/WwwListing.pm   | 117 +-----------------------
 t/www_listing.t                 |   7 +-
 5 files changed, 162 insertions(+), 120 deletions(-)
 create mode 100644 lib/PublicInbox/ManifestJsGz.pm

diff --git a/MANIFEST b/MANIFEST
index 44670c7e..0e225b6a 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -157,6 +157,7 @@ lib/PublicInbox/Lock.pm
 lib/PublicInbox/MDA.pm
 lib/PublicInbox/MID.pm
 lib/PublicInbox/MIME.pm
+lib/PublicInbox/ManifestJsGz.pm
 lib/PublicInbox/Mbox.pm
 lib/PublicInbox/MboxGz.pm
 lib/PublicInbox/MsgIter.pm
diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm
new file mode 100644
index 00000000..328303ce
--- /dev/null
+++ b/lib/PublicInbox/ManifestJsGz.pm
@@ -0,0 +1,153 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+package PublicInbox::ManifestJsGz;
+use strict;
+use v5.10.1;
+use Digest::SHA ();
+use File::Spec ();
+use bytes (); # length
+use PublicInbox::Inbox;
+use PublicInbox::Git;
+use IO::Compress::Gzip qw(gzip);
+use HTTP::Date qw(time2str);
+*try_cat = \&PublicInbox::Inbox::try_cat;
+
+our $json;
+for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) {
+	eval "require $mod" or next;
+	# ->ascii encodes non-ASCII to "\uXXXX"
+	$json = $mod->new->ascii(1) and last;
+}
+
+sub response {
+	my ($env, $list) = @_;
+	$json or return [ 404, [], [] ];
+	my $self = bless {
+		-abs2urlpath => {},
+		-mtime => 0,
+		manifest => {},
+		-list => $list,
+		psgi_env => $env,
+	}, __PACKAGE__;
+
+	# PSGI server will call this immediately and give us a callback (-wcb)
+	sub {
+		$self->{-wcb} = $_[0]; # HTTP write callback
+		iterate_start($self);
+	};
+}
+
+sub fingerprint ($) {
+	my ($git) = @_;
+	# TODO: convert to qspawn for fairness when there's
+	# thousands of repos
+	my ($fh, $pid) = $git->popen('show-ref');
+	my $dig = Digest::SHA->new(1);
+	while (read($fh, my $buf, 65536)) {
+		$dig->add($buf);
+	}
+	close $fh;
+	waitpid($pid, 0);
+	return if $?; # empty, uninitialized git repo
+	$dig->hexdigest;
+}
+
+sub manifest_add ($$;$$) {
+	my ($self, $ibx, $epoch, $default_desc) = @_;
+	my $url_path = "/$ibx->{name}";
+	my $git_dir = $ibx->{inboxdir};
+	if (defined $epoch) {
+		$git_dir .= "/git/$epoch.git";
+		$url_path .= "/git/$epoch.git";
+	}
+	return unless -d $git_dir;
+	my $git = PublicInbox::Git->new($git_dir);
+	my $fingerprint = fingerprint($git) or return; # no empty repos
+
+	chomp(my $owner = $git->qx('config', 'gitweb.owner'));
+	chomp(my $desc = try_cat("$git_dir/description"));
+	utf8::decode($owner);
+	utf8::decode($desc);
+	$owner = undef if $owner eq '';
+	$desc = 'Unnamed repository' if $desc eq '';
+
+	# templates/hooks--update.sample and git-multimail in git.git
+	# only match "Unnamed repository", not the full contents of
+	# templates/this--description in git.git
+	if ($desc =~ /\AUnnamed repository/) {
+		$desc = "$default_desc [epoch $epoch]" if defined($epoch);
+	}
+
+	my $reference;
+	chomp(my $alt = try_cat("$git_dir/objects/info/alternates"));
+	if ($alt) {
+		# n.b.: GitPython doesn't seem to handle comments or C-quoted
+		# strings like native git does; and we don't for now, either.
+		my @alt = split(/\n+/, $alt);
+
+		# grokmirror only supports 1 alternate for "reference",
+		if (scalar(@alt) == 1) {
+			my $objdir = "$git_dir/objects";
+			$reference = File::Spec->rel2abs($alt[0], $objdir);
+			$reference =~ s!/[^/]+/?\z!!; # basename
+		}
+	}
+	$self->{-abs2urlpath}->{$git_dir} = $url_path;
+	my $modified = $git->modified;
+	if ($modified > $self->{-mtime}) {
+		$self->{-mtime} = $modified;
+	}
+	$self->{manifest}->{$url_path} = {
+		owner => $owner,
+		reference => $reference,
+		description => $desc,
+		modified => $modified,
+		fingerprint => $fingerprint,
+	};
+}
+
+sub iterate_start {
+	my ($self) = @_;
+	if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) {
+		# PublicInbox::HTTPD::Async->new
+		$async->(undef, undef, $self);
+	} else {
+		event_step($self) while $self->{-wcb};
+	}
+}
+
+sub event_step {
+	my ($self) = @_;
+	while (my $ibx = shift(@{$self->{-list}})) {
+		eval {
+			if (defined(my $max = $ibx->max_git_epoch)) {
+				my $desc = $ibx->description;
+				for my $epoch (0..$max) {
+					manifest_add($self, $ibx, $epoch, $desc)
+				}
+			} else {
+				manifest_add($self, $ibx);
+			}
+		};
+		warn "E: $@" if $@;
+		if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) {
+			# PublicInbox::HTTPD::Async->new
+			$async->(undef, undef, $self);
+		}
+		return; # more steps needed
+	}
+	my $abs2urlpath = delete $self->{-abs2urlpath};
+	my $wcb = delete $self->{-wcb};
+	my $manifest = delete $self->{manifest};
+	while (my ($url_path, $repo) = each %$manifest) {
+		defined(my $abs = $repo->{reference}) or next;
+		$repo->{reference} = $abs2urlpath->{$abs};
+	}
+	$manifest = $json->encode($manifest);
+	gzip(\$manifest => \(my $out));
+	$wcb->([ 200, [ qw(Content-Type application/gzip),
+		 'Last-Modified', time2str($self->{-mtime}),
+		 'Content-Length', bytes::length($out) ], [ $out ] ]);
+}
+
+1;
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 2ea5d80d..93ab3c9d 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -509,8 +509,8 @@ sub get_inbox_manifest ($$$) {
 	my ($ctx, $inbox, $key) = @_;
 	my $r404 = invalid_inbox($ctx, $inbox);
 	return $r404 if $r404;
-	require PublicInbox::WwwListing;
-	PublicInbox::WwwListing::js($ctx->{env}, [$ctx->{-inbox}]);
+	require PublicInbox::ManifestJsGz;
+	PublicInbox::ManifestJsGz::response($ctx->{env}, [$ctx->{-inbox}]);
 }
 
 sub get_attach {
diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm
index 0be3764c..12b0d9ad 100644
--- a/lib/PublicInbox/WwwListing.pm
+++ b/lib/PublicInbox/WwwListing.pm
@@ -5,24 +5,11 @@
 # Used by PublicInbox::WWW
 package PublicInbox::WwwListing;
 use strict;
-use warnings;
 use PublicInbox::Hval qw(ascii_html prurl fmt_ts);
 use PublicInbox::Linkify;
-use PublicInbox::View;
-use PublicInbox::Inbox;
 use PublicInbox::GzipFilter qw(gzf_maybe);
+use PublicInbox::ManifestJsGz;
 use bytes (); # bytes::length
-use HTTP::Date qw(time2str);
-use Digest::SHA ();
-use File::Spec ();
-use IO::Compress::Gzip qw(gzip);
-*try_cat = \&PublicInbox::Inbox::try_cat;
-our $json;
-for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) {
-	eval "require $mod" or next;
-	# ->ascii encodes non-ASCII to "\uXXXX"
-	$json = $mod->new->ascii(1) and last;
-}
 
 sub list_all_i {
 	my ($ibx, $arg) = @_;
@@ -132,106 +119,6 @@ sub html ($$) {
 	[ $code, $h, [ $out ] ];
 }
 
-sub fingerprint ($) {
-	my ($git) = @_;
-	# TODO: convert to qspawn for fairness when there's
-	# thousands of repos
-	my ($fh, $pid) = $git->popen('show-ref');
-	my $dig = Digest::SHA->new(1);
-	while (read($fh, my $buf, 65536)) {
-		$dig->add($buf);
-	}
-	close $fh;
-	waitpid($pid, 0);
-	return if $?; # empty, uninitialized git repo
-	$dig->hexdigest;
-}
-
-sub manifest_add ($$;$$) {
-	my ($manifest, $ibx, $epoch, $default_desc) = @_;
-	my $url_path = "/$ibx->{name}";
-	my $git_dir = $ibx->{inboxdir};
-	if (defined $epoch) {
-		$git_dir .= "/git/$epoch.git";
-		$url_path .= "/git/$epoch.git";
-	}
-	return unless -d $git_dir;
-	my $git = PublicInbox::Git->new($git_dir);
-	my $fingerprint = fingerprint($git) or return; # no empty repos
-
-	chomp(my $owner = $git->qx('config', 'gitweb.owner'));
-	chomp(my $desc = try_cat("$git_dir/description"));
-	utf8::decode($owner);
-	utf8::decode($desc);
-	$owner = undef if $owner eq '';
-	$desc = 'Unnamed repository' if $desc eq '';
-
-	# templates/hooks--update.sample and git-multimail in git.git
-	# only match "Unnamed repository", not the full contents of
-	# templates/this--description in git.git
-	if ($desc =~ /\AUnnamed repository/) {
-		$desc = "$default_desc [epoch $epoch]" if defined($epoch);
-	}
-
-	my $reference;
-	chomp(my $alt = try_cat("$git_dir/objects/info/alternates"));
-	if ($alt) {
-		# n.b.: GitPython doesn't seem to handle comments or C-quoted
-		# strings like native git does; and we don't for now, either.
-		my @alt = split(/\n+/, $alt);
-
-		# grokmirror only supports 1 alternate for "reference",
-		if (scalar(@alt) == 1) {
-			my $objdir = "$git_dir/objects";
-			$reference = File::Spec->rel2abs($alt[0], $objdir);
-			$reference =~ s!/[^/]+/?\z!!; # basename
-		}
-	}
-	$manifest->{-abs2urlpath}->{$git_dir} = $url_path;
-	my $modified = $git->modified;
-	if ($modified > $manifest->{-mtime}) {
-		$manifest->{-mtime} = $modified;
-	}
-	$manifest->{$url_path} = {
-		owner => $owner,
-		reference => $reference,
-		description => $desc,
-		modified => $modified,
-		fingerprint => $fingerprint,
-	};
-}
-
-# manifest.js.gz
-sub js ($$) {
-	my ($env, $list) = @_;
-	# $json won't be defined if IO::Compress::Gzip is missing
-	$json or return [ 404, [], [] ];
-
-	my $manifest = { -abs2urlpath => {}, -mtime => 0 };
-	for my $ibx (@$list) {
-		if (defined(my $max = $ibx->max_git_epoch)) {
-			my $desc = $ibx->description;
-			for my $epoch (0..$max) {
-				manifest_add($manifest, $ibx, $epoch, $desc);
-			}
-		} else {
-			manifest_add($manifest, $ibx);
-		}
-	}
-	my $abs2urlpath = delete $manifest->{-abs2urlpath};
-	my $mtime = delete $manifest->{-mtime};
-	while (my ($url_path, $repo) = each %$manifest) {
-		defined(my $abs = $repo->{reference}) or next;
-		$repo->{reference} = $abs2urlpath->{$abs};
-	}
-	my $out;
-	gzip(\($json->encode($manifest)) => \$out);
-	$manifest = undef;
-	[ 200, [ qw(Content-Type application/gzip),
-		 'Last-Modified', time2str($mtime),
-		 'Content-Length', bytes::length($out) ], [ $out ] ];
-}
-
 # not really a stand-alone PSGI app, but maybe it could be...
 sub call {
 	my ($self, $env) = @_;
@@ -239,7 +126,7 @@ sub call {
 	if ($env->{PATH_INFO} eq '/manifest.js.gz') {
 		# grokmirror uses relative paths, so it's domain-dependent
 		my $list = $self->{manifest_cb}->($self, $env, 'manifest');
-		js($env, $list);
+		PublicInbox::ManifestJsGz::response($env, $list);
 	} else { # /
 		my $list = $self->{www_cb}->($self, $env, 'www');
 		html($env, $list);
diff --git a/t/www_listing.t b/t/www_listing.t
index c4511cd1..4309a5e1 100644
--- a/t/www_listing.t
+++ b/t/www_listing.t
@@ -10,9 +10,10 @@ use PublicInbox::Import;
 require_mods(qw(URI::Escape Plack::Builder Digest::SHA
 		IO::Compress::Gzip IO::Uncompress::Gunzip HTTP::Tiny));
 require PublicInbox::WwwListing;
+require PublicInbox::ManifestJsGz;
 my $json = do {
 	no warnings 'once';
-	$PublicInbox::WwwListing::json;
+	$PublicInbox::ManifestJsGz::json;
 } or plan skip_all => "JSON module missing";
 
 use_ok 'PublicInbox::Git';
@@ -20,7 +21,7 @@ use_ok 'PublicInbox::Git';
 my ($tmpdir, $for_destroy) = tmpdir();
 my $bare = PublicInbox::Git->new("$tmpdir/bare.git");
 PublicInbox::Import::init_bare($bare->{git_dir});
-is(PublicInbox::WwwListing::fingerprint($bare), undef,
+is(PublicInbox::ManifestJsGz::fingerprint($bare), undef,
 	'empty repo has no fingerprint');
 {
 	my $fi_data = './t/git.fast-import-data';
@@ -30,7 +31,7 @@ is(PublicInbox::WwwListing::fingerprint($bare), undef,
 		'fast-import');
 }
 
-like(PublicInbox::WwwListing::fingerprint($bare), qr/\A[a-f0-9]{40}\z/,
+like(PublicInbox::ManifestJsGz::fingerprint($bare), qr/\A[a-f0-9]{40}\z/,
 	'got fingerprint with non-empty repo');
 
 sub tiny_test {

  parent reply	other threads:[~2020-09-09  6:26 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-09  6:26 [PATCH 00/11] httpd: further reduce event loop monopolization Eric Wong
2020-09-09  6:26 ` [PATCH 01/11] xt/solver: test with public-inbox-httpd, too Eric Wong
2020-09-09  6:26 ` [PATCH 02/11] solver: drop warnings, modernize use v5.10.1, use SEEK_SET Eric Wong
2020-09-09  6:26 ` [PATCH 03/11] use "\&" where possible when referring to subroutines Eric Wong
2020-09-09  6:26 ` Eric Wong [this message]
2020-09-09  6:26 ` [PATCH 05/11] config: flatten each_inbox and iterate_start args Eric Wong
2020-09-09  6:26 ` [PATCH 06/11] config: split out iterator into separate object Eric Wong
2020-09-09  6:26 ` [PATCH 07/11] t/cgi.t: show stderr on failures Eric Wong
2020-09-09  6:26 ` [PATCH 08/11] extmsg: prevent cross-inbox matches from hogging event loop Eric Wong
2020-09-09  6:26 ` [PATCH 09/11] wwwlisting: avoid " Eric Wong
2020-09-09  6:26 ` [PATCH 10/11] solver: check one git coderepo and inbox at a time Eric Wong
2020-09-09  6:26 ` [PATCH 11/11] solver: break apart inbox blob retrieval Eric Wong
2020-09-10  1:51 ` [PATCH 12/11] solver: async blob retrieval for diff extraction Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200909062618.5940-5-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror http://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ http://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for the project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git