From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00, URIBL_BLOCKED shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 2F6001FA10 for ; Tue, 1 Sep 2020 20:36:56 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/3] www: manifest.js.gz generation no longer hogs event loop Date: Tue, 1 Sep 2020 20:36:55 +0000 Message-Id: <20200901203655.29694-4-e@80x24.org> In-Reply-To: <20200901203655.29694-1-e@80x24.org> References: <20200901203655.29694-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: It's still as slow as before with hundreds/thousands of inboxes, but at least it's fair. Future changes will allow it to be cached and memoized with persistent HTTP servers. --- MANIFEST | 1 + lib/PublicInbox/ManifestJsGz.pm | 153 ++++++++++++++++++++++++++++++++ lib/PublicInbox/WWW.pm | 4 +- lib/PublicInbox/WwwListing.pm | 117 +----------------------- t/www_listing.t | 7 +- 5 files changed, 162 insertions(+), 120 deletions(-) create mode 100644 lib/PublicInbox/ManifestJsGz.pm diff --git a/MANIFEST b/MANIFEST index 44670c7e..0e225b6a 100644 --- a/MANIFEST +++ b/MANIFEST @@ -157,6 +157,7 @@ lib/PublicInbox/Lock.pm lib/PublicInbox/MDA.pm lib/PublicInbox/MID.pm lib/PublicInbox/MIME.pm +lib/PublicInbox/ManifestJsGz.pm lib/PublicInbox/Mbox.pm lib/PublicInbox/MboxGz.pm lib/PublicInbox/MsgIter.pm diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm new file mode 100644 index 00000000..328303ce --- /dev/null +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -0,0 +1,153 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +package PublicInbox::ManifestJsGz; +use strict; +use v5.10.1; +use Digest::SHA (); +use File::Spec (); +use bytes (); # length +use PublicInbox::Inbox; +use PublicInbox::Git; +use IO::Compress::Gzip qw(gzip); +use HTTP::Date qw(time2str); +*try_cat = \&PublicInbox::Inbox::try_cat; + +our $json; +for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) { + eval "require $mod" or next; + # ->ascii encodes non-ASCII to "\uXXXX" + $json = $mod->new->ascii(1) and last; +} + +sub response { + my ($env, $list) = @_; + $json or return [ 404, [], [] ]; + my $self = bless { + -abs2urlpath => {}, + -mtime => 0, + manifest => {}, + -list => $list, + psgi_env => $env, + }, __PACKAGE__; + + # PSGI server will call this immediately and give us a callback (-wcb) + sub { + $self->{-wcb} = $_[0]; # HTTP write callback + iterate_start($self); + }; +} + +sub fingerprint ($) { + my ($git) = @_; + # TODO: convert to qspawn for fairness when there's + # thousands of repos + my ($fh, $pid) = $git->popen('show-ref'); + my $dig = Digest::SHA->new(1); + while (read($fh, my $buf, 65536)) { + $dig->add($buf); + } + close $fh; + waitpid($pid, 0); + return if $?; # empty, uninitialized git repo + $dig->hexdigest; +} + +sub manifest_add ($$;$$) { + my ($self, $ibx, $epoch, $default_desc) = @_; + my $url_path = "/$ibx->{name}"; + my $git_dir = $ibx->{inboxdir}; + if (defined $epoch) { + $git_dir .= "/git/$epoch.git"; + $url_path .= "/git/$epoch.git"; + } + return unless -d $git_dir; + my $git = PublicInbox::Git->new($git_dir); + my $fingerprint = fingerprint($git) or return; # no empty repos + + chomp(my $owner = $git->qx('config', 'gitweb.owner')); + chomp(my $desc = try_cat("$git_dir/description")); + utf8::decode($owner); + utf8::decode($desc); + $owner = undef if $owner eq ''; + $desc = 'Unnamed repository' if $desc eq ''; + + # templates/hooks--update.sample and git-multimail in git.git + # only match "Unnamed repository", not the full contents of + # templates/this--description in git.git + if ($desc =~ /\AUnnamed repository/) { + $desc = "$default_desc [epoch $epoch]" if defined($epoch); + } + + my $reference; + chomp(my $alt = try_cat("$git_dir/objects/info/alternates")); + if ($alt) { + # n.b.: GitPython doesn't seem to handle comments or C-quoted + # strings like native git does; and we don't for now, either. + my @alt = split(/\n+/, $alt); + + # grokmirror only supports 1 alternate for "reference", + if (scalar(@alt) == 1) { + my $objdir = "$git_dir/objects"; + $reference = File::Spec->rel2abs($alt[0], $objdir); + $reference =~ s!/[^/]+/?\z!!; # basename + } + } + $self->{-abs2urlpath}->{$git_dir} = $url_path; + my $modified = $git->modified; + if ($modified > $self->{-mtime}) { + $self->{-mtime} = $modified; + } + $self->{manifest}->{$url_path} = { + owner => $owner, + reference => $reference, + description => $desc, + modified => $modified, + fingerprint => $fingerprint, + }; +} + +sub iterate_start { + my ($self) = @_; + if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) { + # PublicInbox::HTTPD::Async->new + $async->(undef, undef, $self); + } else { + event_step($self) while $self->{-wcb}; + } +} + +sub event_step { + my ($self) = @_; + while (my $ibx = shift(@{$self->{-list}})) { + eval { + if (defined(my $max = $ibx->max_git_epoch)) { + my $desc = $ibx->description; + for my $epoch (0..$max) { + manifest_add($self, $ibx, $epoch, $desc) + } + } else { + manifest_add($self, $ibx); + } + }; + warn "E: $@" if $@; + if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) { + # PublicInbox::HTTPD::Async->new + $async->(undef, undef, $self); + } + return; # more steps needed + } + my $abs2urlpath = delete $self->{-abs2urlpath}; + my $wcb = delete $self->{-wcb}; + my $manifest = delete $self->{manifest}; + while (my ($url_path, $repo) = each %$manifest) { + defined(my $abs = $repo->{reference}) or next; + $repo->{reference} = $abs2urlpath->{$abs}; + } + $manifest = $json->encode($manifest); + gzip(\$manifest => \(my $out)); + $wcb->([ 200, [ qw(Content-Type application/gzip), + 'Last-Modified', time2str($self->{-mtime}), + 'Content-Length', bytes::length($out) ], [ $out ] ]); +} + +1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 2ea5d80d..93ab3c9d 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -509,8 +509,8 @@ sub get_inbox_manifest ($$$) { my ($ctx, $inbox, $key) = @_; my $r404 = invalid_inbox($ctx, $inbox); return $r404 if $r404; - require PublicInbox::WwwListing; - PublicInbox::WwwListing::js($ctx->{env}, [$ctx->{-inbox}]); + require PublicInbox::ManifestJsGz; + PublicInbox::ManifestJsGz::response($ctx->{env}, [$ctx->{-inbox}]); } sub get_attach { diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index 0be3764c..12b0d9ad 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -5,24 +5,11 @@ # Used by PublicInbox::WWW package PublicInbox::WwwListing; use strict; -use warnings; use PublicInbox::Hval qw(ascii_html prurl fmt_ts); use PublicInbox::Linkify; -use PublicInbox::View; -use PublicInbox::Inbox; use PublicInbox::GzipFilter qw(gzf_maybe); +use PublicInbox::ManifestJsGz; use bytes (); # bytes::length -use HTTP::Date qw(time2str); -use Digest::SHA (); -use File::Spec (); -use IO::Compress::Gzip qw(gzip); -*try_cat = \&PublicInbox::Inbox::try_cat; -our $json; -for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) { - eval "require $mod" or next; - # ->ascii encodes non-ASCII to "\uXXXX" - $json = $mod->new->ascii(1) and last; -} sub list_all_i { my ($ibx, $arg) = @_; @@ -132,106 +119,6 @@ sub html ($$) { [ $code, $h, [ $out ] ]; } -sub fingerprint ($) { - my ($git) = @_; - # TODO: convert to qspawn for fairness when there's - # thousands of repos - my ($fh, $pid) = $git->popen('show-ref'); - my $dig = Digest::SHA->new(1); - while (read($fh, my $buf, 65536)) { - $dig->add($buf); - } - close $fh; - waitpid($pid, 0); - return if $?; # empty, uninitialized git repo - $dig->hexdigest; -} - -sub manifest_add ($$;$$) { - my ($manifest, $ibx, $epoch, $default_desc) = @_; - my $url_path = "/$ibx->{name}"; - my $git_dir = $ibx->{inboxdir}; - if (defined $epoch) { - $git_dir .= "/git/$epoch.git"; - $url_path .= "/git/$epoch.git"; - } - return unless -d $git_dir; - my $git = PublicInbox::Git->new($git_dir); - my $fingerprint = fingerprint($git) or return; # no empty repos - - chomp(my $owner = $git->qx('config', 'gitweb.owner')); - chomp(my $desc = try_cat("$git_dir/description")); - utf8::decode($owner); - utf8::decode($desc); - $owner = undef if $owner eq ''; - $desc = 'Unnamed repository' if $desc eq ''; - - # templates/hooks--update.sample and git-multimail in git.git - # only match "Unnamed repository", not the full contents of - # templates/this--description in git.git - if ($desc =~ /\AUnnamed repository/) { - $desc = "$default_desc [epoch $epoch]" if defined($epoch); - } - - my $reference; - chomp(my $alt = try_cat("$git_dir/objects/info/alternates")); - if ($alt) { - # n.b.: GitPython doesn't seem to handle comments or C-quoted - # strings like native git does; and we don't for now, either. - my @alt = split(/\n+/, $alt); - - # grokmirror only supports 1 alternate for "reference", - if (scalar(@alt) == 1) { - my $objdir = "$git_dir/objects"; - $reference = File::Spec->rel2abs($alt[0], $objdir); - $reference =~ s!/[^/]+/?\z!!; # basename - } - } - $manifest->{-abs2urlpath}->{$git_dir} = $url_path; - my $modified = $git->modified; - if ($modified > $manifest->{-mtime}) { - $manifest->{-mtime} = $modified; - } - $manifest->{$url_path} = { - owner => $owner, - reference => $reference, - description => $desc, - modified => $modified, - fingerprint => $fingerprint, - }; -} - -# manifest.js.gz -sub js ($$) { - my ($env, $list) = @_; - # $json won't be defined if IO::Compress::Gzip is missing - $json or return [ 404, [], [] ]; - - my $manifest = { -abs2urlpath => {}, -mtime => 0 }; - for my $ibx (@$list) { - if (defined(my $max = $ibx->max_git_epoch)) { - my $desc = $ibx->description; - for my $epoch (0..$max) { - manifest_add($manifest, $ibx, $epoch, $desc); - } - } else { - manifest_add($manifest, $ibx); - } - } - my $abs2urlpath = delete $manifest->{-abs2urlpath}; - my $mtime = delete $manifest->{-mtime}; - while (my ($url_path, $repo) = each %$manifest) { - defined(my $abs = $repo->{reference}) or next; - $repo->{reference} = $abs2urlpath->{$abs}; - } - my $out; - gzip(\($json->encode($manifest)) => \$out); - $manifest = undef; - [ 200, [ qw(Content-Type application/gzip), - 'Last-Modified', time2str($mtime), - 'Content-Length', bytes::length($out) ], [ $out ] ]; -} - # not really a stand-alone PSGI app, but maybe it could be... sub call { my ($self, $env) = @_; @@ -239,7 +126,7 @@ sub call { if ($env->{PATH_INFO} eq '/manifest.js.gz') { # grokmirror uses relative paths, so it's domain-dependent my $list = $self->{manifest_cb}->($self, $env, 'manifest'); - js($env, $list); + PublicInbox::ManifestJsGz::response($env, $list); } else { # / my $list = $self->{www_cb}->($self, $env, 'www'); html($env, $list); diff --git a/t/www_listing.t b/t/www_listing.t index c4511cd1..4309a5e1 100644 --- a/t/www_listing.t +++ b/t/www_listing.t @@ -10,9 +10,10 @@ use PublicInbox::Import; require_mods(qw(URI::Escape Plack::Builder Digest::SHA IO::Compress::Gzip IO::Uncompress::Gunzip HTTP::Tiny)); require PublicInbox::WwwListing; +require PublicInbox::ManifestJsGz; my $json = do { no warnings 'once'; - $PublicInbox::WwwListing::json; + $PublicInbox::ManifestJsGz::json; } or plan skip_all => "JSON module missing"; use_ok 'PublicInbox::Git'; @@ -20,7 +21,7 @@ use_ok 'PublicInbox::Git'; my ($tmpdir, $for_destroy) = tmpdir(); my $bare = PublicInbox::Git->new("$tmpdir/bare.git"); PublicInbox::Import::init_bare($bare->{git_dir}); -is(PublicInbox::WwwListing::fingerprint($bare), undef, +is(PublicInbox::ManifestJsGz::fingerprint($bare), undef, 'empty repo has no fingerprint'); { my $fi_data = './t/git.fast-import-data'; @@ -30,7 +31,7 @@ is(PublicInbox::WwwListing::fingerprint($bare), undef, 'fast-import'); } -like(PublicInbox::WwwListing::fingerprint($bare), qr/\A[a-f0-9]{40}\z/, +like(PublicInbox::ManifestJsGz::fingerprint($bare), qr/\A[a-f0-9]{40}\z/, 'got fingerprint with non-empty repo'); sub tiny_test {