user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 08/43] gzipfilter: replace Compress::Raw::Deflate usages
Date: Sun,  5 Jul 2020 23:27:24 +0000	[thread overview]
Message-ID: <20200705232759.3161-9-e@yhbt.net> (raw)
In-Reply-To: <20200705232759.3161-1-e@yhbt.net>

The new ->zmore and ->zflush APIs make it possible to replace
existing verbose usages of Compress::Raw::Deflate and simplify
buffering logic for streaming large gzipped data.

One potentially user visible change is we now break the mbox.gz
response on zlib failures, instead of silently continuing onto
the next message.  zlib only seems to fail on OOM, which should
be rare; so it's ideal we drop the connection anyways.
---
 lib/PublicInbox/GzipFilter.pm | 27 ++++++++++-------------
 lib/PublicInbox/MboxGz.pm     | 41 ++++++++++-------------------------
 lib/PublicInbox/WwwStream.pm  | 27 ++++++++---------------
 3 files changed, 31 insertions(+), 64 deletions(-)

diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm
index 8cc5ea00b..d2eb4e664 100644
--- a/lib/PublicInbox/GzipFilter.pm
+++ b/lib/PublicInbox/GzipFilter.pm
@@ -6,7 +6,7 @@ package PublicInbox::GzipFilter;
 use strict;
 use parent qw(Exporter);
 use Compress::Raw::Zlib qw(Z_FINISH Z_OK);
-our @EXPORT_OK = qw(gzip_maybe gzf_maybe);
+our @EXPORT_OK = qw(gzf_maybe);
 my %OPT = (-WindowBits => 15 + 16, -AppendOutput => 1);
 my @GZIP_HDRS = qw(Vary Accept-Encoding Content-Encoding gzip);
 
@@ -19,24 +19,23 @@ sub attach {
 	$self
 }
 
-sub gzip_maybe ($$) {
+# returns `0' and not `undef' on failure (see Www*Stream)
+sub gzf_maybe ($$) {
 	my ($res_hdr, $env) = @_;
-	return if (($env->{HTTP_ACCEPT_ENCODING}) // '') !~ /\bgzip\b/;
-
+	return 0 if (($env->{HTTP_ACCEPT_ENCODING}) // '') !~ /\bgzip\b/;
 	my ($gz, $err) = Compress::Raw::Zlib::Deflate->new(%OPT);
-	return if $err != Z_OK;
+	return 0 if $err != Z_OK;
 
 	# in case Plack::Middleware::Deflater is loaded:
 	$env->{'plack.skip-deflater'} = 1;
-
 	push @$res_hdr, @GZIP_HDRS;
-	$gz;
+	bless { gz => $gz }, __PACKAGE__;
 }
 
-sub gzf_maybe ($$) {
-	my ($res_hdr, $env) = @_;
-	my $gz = gzip_maybe($res_hdr, $env) or return 0;
-	bless { gz => $gz }, __PACKAGE__;
+sub gzip_or_die () {
+	my ($gz, $err) = Compress::Raw::Zlib::Deflate->new(%OPT);
+	$err == Z_OK or die "Deflate->new failed: $err";
+	$gz;
 }
 
 # for GetlineBody (via Qspawn) when NOT using $env->{'pi-httpd.async'}
@@ -47,11 +46,7 @@ sub translate ($$) {
 	# allocate the zlib context lazily here, instead of in ->new.
 	# Deflate contexts are memory-intensive and this object may
 	# be sitting in the Qspawn limiter queue for a while.
-	my $gz = $self->{gz} //= do {
-		my ($g, $err) = Compress::Raw::Zlib::Deflate->new(%OPT);
-		$err == Z_OK or die "Deflate->new failed: $err";
-		$g;
-	};
+	my $gz = $self->{gz} //= gzip_or_die();
 	my $zbuf = delete($self->{zbuf});
 	if (defined $_[1]) { # my $buf = $_[1];
 		my $err = $gz->deflate($_[1], $zbuf);
diff --git a/lib/PublicInbox/MboxGz.pm b/lib/PublicInbox/MboxGz.pm
index f7fc4afc1..535ef96c9 100644
--- a/lib/PublicInbox/MboxGz.pm
+++ b/lib/PublicInbox/MboxGz.pm
@@ -2,19 +2,19 @@
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 package PublicInbox::MboxGz;
 use strict;
-use warnings;
+use parent 'PublicInbox::GzipFilter';
 use PublicInbox::Eml;
 use PublicInbox::Hval qw/to_filename/;
 use PublicInbox::Mbox;
-use Compress::Raw::Zlib qw(Z_FINISH Z_OK);
-my %OPT = (-WindowBits => 15 + 16, -AppendOutput => 1);
 
 sub new {
 	my ($class, $ctx, $cb) = @_;
 	$ctx->{base_url} = $ctx->{-inbox}->base_url($ctx->{env});
-	my ($gz, $err) = Compress::Raw::Zlib::Deflate->new(%OPT);
-	$err == Z_OK or die "Deflate->new failed: $err";
-	bless { gz => $gz, cb => $cb, ctx => $ctx }, $class;
+	bless {
+		gz => PublicInbox::GzipFilter::gzip_or_die(),
+		cb => $cb,
+		ctx => $ctx
+	}, $class;
 }
 
 sub response {
@@ -27,40 +27,21 @@ sub response {
 	[ 200, $h, $body ];
 }
 
-sub gzip_fail ($$) {
-	my ($ctx, $err) = @_;
-	$ctx->{env}->{'psgi.errors'}->print("deflate failed: $err\n");
-	'';
-}
-
 # called by Plack::Util::foreach or similar
 sub getline {
 	my ($self) = @_;
 	my $ctx = $self->{ctx} or return;
-	my $gz = $self->{gz};
-	my $buf = delete($self->{buf});
 	while (my $smsg = $self->{cb}->($ctx)) {
 		my $mref = $ctx->{-inbox}->msg_by_smsg($smsg) or next;
 		my $h = PublicInbox::Eml->new($mref)->header_obj;
-
-		my $err = $gz->deflate(
-			PublicInbox::Mbox::msg_hdr($ctx, $h, $smsg->{mid}),
-		        $buf);
-		return gzip_fail($ctx, $err) if $err != Z_OK;
-
-		$err = $gz->deflate(PublicInbox::Mbox::msg_body($$mref), $buf);
-		return gzip_fail($ctx, $err) if $err != Z_OK;
-
-		return $buf if length($buf) >= 8192;
-
-		# be fair to other clients on public-inbox-httpd:
-		$self->{buf} = $buf;
-		return '';
+		$self->zmore(
+			PublicInbox::Mbox::msg_hdr($ctx, $h, $smsg->{mid})
+		);
+		return $self->translate(PublicInbox::Mbox::msg_body($$mref));
 	}
 	# signal that we're done and can return undef next call:
 	delete $self->{ctx};
-	my $err = $gz->flush($buf, Z_FINISH);
-	($err == Z_OK) ? $buf : gzip_fail($ctx, $err);
+	$self->zflush;
 }
 
 sub close {} # noop
diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm
index c964dbd41..8623440b8 100644
--- a/lib/PublicInbox/WwwStream.pm
+++ b/lib/PublicInbox/WwwStream.pm
@@ -13,8 +13,7 @@ use base qw(Exporter);
 our @EXPORT_OK = qw(html_oneshot);
 use bytes (); # length
 use PublicInbox::Hval qw(ascii_html prurl);
-use Compress::Raw::Zlib qw(Z_FINISH Z_OK);
-use PublicInbox::GzipFilter qw(gzip_maybe gzf_maybe);
+use PublicInbox::GzipFilter qw(gzf_maybe);
 our $TOR_URL = 'https://www.torproject.org/';
 our $CODE_URL = 'https://public-inbox.org/public-inbox.git';
 
@@ -190,25 +189,17 @@ sub html_oneshot ($$;$) {
 		base_url => base_url($ctx),
 	}, __PACKAGE__;
 	my @x;
-	my $h = [ 'Content-Type' => 'text/html; charset=UTF-8' ];
-	if (my $gz = gzip_maybe($h, $ctx->{env})) {
-		my $err = $gz->deflate(_html_top($self), $x[0]);
-		die "gzip->deflate: $err" if $err != Z_OK;
-		if ($sref) {
-			$err = $gz->deflate($sref, $x[0]);
-			die "gzip->deflate: $err" if $err != Z_OK;
-		}
-		$err = $gz->deflate(_html_end($self), $x[0]);
-		die "gzip->deflate: $err" if $err != Z_OK;
-		$err = $gz->flush($x[0], Z_FINISH);
-		die "gzip->flush: $err" if $err != Z_OK;
+	my $h = [ 'Content-Type' => 'text/html; charset=UTF-8',
+		'Content-Length' => undef ];
+	if (my $gzf = gzf_maybe($h, $ctx->{env})) {
+		$gzf->zmore(_html_top($self));
+		$gzf->zmore($$sref) if $sref;
+		$x[0] = $gzf->zflush(_html_end($self));
+		$h->[3] = length($x[0]);
 	} else {
 		@x = (_html_top($self), $sref ? $$sref : (), _html_end($self));
+		$h->[3] += bytes::length($_) for @x;
 	}
-
-	my $len = 0;
-	$len += bytes::length($_) for @x;
-	push @$h, 'Content-Length', $len;
 	[ $code, $h, \@x ]
 }
 

  parent reply	other threads:[~2020-07-05 23:28 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-05 23:27 [PATCH 00/43] www: async git cat-file w/ -httpd Eric Wong
2020-07-05 23:27 ` [PATCH 01/43] gzipfilter: minor cleanups Eric Wong
2020-07-05 23:27 ` [PATCH 02/43] wwwstream: oneshot: perform gzip without middleware Eric Wong
2020-07-05 23:27 ` [PATCH 03/43] www*stream: gzip ->getline responses Eric Wong
2020-07-05 23:27 ` [PATCH 04/43] wwwtext: gzip text/plain responses, as well Eric Wong
2020-07-05 23:27 ` [PATCH 05/43] wwwtext: switch to html_oneshot Eric Wong
2020-07-05 23:27 ` [PATCH 06/43] www: need: use WwwStream::html_oneshot Eric Wong
2020-07-05 23:27 ` [PATCH 07/43] wwwlisting: use GzipFilter for HTML Eric Wong
2020-07-05 23:27 ` Eric Wong [this message]
2020-07-05 23:27 ` [PATCH 09/43] {gzip,noop}filter: ->zmore returns undef, always Eric Wong
2020-07-05 23:27 ` [PATCH 10/43] mbox: remove html_oneshot import Eric Wong
2020-07-05 23:27 ` [PATCH 11/43] wwwstatic: support gzipped directory listings Eric Wong
2020-07-05 23:27 ` [PATCH 12/43] qspawn: learn to gzip streaming responses Eric Wong
2020-07-05 23:27 ` [PATCH 13/43] stop auto-loading Plack::Middleware::Deflater Eric Wong
2020-07-05 23:27 ` [PATCH 14/43] mboxgz: do asynchronous git blob retrievals Eric Wong
2020-07-05 23:27 ` [PATCH 15/43] mboxgz: reduce hash depth Eric Wong
2020-07-05 23:27 ` [PATCH 16/43] mbox: async blob fetch for "single message" raw mboxrd Eric Wong
2020-07-05 23:27 ` [PATCH 17/43] wwwatomstream: simplify feed_update callers Eric Wong
2020-07-05 23:27 ` [PATCH 18/43] wwwatomstream: use PublicInbox::Inbox->modified for feed_updated Eric Wong
2020-07-05 23:27 ` [PATCH 19/43] wwwatomstream: reuse $ctx as $self Eric Wong
2020-07-05 23:27 ` [PATCH 20/43] xt/httpd-async-stream: allow more options Eric Wong
2020-07-05 23:27 ` [PATCH 21/43] wwwatomstream: support async blob fetch Eric Wong
2020-07-05 23:27 ` [PATCH 22/43] wwwstream: reduce object graph depth Eric Wong
2020-07-05 23:27 ` [PATCH 23/43] wwwstream: reduce blob fetch paths for ->getline Eric Wong
2020-07-05 23:27 ` [PATCH 24/43] www: start making gzipfilter the parent response class Eric Wong
2020-07-05 23:27 ` [PATCH 25/43] remove unused/redundant zlib-related imports Eric Wong
2020-07-05 23:27 ` [PATCH 26/43] wwwstream: use parent.pm and no warnings Eric Wong
2020-07-05 23:27 ` [PATCH 27/43] wwwstream: subclass off GzipFilter Eric Wong
2020-07-05 23:27 ` [PATCH 28/43] view: make /$INBOX/$MSGID/ permalink async Eric Wong
2020-07-05 23:27 ` [PATCH 29/43] view: /$INBOX/$MSGID/t/ reads blobs asynchronously Eric Wong
2020-07-05 23:27 ` [PATCH 30/43] view: update /$INBOX/$MSGID/T/ to be async Eric Wong
2020-07-05 23:27 ` [PATCH 31/43] feed: generate_i: eliminate pointless loop Eric Wong
2020-07-05 23:27 ` [PATCH 32/43] feed: /$INBOX/new.html fetches blobs asynchronously Eric Wong
2020-07-05 23:27 ` [PATCH 33/43] ssearchview: /$INBOX/?q=$QUERY&x=t uses async blobs Eric Wong
2020-07-05 23:27 ` [PATCH 34/43] view: eml_entry: reduce parameters Eric Wong
2020-07-05 23:27 ` [PATCH 35/43] view: /$INBOX/$MSGID/t/: avoid extra hash lookup in eml case Eric Wong
2020-07-05 23:27 ` [PATCH 36/43] wwwstream: eliminate ::response, use html_oneshot Eric Wong
2020-07-05 23:27 ` [PATCH 37/43] www: update internal docs Eric Wong
2020-07-05 23:27 ` [PATCH 38/43] view: simplify eml_entry callers further Eric Wong
2020-07-05 23:27 ` [PATCH 39/43] wwwtext: simplify gzf_maybe use Eric Wong
2020-07-05 23:27 ` [PATCH 40/43] wwwattach: support async blob retrievals Eric Wong
2020-07-05 23:27 ` [PATCH 41/43] gzipfilter: drop HTTP connection on bugs or data corruption Eric Wong
2020-07-05 23:27 ` [PATCH 42/43] daemon: warn on missing blobs Eric Wong
2020-07-05 23:27 ` [PATCH 43/43] gzipfilter: check http->{forward} for client disconnects Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200705232759.3161-9-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).