user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 20/38] view: reduce ascii_html calls and {obuf} use
  2022-09-10  8:16  7% [PATCH 00/38] www: reduce memory usage Eric Wong
@ 2022-09-10  8:17  4% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2022-09-10  8:17 UTC (permalink / raw)
  To: meta

We can rely on {-html_tip} for some things at the top of the
page, and reduce ascii_html and obfuscate_addrs calls by
working on the whole buffer at once.
---
 lib/PublicInbox/View.pm | 127 +++++++++++++++++-----------------------
 t/psgi_v2.t             |   4 +-
 2 files changed, 58 insertions(+), 73 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 08ba54bb..52d37a9f 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -38,7 +38,7 @@ sub msg_page_i {
 				: $ctx->gone('over');
 		$ctx->{mhref} = ($ctx->{nr} || $ctx->{smsg}) ?
 				"../${\mid_href($smsg->{mid})}/" : '';
-		if (_msg_page_prepare_obuf($eml, $ctx)) {
+		if (_msg_page_prepare($eml, $ctx)) {
 			$eml->each_part(\&add_text_body, $ctx, 1);
 			$ctx->zmore('</pre><hr>');
 		}
@@ -56,7 +56,7 @@ sub no_over_html ($) {
 	my $eml = PublicInbox::Eml->new($bref);
 	$ctx->{mhref} = '';
 	PublicInbox::WwwStream::init($ctx);
-	if (_msg_page_prepare_obuf($eml, $ctx)) { # sets {-title_html}
+	if (_msg_page_prepare($eml, $ctx)) { # sets {-title_html}
 		$eml->each_part(\&add_text_body, $ctx, 1);
 		$ctx->zmore('</pre><hr>');
 	}
@@ -635,11 +635,9 @@ sub add_text_body { # callback for each_part
 	}
 }
 
-sub _msg_page_prepare_obuf {
+sub _msg_page_prepare {
 	my ($eml, $ctx) = @_;
 	my $have_over = !!$ctx->{ibx}->over;
-	my $obfs_ibx = $ctx->{-obfs_ibx};
-	$ctx->{obuf} = \(my $rv = '');
 	my $mids = mids_for_index($eml);
 	my $nr = $ctx->{nr}++;
 	if ($nr) { # unlikely
@@ -647,80 +645,86 @@ sub _msg_page_prepare_obuf {
 			warn "W: BUG? @$mids not deduplicated properly\n";
 			return;
 		}
-		$rv .=
+		$ctx->{-html_tip} =
 "<pre>WARNING: multiple messages have this Message-ID\n</pre><pre>";
 	} else {
 		$ctx->{first_hdr} = $eml->header_obj;
 		$ctx->{chash} = content_hash($eml) if $ctx->{smsg}; # reused MID
-		$rv .= "<pre\nid=b>"; # anchor for body start
+		$ctx->{-html_tip} = "<pre\nid=b>"; # anchor for body start
 	}
 	$ctx->{-upfx} = '../';
 	my @title; # (Subject[0], From[0])
+	my $hbuf = '';
 	for my $v ($eml->header('From')) {
 		my @n = PublicInbox::Address::names($v);
-		$v = ascii_html($v);
-		$title[1] //= ascii_html(join(', ', @n));
-		if ($obfs_ibx) {
-			obfuscate_addrs($obfs_ibx, $v);
-			obfuscate_addrs($obfs_ibx, $title[1]);
-		}
-		$rv .= "From: $v\n" if $v ne '';
+		$title[1] //= join(', ', @n);
+		$hbuf .= "From: $v\n" if $v ne '';
 	}
-	foreach my $h (qw(To Cc)) {
+	for my $h (qw(To Cc)) {
 		for my $v ($eml->header($h)) {
 			fold_addresses($v);
-			$v = ascii_html($v);
-			obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx;
-			$rv .= "$h: $v\n" if $v ne '';
+			$hbuf .= "$h: $v\n" if $v ne '';
 		}
 	}
 	my @subj = $eml->header('Subject');
-	if (@subj) {
-		my $v = ascii_html(shift @subj);
-		obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx;
-		$rv .= 'Subject: ';
-		$rv .= $have_over ? qq(<a\nhref="#r"\nid=t>$v</a>\n) : "$v\n";
-		$title[0] = $v;
-		for $v (@subj) { # multi-Subject message :<
-			$v = ascii_html($v);
-			obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx;
-			$rv .= "Subject: $v\n";
-		}
-	} else { # dummy anchor for thread skeleton at bottom of page
-		$rv .= qq(<a\nhref="#r"\nid=t></a>) if $have_over;
-		$title[0] = '(no subject)';
-	}
-	for my $v ($eml->header('Date')) {
-		$v = ascii_html($v);
-		obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; # possible :P
-		$rv .= qq{Date: $v\n};
+	$hbuf .= "Subject: $_\n" for @subj;
+	$title[0] = $subj[0] // '(no subject)';
+	$hbuf .= "Date: $_\n" for $eml->header('Date');
+	$hbuf = ascii_html($hbuf);
+	$ctx->{-title_html} = ascii_html(join(' - ', @title));
+	if (my $obfs_ibx = $ctx->{-obfs_ibx}) {
+		obfuscate_addrs($obfs_ibx, $hbuf);
+		obfuscate_addrs($obfs_ibx, $ctx->{-title_html});
 	}
+
 	# [thread overview] link is typically added after Date,
 	# but added after Subject, or even nothing.
 	if ($have_over) {
-		chop $rv; # drop "\n", or noop if $rv eq ''
-		$rv .= qq{\t<a\nhref="#r">[thread overview]</a>\n};
+		chop $hbuf; # drop "\n", or noop if $rv eq ''
+		$hbuf .= qq{\t<a\nhref="#r">[thread overview]</a>\n};
+		$hbuf =~ s!^Subject:\x20(.*?)(\n[A-Z]|\z)
+				!Subject: <a\nhref="#r"\nid=t>$1</a>$2!msx or
+			$hbuf .= qq(<a\nhref="#r\nid=t></a>);
+	}
+	if (scalar(@$mids) == 1) { # common case
+		my $x = ascii_html($mids->[0]);
+		$hbuf .= qq[Message-ID: &lt;$x&gt; (<a href="raw">raw</a>)\n];
 	}
 	if (!$nr) { # first (and only) message, common case
-		$ctx->{-title_html} = join(' - ', @title);
-		$rv = $ctx->html_top . $rv;
+		$ctx->zmore($ctx->html_top, $hbuf);
+	} else {
+		delete $ctx->{-title_html};
+		$ctx->zmore($ctx->{-html_tip}, $hbuf);
 	}
-
 	$ctx->{-linkify} //= PublicInbox::Linkify->new;
-	if (scalar(@$mids) == 1) { # common case
-		my $mhtml = ascii_html($mids->[0]);
-		$rv .= qq[Message-ID: &lt;$mhtml&gt; (<a href="raw">raw</a>)\n];
-	} else {
+	$hbuf = '';
+	if (scalar(@$mids) != 1) { # unlikely, but it happens :<
 		# X-Alt-Message-ID can happen if a message is injected from
 		# public-inbox-nntpd because of multiple Message-ID headers.
-		my $s = '';
 		for my $h (qw(Message-ID X-Alt-Message-ID)) {
-			$s .= "$h: $_\n" for ($eml->header_raw($h));
+			$hbuf .= "$h: $_\n" for ($eml->header_raw($h));
 		}
-		$ctx->{-linkify}->linkify_mids('..', \$s, 1);
-		$rv .= $s;
+		$ctx->{-linkify}->linkify_mids('..', \$hbuf, 1); # escapes HTML
+		$ctx->zmore($hbuf);
+		$hbuf = '';
+	}
+	my @irt = $eml->header_raw('In-Reply-To');
+	my $refs;
+	if (!@irt) {
+		$refs = references($eml);
+		$irt[0] = pop(@$refs) if scalar @$refs;
 	}
-	$rv .= _parent_headers($ctx, $eml);
+	$hbuf .= "In-Reply-To: $_\n" for @irt;
+
+	# do not display References: if search is present,
+	# we show the thread skeleton at the bottom, instead.
+	if (!$have_over) {
+		$refs //= references($eml);
+		$hbuf .= 'References: <'.join(">\n\t<", @$refs).">\n" if @$refs;
+	}
+	$ctx->{-linkify}->linkify_mids('..', \$hbuf); # escapes HTML
+	$ctx->zmore($hbuf .= "\n");
+	${$ctx->{obuf}} = ''; # TODO remove
 	1;
 }
 
@@ -770,27 +774,6 @@ sub thread_skel ($$$) {
 	$ctx->{parent_msg} = $parent;
 }
 
-sub _parent_headers {
-	my ($ctx, $hdr) = @_;
-	my @irt = $hdr->header_raw('In-Reply-To');
-	my $refs;
-	my $s = '';
-	if (!@irt) {
-		$refs = references($hdr);
-		$irt[0] = pop(@$refs) if scalar @$refs;
-	}
-	$s .= "In-Reply-To: $_\n" for @irt;
-
-	# do not display References: if search is present,
-	# we show the thread skeleton at the bottom, instead.
-	if (!$ctx->{ibx}->over) {
-		$refs //= references($hdr);
-		$s .= 'References: <'.join(">\n\t<", @$refs).">\n" if @$refs;
-	}
-	$ctx->{-linkify}->linkify_mids('..', \$s); # escapes HTML
-	$s .= "\n";
-}
-
 # appends to obuf
 sub html_footer {
 	my ($ctx, $hdr) = @_;
diff --git a/t/psgi_v2.t b/t/psgi_v2.t
index 7d73b606..6b1b3a39 100644
--- a/t/psgi_v2.t
+++ b/t/psgi_v2.t
@@ -1,5 +1,5 @@
 #!perl -w
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 use strict;
 use v5.10.1;
@@ -209,6 +209,8 @@ my $client1 = sub {
 	local $SIG{__WARN__} = 'DEFAULT';
 	$res = $cb->(GET('/v2test/a-mid@b/'));
 	$raw = $res->content;
+	like($raw, qr/WARNING: multiple messages have this Message-ID/,
+		'warned about duplicate Message-IDs');
 	like($raw, qr/^hello world$/m, 'got first message');
 	like($raw, qr/^hello world!$/m, 'got second message');
 	like($raw, qr/^hello ghosts$/m, 'got third message');

^ permalink raw reply related	[relevance 4%]

* [PATCH 00/38] www: reduce memory usage
@ 2022-09-10  8:16  7% Eric Wong
  2022-09-10  8:17  4% ` [PATCH 20/38] view: reduce ascii_html calls and {obuf} use Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2022-09-10  8:16 UTC (permalink / raw)
  To: meta

I'm over the moon with this series since this drops dozens of
megabytes of scratchpad use while providing tiny speedups along
the way.  For me, that's a 10-15% reduction in memory use under
public-inbox-netd w/ mwrap-perl[1] overhead.

This scratchpad use has been bothering me for a long time
(since I fixed all the other leaks, including one in the core
Encode module).

There's more coming, of course, but this series is big enough
and shown good results on https://yhbt.net/lore/

Also, it also provides a good pattern/guidance going forward
on how to efficiently implement future features.

I actually started out in this series trying to buffer
everything using gzip to avoid space-wasting uncompressed
strings living in memory.  Unfortunately,
Compress::Raw::Zlib::deflate calls proved too expensive to call
frequently for short strings.

Going back to `.=' ops via a ->zadd method brought back some of
the speed while consolidating the scratchpad to a single place;
but I didn't like the performance regression.

I kept those detours in the history presented here since I
figure it's worth showing

Finally relying on PerlIO::scalar with print|say ops proved to
be the fastest since OO ->method dispatch overhead can be avoided
and there's no scratchpad use at all from these, either.

As before, we still call C:R:Z:deflate after every full message
and flush to the socket periodically.

I may even consider using PerlIO::gzip in the future, but that's
a non-standard module.  However, I definitely took inspiration
from it since I saw that it would buffer uncompressed data into
memory before compressing it.

There's also a few small simplifications and speedups I noticed
along the way, and several other bugfixes I posted independently
while working on this series.

[1] I used https://80x24.org/mwrap-perl.git to check malloc use

Eric Wong (38):
  xt: fold perf-obfuscate into perf-msgview, future-proof
  www: gzip_filter: implicitly flush {obuf} on zmore/zflush
  view: rework single message page to compress earlier
  www_atom_stream: require 200 response
  www_stream: aresponse assumes 200, too
  www_text: reduce parameter passing for response header
  viewvcs: use shorter and simpler ctx->html_done
  www_listing: consolidate some ->zmore dispatches
  www_listing: avoid unnecessary work for common cases
  www: viewdiff: use return value for diff_hunk
  view: simplify _parent_headers
  view: eml_entry: reduce manipulation of ctx->{obuf}
  gzip_filter: ->translate can reuse zmore/zflush
  view: remove multipart_text_as_html
  view: reduce subroutine calls for submsg_hdr
  view: attach_link: reduce obuf manipulation
  viewdiff: reuse existing string in diff_before_or_after
  view: _th_index_lite: avoid one s///, improve symmetry
  view: _th_index_lite: use `//' defined-or op
  view: reduce ascii_html calls and {obuf} use
  view: html_footer: golf out a few lines
  view: html_footer: remove obuf dependency
  view: html_footer: avoid escaping " in a few places
  viewdiff: diff_hunk: shorten conditionals, slightly
  view: switch a few things to ctx->zmore
  www: drop {obuf} use entirely, for now
  www: switch to zadd for the majority of buffering
  www: use PerlIO::scalar (zfh) for buffering
  viewdiff: diff_before_or_after: avoid extra capture
  viewdiff: diff_header: shorten function, slightly
  www_static: switch to `print $zfh', and optimize
  httpd/async: describe which ->write subs it can call
  translate: support multiple buffer args
  gzip_filter: write: use multi-arg translate
  feed: new_html_i: switch from zmore to `print $zfh'
  mbox*: use multi-arg ->translate and ->write
  www_listing: switch to `print $zfh'
  viewvcs: switch to `print $zfh'

 Documentation/mknews.perl        |   3 +-
 MANIFEST                         |   1 -
 lib/PublicInbox/CompressNoop.pm  |   4 +-
 lib/PublicInbox/Feed.pm          |  12 +-
 lib/PublicInbox/GzipFilter.pm    |  62 +++---
 lib/PublicInbox/HTTPD/Async.pm   |   9 +-
 lib/PublicInbox/Mbox.pm          |  11 +-
 lib/PublicInbox/MboxGz.pm        |   3 +-
 lib/PublicInbox/SearchView.pm    |   8 +-
 lib/PublicInbox/View.pm          | 312 ++++++++++++-------------------
 lib/PublicInbox/ViewDiff.pm      | 115 +++++-------
 lib/PublicInbox/ViewVCS.pm       |  17 +-
 lib/PublicInbox/WwwAtomStream.pm |  19 +-
 lib/PublicInbox/WwwListing.pm    |  40 ++--
 lib/PublicInbox/WwwStatic.pm     |  32 ++--
 lib/PublicInbox/WwwStream.pm     |  23 ++-
 lib/PublicInbox/WwwText.pm       |  35 ++--
 t/psgi_v2.t                      |   4 +-
 xt/perf-msgview.t                |  10 +-
 xt/perf-obfuscate.t              |  66 -------
 20 files changed, 320 insertions(+), 466 deletions(-)
 delete mode 100644 xt/perf-obfuscate.t

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2022-09-10  8:16  7% [PATCH 00/38] www: reduce memory usage Eric Wong
2022-09-10  8:17  4% ` [PATCH 20/38] view: reduce ascii_html calls and {obuf} use Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).