"view: stop storing all MIME objects on large threads"

user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed

Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |

* [PATCH 08/13] www: remove smsg_mime API and adjust callers
  2020-06-01 10:06  5% [PATCH 00/13] smsg: remove tricky {mime} field Eric Wong
  2020-06-01 10:06  4% ` [PATCH 01/13] inbox: introduce smsg_eml method Eric Wong
@ 2020-06-01 10:06  2% ` Eric Wong
  1 sibling, 0 replies; 10+ results
From: Eric Wong @ 2020-06-01 10:06 UTC (permalink / raw)
  To: meta

To further simplify callers and avoid embarrasing memory
explosions[1], we can finally eliminate this method in
favor of smsg_eml.

[1] commit 7d02b9e64455831d3bda20cd2e64e0c15dc07df5
    ("view: stop storing all MIME objects on large threads")
    fixed a huge memory blowup.
---
 Documentation/mknews.perl     |  7 ++--
 lib/PublicInbox/Feed.pm       |  6 ++--
 lib/PublicInbox/Inbox.pm      | 12 ++-----
 lib/PublicInbox/SearchView.pm |  4 +--
 lib/PublicInbox/Smsg.pm       |  7 ++--
 lib/PublicInbox/View.pm       | 63 +++++++++++++++++------------------
 t/v2mda.t                     |  4 +--
 7 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/Documentation/mknews.perl b/Documentation/mknews.perl
index 3bdebfce7a5..965c30c1dcb 100755
--- a/Documentation/mknews.perl
+++ b/Documentation/mknews.perl
@@ -102,9 +102,10 @@ sub mime2txt {
 }
 
 sub mime2html {
-	my ($out, $mime, $ctx) = @_;
-	my $smsg = bless { mime => $mime }, 'PublicInbox::Smsg';
-	print $out PublicInbox::View::index_entry($smsg, $ctx, 1) or die;
+	my ($out, $eml, $ctx) = @_;
+	my $smsg = bless {}, 'PublicInbox::Smsg';
+	$smsg->populate($eml);
+	print $out PublicInbox::View::eml_entry($ctx, $smsg, $eml, 1) or die;
 }
 
 sub html_start {
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index b770a35077c..4c1056b4665 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -53,9 +53,9 @@ sub new_html_i {
 	my ($nr, $ctx) = @_;
 	my $msgs = $ctx->{msgs};
 	while (my $smsg = shift @$msgs) {
-		my $m = $ctx->{-inbox}->smsg_mime($smsg) or next;
-		my $more = scalar @$msgs;
-		return PublicInbox::View::index_entry($m, $ctx, $more);
+		my $eml = $ctx->{-inbox}->smsg_eml($smsg) or next;
+		return PublicInbox::View::eml_entry($ctx, $smsg, $eml,
+							scalar @$msgs);
 	}
 	PublicInbox::View::pagination_footer($ctx, './new.html');
 }
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 38abdfe5847..af034358b15 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -326,18 +326,12 @@ sub msg_by_smsg ($$) {
 	git($self)->cat_file($blob);
 }
 
-sub smsg_mime {
-	my ($self, $smsg) = @_;
-	if (my $s = msg_by_smsg($self, $smsg)) {
-		$smsg->{mime} = PublicInbox::Eml->new($s);
-		return $smsg;
-	}
-}
-
 sub smsg_eml {
 	my ($self, $smsg) = @_;
 	my $bref = msg_by_smsg($self, $smsg) or return;
-	PublicInbox::Eml->new($bref);
+	my $eml = PublicInbox::Eml->new($bref);
+	$smsg->populate($eml) unless exists($smsg->{num}); # v1 w/o SQLite
+	$eml;
 }
 
 sub mid2num($$) {
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 249cf53926d..d53a533e53c 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -300,8 +300,8 @@ sub mset_thread_i {
 	my ($nr, $ctx) = @_;
 	my $msgs = $ctx->{msgs} or return;
 	while (my $smsg = pop @$msgs) {
-		$ctx->{-inbox}->smsg_mime($smsg) or next;
-		return PublicInbox::View::index_entry($smsg, $ctx,
+		my $eml = $ctx->{-inbox}->smsg_eml($smsg) or next;
+		return PublicInbox::View::eml_entry($ctx, $smsg, $eml,
 							scalar @$msgs);
 	}
 	my ($skel) = delete @$ctx{qw(skel msgs)};
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 446bca81b48..9688c5592a2 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -12,7 +12,7 @@ use strict;
 use warnings;
 use base qw(Exporter);
 our @EXPORT_OK = qw(subject_normalized);
-use PublicInbox::MID qw/mid_mime/;
+use PublicInbox::MID qw(mid_mime mids);
 use PublicInbox::Address;
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use Time::Local qw(timegm);
@@ -105,7 +105,7 @@ sub __hdr ($$) {
 	};
 }
 
-# for Import and v1 WWW code paths
+# for Import and v1 non-SQLite WWW code paths
 sub populate {
 	my ($self, $hdr, $v2w) = @_;
 	for my $f (qw(From To Cc Subject)) {
@@ -133,6 +133,9 @@ sub populate {
 	$self->{-ts} = [ my @ts = msg_timestamp($hdr, $v2w->{cotime}) ];
 	$self->{ds} //= $ds[0]; # no zone
 	$self->{ts} //= $ts[0];
+
+	# for v1 users w/o SQLite
+	$self->{mid} //= eval { mids($hdr)->[0] } // '';
 }
 
 sub subject ($) { __hdr($_[0], 'Subject') }
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index a05ac4142f2..0bc2b06e4dc 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -74,11 +74,10 @@ sub msg_page_more { # cold
 	my $ibx = $ctx->{-inbox};
 	my $next = $ibx->over->next_by_mid($ctx->{mid}, \$id, \$prev);
 	$ctx->{more} = [ $id, $prev, $next ] if $next;
-	$smsg = $ibx->smsg_mime($smsg) or return '';
+	my $eml = $ibx->smsg_eml($smsg) or return '';
 	$ctx->{mhref} = '../' . mid_href($smsg->{mid}) . '/';
-	my $mime = delete $smsg->{mime};
-	$ctx->{obuf} = _msg_page_prepare_obuf($mime->header_obj, $ctx, $nr);
-	multipart_text_as_html($mime, $ctx);
+	$ctx->{obuf} = _msg_page_prepare_obuf($eml->header_obj, $ctx, $nr);
+	multipart_text_as_html($eml, $ctx);
 	${delete $ctx->{obuf}} .= '</pre><hr>';
 }
 
@@ -181,14 +180,14 @@ sub nr_to_s ($$$) {
 # human-friendly format
 sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
 
+# Displays the text of of the message for /$INBOX/$MSGID/[Tt]/ endpoint
 # this is already inside a <pre>
-sub index_entry {
-	my ($smsg, $ctx, $more) = @_;
-	my $subj = $smsg->subject;
-	my $mid_raw = $smsg->mid;
+sub eml_entry {
+	my ($ctx, $smsg, $eml, $more) = @_;
+	my $subj = delete $smsg->{subject};
+	my $mid_raw = $smsg->{mid};
 	my $id = id_compress($mid_raw, 1);
 	my $id_m = 'm'.$id;
-
 	my $root_anchor = $ctx->{root_anchor} || '';
 	my $irt;
 	my $obfs_ibx = $ctx->{-obfs_ibx};
@@ -201,12 +200,12 @@ sub index_entry {
 	$rv .= $subj . "\n";
 	$rv .= _th_index_lite($mid_raw, \$irt, $id, $ctx);
 	my @tocc;
-	my $ds = $smsg->ds; # for v1 non-Xapian/SQLite users
-	# deleting {mime} is critical to memory use,
-	# the rest of the fields saves about 400K as we iterate across 1K msgs
-	my ($mime) = delete @$smsg{qw(mime ds ts blob subject)};
+	my $ds = delete $smsg->{ds}; # for v1 non-Xapian/SQLite users
+
+	# Deleting these fields saves about 400K as we iterate across 1K msgs
+	delete @$smsg{qw(ts blob)};
 
-	my $hdr = $mime->header_obj;
+	my $hdr = $eml->header_obj;
 	my $from = _hdr_names_html($hdr, 'From');
 	obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx;
 	$rv .= "From: $from @ ".fmt_ts($ds)." UTC";
@@ -244,7 +243,7 @@ sub index_entry {
 	# scan through all parts, looking for displayable text
 	$ctx->{mhref} = $mhref;
 	$ctx->{obuf} = \$rv;
-	$mime->each_part(\&add_text_body, $ctx, 1);
+	$eml->each_part(\&add_text_body, $ctx, 1);
 	delete $ctx->{obuf};
 
 	# add the footer
@@ -372,10 +371,10 @@ sub pre_thread  { # walk_thread callback
 	skel_dump($ctx, $level, $node);
 }
 
-sub thread_index_entry {
-	my ($ctx, $level, $smsg) = @_;
+sub thread_eml_entry {
+	my ($ctx, $level, $smsg, $eml) = @_;
 	my ($beg, $end) = thread_adj_level($ctx, $level);
-	$beg . '<pre>' . index_entry($smsg, $ctx, 0) . '</pre>' . $end;
+	$beg . '<pre>' . eml_entry($ctx, $smsg, $eml, 0) . '</pre>' . $end;
 }
 
 sub stream_thread_i { # PublicInbox::WwwStream::getline callback
@@ -387,8 +386,8 @@ sub stream_thread_i { # PublicInbox::WwwStream::getline callback
 		my $node = shift @$q or next;
 		my $cl = $level + 1;
 		unshift @$q, map { ($cl, $_) } @{$node->{children}};
-		if ($ctx->{-inbox}->smsg_mime($node)) {
-			return thread_index_entry($ctx, $level, $node);
+		if (my $eml = $ctx->{-inbox}->smsg_eml($node)) {
+			return thread_eml_entry($ctx, $level, $node, $eml);
 		} else {
 			return ghost_index_entry($ctx, $level, $node);
 		}
@@ -400,19 +399,19 @@ sub stream_thread ($$) {
 	my ($rootset, $ctx) = @_;
 	my $ibx = $ctx->{-inbox};
 	my @q = map { (0, $_) } @$rootset;
-	my ($smsg, $level);
+	my ($smsg, $eml, $level);
 	while (@q) {
 		$level = shift @q;
-		my $node = shift @q or next;
+		$smsg = shift @q or next;
 		my $cl = $level + 1;
-		unshift @q, map { ($cl, $_) } @{$node->{children}};
-		$smsg = $ibx->smsg_mime($node) and last;
+		unshift @q, map { ($cl, $_) } @{$smsg->{children}};
+		$eml = $ibx->smsg_eml($smsg) and last;
 	}
-	return missing_thread($ctx) unless $smsg;
+	return missing_thread($ctx) unless $eml;
 
 	$ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef;
 	$ctx->{-title_html} = ascii_html($smsg->{subject});
-	$ctx->{-html_tip} = thread_index_entry($ctx, $level, $smsg);
+	$ctx->{-html_tip} = thread_eml_entry($ctx, $level, $smsg, $eml);
 	$ctx->{-queue} = \@q;
 	PublicInbox::WwwStream->response($ctx, 200, \&stream_thread_i);
 }
@@ -452,13 +451,13 @@ sub thread_html {
 	return stream_thread($rootset, $ctx) unless $ctx->{flat};
 
 	# flat display: lazy load the full message from smsg
-	my $smsg;
-	while (my $m = shift @$msgs) {
-		$smsg = $ibx->smsg_mime($m) and last;
+	my ($smsg, $eml);
+	while ($smsg = shift @$msgs) {
+		$eml = $ibx->smsg_eml($smsg) and last;
 	}
 	return missing_thread($ctx) unless $smsg;
 	$ctx->{-title_html} = ascii_html($smsg->{subject});
-	$ctx->{-html_tip} = '<pre>'.index_entry($smsg, $ctx, scalar @$msgs);
+	$ctx->{-html_tip} = '<pre>'.eml_entry($ctx, $smsg, $eml, scalar @$msgs);
 	$ctx->{msgs} = $msgs;
 	PublicInbox::WwwStream->response($ctx, 200, \&thread_html_i);
 }
@@ -467,8 +466,8 @@ sub thread_html_i { # PublicInbox::WwwStream::getline callback
 	my ($nr, $ctx) = @_;
 	my $msgs = $ctx->{msgs} or return;
 	while (my $smsg = shift @$msgs) {
-		$ctx->{-inbox}->smsg_mime($smsg) or next;
-		return index_entry($smsg, $ctx, scalar @$msgs);
+		my $eml = $ctx->{-inbox}->smsg_eml($smsg) or next;
+		return eml_entry($ctx, $smsg, $eml, scalar @$msgs);
 	}
 	my ($skel) = delete @$ctx{qw(skel msgs)};
 	$$skel;
diff --git a/t/v2mda.t b/t/v2mda.t
index 36f43ff096c..7666eb2dacd 100644
--- a/t/v2mda.t
+++ b/t/v2mda.t
@@ -52,8 +52,8 @@ if ($V == 1) {
 }
 my $msgs = $ibx->search->query('');
 is(scalar(@$msgs), 1, 'only got one message');
-my $saved = $ibx->smsg_mime($msgs->[0]);
-is($saved->{mime}->as_string, $mime->as_string, 'injected message');
+my $eml = $ibx->smsg_eml($msgs->[0]);
+is($eml->as_string, $mime->as_string, 'injected message');
 
 {
 	my @new = glob("$faildir/new/*");

^ permalink raw reply related	[relevance 2%]

* [PATCH 00/13] smsg: remove tricky {mime} field
@ 2020-06-01 10:06  5% Eric Wong
  2020-06-01 10:06  4% ` [PATCH 01/13] inbox: introduce smsg_eml method Eric Wong
  2020-06-01 10:06  2% ` [PATCH 08/13] www: remove smsg_mime API and adjust callers Eric Wong
  0 siblings, 2 replies; 10+ results
From: Eric Wong @ 2020-06-01 10:06 UTC (permalink / raw)
  To: meta

Storing a large PublicInbox::Eml (or in the past, Email::MIME)
object inside a small PublicInbox::Smsg object has historically
been bloat-prone[1] since there may be many small smsgs in
memory at once

Hundreds or thousands of $smsg objects can linger in memory due
to search results and message threading operations.  So keep
$eml and $smsg objects independent of each other, for now.
Instead, we'll introduce a $smsg->populate($eml) API to handle
filling in the keys for the importer, indexer, and
non-SQLite-using WWW users.

Furthermore, $smsg->$field dispatch has always been measurably
faster than $smsg->{$field} access in NNTP.  Since $smsg->$field
became read-only with the removal of $smsg->{mime}, we can
abandon the $smsg->$field invocations and favor of direct hash
access.

[1] the prime example being what commit 7d02b9e64455831d fixed
    ("view: stop storing all MIME objects on large threads")

Eric Wong (13):
  inbox: introduce smsg_eml method
  wwwatomstream: convert callers to use smsg_eml
  v2writable: fix non-sensical interpolation in BUG message
  import: modernize to use Perl 5.10 features
  smsg: introduce ->populate method
  smsg: get rid of ->wrap initializer, too
  inbox: msg_by_*: remove $(size)ref args
  www: remove smsg_mime API and adjust callers
  nntp: smsg_range_i: favor ->{$field} lookups when possible
  smsg: get rid of remaining {mime} users
  smsg: remove ->bytes and ->lines methods
  smsg: remove remaining accessor methods
  wwwatomstream: drop smsg->{mid} fallback for non-SQLite

 Documentation/mknews.perl        |   7 +-
 lib/PublicInbox/ExtMsg.pm        |   2 +-
 lib/PublicInbox/Feed.pm          |   8 +-
 lib/PublicInbox/Import.pm        |  69 ++++++++---------
 lib/PublicInbox/Inbox.pm         |  32 ++++----
 lib/PublicInbox/Mbox.pm          |   2 +-
 lib/PublicInbox/NNTP.pm          |  14 +++-
 lib/PublicInbox/OverIdx.pm       |   3 +-
 lib/PublicInbox/SearchIdx.pm     |  33 ++++-----
 lib/PublicInbox/SearchView.pm    |   6 +-
 lib/PublicInbox/Smsg.pm          | 123 +++++++++++--------------------
 lib/PublicInbox/SolverGit.pm     |   4 +-
 lib/PublicInbox/V2Writable.pm    |  11 +--
 lib/PublicInbox/View.pm          |  63 ++++++++--------
 lib/PublicInbox/WwwAtomStream.pm |   8 +-
 t/altid.t                        |   3 +-
 t/altid_v2.t                     |   3 +-
 t/import.t                       |   3 +-
 t/search.t                       |  46 +++++++-----
 t/v2mda.t                        |   4 +-
 t/v2writable.t                   |   5 +-
 21 files changed, 207 insertions(+), 242 deletions(-)


^ permalink raw reply	[relevance 5%]

* [PATCH 01/13] inbox: introduce smsg_eml method
  2020-06-01 10:06  5% [PATCH 00/13] smsg: remove tricky {mime} field Eric Wong
@ 2020-06-01 10:06  4% ` Eric Wong
  2020-06-01 10:06  2% ` [PATCH 08/13] www: remove smsg_mime API and adjust callers Eric Wong
  1 sibling, 0 replies; 10+ results
From: Eric Wong @ 2020-06-01 10:06 UTC (permalink / raw)
  To: meta

The goal of this is to eventually remove the $smsg->{mime} field
which is easy-to-misuse and cause memory explosions which
necessitated fixes like commit 7d02b9e64455831d
("view: stop storing all MIME objects on large threads").
---
 lib/PublicInbox/Inbox.pm     | 6 ++++++
 lib/PublicInbox/SolverGit.pm | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index c295b2677e4..bd1489e3699 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -334,6 +334,12 @@ sub smsg_mime {
 	}
 }
 
+sub smsg_eml {
+	my ($self, $smsg) = @_;
+	my $bref = msg_by_smsg($self, $smsg) or return;
+	PublicInbox::Eml->new($bref);
+}
+
 sub mid2num($$) {
 	my ($self, $mid) = @_;
 	my $mm = mm($self) or return;
diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
index f718e28cbd5..b1cb1ae97c4 100644
--- a/lib/PublicInbox/SolverGit.pm
+++ b/lib/PublicInbox/SolverGit.pm
@@ -233,8 +233,8 @@ sub find_extract_diffs ($$$) {
 
 	my $diffs = [];
 	foreach my $smsg (@$msgs) {
-		$ibx->smsg_mime($smsg) or next;
-		delete($smsg->{mime})->each_part(\&extract_diff,
+		my $eml = $ibx->smsg_eml($smsg) or next;
+		$eml->each_part(\&extract_diff,
 				[$self, $diffs, $pre, $post, $ibx, $smsg], 1);
 	}
 	@$diffs ? $diffs : undef;

^ permalink raw reply related	[relevance 4%]

* [PATCH 0/2] v2writable: reduce smsg->{mime} impact
@ 2020-02-24  8:08  4% Eric Wong
  0 siblings, 0 replies; 10+ results
From: Eric Wong @ 2020-02-24  8:08 UTC (permalink / raw)
  To: meta

Stuffing a full MIME object into $smsg is probably a bad idea
as witnessed by the memory bloat fixed with:
https://public-inbox.org/meta/20190108004606.23760-1-e@80x24.org/
("view: stop storing all MIME objects on large threads")

So slowly start getting rid of smsg->{mime} and improve some
v2writable behaviors while we're at it.

Eric Wong (2):
  v2writable: make remove return-compatible w/ Import::remove
  v2writable: lookup_content => content_exists

 lib/PublicInbox/V2Writable.pm | 34 ++++++++++++++++------------------
 t/v2writable.t                |  7 +++++--
 2 files changed, 21 insertions(+), 20 deletions(-)


^ permalink raw reply	[relevance 4%]

* Re: how's memory usage on public-inbox-httpd?
  2019-06-06 21:45  0%     ` Konstantin Ryabitsev
@ 2019-06-06 22:10  0%       ` Eric Wong
  0 siblings, 0 replies; 10+ results
From: Eric Wong @ 2019-06-06 22:10 UTC (permalink / raw)
  To: Konstantin Ryabitsev; +Cc: meta

Konstantin Ryabitsev <konstantin@linuxfoundation.org> wrote:
> On Thu, Jun 06, 2019 at 08:37:52PM +0000, Eric Wong wrote:
> > Do you have commit 7d02b9e64455831d3bda20cd2e64e0c15dc07df5?
> > ("view: stop storing all MIME objects on large threads")
> > That was most significant.
> 
> Yes. We're running 743ac758 with a few cherry-picked patches on top of that
> (like epoch roll-over fix).
> 
> > Otherwise it's probably a combination of several things...
> > httpd and nntpd both supports streaming, arbitrarily  large
> > endpoints (all.mbox.gz, and /T/, /t/, /t.mbox.gz threads with
> > thousands of messages, giant NNTP BODY/ARTICLE ranges).
> > 
> > All those endpoints should detect backpressure from a slow
> > client (varnish/nginx in your case) using the ->getline method.
> 
> Wouldn't that spike up and down? The size I'm seeing stays pretty constant
> without any significant changes across requests.

Nope.  That's the thing with glibc malloc not wanting to trim
the heap for good benchmarks.

You could also try starting with MALLOC_MMAP_THRESHOLD_=131072
in env (or some smaller/larger number in bytes) to force it to
use mmap in more cases instead of sbrk.

> > Also, are you only using the default of -W/--worker-process=1
> > on a 16-core machine?  Just checked public-inbox-httpd(8), the
> > -W switch is documented :)  You can use SIGTTIN/TTOU to
> > increase, decrease workers w/o restarting, too.
> 
> D'oh, yes... though it's not been a problem yet. :) I'm not sure I want to
> bump that up, though, if that means we're going to have multiple 19GB-sized
> processes instead of one. :)

You'd probably end up with several smaller processes totalling
up to 19GB.

In any case, killing individual workers with QUIT/INT/TERM is
graceful and won't drop connections if memory use on one goes
awry.

> > Do you have any stats on the number of simultaneous connections
> > public-inbox-httpd/nginx/varnish handles (and logging of that
> > info at peek)?  (perhaps running "ss -tan" periodically)(*)
> 
> We don't collect that info, but I'm not sure it's the number of concurrent
> connections that's the culprit, as there is no fluctuation in RSS size based
> on the number of responses.

Without concurrent connections; I can't see that happening
unless there's a single message which is gigabytes in size.  I'm
already irked that Email::MIME requires slurping entire emails
into memory; but it should not be using more than one
Email::MIME object in memory-at-a-time for a single client.

Anything from varnish/nginx logs can't keep up for some reason?

Come to think of it, nginx proxy buffering might be redundant
and even harmful if varnish is already doing it.

Perhaps "proxy_buffering off" in nginx is worth trying...
I use yahns instead of nginx, which does lazy buffering (but
scary Ruby experimental server warning applies :x).

Last I checked: nginx is either buffer-in-full-before-first-byte
or no buffering at all (which is probably fine with varnish).

> To answer the questions in your follow-up:
> 
> It would appear to be all in anon memory. Mem_usage [1] reports:
> 
> # ./Mem_usage 18275
> Backed by file:
>  Executable                r-x  16668
>  Write/Exec (jump tables)  rwx  0
>  RO data                   r--  106908
>  Data                      rw-  232
>  Unreadable                ---  94072
>  Unknown                        0
> Anonymous:
>  Writable code (stack)     rwx  0
>  Data (malloc, mmap)       rw-  19988892
>  RO data                   r--  0
>  Unreadable                ---  0
>  Unknown                        12
> 
> I've been looking at lsof -p of that process and I see sqlite and xapian
> showing up and disappearing. The lkml ones are being accessed almost all the
> time, but even there I see them showing up with different FD entries, so
> they are being closed and reopened properly.

Yep, that's expected.  It's to better detect DB changes in case
of compact/copydatabase/xcpdb for Xapian.

Might not be necessary strictly necessary for SQLite, but maybe
somebody could be running VACUUM offline; then flock-ing
inbox.lock and rename-ing it into place or something (and
retrying/restarting the VACUUM if out-of-date, seq_lock style).

^ permalink raw reply	[relevance 0%]

* Re: how's memory usage on public-inbox-httpd?
  2019-06-06 20:37  4%   ` Eric Wong
@ 2019-06-06 21:45  0%     ` Konstantin Ryabitsev
  2019-06-06 22:10  0%       ` Eric Wong
  0 siblings, 1 reply; 10+ results
From: Konstantin Ryabitsev @ 2019-06-06 21:45 UTC (permalink / raw)
  To: Eric Wong; +Cc: meta

On Thu, Jun 06, 2019 at 08:37:52PM +0000, Eric Wong wrote:
>Do you have commit 7d02b9e64455831d3bda20cd2e64e0c15dc07df5?
>("view: stop storing all MIME objects on large threads")
>That was most significant.

Yes. We're running 743ac758 with a few cherry-picked patches on top of 
that (like epoch roll-over fix).

>Otherwise it's probably a combination of several things...
>httpd and nntpd both supports streaming, arbitrarily  large
>endpoints (all.mbox.gz, and /T/, /t/, /t.mbox.gz threads with
>thousands of messages, giant NNTP BODY/ARTICLE ranges).
>
>All those endpoints should detect backpressure from a slow
>client (varnish/nginx in your case) using the ->getline method.

Wouldn't that spike up and down? The size I'm seeing stays pretty 
constant without any significant changes across requests.

>Also, are you only using the default of -W/--worker-process=1
>on a 16-core machine?  Just checked public-inbox-httpd(8), the
>-W switch is documented :)  You can use SIGTTIN/TTOU to
>increase, decrease workers w/o restarting, too.

D'oh, yes... though it's not been a problem yet. :) I'm not sure I want 
to bump that up, though, if that means we're going to have multiple 
19GB-sized processes instead of one. :)

>Do you have any stats on the number of simultaneous connections
>public-inbox-httpd/nginx/varnish handles (and logging of that
>info at peek)?  (perhaps running "ss -tan" periodically)(*)

We don't collect that info, but I'm not sure it's the number of 
concurrent connections that's the culprit, as there is no fluctuation in 
RSS size based on the number of responses.

To answer the questions in your follow-up:

It would appear to be all in anon memory. Mem_usage [1] reports:

# ./Mem_usage 18275
Backed by file:
  Executable                r-x  16668
  Write/Exec (jump tables)  rwx  0
  RO data                   r--  106908
  Data                      rw-  232
  Unreadable                ---  94072
  Unknown                        0
Anonymous:
  Writable code (stack)     rwx  0
  Data (malloc, mmap)       rw-  19988892
  RO data                   r--  0
  Unreadable                ---  0
  Unknown                        12

I've been looking at lsof -p of that process and I see sqlite and xapian 
showing up and disappearing. The lkml ones are being accessed almost all 
the time, but even there I see them showing up with different FD 
entries, so they are being closed and reopened properly.

Hope this helps.

-K

.. [1] https://elinux.org/images/d/d3/Mem_usage

^ permalink raw reply	[relevance 0%]

* Re: how's memory usage on public-inbox-httpd?
  @ 2019-06-06 20:37  4%   ` Eric Wong
  2019-06-06 21:45  0%     ` Konstantin Ryabitsev
  0 siblings, 1 reply; 10+ results
From: Eric Wong @ 2019-06-06 20:37 UTC (permalink / raw)
  To: Konstantin Ryabitsev; +Cc: meta

Konstantin Ryabitsev <konstantin@linuxfoundation.org> wrote:
> Hello:
> 
> This is an old-ish discussion, but we finally had a chance to run the httpd
> daemon for a long time without restarting it to add more lists, and the
> memory usage on it is actually surprising:

Thanks for getting back to this.

> $ ps -eF | grep public-inbox
> publici+ 17741     1  0 52667 24836   8 May24 ?        00:00:00 /usr/bin/perl -w /usr/local/bin/public-inbox-nntpd -1 /var/log/public-inbox/nntpd.out.log
> publici+ 17744 17741  0 69739 90288   9 May24 ?        00:38:43 /usr/bin/perl -w /usr/local/bin/public-inbox-nntpd -1 /var/log/public-inbox/nntpd.out.log
> publici+ 18273     1  0 52599 23832   9 May24 ?        00:00:00 /usr/bin/perl -w /usr/local/bin/public-inbox-httpd -1 /var/log/public-inbox/httpd.out.log
> publici+ 18275 18273  4 5016115 19713872 10 May24 ?    13:59:13 /usr/bin/perl -w /usr/local/bin/public-inbox-httpd -1 /var/log/public-inbox/httpd.out.log
> 
> You'll notice that process 18275 has been running since May 24 and takes up
> 19GB in RSS. This is a 16-core 64-GB system, so it's not necessarily super
> alarming, but seems large. :)

Yes, it's large and ugly :<  I don't even have 19GB and even
90MB RSS worries me.

Do you have commit 7d02b9e64455831d3bda20cd2e64e0c15dc07df5?
("view: stop storing all MIME objects on large threads")
That was most significant.

Also, it looks like you've yet to configure the wacky
coderepo+solver stuff, so that's not a culprit...

Otherwise it's probably a combination of several things...
httpd and nntpd both supports streaming, arbitrarily  large
endpoints (all.mbox.gz, and /T/, /t/, /t.mbox.gz threads with
thousands of messages, giant NNTP BODY/ARTICLE ranges).

All those endpoints should detect backpressure from a slow
client (varnish/nginx in your case) using the ->getline method.

gzip (for compressed mbox) also uses truckload of memory and I
would like to add options to control zlib window sizes to reduce
memory use (at the cost of less compression).  nginx has these
options, too, but they're not documented AFAIK.

For the case of varnish/nginx or whatever's in front of it not
keeping up...  the old design choice of Danga::Socket (now
inherited to PublicInbox::DS) made it buffer slow client data to
RAM, which doesn't make sense to me...  I prefer buffering to
the FS (similar to nginx/varnish) to avoid malloc fragmentation
and also to avoid delaying the extra kernel-to-user copy if
using sendfile.

By default, glibc malloc is really adverse to releasing memory
back to the OS, too.  It's fast in benchmarks that way; (until
the system starts swapping and slowdowns cascade to failure).

I'm also unsure about malloc fragmentation behavior at such
sizes and how it hurts locality.  So my preference is to avoid
putting big objects into heap and let the kernel/FS deal with
big buffers.

httpd/nntpd both try to avoid buffering at all with the
backpressure handling based on ->getline; but sometimes it's not
effective enough because some big chunks still end up in heap.

In any case, you can safely SIGQUIT the individual worker and
it'll restart gracefully w/o dropping active connections.

Also, are you only using the default of -W/--worker-process=1
on a 16-core machine?  Just checked public-inbox-httpd(8), the
-W switch is documented :)  You can use SIGTTIN/TTOU to
increase, decrease workers w/o restarting, too.

nntpd would have the same problem if people used it more;
but at the moment it doesn't do gzip.  I'm happy to see it's
at least gotten some traffic :)

> Is that normal, and if not, what can I do to help troubleshoot where it's
> all going?

There's definitely some problems with big threads, giant
messages and gzip overhead.  I was looking into a few big
threads earlier this year but forgot the Message-IDs :x

Do you have any stats on the number of simultaneous connections
public-inbox-httpd/nginx/varnish handles (and logging of that
info at peek)?  (perhaps running "ss -tan" periodically)(*)

Are you using the Plack::Middleware::Deflater endpoint in PSGI?
Removing it and doing gzip in varnish/nginx may be a little
faster since it can utilize multiple cores, but at higher IPC
cost.  I've gotten rid of the annoying warning for that
middleware install as a result...

But gzipped mboxes has the same problem; though; so adding
zlib window-size options would be necessary...

So I think supporting buffer-to-FS behavior in ::DS along
with gzip options should alleviate much of the memory use.  But
immediately you can increase worker process counts to distribute
the load between cores a bit...

I've also tried nicing down nginx/varnish so they're prioritized
by the kernel and don't bottleneck -httpd.  Makes sense to me in
theory but I was also making a lot of changes around the same
time to reduce httpd memory use.

Limiting HTTP endpoint response size isn't a real option to
protect the server; IMHO, because NNTP requires supporting giant
responses anyways.

(*) I did "raindrops" with Ruby+C back in the day but haven't really
    looked at it in ages, and I don't think the IPv6 counting was
    accurate <https://raindrops-demo.bogomips.org/>
    That's -httpd on :280

^ permalink raw reply	[relevance 4%]

* [PATCH 0/7] psgi: more memory reductions
@ 2019-01-10 21:35  5% Eric Wong
  0 siblings, 0 replies; 10+ results
From: Eric Wong @ 2019-01-10 21:35 UTC (permalink / raw)
  To: meta

While of these are as significant as the patch avoid inadvertant
MIME objects storage in threads(*), they add up to some meaningful
reductions and can make it easier for memory-starved VPS to serve
serve public-inboxes.

I've diffed output of /T/, /t/ and &x=t endpoints of various HTML
pages before and after without finding differences.

There's definitely more that can be done in this area, though...

Sprinkling Devel::Size::total_size calls in various places (mostly
->getline iterators/callbacks ) was instrumental in the development
of these patches.

(*) https://public-inbox.org/meta/20190108004606.23760-1-e@80x24.org/
    ("view: stop storing all MIME objects on large threads")

Eric Wong (7):
  httpd: remove psgix.harakiri reference
  searchmsg: get rid of termlist scanning for mid
  searchmsg: remove Xapian::Document field
  searchview: drop unused {seen} hashref
  searchmsg: remove unused fields for PSGI in Xapian results
  over: cull unneeded fields for get_thread
  view: more culling for search threads

 lib/PublicInbox/HTTPD.pm        |  1 -
 lib/PublicInbox/Inbox.pm        |  5 ++--
 lib/PublicInbox/Over.pm         | 19 ++++++++-----
 lib/PublicInbox/SearchIdx.pm    |  6 ++--
 lib/PublicInbox/SearchMsg.pm    | 49 ++++++++++++++++-----------------
 lib/PublicInbox/SearchThread.pm |  5 ++++
 lib/PublicInbox/SearchView.pm   |  1 -
 lib/PublicInbox/View.pm         | 10 +++++--
 t/search.t                      | 10 ++++---
 9 files changed, 60 insertions(+), 46 deletions(-)

-- 
EW


^ permalink raw reply	[relevance 5%]

* [PATCH] view: fix wrong date for non-Xapian/SQLite v1 users
@ 2019-01-08 10:34  4% Eric Wong
  0 siblings, 0 replies; 10+ results
From: Eric Wong @ 2019-01-08 10:34 UTC (permalink / raw)
  To: meta

We need to parse the MIME object in order to get the
datestamp for those sites.

Fixes: 7d02b9e64455 ("view: stop storing all MIME objects on large threads")
---
 lib/PublicInbox/View.pm | 3 ++-
 t/plack.t               | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 416cb4a..5ddb842 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -218,11 +218,12 @@ sub index_entry {
 	$rv .= $subj . "\n";
 	$rv .= _th_index_lite($mid_raw, \$irt, $id, $ctx);
 	my @tocc;
+	my $ds = $smsg->ds; # for v1 non-Xapian/SQLite users
 	my $mime = delete $smsg->{mime}; # critical to memory use
 	my $hdr = $mime->header_obj;
 	my $from = _hdr_names_html($hdr, 'From');
 	obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx;
-	$rv .= "From: $from @ ".fmt_ts($smsg->ds)." UTC";
+	$rv .= "From: $from @ ".fmt_ts($ds)." UTC";
 	my $upfx = $ctx->{-upfx};
 	my $mhref = $upfx . mid_escape($mid_raw) . '/';
 	$rv .= qq{ (<a\nhref="$mhref">permalink</a> / };
diff --git a/t/plack.t b/t/plack.t
index 9901186..2ff98e9 100644
--- a/t/plack.t
+++ b/t/plack.t
@@ -46,7 +46,7 @@ To: You <you\@example.com>
 Cc: $addr
 Message-Id: <blah\@example.com>
 Subject: hihi
-Date: Thu, 01 Jan 1970 00:00:00 +0000
+Date: Fri, 02 Oct 1993 00:00:00 +0000
 
 zzzzzz
 EOF
@@ -129,6 +129,7 @@ EOF
 			'atom URL generated');
 		like($res->content, qr!href="blah\@example\.com/"!,
 			'index generated');
+		like($res->content, qr!1993-10-02!, 'date set');
 	});
 
 	test_psgi($app, sub {
-- 
EW


^ permalink raw reply related	[relevance 4%]

* [PATCH] view: stop storing all MIME objects on large threads
@ 2019-01-08  0:46  7% Eric Wong
  0 siblings, 0 replies; 10+ results
From: Eric Wong @ 2019-01-08  0:46 UTC (permalink / raw)
  To: meta

While we try to discard the $smsg (SearchMsg) objects quickly,
they remain referenced via $node (SearchThread::Msg) objects,
which are stored forever in $ctx->{mapping} to cull redundant
words out of subjects in the thread skeleton.

This significantly cuts memory bloat with large search results
with '&x=t'.  Now, the search results overhead of
SearchThread::Msg and linked objects are stable at around 350K
instead of ~7M per response in a rough test (there's more
savings to be had in the same areas).

Several hundred kilobytes is still huge and a large per-client
cost; but it's far better than MEGABYTES per-client.
---
 lib/PublicInbox/View.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index bb49c03..416cb4a 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -218,7 +218,7 @@ sub index_entry {
 	$rv .= $subj . "\n";
 	$rv .= _th_index_lite($mid_raw, \$irt, $id, $ctx);
 	my @tocc;
-	my $mime = $smsg->{mime};
+	my $mime = delete $smsg->{mime}; # critical to memory use
 	my $hdr = $mime->header_obj;
 	my $from = _hdr_names_html($hdr, 'From');
 	obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx;
-- 
EW

^ permalink raw reply related	[relevance 7%]

Results 1-10 of 10 | reverse | options above

-- pct% links below jump to the message on this page, permalinks otherwise --
2018-12-01 19:44     how's memory usage on public-inbox-httpd? Eric Wong
2019-06-06 19:04     ` Konstantin Ryabitsev
2019-06-06 20:37  4%   ` Eric Wong
2019-06-06 21:45  0%     ` Konstantin Ryabitsev
2019-06-06 22:10  0%       ` Eric Wong
2019-01-08  0:46  7% [PATCH] view: stop storing all MIME objects on large threads Eric Wong
2019-01-08 10:34  4% [PATCH] view: fix wrong date for non-Xapian/SQLite v1 users Eric Wong
2019-01-10 21:35  5% [PATCH 0/7] psgi: more memory reductions Eric Wong
2020-02-24  8:08  4% [PATCH 0/2] v2writable: reduce smsg->{mime} impact Eric Wong
2020-06-01 10:06  5% [PATCH 00/13] smsg: remove tricky {mime} field Eric Wong
2020-06-01 10:06  4% ` [PATCH 01/13] inbox: introduce smsg_eml method Eric Wong
2020-06-01 10:06  2% ` [PATCH 08/13] www: remove smsg_mime API and adjust callers Eric Wong
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).