user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/3] www: more aggressive cleanup of SQLite DBs
@ 2021-09-28 23:11 Eric Wong
  2021-09-28 23:11 ` [PATCH 1/3] www: do not bump {over} refcnt on long responses Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Eric Wong @ 2021-09-28 23:11 UTC (permalink / raw)
  To: meta

While extindex allows the elimination of per-inbox Xapian DB
shards, per-inbox over.sqlite3 and msgmap.sqlite3 are still
required, and they can waste system resources when there are
hundreds/thousands of inboxes and most of them sit idle..

Since reopening them isn't expensive, close them aggressively to
reduce memory pressure and the likelyhood of swapping.

We'll also take advantage of the new ->add_uniq_timer API
to amortize the cost of cleanups and avoid globals in the
inbox class.

Eric Wong (3):
  www: do not bump {over} refcnt on long responses
  inbox: rewrite cleanup to be more aggressive
  inbox: drop memoization/preload, cleanup expires caches

 lib/PublicInbox/Git.pm        | 12 ++++---
 lib/PublicInbox/GzipFilter.pm |  6 ++++
 lib/PublicInbox/Inbox.pm      | 60 ++++++++++++++---------------------
 lib/PublicInbox/Mbox.pm       | 29 +++++++----------
 lib/PublicInbox/View.pm       |  6 ++--
 lib/PublicInbox/WWW.pm        |  9 ------
 t/nntp.t                      |  1 -
 7 files changed, 52 insertions(+), 71 deletions(-)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] www: do not bump {over} refcnt on long responses
  2021-09-28 23:11 [PATCH 0/3] www: more aggressive cleanup of SQLite DBs Eric Wong
@ 2021-09-28 23:11 ` Eric Wong
  2021-09-28 23:11 ` [PATCH 2/3] inbox: rewrite cleanup to be more aggressive Eric Wong
  2021-09-28 23:11 ` [PATCH 3/3] inbox: drop memoization/preload, cleanup expires caches Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-09-28 23:11 UTC (permalink / raw)
  To: meta

SQLite files may be replaced or removed by admins while
generating a large threads or mailbox responses.  Ensure we
don't hold onto DBI handles and associated file descriptors
past their cleanup.
---
 lib/PublicInbox/GzipFilter.pm |  6 ++++++
 lib/PublicInbox/Mbox.pm       | 29 ++++++++++++-----------------
 lib/PublicInbox/View.pm       |  6 ++++--
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm
index c50c26c5..624c2ed3 100644
--- a/lib/PublicInbox/GzipFilter.pm
+++ b/lib/PublicInbox/GzipFilter.pm
@@ -84,6 +84,12 @@ sub gzip_or_die () {
 	$gz;
 }
 
+sub gone { # what: search/over/mm
+	my ($ctx, $what) = @_;
+	warn "W: `$ctx->{ibx}->{name}' $what went away unexpectedly\n";
+	undef;
+}
+
 # for GetlineBody (via Qspawn) when NOT using $env->{'pi-httpd.async'}
 # Also used for ->getline callbacks
 sub translate ($$) {
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index cec76182..dede4825 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -124,8 +124,9 @@ sub thread_cb {
 			return $smsg;
 		}
 		# refill result set
-		$ctx->{msgs} = $msgs = $ctx->{over}->get_thread($ctx->{mid},
-								$ctx->{prev});
+		my $over = $ctx->{ibx}->over or return $ctx->gone('over');
+		$ctx->{msgs} = $msgs = $over->get_thread($ctx->{mid},
+							$ctx->{prev});
 		return unless @$msgs;
 		$ctx->{prev} = $msgs->[-1];
 	}
@@ -136,7 +137,6 @@ sub thread_mbox {
 	my $msgs = $ctx->{msgs} = $over->get_thread($ctx->{mid}, {});
 	return [404, [qw(Content-Type text/plain)], []] if !@$msgs;
 	$ctx->{prev} = $msgs->[-1];
-	$ctx->{over} = $over; # bump refcnt
 	require PublicInbox::MboxGz;
 	PublicInbox::MboxGz::mbox_gz($ctx, \&thread_cb, $msgs->[0]->{subject});
 }
@@ -155,22 +155,23 @@ sub emit_range {
 
 sub all_ids_cb {
 	my ($ctx) = @_;
+	my $over = $ctx->{ibx}->over or return $ctx->gone('over');
 	my $ids = $ctx->{ids};
 	do {
 		while ((my $num = shift @$ids)) {
-			my $smsg = $ctx->{over}->get_art($num) or next;
+			my $smsg = $over->get_art($num) or next;
 			return $smsg;
 		}
-		$ctx->{ids} = $ids = $ctx->{over}->ids_after(\($ctx->{prev}));
+		$ctx->{ids} = $ids = $over->ids_after(\($ctx->{prev}));
 	} while (@$ids);
 }
 
 sub mbox_all_ids {
 	my ($ctx) = @_;
 	my $prev = 0;
-	$ctx->{over} = $ctx->{ibx}->over or
+	my $over = $ctx->{ibx}->over or
 		return PublicInbox::WWW::need($ctx, 'Overview');
-	my $ids = $ctx->{over}->ids_after(\$prev) or return
+	my $ids = $over->ids_after(\$prev) or return
 		[404, [qw(Content-Type text/plain)], ["No results found\n"]];
 	$ctx->{ids} = $ids;
 	$ctx->{prev} = $prev;
@@ -179,22 +180,16 @@ sub mbox_all_ids {
 	PublicInbox::MboxGz::mbox_gz($ctx, \&all_ids_cb, 'all');
 }
 
-sub gone ($$) {
-	my ($ctx, $what) = @_;
-	warn "W: `$ctx->{ibx}->{inboxdir}' $what went away unexpectedly\n";
-	undef;
-}
-
 sub results_cb {
 	my ($ctx) = @_;
-	my $over = $ctx->{ibx}->over or return gone($ctx, 'over');
+	my $over = $ctx->{ibx}->over or return $ctx->gone('over');
 	while (1) {
 		while (defined(my $num = shift(@{$ctx->{ids}}))) {
 			my $smsg = $over->get_art($num) or next;
 			return $smsg;
 		}
 		# refill result set, deprioritize since there's many results
-		my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search');
+		my $srch = $ctx->{ibx}->isrch or return $ctx->gone('search');
 		my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
 		my $size = $mset->size or return;
 		$ctx->{qopts}->{offset} += $size;
@@ -206,7 +201,7 @@ sub results_cb {
 sub results_thread_cb {
 	my ($ctx) = @_;
 
-	my $over = $ctx->{ibx}->over or return gone($ctx, 'over');
+	my $over = $ctx->{ibx}->over or return $ctx->gone('over');
 	while (1) {
 		while (defined(my $num = shift(@{$ctx->{xids}}))) {
 			my $smsg = $over->get_art($num) or next;
@@ -217,7 +212,7 @@ sub results_thread_cb {
 		next if $over->expand_thread($ctx);
 
 		# refill result set, deprioritize since there's many results
-		my $srch = $ctx->{ibx}->isrch or return gone($ctx, 'search');
+		my $srch = $ctx->{ibx}->isrch or return $ctx->gone('search');
 		my $mset = $srch->mset($ctx->{query}, $ctx->{qopts});
 		my $size = $mset->size or return;
 		$ctx->{qopts}->{offset} += $size;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 805e785b..069b9680 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -31,7 +31,9 @@ sub msg_page_i {
 	my ($ctx, $eml) = @_;
 	if ($eml) { # called by WwwStream::async_eml or getline
 		my $smsg = $ctx->{smsg};
-		$ctx->{smsg} = $ctx->{over}->next_by_mid(@{$ctx->{next_arg}});
+		my $over = $ctx->{ibx}->over;
+		$ctx->{smsg} = $over ? $over->next_by_mid(@{$ctx->{next_arg}})
+				: $ctx->gone('over');
 		$ctx->{mhref} = ($ctx->{nr} || $ctx->{smsg}) ?
 				"../${\mid_href($smsg->{mid})}/" : '';
 		my $obuf = $ctx->{obuf} = _msg_page_prepare_obuf($eml, $ctx);
@@ -70,7 +72,7 @@ sub msg_page {
 	my ($ctx) = @_;
 	my $ibx = $ctx->{ibx};
 	$ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef;
-	my $over = $ctx->{over} = $ibx->over or return no_over_html($ctx);
+	my $over = $ibx->over or return no_over_html($ctx);
 	my ($id, $prev);
 	my $next_arg = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ];
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 2/3] inbox: rewrite cleanup to be more aggressive
  2021-09-28 23:11 [PATCH 0/3] www: more aggressive cleanup of SQLite DBs Eric Wong
  2021-09-28 23:11 ` [PATCH 1/3] www: do not bump {over} refcnt on long responses Eric Wong
@ 2021-09-28 23:11 ` Eric Wong
  2021-09-28 23:11 ` [PATCH 3/3] inbox: drop memoization/preload, cleanup expires caches Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-09-28 23:11 UTC (permalink / raw)
  To: meta

Avoid relying on a giant cleanup hash and instead use the new
DS->add_uniq_timer API to amortize the pause times associated
with having to cleanup many inboxes.  We can also use smaller
intervals for this, as well.

We now discard SQLite DB handles at cleanup.  Each of these can
use several megabytes of memory, which adds up with
hundreds/thousands of inboxes.  Since per-inbox access intervals
are unpredictable and opening an SQLite handle is relatively
inexpensive, release memory more aggressively to avoid the heap
having to hit swap.
---
 lib/PublicInbox/Git.pm   | 12 +++++++-----
 lib/PublicInbox/Inbox.pm | 41 ++++++++++++++--------------------------
 2 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index 3c577ab3..d5b1d39d 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -392,8 +392,10 @@ sub async_wait_all ($) {
 
 # returns true if there are pending "git cat-file" processes
 sub cleanup {
-	my ($self) = @_;
+	my ($self, $lazy) = @_;
 	local $in_cleanup = 1;
+	return 1 if $lazy && (scalar(@{$self->{inflight_c} // []}) ||
+				scalar(@{$self->{inflight} // []}));
 	delete $self->{async_cat};
 	async_wait_all($self);
 	delete $self->{inflight};
@@ -403,7 +405,6 @@ sub cleanup {
 	defined($self->{pid}) || defined($self->{pid_c});
 }
 
-
 # assuming a well-maintained repo, this should be a somewhat
 # accurate estimation of its size
 # TODO: show this in the WWW UI as a hint to potential cloners
@@ -526,18 +527,19 @@ sub manifest_entry {
 # returns true if there are pending cat-file processes
 sub cleanup_if_unlinked {
 	my ($self) = @_;
-	return cleanup($self) if $^O ne 'linux';
+	return cleanup($self, 1) if $^O ne 'linux';
 	# Linux-specific /proc/$PID/maps access
 	# TODO: support this inside git.git
 	my $ret = 0;
 	for my $fld (qw(pid pid_c)) {
 		my $pid = $self->{$fld} // next;
-		open my $fh, '<', "/proc/$pid/maps" or return cleanup($self);
+		open my $fh, '<', "/proc/$pid/maps" or return cleanup($self, 1);
 		while (<$fh>) {
 			# n.b. we do not restart for unlinked multi-pack-index
 			# since it's not too huge, and the startup cost may
 			# be higher.
-			return cleanup($self) if /\.(?:idx|pack) \(deleted\)$/;
+			/\.(?:idx|pack) \(deleted\)$/ and
+				return cleanup($self, 1);
 		}
 		++$ret;
 	}
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 1d5fc708..04b7d764 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -10,14 +10,6 @@ use PublicInbox::Eml;
 use List::Util qw(max);
 use Carp qw(croak);
 
-# Long-running "git-cat-file --batch" processes won't notice
-# unlinked packs, so we need to restart those processes occasionally.
-# Xapian and SQLite file handles are mostly stable, but sometimes an
-# admin will attempt to replace them atomically after compact/vacuum
-# and we need to be prepared for that.
-my $cleanup_timer;
-my $CLEANUP = {}; # string(inbox) -> inbox
-
 sub git_cleanup ($) {
 	my ($self) = @_;
 	my $git = $self->{git} // return undef;
@@ -25,7 +17,7 @@ sub git_cleanup ($) {
 	# keep process+pipe counts in check.  ExtSearch may have high startup
 	# cost (e.g. ->ALL) and but likely one per-daemon, so cleanup only
 	# if there's unlinked files
-	my $live = $self->isa(__PACKAGE__) ? $git->cleanup
+	my $live = $self->isa(__PACKAGE__) ? $git->cleanup(1)
 					: $git->cleanup_if_unlinked;
 	delete($self->{git}) unless $live;
 	$live;
@@ -34,29 +26,22 @@ sub git_cleanup ($) {
 # returns true if further checking is required
 sub cleanup_shards { $_[0]->{search} ? $_[0]->{search}->cleanup_shards : undef }
 
-sub cleanup_task () {
-	$cleanup_timer = undef;
-	my $next = {};
-	for my $ibx (values %$CLEANUP) {
-		my $again = git_cleanup($ibx);
-		$ibx->cleanup_shards and $again = 1;
-		for my $git (@{$ibx->{-repo_objs}}) {
-			$again = 1 if $git->cleanup;
-		}
-		check_inodes($ibx);
-		$next->{"$ibx"} = $ibx if $again;
+sub do_cleanup {
+	my ($ibx) = @_;
+	my $live = git_cleanup($ibx);
+	$ibx->cleanup_shards and $live = 1;
+	for my $git (@{$ibx->{-repo_objs}}) {
+		$live = 1 if $git->cleanup(1);
 	}
-	$CLEANUP = $next;
-	$cleanup_timer //= PublicInbox::DS::later(\&cleanup_task);
+	delete @$ibx{qw(over mm)};
+	PublicInbox::DS::add_uniq_timer($ibx+0, 5, \&do_cleanup, $ibx) if $live;
 }
 
 sub _cleanup_later ($) {
 	# no need to require DS, here, if it were enabled another
 	# module would've require'd it, already
-	if (eval { PublicInbox::DS::in_loop() }) {
-		$cleanup_timer //= PublicInbox::DS::later(\&cleanup_task);
-		$CLEANUP->{"$_[0]"} = $_[0]; # $self
-	}
+	eval { PublicInbox::DS::in_loop() } and
+		PublicInbox::DS::add_uniq_timer($_[0]+0, 30, \&do_cleanup, @_)
 }
 
 sub _set_limiter ($$$) {
@@ -156,6 +141,7 @@ sub mm {
 	my ($self, $req) = @_;
 	$self->{mm} //= eval {
 		require PublicInbox::Msgmap;
+		_cleanup_later($self);
 		my $dir = $self->{inboxdir};
 		if ($self->version >= 2) {
 			PublicInbox::Msgmap->new_file("$dir/msgmap.sqlite3");
@@ -186,6 +172,7 @@ sub over {
 			require PublicInbox::Search;
 			PublicInbox::Search->new($self);
 		};
+		_cleanup_later($self);
 		my $over = PublicInbox::Over->new("$srch->{xpfx}/over.sqlite3");
 		$over->dbh; # may fail
 		$self->{over} = $over;
@@ -387,7 +374,7 @@ sub unsubscribe_unlock {
 
 sub check_inodes ($) {
 	my ($self) = @_;
-	for (qw(over mm)) { # TODO: search
+	for (qw(over mm)) {
 		$self->{$_}->check_inodes if $self->{$_};
 	}
 }

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 3/3] inbox: drop memoization/preload, cleanup expires caches
  2021-09-28 23:11 [PATCH 0/3] www: more aggressive cleanup of SQLite DBs Eric Wong
  2021-09-28 23:11 ` [PATCH 1/3] www: do not bump {over} refcnt on long responses Eric Wong
  2021-09-28 23:11 ` [PATCH 2/3] inbox: rewrite cleanup to be more aggressive Eric Wong
@ 2021-09-28 23:11 ` Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2021-09-28 23:11 UTC (permalink / raw)
  To: meta

cloneurl, description, and base_url are no longer memoized.  The
non-$env form of base_url is rare in WWW, and is fast enough to
not require memoization.

cloneurl and description are now expired during cleanup,
allowing admins to change these files without restarting
(or SIGHUP).

-altid_map is no longer cached nor memoized at all, since the
endpoint(s) which hit it seem rarely accessed.

nntp_url and imap_url are now cached (instead of memoized) in
case an inbox is unvisited for a long time.  They remain cached
since the truthiness check gets called in every per-inbox HTML
page, which can potentially be expensive.
---
 lib/PublicInbox/Inbox.pm | 21 ++++++++++-----------
 lib/PublicInbox/WWW.pm   |  9 ---------
 t/nntp.t                 |  1 -
 3 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 04b7d764..c525f4d1 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -33,7 +33,8 @@ sub do_cleanup {
 	for my $git (@{$ibx->{-repo_objs}}) {
 		$live = 1 if $git->cleanup(1);
 	}
-	delete @$ibx{qw(over mm)};
+	delete @$ibx{qw(over mm description cloneurl
+			-altid_map -imap_url -nntp_url)};
 	PublicInbox::DS::add_uniq_timer($ibx+0, 5, \&do_cleanup, $ibx) if $live;
 }
 
@@ -219,15 +220,13 @@ sub base_url {
 		return $url .= $self->{name} . '/';
 	}
 	# called from a non-PSGI environment (e.g. NNTP/POP3):
-	$self->{-base_url} ||= do {
-		my $url = $self->{url} // return undef;
-		$url = $url->[0] // return undef;
-		# expand protocol-relative URLs to HTTPS if we're
-		# not inside a web server
-		$url = "https:$url" if $url =~ m!\A//!;
-		$url .= '/' if $url !~ m!/\z!;
-		$url;
-	};
+	my $url = $self->{url} // return undef;
+	$url = $url->[0] // return undef;
+	# expand protocol-relative URLs to HTTPS if we're
+	# not inside a web server
+	substr($url, 0, 0, 'https:') if substr($url, 0, 2) eq '//';
+	$url .= '/' if substr($url, -1, 1) ne '/';
+	$url;
 }
 
 sub _x_url ($$$) {
@@ -350,7 +349,7 @@ sub modified {
 # (pathname is NOT public, but prefix is used for Xapian queries)
 sub altid_map ($) {
 	my ($self) = @_;
-	$self->{-altid_map} //= eval {
+	eval {
 		require PublicInbox::AltId;
 		my $altid = $self->{altid} or return {};
 		my %h = map {;
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index a7c961f4..5e83f769 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -173,18 +173,9 @@ sub preload {
 		$self->cgit;
 		$self->stylesheets_prepare($_) for ('', '../', '../../');
 		$self->news_www;
-		$pi_cfg->each_inbox(\&preload_inbox);
 	}
 }
 
-sub preload_inbox {
-	my $ibx = shift;
-	$ibx->altid_map;
-	$ibx->cloneurl;
-	$ibx->description;
-	$ibx->base_url;
-}
-
 # private functions below
 
 sub r404 {
diff --git a/t/nntp.t b/t/nntp.t
index 5bad9dfe..655af398 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -126,7 +126,6 @@ use PublicInbox::Config;
 	is_deeply([ $mime->header('Xref') ], [ 'example.com test:1' ],
 		'Xref: set');
 
-	$ibx->{-base_url} = 'http://mirror.example.com/m/';
 	$smsg->{num} = 2;
 	PublicInbox::NNTP::set_nntp_headers($hdr, $smsg);
 	is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-09-28 23:11 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-28 23:11 [PATCH 0/3] www: more aggressive cleanup of SQLite DBs Eric Wong
2021-09-28 23:11 ` [PATCH 1/3] www: do not bump {over} refcnt on long responses Eric Wong
2021-09-28 23:11 ` [PATCH 2/3] inbox: rewrite cleanup to be more aggressive Eric Wong
2021-09-28 23:11 ` [PATCH 3/3] inbox: drop memoization/preload, cleanup expires caches Eric Wong

Code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).