user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/5] more ->ALL usage
@ 2020-12-04 22:03 Eric Wong
  2020-12-04 22:03 ` [PATCH 1/5] nntp: xref_by_tc: simplify slightly Eric Wong
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-04 22:03 UTC (permalink / raw)
  To: meta

Some tiny speedups to NNTP, and more WWW bits start using ->ALL
to avoid iterating through hundreds/thousands of inboxes.

Still experimenting with reindexing bits (and it's still slow :<)

Eric Wong (5):
  nntp: xref_by_tc: simplify slightly
  nntp: small speed up for multi-line responses
  search: remove mdocid export
  newswww: use ->ALL to avoid O(n) inbox scan
  extmsg: use ->ALL for "global" MID lookups

 lib/PublicInbox/Config.pm     |  4 ++--
 lib/PublicInbox/ExtMsg.pm     | 36 ++++++++++++++++++++++++++++++++---
 lib/PublicInbox/NNTP.pm       | 24 +++++++++++------------
 lib/PublicInbox/NNTPD.pm      |  9 +++++++--
 lib/PublicInbox/NewsWWW.pm    | 30 ++++++++++++++++++++++-------
 lib/PublicInbox/Search.pm     |  2 +-
 lib/PublicInbox/SearchView.pm | 11 +++++------
 7 files changed, 82 insertions(+), 34 deletions(-)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/5] nntp: xref_by_tc: simplify slightly
  2020-12-04 22:03 [PATCH 0/5] more ->ALL usage Eric Wong
@ 2020-12-04 22:03 ` Eric Wong
  2020-12-04 22:03 ` [PATCH 2/5] nntp: small speed up for multi-line responses Eric Wong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-04 22:03 UTC (permalink / raw)
  To: meta

We can invalidate ibx->{newsgroup} at config load-time to avoid
having to check ibx->{newsgroup} validity in To/Cc: matching.
This saves us some hash lookups in all cases.
---
 lib/PublicInbox/NNTP.pm  | 3 +--
 lib/PublicInbox/NNTPD.pm | 9 +++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index e0916011..6728f9c5 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -422,14 +422,13 @@ sub header_append ($$$) {
 sub xref_by_tc ($$$) {
 	my ($xref, $pi_cfg, $smsg) = @_;
 	my $by_addr = $pi_cfg->{-by_addr};
-	my $groups = $pi_cfg->{-by_newsgroup};
 	my $mid = $smsg->{mid};
 	for my $f (qw(to cc)) {
 		my @ibxs = map {
 			$by_addr->{lc($_)} // ()
 		} (PublicInbox::Address::emails($smsg->{$f} // ''));
 		for my $ibx (@ibxs) {
-			$groups->{my $ngname = $ibx->{newsgroup}} or next;
+			my $ngname = $ibx->{newsgroup} // next;
 			next if defined $xref->{$ngname};
 			$xref->{$ngname} = eval { $ibx->mm->num_for($mid) };
 		}
diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm
index 967850e9..03c56db3 100644
--- a/lib/PublicInbox/NNTPD.pm
+++ b/lib/PublicInbox/NNTPD.pm
@@ -38,13 +38,18 @@ sub refresh_groups {
 	my $groups = $pi_config->{-by_newsgroup}; # filled during each_inbox
 	$pi_config->each_inbox(sub {
 		my ($ibx) = @_;
-		my $ngname = $ibx->{newsgroup} // return;
-		if ($ibx->nntp_usable) { # only valid if msgmap and over works
+		my $ngname = $ibx->{newsgroup};
+		if (defined($ngname) && $ibx->nntp_usable) {
+			# only valid if msgmap and over works
 			# preload to avoid fragmentation:
 			$ibx->description;
 			$ibx->base_url;
 		} else {
 			delete $groups->{$ngname};
+			delete $ibx->{newsgroup};
+			# Note: don't be tempted to delete more for memory
+			# savings just yet: NNTP, IMAP, and WWW may all
+			# run in the same process someday.
 		}
 	});
 	$self->{groupnames} = [ sort(keys %$groups) ];

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/5] nntp: small speed up for multi-line responses
  2020-12-04 22:03 [PATCH 0/5] more ->ALL usage Eric Wong
  2020-12-04 22:03 ` [PATCH 1/5] nntp: xref_by_tc: simplify slightly Eric Wong
@ 2020-12-04 22:03 ` Eric Wong
  2020-12-04 22:03 ` [PATCH 3/5] search: remove mdocid export Eric Wong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-04 22:03 UTC (permalink / raw)
  To: meta

Using a non-zero-length separator for `join' requires extra work
inside Perl.  We can shove the cost of appending "\r\n" into the
`map' loop, instead.  This speeds up the `join' operation.

The "deferred" log entry for a "LISTGROUP org.kernel.vger.linux-kernel"
command (with nearly 3.8 million messages) goes from ~3.96s to 3.86s
on my workstation.
---
 lib/PublicInbox/NNTP.pm | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 6728f9c5..05d2d13b 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -204,7 +204,7 @@ sub listgroup_range_i {
 	my ($self, $beg, $end) = @_;
 	my $r = $self->{ng}->mm->msg_range($beg, $end, 'num');
 	scalar(@$r) or return;
-	more($self, join("\r\n", map { $_->[0] } @$r));
+	$self->msg_more(join('', map { "$_->[0]\r\n" } @$r));
 	1;
 }
 
@@ -327,10 +327,9 @@ sub newnews_i {
 		if (my $over = $ibx->over) {
 			my $msgs = $over->query_ts($ts, $$prev);
 			if (scalar @$msgs) {
-				more($self, '<' .
-					join(">\r\n<",
-						map { $_->{mid} } @$msgs ) .
-					'>');
+				$self->msg_more(join('', map {
+							"<$_->{mid}>\r\n";
+						} @$msgs));
 				$$prev = $msgs->[-1]->{num};
 				return 1; # continue on current group
 			}
@@ -707,7 +706,7 @@ sub hdr_msgid_range_i {
 	my ($self, $beg, $end) = @_;
 	my $r = $self->{ng}->mm->msg_range($beg, $end);
 	@$r or return;
-	more($self, join("\r\n", map { "$_->[0] <$_->[1]>" } @$r));
+	$self->msg_more(join('', map { "$_->[0] <$_->[1]>\r\n" } @$r));
 	1;
 }
 
@@ -774,8 +773,8 @@ sub xref_range_i {
 	my $msgs = $ng->over->query_xover($$beg, $end);
 	scalar(@$msgs) or return;
 	$$beg = $msgs->[-1]->{num} + 1;
-	more($self, join("\r\n", map {
-		"$_->{num} ".xref($self, $ng, $_);
+	$self->msg_more(join('', map {
+		"$_->{num} ".xref($self, $ng, $_) . "\r\n";
 	} @$msgs));
 	1;
 }
@@ -934,7 +933,7 @@ sub over_line ($$$) {
 		$smsg->{lines},
 		"Xref: " . xref($self, $ng, $smsg));
 	utf8::encode($s);
-	$s
+	$s .= "\r\n";
 }
 
 sub cmd_over ($;$) {
@@ -953,7 +952,7 @@ sub cmd_over ($;$) {
 			$smsg->{-orig_num} = $smsg->{num};
 			$smsg->{num} = 0;
 		}
-		more($self, over_line($self, $ng, $smsg));
+		$self->msg_more(over_line($self, $ng, $smsg));
 		'.';
 	} else {
 		cmd_xover($self, $range);
@@ -967,7 +966,7 @@ sub xover_i {
 	my $nr = scalar @$msgs or return;
 
 	# OVERVIEW.FMT
-	more($self, join("\r\n", map {
+	$self->msg_more(join('', map {
 		over_line($self, $ng, $_);
 		} @$msgs));
 	$$beg = $msgs->[-1]->{num} + 1;

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/5] search: remove mdocid export
  2020-12-04 22:03 [PATCH 0/5] more ->ALL usage Eric Wong
  2020-12-04 22:03 ` [PATCH 1/5] nntp: xref_by_tc: simplify slightly Eric Wong
  2020-12-04 22:03 ` [PATCH 2/5] nntp: small speed up for multi-line responses Eric Wong
@ 2020-12-04 22:03 ` Eric Wong
  2020-12-04 22:03 ` [PATCH 4/5] newswww: use ->ALL to avoid O(n) inbox scan Eric Wong
  2020-12-04 22:03 ` [PATCH 5/5] extmsg: use ->ALL for "global" MID lookups Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-04 22:03 UTC (permalink / raw)
  To: meta

There's no need to export it, as shown by the change to
SearchView.  This should pave the way to making search
more flexible and allow per-Inbox search to reuse ->ALL.
---
 lib/PublicInbox/Search.pm     |  2 +-
 lib/PublicInbox/SearchView.pm | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 574bc145..7e72913f 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -6,7 +6,7 @@
 package PublicInbox::Search;
 use strict;
 use parent qw(Exporter);
-our @EXPORT_OK = qw(mdocid retry_reopen);
+our @EXPORT_OK = qw(retry_reopen);
 use List::Util qw(max);
 
 # values for searching, changing the numeric value breaks
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index c482f1c9..26426c01 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -14,7 +14,7 @@ use PublicInbox::WwwAtomStream;
 use PublicInbox::WwwStream qw(html_oneshot);
 use PublicInbox::SearchThread;
 use PublicInbox::SearchQuery;
-use PublicInbox::Search qw(mdocid);
+use PublicInbox::Search;
 my %rmap_inc;
 
 sub mbox_results {
@@ -287,13 +287,12 @@ sub get_pct ($) {
 sub mset_thread {
 	my ($ctx, $mset, $q) = @_;
 	my $ibx = $ctx->{-inbox};
-	my $nshard = $ibx->search->{nshard} // 1;
-	my %pct = map { mdocid($nshard, $_) => get_pct($_) } $mset->items;
-	my $msgs = $ibx->over->get_all(keys %pct);
-	$_->{pct} = $pct{$_->{num}} for @$msgs;
+	my @pct = map { get_pct($_) } $mset->items;
+	my $msgs = $ibx->search->mset_to_smsg($ibx, $mset);
+	my $i = 0;
+	$_->{pct} = $pct[$i++] for @$msgs;
 	my $r = $q->{r};
 	if ($r) { # for descriptions in search_nav_bot
-		my @pct = values %pct;
 		$q->{-min_pct} = min(@pct);
 		$q->{-max_pct} = max(@pct);
 	}

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/5] newswww: use ->ALL to avoid O(n) inbox scan
  2020-12-04 22:03 [PATCH 0/5] more ->ALL usage Eric Wong
                   ` (2 preceding siblings ...)
  2020-12-04 22:03 ` [PATCH 3/5] search: remove mdocid export Eric Wong
@ 2020-12-04 22:03 ` Eric Wong
  2020-12-04 22:03 ` [PATCH 5/5] extmsg: use ->ALL for "global" MID lookups Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-04 22:03 UTC (permalink / raw)
  To: meta

We can avoid doing a Message-ID lookup on every single inbox
by using ->ALL to scan its over.sqlite3 DB.  This mimics NNTP
behavior and picks the first message indexed, though redirecting
to /all/$MESSAGE_ID/ could be done.

With the current lore.kernel.org set of inboxes (~140), this
provides a 10-40% speedup depending on inbox ordering.
---
 lib/PublicInbox/Config.pm  |  4 ++--
 lib/PublicInbox/NewsWWW.pm | 30 +++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 9b9d5c19..ba0ead6e 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -33,6 +33,7 @@ sub new {
 	$self->{-by_list_id} = {};
 	$self->{-by_name} = {};
 	$self->{-by_newsgroup} = {};
+	$self->{-by_eidx_key} = {};
 	$self->{-no_obfuscate} = {};
 	$self->{-limiters} = {};
 	$self->{-code_repos} = {}; # nick => PublicInbox::Git object
@@ -476,8 +477,7 @@ EOF
 			push @$repo_objs, $repo if $repo;
 		}
 	}
-
-	$ibx
+	$self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx;
 }
 
 sub _fill_ei ($$) {
diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm
index 6bed0103..ade8dfd1 100644
--- a/lib/PublicInbox/NewsWWW.pm
+++ b/lib/PublicInbox/NewsWWW.pm
@@ -63,7 +63,6 @@ sub call {
 		return redirect($code, $url);
 	}
 
-	my $res;
 	my @try = (join('/', @parts));
 
 	# trailing slash is in the rest of our WWW, so maybe some users
@@ -72,13 +71,30 @@ sub call {
 		pop @parts;
 		push @try, join('/', @parts);
 	}
-
-	foreach my $mid (@try) {
-		my $arg = [ $mid ];
-		$pi_config->each_inbox(\&try_inbox, $arg);
-		defined($res = $arg->[1]) and last;
+	my $ALL = $pi_config->ALL;
+	if (my $over = $ALL ? $ALL->over : undef) {
+		my $by_eidx_key = $pi_config->{-by_eidx_key};
+		for my $mid (@try) {
+			my ($id, $prev);
+			while (my $x = $over->next_by_mid($mid, \$id, \$prev)) {
+				my $xr3 = $over->get_xref3($x->{num});
+				for (@$xr3) {
+					s/:[0-9]+:$x->{blob}\z// or next;
+					my $ibx = $by_eidx_key->{$_} // next;
+					my $url = $ibx->base_url or next;
+					$url .= mid_escape($mid) . '/';
+					return redirect(302, $url);
+				}
+			}
+		}
+	} else { # slow path, scan every inbox
+		for my $mid (@try) {
+			my $arg = [ $mid ]; # [1] => result
+			$pi_config->each_inbox(\&try_inbox, $arg);
+			return $arg->[1] if $arg->[1];
+		}
 	}
-	$res || [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ];
+	[ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ];
 }
 
 1;

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/5] extmsg: use ->ALL for "global" MID lookups
  2020-12-04 22:03 [PATCH 0/5] more ->ALL usage Eric Wong
                   ` (3 preceding siblings ...)
  2020-12-04 22:03 ` [PATCH 4/5] newswww: use ->ALL to avoid O(n) inbox scan Eric Wong
@ 2020-12-04 22:03 ` Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-04 22:03 UTC (permalink / raw)
  To: meta

As with NewsWWW and NNTP, we can use ->ALL to completely
avoid trying SQLite/Xapian lookups across hundreds/thousands
of inboxes.
---
 lib/PublicInbox/ExtMsg.pm | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 03faf3a1..2a0a3e46 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -103,9 +103,37 @@ sub ext_msg_step {
 	}
 }
 
+sub ext_msg_ALL ($) {
+	my ($ctx) = @_;
+	my $ALL = $ctx->{www}->{pi_config}->ALL or return;
+	my $by_eidx_key = $ctx->{www}->{pi_config}->{-by_eidx_key};
+	my $cur_key = $ctx->{-inbox}->eidx_key;
+	my %seen = ($cur_key => 1);
+	my ($id, $prev);
+	while (my $x = $ALL->over->next_by_mid($ctx->{mid}, \$id, \$prev)) {
+		my $xr3 = $ALL->over->get_xref3($x->{num});
+		for my $k (@$xr3) {
+			$k =~ s/:[0-9]+:$x->{blob}\z// or next;
+			next if $k eq $cur_key;
+			my $ibx = $by_eidx_key->{$k} // next;
+			my $url = $ibx->base_url or next;
+			push(@{$ctx->{found}}, $ibx) unless $seen{$k}++;
+		}
+	}
+	return exact($ctx) if $ctx->{found};
+
+	# fall back to partial MID matching
+	for my $ibxish ($ctx->{-inbox}, $ALL) {
+		my $mids = search_partial($ibxish, $ctx->{mid}) or next;
+		push @{$ctx->{partial}}, [ $ibxish, $mids ];
+		last if ($ctx->{n_partial} += scalar(@$mids)) >= PARTIAL_MAX;
+	}
+	partial_response($ctx);
+}
+
 sub ext_msg {
 	my ($ctx) = @_;
-	sub {
+	ext_msg_ALL($ctx) // sub {
 		$ctx->{-wcb} = $_[0]; # HTTP server write callback
 
 		if ($ctx->{env}->{'pi-httpd.async'}) {
@@ -159,7 +187,7 @@ sub finalize_exact {
 	finalize_partial($ctx);
 }
 
-sub finalize_partial {
+sub partial_response ($) {
 	my ($ctx) = @_;
 	my $mid = $ctx->{mid};
 	my $code = 404;
@@ -192,9 +220,11 @@ sub finalize_partial {
 	$ctx->{-html_tip} = $s .= '</pre>';
 	$ctx->{-title_html} = $title;
 	$ctx->{-upfx} = '../';
-	$ctx->{-wcb}->(html_oneshot($ctx, $code));
+	html_oneshot($ctx, $code);
 }
 
+sub finalize_partial ($) { $_[0]->{-wcb}->(partial_response($_[0])) }
+
 sub ext_urls {
 	my ($ctx, $mid, $href, $html) = @_;
 

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-12-04 22:03 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-04 22:03 [PATCH 0/5] more ->ALL usage Eric Wong
2020-12-04 22:03 ` [PATCH 1/5] nntp: xref_by_tc: simplify slightly Eric Wong
2020-12-04 22:03 ` [PATCH 2/5] nntp: small speed up for multi-line responses Eric Wong
2020-12-04 22:03 ` [PATCH 3/5] search: remove mdocid export Eric Wong
2020-12-04 22:03 ` [PATCH 4/5] newswww: use ->ALL to avoid O(n) inbox scan Eric Wong
2020-12-04 22:03 ` [PATCH 5/5] extmsg: use ->ALL for "global" MID lookups Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).