user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 12/12] nntp: xref: use ->ALL extindex if available
Date: Fri, 27 Nov 2020 09:52:54 +0000	[thread overview]
Message-ID: <20201127095254.21624-13-e@80x24.org> (raw)
In-Reply-To: <20201127095254.21624-1-e@80x24.org>

Getting Xref for cross-posted messages is an O(n) operation
where `n' is the number of newsgroups on the server.  This works
acceptably when there are dozens of groups, but would be
unnacceptable when there's tens of thousands of newsgroups.

With ~140 newsgroups, a lore.kernel.org mirror already handles
"XHDR Xref $MESSAGE_ID" requests around 30% faster after
creating the xref3.idx_nntp index.

The SQL additions to ExtSearch.pm may be a bit strange and
seem more appropriate for Over.pm; however it currently makes
sense to me since those bits of over.sqlite3 access are
exclusive to ExtSearch and can't be used by traditional
v1/v2 inboxes...
---
 lib/PublicInbox/ExtSearch.pm | 56 ++++++++++++++++++++++++++++++++++++
 lib/PublicInbox/NNTP.pm      | 23 +++++++++------
 lib/PublicInbox/OverIdx.pm   |  5 ++++
 t/extsearch.t                | 40 ++++++++++++++++++++++----
 t/nntp.t                     |  7 ++++-
 5 files changed, 116 insertions(+), 15 deletions(-)

diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index dd93cd32..20ec3224 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -11,6 +11,7 @@ use PublicInbox::Over;
 use PublicInbox::Inbox;
 use File::Spec ();
 use PublicInbox::MiscSearch;
+use DBI qw(:sql_types); # SQL_BLOB
 
 # for ->reopen, ->mset, ->mset_to_artnums
 use parent qw(PublicInbox::Search);
@@ -49,6 +50,61 @@ sub git {
 	$self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git");
 }
 
+# returns an arrayref of [ $NEWSGROUP_NAME:$ART_NO ] using
+# the `xref3' table
+sub nntp_xref_for { # NNTP only
+	my ($self, $xibx, $xsmsg) = @_;
+	my $dbh = over($self)->dbh;
+
+	my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
+
+	$sth->execute($xibx->{newsgroup});
+	my $xibx_id = $sth->fetchrow_array // do {
+		warn "W: `$xibx->{newsgroup}' not found in $self->{topdir}\n";
+		return;
+	};
+
+	$sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1
+
+	$sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB);
+	$sth->bind_param(2, $xsmsg->{num});
+	$sth->bind_param(3, $xibx_id);
+	$sth->execute;
+	my $docid = $sth->fetchrow_array // do {
+		warn <<EOF;
+W: `$xibx->{newsgroup}:$xsmsg->{num}' not found in $self->{topdir}"
+EOF
+		return;
+	};
+
+	# LIMIT is number of newsgroups on server:
+	$sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id,xnum FROM xref3 WHERE docid = ?
+
+	$sth->execute($docid);
+	my $rows = $sth->fetchall_arrayref;
+
+	my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
+
+	my %xref = map {
+		my ($ibx_id, $xnum) = @$_;
+		if ($ibx_id == $xibx_id) {
+			();
+		} else {
+			$eidx_key_sth->execute($ibx_id);
+			my $eidx_key = $eidx_key_sth->fetchrow_array;
+
+			# only include if there's a newsgroup name
+			$eidx_key && index($eidx_key, '/') >= 0 ?
+				() : ($eidx_key => $xnum)
+		}
+	} @$rows;
+	[ map { "$_:$xref{$_}" } sort keys %xref ]; # match NNTP LIST order
+}
+
 sub mm { undef }
 
 sub altid_map { {} }
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 39ff5257..8eec6b91 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -413,14 +413,21 @@ sub xref ($$$) {
 	my $nntpd = $self->{nntpd};
 	my $cur_ngname = $cur_ibx->{newsgroup};
 	my $ret = "$nntpd->{servername} $cur_ngname:$smsg->{num}";
-
-	my $mid = $smsg->{mid};
-	my $groups = $nntpd->{pi_config}->{-by_newsgroup};
-	for my $xngname (@{$nntpd->{groupnames}}) {
-		next if $cur_ngname eq $xngname;
-		my $xibx = $groups->{$xngname} or next;
-		my $num = eval { $xibx->mm->num_for($mid) } or next;
-		$ret .= " $xngname:$num";
+	if (my $ALL = $nntpd->{pi_config}->ALL) {
+		if (my $ary = $ALL->nntp_xref_for($cur_ibx, $smsg)) {
+			$ret .= join(' ', '', @$ary) if scalar(@$ary);
+		}
+		# better off wrong than slow if there's thousands of groups,
+		# so no fallback to the slow path below:
+	} else { # slow path
+		my $mid = $smsg->{mid};
+		my $groups = $nntpd->{pi_config}->{-by_newsgroup};
+		for my $xngname (@{$nntpd->{groupnames}}) {
+			next if $cur_ngname eq $xngname;
+			my $xibx = $groups->{$xngname} or next;
+			my $num = eval { $xibx->mm->num_for($mid) } or next;
+			$ret .= " $xngname:$num";
+		}
 	}
 	$ret;
 }
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 173e3220..8bec08da 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -542,6 +542,11 @@ CREATE TABLE IF NOT EXISTS xref3 (
 
 	$dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)');
 
+	# performance critical, this is not UNIQUE since we may need to
+	# tolerate some old bugs from indexing mirrors
+	$dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '.
+		'xref3 (oidbin,xnum,ibx_id)');
+
 		$dbh->do(<<'');
 CREATE TABLE IF NOT EXISTS eidx_meta (
 	key VARCHAR(255) PRIMARY KEY,
diff --git a/t/extsearch.t b/t/extsearch.t
index 778ba32d..f9f74e5c 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -11,6 +11,8 @@ require_git(2.6);
 require_mods(qw(DBD::SQLite Search::Xapian));
 use_ok 'PublicInbox::ExtSearch';
 use_ok 'PublicInbox::ExtSearchIdx';
+my $sock = tcp_server();
+my $host_port = $sock->sockhost . ':' . $sock->sockport;
 my ($home, $for_destroy) = tmpdir();
 local $ENV{HOME} = $home;
 mkdir "$home/.public-inbox" or BAIL_OUT $!;
@@ -35,7 +37,7 @@ seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
 
 run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
 
-ok(run_script([qw(-init -V1 v1test), "$home/v1test",
+ok(run_script([qw(-init -V1 v1test --newsgroup v1.example), "$home/v1test",
 	'http://example.com/v1test', $v1addr ]), 'v1test init');
 
 $eml->header_set('List-Id', '<v1.example.com>');
@@ -51,6 +53,36 @@ run_script(['-index', "$home/v1test"]) or BAIL_OUT "index $?";
 
 ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
 
+
+{ # TODO: -extindex should write this to config
+	open $fh, '>>', "$home/.public-inbox/config" or BAIL_OUT $!;
+	print $fh <<EOF or BAIL_OUT $!;
+; for ->ALL
+[extindex "all"]
+	topdir = $home/extindex
+EOF
+	close $fh or BAIL_OUT $!;
+
+	my $pi_cfg = PublicInbox::Config->new;
+	$pi_cfg->fill_all;
+	ok($pi_cfg->ALL, '->ALL');
+	my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'};
+	my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1));
+	is_deeply($ret, ['v1.example:1'], '->nntp_xref_for');
+}
+
+SKIP: {
+	require_mods(qw(Net::NNTP), 1);
+	my ($out, $err) = ("$home/nntpd.out.log", "$home/nntpd.err.log");
+	my $cmd = [ '-nntpd', '-W0' ]; #, "--stdout=$out", "--stderr=$err" ];
+	my $td = start_script($cmd, undef, { 3 => $sock });
+	my $n = Net::NNTP->new($host_port);
+	$n->group('v1.example');
+	my $res = $n->head(1);
+	@$res = grep(/^Xref: /, @$res);
+	like($res->[0], qr/ v1\.example:1 v2\.example:1/, 'nntp_xref works');
+}
+
 my $es = PublicInbox::ExtSearch->new("$home/extindex");
 {
 	my $smsg = $es->over->get_art(1);
@@ -58,7 +90,7 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex");
 	is($es->over->get_art(2), undef, 'only one added');
 	my $xref3 = $es->over->get_xref3(1);
 	like($xref3->[0], qr/\A\Qv2.example\E:1:/, 'order preserved 1');
-	like($xref3->[1], qr!\A\Q$home/v1test\E:1:!, 'order preserved 2');
+	like($xref3->[1], qr/\A\Qv1.example\E:1:/, 'order preserved 2');
 	is(scalar(@$xref3), 2, 'only to entries');
 }
 
@@ -93,9 +125,5 @@ my @it = $misc->mset('')->items;
 is(scalar(@it), 2, 'two inboxes');
 like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
 like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
-my $pi_cfg = PublicInbox::Config->new;
-$pi_cfg->fill_all;
-my $ret = $misc->newsgroup_matches('', $pi_cfg);
-is_deeply($pi_cfg->{-by_newsgroup}, $ret, '->newsgroup_matches');
 
 done_testing;
diff --git a/t/nntp.t b/t/nntp.t
index 9a482acb..91a2aff7 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -8,6 +8,7 @@ use PublicInbox::Eml;
 require_mods(qw(DBD::SQLite Data::Dumper));
 use_ok 'PublicInbox::NNTP';
 use_ok 'PublicInbox::Inbox';
+use PublicInbox::Config;
 
 {
 	sub quote_str {
@@ -110,7 +111,11 @@ use_ok 'PublicInbox::Inbox';
 	my $mime = PublicInbox::Eml->new("Message-ID: <$mid>\r\n\r\n");
 	my $hdr = $mime->header_obj;
 	my $mock_self = {
-		nntpd => { grouplist => [], servername => 'example.com' },
+		nntpd => {
+			grouplist => [],
+			servername => 'example.com',
+			pi_config => bless {}, 'PublicInbox::Config',
+		},
 		ng => $ng,
 	};
 	my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ng };

  parent reply	other threads:[~2020-11-27  9:52 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-27  9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
2020-11-27  9:52 ` [PATCH 01/12] nntp: use Inbox->uidvalidity instead of ->mm->created_at Eric Wong
2020-11-27  9:52 ` [PATCH 02/12] nntpd: share {groups} hash with {-by_newsgroup} in Config Eric Wong
2020-11-27  9:52 ` [PATCH 03/12] mm: min/max: return 0 instead of undef Eric Wong
2020-11-27  9:52 ` [PATCH 04/12] nntp: use grep operation for wildmat matching Eric Wong
2020-11-27  9:52 ` [PATCH 05/12] nntp: NEWNEWS: speed up filtering Eric Wong
2020-11-27  9:52 ` [PATCH 06/12] miscsearch: implement ->newsgroup_matches Eric Wong
2020-11-27  9:52 ` [PATCH 07/12] nntp: LIST ACTIVE.TIMES use angle brackets around address Eric Wong
2020-11-27  9:52 ` [PATCH 08/12] nntp: move LIST iterators to long_response Eric Wong
2020-11-27  9:52 ` [PATCH 09/12] t/extsearch: show a more realistic case Eric Wong
2020-11-27  9:52 ` [PATCH 10/12] nntp: some minor golfing Eric Wong
2020-11-27  9:52 ` [PATCH 11/12] nntp: xref: simplify sub signature Eric Wong
2020-11-27  9:52 ` Eric Wong [this message]
2020-11-30 19:42   ` xref3 + NNTP problems Eric Wong
2020-11-30 23:37     ` [PATCH] nntp: make ->ALL Xref generation more fuzzy Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201127095254.21624-13-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).