From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 14D1A1FD6E for ; Fri, 27 Nov 2020 09:52:56 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 12/12] nntp: xref: use ->ALL extindex if available Date: Fri, 27 Nov 2020 09:52:54 +0000 Message-Id: <20201127095254.21624-13-e@80x24.org> In-Reply-To: <20201127095254.21624-1-e@80x24.org> References: <20201127095254.21624-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Getting Xref for cross-posted messages is an O(n) operation where `n' is the number of newsgroups on the server. This works acceptably when there are dozens of groups, but would be unnacceptable when there's tens of thousands of newsgroups. With ~140 newsgroups, a lore.kernel.org mirror already handles "XHDR Xref $MESSAGE_ID" requests around 30% faster after creating the xref3.idx_nntp index. The SQL additions to ExtSearch.pm may be a bit strange and seem more appropriate for Over.pm; however it currently makes sense to me since those bits of over.sqlite3 access are exclusive to ExtSearch and can't be used by traditional v1/v2 inboxes... --- lib/PublicInbox/ExtSearch.pm | 56 ++++++++++++++++++++++++++++++++++++ lib/PublicInbox/NNTP.pm | 23 +++++++++------ lib/PublicInbox/OverIdx.pm | 5 ++++ t/extsearch.t | 40 ++++++++++++++++++++++---- t/nntp.t | 7 ++++- 5 files changed, 116 insertions(+), 15 deletions(-) diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index dd93cd32..20ec3224 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -11,6 +11,7 @@ use PublicInbox::Over; use PublicInbox::Inbox; use File::Spec (); use PublicInbox::MiscSearch; +use DBI qw(:sql_types); # SQL_BLOB # for ->reopen, ->mset, ->mset_to_artnums use parent qw(PublicInbox::Search); @@ -49,6 +50,61 @@ sub git { $self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git"); } +# returns an arrayref of [ $NEWSGROUP_NAME:$ART_NO ] using +# the `xref3' table +sub nntp_xref_for { # NNTP only + my ($self, $xibx, $xsmsg) = @_; + my $dbh = over($self)->dbh; + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($xibx->{newsgroup}); + my $xibx_id = $sth->fetchrow_array // do { + warn "W: `$xibx->{newsgroup}' not found in $self->{topdir}\n"; + return; + }; + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1 + + $sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB); + $sth->bind_param(2, $xsmsg->{num}); + $sth->bind_param(3, $xibx_id); + $sth->execute; + my $docid = $sth->fetchrow_array // do { + warn <{newsgroup}:$xsmsg->{num}' not found in $self->{topdir}" +EOF + return; + }; + + # LIMIT is number of newsgroups on server: + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id,xnum FROM xref3 WHERE docid = ? + + $sth->execute($docid); + my $rows = $sth->fetchall_arrayref; + + my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1 + + my %xref = map { + my ($ibx_id, $xnum) = @$_; + if ($ibx_id == $xibx_id) { + (); + } else { + $eidx_key_sth->execute($ibx_id); + my $eidx_key = $eidx_key_sth->fetchrow_array; + + # only include if there's a newsgroup name + $eidx_key && index($eidx_key, '/') >= 0 ? + () : ($eidx_key => $xnum) + } + } @$rows; + [ map { "$_:$xref{$_}" } sort keys %xref ]; # match NNTP LIST order +} + sub mm { undef } sub altid_map { {} } diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 39ff5257..8eec6b91 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -413,14 +413,21 @@ sub xref ($$$) { my $nntpd = $self->{nntpd}; my $cur_ngname = $cur_ibx->{newsgroup}; my $ret = "$nntpd->{servername} $cur_ngname:$smsg->{num}"; - - my $mid = $smsg->{mid}; - my $groups = $nntpd->{pi_config}->{-by_newsgroup}; - for my $xngname (@{$nntpd->{groupnames}}) { - next if $cur_ngname eq $xngname; - my $xibx = $groups->{$xngname} or next; - my $num = eval { $xibx->mm->num_for($mid) } or next; - $ret .= " $xngname:$num"; + if (my $ALL = $nntpd->{pi_config}->ALL) { + if (my $ary = $ALL->nntp_xref_for($cur_ibx, $smsg)) { + $ret .= join(' ', '', @$ary) if scalar(@$ary); + } + # better off wrong than slow if there's thousands of groups, + # so no fallback to the slow path below: + } else { # slow path + my $mid = $smsg->{mid}; + my $groups = $nntpd->{pi_config}->{-by_newsgroup}; + for my $xngname (@{$nntpd->{groupnames}}) { + next if $cur_ngname eq $xngname; + my $xibx = $groups->{$xngname} or next; + my $num = eval { $xibx->mm->num_for($mid) } or next; + $ret .= " $xngname:$num"; + } } $ret; } diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 173e3220..8bec08da 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -542,6 +542,11 @@ CREATE TABLE IF NOT EXISTS xref3 ( $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors + $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '. + 'xref3 (oidbin,xnum,ibx_id)'); + $dbh->do(<<''); CREATE TABLE IF NOT EXISTS eidx_meta ( key VARCHAR(255) PRIMARY KEY, diff --git a/t/extsearch.t b/t/extsearch.t index 778ba32d..f9f74e5c 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -11,6 +11,8 @@ require_git(2.6); require_mods(qw(DBD::SQLite Search::Xapian)); use_ok 'PublicInbox::ExtSearch'; use_ok 'PublicInbox::ExtSearchIdx'; +my $sock = tcp_server(); +my $host_port = $sock->sockhost . ':' . $sock->sockport; my ($home, $for_destroy) = tmpdir(); local $ENV{HOME} = $home; mkdir "$home/.public-inbox" or BAIL_OUT $!; @@ -35,7 +37,7 @@ seek($fh, 0, SEEK_SET) or BAIL_OUT $!; run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda'; -ok(run_script([qw(-init -V1 v1test), "$home/v1test", +ok(run_script([qw(-init -V1 v1test --newsgroup v1.example), "$home/v1test", 'http://example.com/v1test', $v1addr ]), 'v1test init'); $eml->header_set('List-Id', ''); @@ -51,6 +53,36 @@ run_script(['-index', "$home/v1test"]) or BAIL_OUT "index $?"; ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init'); + +{ # TODO: -extindex should write this to config + open $fh, '>>', "$home/.public-inbox/config" or BAIL_OUT $!; + print $fh <ALL +[extindex "all"] + topdir = $home/extindex +EOF + close $fh or BAIL_OUT $!; + + my $pi_cfg = PublicInbox::Config->new; + $pi_cfg->fill_all; + ok($pi_cfg->ALL, '->ALL'); + my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'}; + my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1)); + is_deeply($ret, ['v1.example:1'], '->nntp_xref_for'); +} + +SKIP: { + require_mods(qw(Net::NNTP), 1); + my ($out, $err) = ("$home/nntpd.out.log", "$home/nntpd.err.log"); + my $cmd = [ '-nntpd', '-W0' ]; #, "--stdout=$out", "--stderr=$err" ]; + my $td = start_script($cmd, undef, { 3 => $sock }); + my $n = Net::NNTP->new($host_port); + $n->group('v1.example'); + my $res = $n->head(1); + @$res = grep(/^Xref: /, @$res); + like($res->[0], qr/ v1\.example:1 v2\.example:1/, 'nntp_xref works'); +} + my $es = PublicInbox::ExtSearch->new("$home/extindex"); { my $smsg = $es->over->get_art(1); @@ -58,7 +90,7 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex"); is($es->over->get_art(2), undef, 'only one added'); my $xref3 = $es->over->get_xref3(1); like($xref3->[0], qr/\A\Qv2.example\E:1:/, 'order preserved 1'); - like($xref3->[1], qr!\A\Q$home/v1test\E:1:!, 'order preserved 2'); + like($xref3->[1], qr/\A\Qv1.example\E:1:/, 'order preserved 2'); is(scalar(@$xref3), 2, 'only to entries'); } @@ -93,9 +125,5 @@ my @it = $misc->mset('')->items; is(scalar(@it), 2, 'two inboxes'); like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2'); like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1'); -my $pi_cfg = PublicInbox::Config->new; -$pi_cfg->fill_all; -my $ret = $misc->newsgroup_matches('', $pi_cfg); -is_deeply($pi_cfg->{-by_newsgroup}, $ret, '->newsgroup_matches'); done_testing; diff --git a/t/nntp.t b/t/nntp.t index 9a482acb..91a2aff7 100644 --- a/t/nntp.t +++ b/t/nntp.t @@ -8,6 +8,7 @@ use PublicInbox::Eml; require_mods(qw(DBD::SQLite Data::Dumper)); use_ok 'PublicInbox::NNTP'; use_ok 'PublicInbox::Inbox'; +use PublicInbox::Config; { sub quote_str { @@ -110,7 +111,11 @@ use_ok 'PublicInbox::Inbox'; my $mime = PublicInbox::Eml->new("Message-ID: <$mid>\r\n\r\n"); my $hdr = $mime->header_obj; my $mock_self = { - nntpd => { grouplist => [], servername => 'example.com' }, + nntpd => { + grouplist => [], + servername => 'example.com', + pi_config => bless {}, 'PublicInbox::Config', + }, ng => $ng, }; my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ng };