From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH] nntp: make ->ALL Xref generation more fuzzy
Date: Mon, 30 Nov 2020 23:37:42 +0000 [thread overview]
Message-ID: <20201130233742.GA3077@dcvr> (raw)
In-Reply-To: <20201130194201.GA6687@dcvr>
Eric Wong <e@80x24.org> wrote:
> So every message posted to linux-mtd@lists.infradead.org
> nntp://rskvuqcfnfizkjg6h5jvovwb3wkikzcwskf54lfpymus6mxrzw67b5ad.onion/org.infradead.lists.linux-mtd
> fails to match cross posts to Xref:, which is bad, too...
>
> I have an idea to make it less bad, maybe it'll work...
Deployed to
nntp://rskvuqcfnfizkjg6h5jvovwb3wkikzcwskf54lfpymus6mxrzw67b5ad.onion/
and it seems to be OK when comparing messages cross-posted
between org.kernel.vger.* and org.infradead.lists.*
-------------8<------------
Subject: [PATCH] nntp: make ->ALL Xref generation more fuzzy
For ->ALL users, this mitigates the regression introduced
by commit 811b8d3cbaa790f59b7b107140b86248da16499b
("nntp: xref: use ->ALL extindex if available"), since
it's common to cross post messages to some mailing
lists with per-list trailers for unsubscribe information.
We won't bother dealing with Bcc-ed messages since those
are nearly all spam when it comes to public mailing lists.
Fixes: 811b8d3cbaa790f5 ("nntp: xref: use ->ALL extindex if available")
Link: https://public-inbox.org/meta/20201130194201.GA6687@dcvr/
---
lib/PublicInbox/ExtSearch.pm | 31 +++++++++++-----------
lib/PublicInbox/NNTP.pm | 50 ++++++++++++++++++++++++++----------
t/extsearch.t | 3 ++-
3 files changed, 53 insertions(+), 31 deletions(-)
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index 20ec3224..80455d8d 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -50,8 +50,7 @@ sub git {
$self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git");
}
-# returns an arrayref of [ $NEWSGROUP_NAME:$ART_NO ] using
-# the `xref3' table
+# returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table
sub nntp_xref_for { # NNTP only
my ($self, $xibx, $xsmsg) = @_;
my $dbh = over($self)->dbh;
@@ -69,7 +68,9 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1
$sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB);
- $sth->bind_param(2, $xsmsg->{num});
+
+ # NNTP::cmd_over can set {num} to zero according to RFC 3977 8.3.2
+ $sth->bind_param(2, $xsmsg->{num} || $xsmsg->{-orig_num});
$sth->bind_param(3, $xibx_id);
$sth->execute;
my $docid = $sth->fetchrow_array // do {
@@ -81,9 +82,9 @@ EOF
# LIMIT is number of newsgroups on server:
$sth = $dbh->prepare_cached(<<'', undef, 1);
-SELECT ibx_id,xnum FROM xref3 WHERE docid = ?
+SELECT ibx_id,xnum FROM xref3 WHERE docid = ? AND ibx_id != ?
- $sth->execute($docid);
+ $sth->execute($docid, $xibx_id);
my $rows = $sth->fetchall_arrayref;
my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1);
@@ -91,18 +92,16 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
my %xref = map {
my ($ibx_id, $xnum) = @$_;
- if ($ibx_id == $xibx_id) {
- ();
- } else {
- $eidx_key_sth->execute($ibx_id);
- my $eidx_key = $eidx_key_sth->fetchrow_array;
-
- # only include if there's a newsgroup name
- $eidx_key && index($eidx_key, '/') >= 0 ?
- () : ($eidx_key => $xnum)
- }
+
+ $eidx_key_sth->execute($ibx_id);
+ my $eidx_key = $eidx_key_sth->fetchrow_array;
+
+ # only include if there's a newsgroup name
+ $eidx_key && index($eidx_key, '/') >= 0 ?
+ () : ($eidx_key => $xnum)
} @$rows;
- [ map { "$_:$xref{$_}" } sort keys %xref ]; # match NNTP LIST order
+ $xref{$xibx->{newsgroup}} = $xsmsg->{num};
+ \%xref;
}
sub mm { undef }
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 3b16a66a..e0916011 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -17,6 +17,8 @@ use PublicInbox::DS qw(now);
use Digest::SHA qw(sha1_hex);
use Time::Local qw(timegm timelocal);
use PublicInbox::GitAsyncCat;
+use PublicInbox::Address;
+
use constant {
LINE_MAX => 512, # RFC 977 section 2.3
r501 => '501 command syntax error',
@@ -417,27 +419,42 @@ sub header_append ($$$) {
$hdr->header_set($k, @v, $v);
}
+sub xref_by_tc ($$$) {
+ my ($xref, $pi_cfg, $smsg) = @_;
+ my $by_addr = $pi_cfg->{-by_addr};
+ my $groups = $pi_cfg->{-by_newsgroup};
+ my $mid = $smsg->{mid};
+ for my $f (qw(to cc)) {
+ my @ibxs = map {
+ $by_addr->{lc($_)} // ()
+ } (PublicInbox::Address::emails($smsg->{$f} // ''));
+ for my $ibx (@ibxs) {
+ $groups->{my $ngname = $ibx->{newsgroup}} or next;
+ next if defined $xref->{$ngname};
+ $xref->{$ngname} = eval { $ibx->mm->num_for($mid) };
+ }
+ }
+}
+
sub xref ($$$) {
my ($self, $cur_ibx, $smsg) = @_;
my $nntpd = $self->{nntpd};
- my $cur_ngname = $cur_ibx->{newsgroup};
- my $ret = "$nntpd->{servername} $cur_ngname:$smsg->{num}";
+ my $cur_ng = $cur_ibx->{newsgroup};
+ my $xref;
if (my $ALL = $nntpd->{pi_config}->ALL) {
- if (my $ary = $ALL->nntp_xref_for($cur_ibx, $smsg)) {
- $ret .= join(' ', '', @$ary) if scalar(@$ary);
- }
- # better off wrong than slow if there's thousands of groups,
- # so no fallback to the slow path below:
+ $xref = $ALL->nntp_xref_for($cur_ibx, $smsg);
+ xref_by_tc($xref, $nntpd->{pi_config}, $smsg);
} else { # slow path
+ $xref = { $cur_ng => $smsg->{num} };
my $mid = $smsg->{mid};
- my $groups = $nntpd->{pi_config}->{-by_newsgroup};
- for my $xngname (@{$nntpd->{groupnames}}) {
- next if $cur_ngname eq $xngname;
- my $xibx = $groups->{$xngname} or next;
- my $num = eval { $xibx->mm->num_for($mid) } or next;
- $ret .= " $xngname:$num";
+ for my $ibx (values %{$nntpd->{pi_config}->{-by_newsgroup}}) {
+ next if defined($xref->{$ibx->{newsgroup}});
+ my $num = eval { $ibx->mm->num_for($mid) } // next;
+ $xref->{$ibx->{newsgroup}} = $num;
}
}
+ my $ret = "$nntpd->{servername} $cur_ng:".delete($xref->{$cur_ng});
+ $ret .= " $_:$xref->{$_}" for (sort keys %$xref);
$ret;
}
@@ -930,8 +947,13 @@ sub cmd_over ($;$) {
more($self, '224 Overview information follows (multi-line)');
# Only set article number column if it's the current group
+ # (RFC 3977 8.3.2)
my $self_ng = $self->{ng};
- $smsg->{num} = 0 if (!$self_ng || $self_ng ne $ng);
+ if (!$self_ng || $self_ng ne $ng) {
+ # set {-orig_num} for nntp_xref_for
+ $smsg->{-orig_num} = $smsg->{num};
+ $smsg->{num} = 0;
+ }
more($self, over_line($self, $ng, $smsg));
'.';
} else {
diff --git a/t/extsearch.t b/t/extsearch.t
index 2a1b05c7..2b8b88ea 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -72,7 +72,8 @@ EOF
ok($pi_cfg->ALL, '->ALL');
my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'};
my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1));
- is_deeply($ret, ['v1.example:1'], '->nntp_xref_for');
+ is_deeply($ret, { 'v1.example' => 1, 'v2.example' => 1 },
+ '->nntp_xref_for');
}
SKIP: {
prev parent reply other threads:[~2020-11-30 23:37 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
2020-11-27 9:52 ` [PATCH 01/12] nntp: use Inbox->uidvalidity instead of ->mm->created_at Eric Wong
2020-11-27 9:52 ` [PATCH 02/12] nntpd: share {groups} hash with {-by_newsgroup} in Config Eric Wong
2020-11-27 9:52 ` [PATCH 03/12] mm: min/max: return 0 instead of undef Eric Wong
2020-11-27 9:52 ` [PATCH 04/12] nntp: use grep operation for wildmat matching Eric Wong
2020-11-27 9:52 ` [PATCH 05/12] nntp: NEWNEWS: speed up filtering Eric Wong
2020-11-27 9:52 ` [PATCH 06/12] miscsearch: implement ->newsgroup_matches Eric Wong
2020-11-27 9:52 ` [PATCH 07/12] nntp: LIST ACTIVE.TIMES use angle brackets around address Eric Wong
2020-11-27 9:52 ` [PATCH 08/12] nntp: move LIST iterators to long_response Eric Wong
2020-11-27 9:52 ` [PATCH 09/12] t/extsearch: show a more realistic case Eric Wong
2020-11-27 9:52 ` [PATCH 10/12] nntp: some minor golfing Eric Wong
2020-11-27 9:52 ` [PATCH 11/12] nntp: xref: simplify sub signature Eric Wong
2020-11-27 9:52 ` [PATCH 12/12] nntp: xref: use ->ALL extindex if available Eric Wong
2020-11-30 19:42 ` xref3 + NNTP problems Eric Wong
2020-11-30 23:37 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201130233742.GA3077@dcvr \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).