From 82ffb3c183ac20e00effa8a5a7b664eda59672de Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 30 Nov 2020 23:37:42 +0000 Subject: nntp: make ->ALL Xref generation more fuzzy For ->ALL users, this mitigates the regression introduced by commit 811b8d3cbaa790f59b7b107140b86248da16499b ("nntp: xref: use ->ALL extindex if available"), since it's common to cross post messages to some mailing lists with per-list trailers for unsubscribe information. We won't bother dealing with Bcc-ed messages since those are nearly all spam when it comes to public mailing lists. Fixes: 811b8d3cbaa790f5 ("nntp: xref: use ->ALL extindex if available") Link: https://public-inbox.org/meta/20201130194201.GA6687@dcvr/ --- lib/PublicInbox/ExtSearch.pm | 31 +++++++++++++-------------- lib/PublicInbox/NNTP.pm | 50 +++++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 30 deletions(-) (limited to 'lib/PublicInbox') diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index 20ec3224..80455d8d 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -50,8 +50,7 @@ sub git { $self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git"); } -# returns an arrayref of [ $NEWSGROUP_NAME:$ART_NO ] using -# the `xref3' table +# returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table sub nntp_xref_for { # NNTP only my ($self, $xibx, $xsmsg) = @_; my $dbh = over($self)->dbh; @@ -69,7 +68,9 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1 $sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB); - $sth->bind_param(2, $xsmsg->{num}); + + # NNTP::cmd_over can set {num} to zero according to RFC 3977 8.3.2 + $sth->bind_param(2, $xsmsg->{num} || $xsmsg->{-orig_num}); $sth->bind_param(3, $xibx_id); $sth->execute; my $docid = $sth->fetchrow_array // do { @@ -81,9 +82,9 @@ EOF # LIMIT is number of newsgroups on server: $sth = $dbh->prepare_cached(<<'', undef, 1); -SELECT ibx_id,xnum FROM xref3 WHERE docid = ? +SELECT ibx_id,xnum FROM xref3 WHERE docid = ? AND ibx_id != ? - $sth->execute($docid); + $sth->execute($docid, $xibx_id); my $rows = $sth->fetchall_arrayref; my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); @@ -91,18 +92,16 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1 my %xref = map { my ($ibx_id, $xnum) = @$_; - if ($ibx_id == $xibx_id) { - (); - } else { - $eidx_key_sth->execute($ibx_id); - my $eidx_key = $eidx_key_sth->fetchrow_array; - - # only include if there's a newsgroup name - $eidx_key && index($eidx_key, '/') >= 0 ? - () : ($eidx_key => $xnum) - } + + $eidx_key_sth->execute($ibx_id); + my $eidx_key = $eidx_key_sth->fetchrow_array; + + # only include if there's a newsgroup name + $eidx_key && index($eidx_key, '/') >= 0 ? + () : ($eidx_key => $xnum) } @$rows; - [ map { "$_:$xref{$_}" } sort keys %xref ]; # match NNTP LIST order + $xref{$xibx->{newsgroup}} = $xsmsg->{num}; + \%xref; } sub mm { undef } diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 3b16a66a..e0916011 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -17,6 +17,8 @@ use PublicInbox::DS qw(now); use Digest::SHA qw(sha1_hex); use Time::Local qw(timegm timelocal); use PublicInbox::GitAsyncCat; +use PublicInbox::Address; + use constant { LINE_MAX => 512, # RFC 977 section 2.3 r501 => '501 command syntax error', @@ -417,27 +419,42 @@ sub header_append ($$$) { $hdr->header_set($k, @v, $v); } +sub xref_by_tc ($$$) { + my ($xref, $pi_cfg, $smsg) = @_; + my $by_addr = $pi_cfg->{-by_addr}; + my $groups = $pi_cfg->{-by_newsgroup}; + my $mid = $smsg->{mid}; + for my $f (qw(to cc)) { + my @ibxs = map { + $by_addr->{lc($_)} // () + } (PublicInbox::Address::emails($smsg->{$f} // '')); + for my $ibx (@ibxs) { + $groups->{my $ngname = $ibx->{newsgroup}} or next; + next if defined $xref->{$ngname}; + $xref->{$ngname} = eval { $ibx->mm->num_for($mid) }; + } + } +} + sub xref ($$$) { my ($self, $cur_ibx, $smsg) = @_; my $nntpd = $self->{nntpd}; - my $cur_ngname = $cur_ibx->{newsgroup}; - my $ret = "$nntpd->{servername} $cur_ngname:$smsg->{num}"; + my $cur_ng = $cur_ibx->{newsgroup}; + my $xref; if (my $ALL = $nntpd->{pi_config}->ALL) { - if (my $ary = $ALL->nntp_xref_for($cur_ibx, $smsg)) { - $ret .= join(' ', '', @$ary) if scalar(@$ary); - } - # better off wrong than slow if there's thousands of groups, - # so no fallback to the slow path below: + $xref = $ALL->nntp_xref_for($cur_ibx, $smsg); + xref_by_tc($xref, $nntpd->{pi_config}, $smsg); } else { # slow path + $xref = { $cur_ng => $smsg->{num} }; my $mid = $smsg->{mid}; - my $groups = $nntpd->{pi_config}->{-by_newsgroup}; - for my $xngname (@{$nntpd->{groupnames}}) { - next if $cur_ngname eq $xngname; - my $xibx = $groups->{$xngname} or next; - my $num = eval { $xibx->mm->num_for($mid) } or next; - $ret .= " $xngname:$num"; + for my $ibx (values %{$nntpd->{pi_config}->{-by_newsgroup}}) { + next if defined($xref->{$ibx->{newsgroup}}); + my $num = eval { $ibx->mm->num_for($mid) } // next; + $xref->{$ibx->{newsgroup}} = $num; } } + my $ret = "$nntpd->{servername} $cur_ng:".delete($xref->{$cur_ng}); + $ret .= " $_:$xref->{$_}" for (sort keys %$xref); $ret; } @@ -930,8 +947,13 @@ sub cmd_over ($;$) { more($self, '224 Overview information follows (multi-line)'); # Only set article number column if it's the current group + # (RFC 3977 8.3.2) my $self_ng = $self->{ng}; - $smsg->{num} = 0 if (!$self_ng || $self_ng ne $ng); + if (!$self_ng || $self_ng ne $ng) { + # set {-orig_num} for nntp_xref_for + $smsg->{-orig_num} = $smsg->{num}; + $smsg->{num} = 0; + } more($self, over_line($self, $ng, $smsg)); '.'; } else { -- cgit v1.2.3-24-ge0c7