From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 3/3] mid: add $MID_EXTRACT regexp for export
Date: Wed, 1 Apr 2020 06:16:20 +0000 [thread overview]
Message-ID: <20200401061620.29276-4-e@yhbt.net> (raw)
In-Reply-To: <20200401061620.29276-1-e@yhbt.net>
This allows us to consistently enforce the same Message-ID
extraction rules everywhere and makes it easier for us to
make changes in the future.
Update scripts/ssoma-replay, as well, but don't rely on
PublicInbox::* modules in that since it's legacy and
public-inbox was never a dependency of ssoma.
---
lib/PublicInbox/Linkify.pm | 3 ++-
lib/PublicInbox/MID.pm | 12 +++++++-----
lib/PublicInbox/NNTP.pm | 16 ++++++++--------
lib/PublicInbox/SearchThread.pm | 3 ++-
lib/PublicInbox/View.pm | 5 +++--
scripts/ssoma-replay | 2 +-
6 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm
index 2bd8f64a..b85bedfe 100644
--- a/lib/PublicInbox/Linkify.pm
+++ b/lib/PublicInbox/Linkify.pm
@@ -14,6 +14,7 @@ use strict;
use warnings;
use Digest::SHA qw/sha1_hex/;
use PublicInbox::Hval qw(ascii_html mid_href);
+use PublicInbox::MID qw($MID_EXTRACT);
my $SALT = rand;
my $LINK_RE = qr{([\('!])?\b((?:ftps?|https?|nntps?|gopher)://
@@ -93,7 +94,7 @@ sub linkify_2 {
# with $pfx being the URL prefix
sub linkify_mids {
my ($self, $pfx, $str, $raw) = @_;
- $$str =~ s!<([^>]+)>!
+ $$str =~ s!$MID_EXTRACT!
my $mid = $1;
my $html = ascii_html($mid);
my $href = mid_href($mid);
diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index d2bbaec7..dddde092 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -6,8 +6,8 @@ package PublicInbox::MID;
use strict;
use warnings;
use base qw/Exporter/;
-our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC
- mids references mids_for_index/;
+our @EXPORT_OK = qw(mid_clean id_compress mid2path mid_mime mid_escape MID_ESC
+ mids references mids_for_index $MID_EXTRACT);
use URI::Escape qw(uri_escape_utf8);
use Digest::SHA qw/sha1_hex/;
require PublicInbox::Address;
@@ -16,11 +16,13 @@ use constant {
MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q')
};
+our $MID_EXTRACT = qr/<([^>]+)>/s;
+
sub mid_clean {
my ($mid) = @_;
defined($mid) or die "no Message-ID";
# MDA->precheck did more checking for us
- if ($mid =~ /<([^>]+)>/) {
+ if ($mid =~ $MID_EXTRACT) {
$mid = $1;
}
$mid;
@@ -58,7 +60,7 @@ sub mid_mime ($) { mids($_[0]->header_obj)->[0] }
sub extract_mids {
my @mids;
for my $v (@_) {
- my @cur = ($v =~ /<([^>]+)>/sg);
+ my @cur = ($v =~ /$MID_EXTRACT/g);
if (@cur) {
push(@mids, @cur);
} else {
@@ -92,7 +94,7 @@ sub references ($) {
foreach my $f (qw(References In-Reply-To)) {
my @v = $hdr->header_raw($f);
foreach my $v (@v) {
- push(@mids, ($v =~ /<([^>]+)>/sg));
+ push(@mids, ($v =~ /$MID_EXTRACT/g));
}
}
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 277657e6..39e2f88e 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -7,7 +7,7 @@ use strict;
use warnings;
use base qw(PublicInbox::DS);
use fields qw(nntpd article ng long_cb);
-use PublicInbox::MID qw(mid_escape);
+use PublicInbox::MID qw(mid_escape $MID_EXTRACT);
use Email::Simple;
use POSIX qw(strftime);
use PublicInbox::DS qw(now);
@@ -24,7 +24,7 @@ use constant {
};
use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT);
use Errno qw(EAGAIN);
-
+my $ONE_MSGID = qr/\A$MID_EXTRACT\z/;
my @OVERVIEW = qw(Subject From Date Message-ID References);
my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines), '') .
"Xref:full\r\n";
@@ -450,7 +450,7 @@ sub art_lookup ($$$) {
$err = '423 no such article number in this group';
$n = int($art);
goto find_mid;
- } elsif ($art =~ /\A<([^>]+)>\z/) {
+ } elsif ($art =~ $ONE_MSGID) {
$mid = $1;
$err = r430;
$n = $ng->mm->num_for($mid) if $ng;
@@ -653,7 +653,7 @@ sub hdr_msgid_range_i {
sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull.
my ($self, $xhdr, $range) = @_;
- if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID
+ if (defined $range && $range =~ $ONE_MSGID) {
my ($ng, $n) = mid_lookup($self, $1);
return r430 unless $n;
hdr_mid_response($self, $xhdr, $ng, $n, $range, $range);
@@ -696,7 +696,7 @@ sub xref_range_i {
sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin
my ($self, $xhdr, $range) = @_;
- if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID
+ if (defined $range && $range =~ $ONE_MSGID) {
my $mid = $1;
my ($ng, $n) = mid_lookup($self, $mid);
return r430 unless $n;
@@ -734,7 +734,7 @@ sub smsg_range_i {
sub hdr_smsg ($$$$) {
my ($self, $xhdr, $field, $range) = @_;
- if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID
+ if (defined $range && $range =~ $ONE_MSGID) {
my ($ng, $n) = mid_lookup($self, $1);
return r430 unless defined $n;
my $v = over_header_for($ng->over, $n, $field);
@@ -843,7 +843,7 @@ sub over_line ($$$$) {
sub cmd_over ($;$) {
my ($self, $range) = @_;
- if ($range && $range =~ /\A<(.+)>\z/) {
+ if ($range && $range =~ $ONE_MSGID) {
my ($ng, $n) = mid_lookup($self, $1);
defined $n or return r430;
my $smsg = $ng->over->get_art($n) or return r430;
@@ -911,7 +911,7 @@ sub zflush {} # overridden by NNTPdeflate
sub cmd_xpath ($$) {
my ($self, $mid) = @_;
- return r501 unless $mid =~ /\A<(.+)>\z/;
+ return r501 unless $mid =~ $ONE_MSGID;
$mid = $1;
my @paths;
foreach my $ng (values %{$self->{nntpd}->{groups}}) {
diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm
index ab2f1a84..8b2cb805 100644
--- a/lib/PublicInbox/SearchThread.pm
+++ b/lib/PublicInbox/SearchThread.pm
@@ -20,6 +20,7 @@
package PublicInbox::SearchThread;
use strict;
use warnings;
+use PublicInbox::MID qw($MID_EXTRACT);
sub thread {
my ($msgs, $ordersub, $ctx) = @_;
@@ -67,7 +68,7 @@ sub _add_message ($$) {
# everything is perfectly referenced, only the last ref
# matters.
my $prev;
- foreach my $ref ($refs =~ m/<([^>]+)>/g) {
+ foreach my $ref ($refs =~ m/$MID_EXTRACT/go) {
# Find a Container object for the given Message-ID
my $cont = _get_cont_for_id($id_table, $ref);
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 5baaffaf..89174296 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -10,7 +10,8 @@ use bytes (); # only for bytes::length
use PublicInbox::MsgTime qw(msg_datestamp);
use PublicInbox::Hval qw(ascii_html obfuscate_addrs prurl mid_href);
use PublicInbox::Linkify;
-use PublicInbox::MID qw/id_compress mids mids_for_index references/;
+use PublicInbox::MID qw(id_compress mids mids_for_index references
+ $MID_EXTRACT);
use PublicInbox::MsgIter;
use PublicInbox::Address;
use PublicInbox::WwwStream;
@@ -299,7 +300,7 @@ sub _th_index_lite {
if (my $smsg = $node->{smsg}) {
# delete saves about 200KB on a 1K message thread
if (my $refs = delete $smsg->{references}) {
- ($$irt) = ($refs =~ m/<([^>]+)>\z/);
+ ($$irt) = ($refs =~ m/$MID_EXTRACT\z/o);
}
}
my $irt_map = $mapping->{$$irt} if defined $$irt;
diff --git a/scripts/ssoma-replay b/scripts/ssoma-replay
index 46b15d7e..3e928084 100755
--- a/scripts/ssoma-replay
+++ b/scripts/ssoma-replay
@@ -52,7 +52,7 @@ if (defined $list_id) {
if (defined $domain) {
$archive_url = "https://$domain/$user/";
my $mid = $header_obj->header('Message-Id');
- if ($mid =~ /\A<(.+)>\z/) {
+ if ($mid =~ /<[ \t]*([^>]+)?[ \t]*>/s) {
$mid = $1;
}
$mid = uri_escape_utf8($mid,
prev parent reply other threads:[~2020-04-01 6:16 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-01 6:16 [PATCH 0/3] Message-ID-related cleanups Eric Wong
2020-04-01 6:16 ` [PATCH 1/3] smsg: inline _extract_mid functionality Eric Wong
2020-04-01 6:16 ` [PATCH 2/3] searchidx: v1: skip mid_clean on mid_mime results Eric Wong
2020-04-01 6:16 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200401061620.29276-4-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).