From 906393b801050e303d2ec2a660c85de4a5fa4740 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 1 Apr 2020 06:16:20 +0000 Subject: mid: add $MID_EXTRACT regexp for export This allows us to consistently enforce the same Message-ID extraction rules everywhere and makes it easier for us to make changes in the future. Update scripts/ssoma-replay, as well, but don't rely on PublicInbox::* modules in that since it's legacy and public-inbox was never a dependency of ssoma. --- lib/PublicInbox/Linkify.pm | 3 ++- lib/PublicInbox/MID.pm | 12 +++++++----- lib/PublicInbox/NNTP.pm | 16 ++++++++-------- lib/PublicInbox/SearchThread.pm | 3 ++- lib/PublicInbox/View.pm | 5 +++-- 5 files changed, 22 insertions(+), 17 deletions(-) (limited to 'lib/PublicInbox') diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index 2bd8f64a..b85bedfe 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -14,6 +14,7 @@ use strict; use warnings; use Digest::SHA qw/sha1_hex/; use PublicInbox::Hval qw(ascii_html mid_href); +use PublicInbox::MID qw($MID_EXTRACT); my $SALT = rand; my $LINK_RE = qr{([\('!])?\b((?:ftps?|https?|nntps?|gopher):// @@ -93,7 +94,7 @@ sub linkify_2 { # with $pfx being the URL prefix sub linkify_mids { my ($self, $pfx, $str, $raw) = @_; - $$str =~ s!<([^>]+)>! + $$str =~ s!$MID_EXTRACT! my $mid = $1; my $html = ascii_html($mid); my $href = mid_href($mid); diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index d2bbaec7..dddde092 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -6,8 +6,8 @@ package PublicInbox::MID; use strict; use warnings; use base qw/Exporter/; -our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC - mids references mids_for_index/; +our @EXPORT_OK = qw(mid_clean id_compress mid2path mid_mime mid_escape MID_ESC + mids references mids_for_index $MID_EXTRACT); use URI::Escape qw(uri_escape_utf8); use Digest::SHA qw/sha1_hex/; require PublicInbox::Address; @@ -16,11 +16,13 @@ use constant { MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q') }; +our $MID_EXTRACT = qr/<([^>]+)>/s; + sub mid_clean { my ($mid) = @_; defined($mid) or die "no Message-ID"; # MDA->precheck did more checking for us - if ($mid =~ /<([^>]+)>/) { + if ($mid =~ $MID_EXTRACT) { $mid = $1; } $mid; @@ -58,7 +60,7 @@ sub mid_mime ($) { mids($_[0]->header_obj)->[0] } sub extract_mids { my @mids; for my $v (@_) { - my @cur = ($v =~ /<([^>]+)>/sg); + my @cur = ($v =~ /$MID_EXTRACT/g); if (@cur) { push(@mids, @cur); } else { @@ -92,7 +94,7 @@ sub references ($) { foreach my $f (qw(References In-Reply-To)) { my @v = $hdr->header_raw($f); foreach my $v (@v) { - push(@mids, ($v =~ /<([^>]+)>/sg)); + push(@mids, ($v =~ /$MID_EXTRACT/g)); } } diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 277657e6..39e2f88e 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -7,7 +7,7 @@ use strict; use warnings; use base qw(PublicInbox::DS); use fields qw(nntpd article ng long_cb); -use PublicInbox::MID qw(mid_escape); +use PublicInbox::MID qw(mid_escape $MID_EXTRACT); use Email::Simple; use POSIX qw(strftime); use PublicInbox::DS qw(now); @@ -24,7 +24,7 @@ use constant { }; use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); use Errno qw(EAGAIN); - +my $ONE_MSGID = qr/\A$MID_EXTRACT\z/; my @OVERVIEW = qw(Subject From Date Message-ID References); my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines), '') . "Xref:full\r\n"; @@ -450,7 +450,7 @@ sub art_lookup ($$$) { $err = '423 no such article number in this group'; $n = int($art); goto find_mid; - } elsif ($art =~ /\A<([^>]+)>\z/) { + } elsif ($art =~ $ONE_MSGID) { $mid = $1; $err = r430; $n = $ng->mm->num_for($mid) if $ng; @@ -653,7 +653,7 @@ sub hdr_msgid_range_i { sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. my ($self, $xhdr, $range) = @_; - if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID + if (defined $range && $range =~ $ONE_MSGID) { my ($ng, $n) = mid_lookup($self, $1); return r430 unless $n; hdr_mid_response($self, $xhdr, $ng, $n, $range, $range); @@ -696,7 +696,7 @@ sub xref_range_i { sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin my ($self, $xhdr, $range) = @_; - if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID + if (defined $range && $range =~ $ONE_MSGID) { my $mid = $1; my ($ng, $n) = mid_lookup($self, $mid); return r430 unless $n; @@ -734,7 +734,7 @@ sub smsg_range_i { sub hdr_smsg ($$$$) { my ($self, $xhdr, $field, $range) = @_; - if (defined $range && $range =~ /\A<(.+)>\z/) { # Message-ID + if (defined $range && $range =~ $ONE_MSGID) { my ($ng, $n) = mid_lookup($self, $1); return r430 unless defined $n; my $v = over_header_for($ng->over, $n, $field); @@ -843,7 +843,7 @@ sub over_line ($$$$) { sub cmd_over ($;$) { my ($self, $range) = @_; - if ($range && $range =~ /\A<(.+)>\z/) { + if ($range && $range =~ $ONE_MSGID) { my ($ng, $n) = mid_lookup($self, $1); defined $n or return r430; my $smsg = $ng->over->get_art($n) or return r430; @@ -911,7 +911,7 @@ sub zflush {} # overridden by NNTPdeflate sub cmd_xpath ($$) { my ($self, $mid) = @_; - return r501 unless $mid =~ /\A<(.+)>\z/; + return r501 unless $mid =~ $ONE_MSGID; $mid = $1; my @paths; foreach my $ng (values %{$self->{nntpd}->{groups}}) { diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index ab2f1a84..8b2cb805 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -20,6 +20,7 @@ package PublicInbox::SearchThread; use strict; use warnings; +use PublicInbox::MID qw($MID_EXTRACT); sub thread { my ($msgs, $ordersub, $ctx) = @_; @@ -67,7 +68,7 @@ sub _add_message ($$) { # everything is perfectly referenced, only the last ref # matters. my $prev; - foreach my $ref ($refs =~ m/<([^>]+)>/g) { + foreach my $ref ($refs =~ m/$MID_EXTRACT/go) { # Find a Container object for the given Message-ID my $cont = _get_cont_for_id($id_table, $ref); diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 5baaffaf..89174296 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -10,7 +10,8 @@ use bytes (); # only for bytes::length use PublicInbox::MsgTime qw(msg_datestamp); use PublicInbox::Hval qw(ascii_html obfuscate_addrs prurl mid_href); use PublicInbox::Linkify; -use PublicInbox::MID qw/id_compress mids mids_for_index references/; +use PublicInbox::MID qw(id_compress mids mids_for_index references + $MID_EXTRACT); use PublicInbox::MsgIter; use PublicInbox::Address; use PublicInbox::WwwStream; @@ -299,7 +300,7 @@ sub _th_index_lite { if (my $smsg = $node->{smsg}) { # delete saves about 200KB on a 1K message thread if (my $refs = delete $smsg->{references}) { - ($$irt) = ($refs =~ m/<([^>]+)>\z/); + ($$irt) = ($refs =~ m/$MID_EXTRACT\z/o); } } my $irt_map = $mapping->{$$irt} if defined $$irt; -- cgit v1.2.3-24-ge0c7