user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 5/9] treewide: consolidate "From " line removal
  2023-10-11  7:20  7% [PATCH 0/9] lei + import-related updates Eric Wong
@ 2023-10-11  7:20  4% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2023-10-11  7:20 UTC (permalink / raw)
  To: meta

Aside from our prior import bugs (fixed in a0c07cba0e5d8b6a
(mda: drop leading "From " lines again, 2016-06-26)), we'll
always have to be dealing with mutt piping messages to us and
`git format-patch' output.  So just share the regexp so we
can use it everywhere.

In may be desirable to allow importing messages with a leading
"From " line for FUSE, even.

Additionally, some instances of this regexp needlessly added
optional `\r?' (CR) checks ahead of the `\n' (LF) element; but
they're pointless anyways since [^\n]* is enough to exclude all
non-LF bytes.
---
 lib/PublicInbox/Eml.pm        |  6 ++++++
 lib/PublicInbox/IMAP.pm       |  2 +-
 lib/PublicInbox/Import.pm     |  8 +++-----
 lib/PublicInbox/LeiInput.pm   |  5 +----
 lib/PublicInbox/LeiInspect.pm |  2 +-
 lib/PublicInbox/LeiToMail.pm  |  3 +--
 lib/PublicInbox/Mbox.pm       | 16 +++++++---------
 lib/PublicInbox/MboxReader.pm |  2 +-
 lib/PublicInbox/NNTP.pm       |  3 +--
 script/public-inbox-edit      |  5 ++---
 script/public-inbox-learn     |  2 +-
 script/public-inbox-mda       |  4 ++--
 script/public-inbox-purge     |  4 ++--
 13 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm
index 8b999e1a..24060ec8 100644
--- a/lib/PublicInbox/Eml.pm
+++ b/lib/PublicInbox/Eml.pm
@@ -528,4 +528,10 @@ sub willneed { re_memo($_) for @_ }
 willneed(qw(From To Cc Date Subject Content-Type In-Reply-To References
 		Message-ID X-Alt-Message-ID));
 
+# This fixes an old bug from import (pre-a0c07cba0e5d8b6a)
+# mutt also pipes single RFC822 messages with a "From " line,
+# but no Content-Length or "From " escaping.
+# "git format-patch" also generates such files by default.
+sub strip_from { $_[0] =~ s/\A[\r\n]*From [^\n]*\n//s }
+
 1;
diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm
index 3c64cefa..e4a9e304 100644
--- a/lib/PublicInbox/IMAP.pm
+++ b/lib/PublicInbox/IMAP.pm
@@ -664,7 +664,7 @@ sub op_eml_new { $_[4] = PublicInbox::Eml->new($_[3]) }
 # s/From / fixes old bug from import (pre-a0c07cba0e5d8b6a)
 sub to_crlf_full {
 	${$_[0]} =~ s/(?<!\r)\n/\r\n/sg;
-	${$_[0]} =~ s/\A[\r\n]*From [^\r\n]*\r\n//s;
+	PublicInbox::Eml::strip_from(${$_[0]});
 }
 
 sub op_crlf_bref { to_crlf_full($_[3]) }
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 7175884c..cd03da05 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -118,9 +118,6 @@ sub _cat_blob ($$$) {
 	$n == $len or croak "cat-blob: short read: $n < $len";
 	my $lf = chop $buf;
 	croak "bad read on final byte: <$lf>" if $lf ne "\n";
-
-	# fixup some bugginess in old versions:
-	$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
 	\$buf;
 }
 
@@ -136,8 +133,9 @@ sub check_remove_v1 {
 	my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef);
 	$info =~ m!\A100644 blob ([a-f0-9]{40,})\t!s or die "not blob: $info";
 	my $oid = $1;
-	my $msg = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed";
-	my $cur = PublicInbox::Eml->new($msg);
+	my $bref = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed";
+	PublicInbox::Eml::strip_from($$bref);
+	my $cur = PublicInbox::Eml->new($bref);
 	my $cur_s = $cur->header('Subject') // '';
 	my $cur_m = $mime->header('Subject') // '';
 	if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) {
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index 93f8b6b8..28b73ca9 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -84,10 +84,7 @@ sub input_fh {
 			return $self->{lei}->child_error(0, <<"");
 error reading $name: $!
 
-		# mutt pipes single RFC822 messages with a "From " line,
-		# but no Content-Length or "From " escaping.
-		# "git format-patch" also generates such files by default.
-		$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+		PublicInbox::Eml::strip_from($buf);
 
 		# a user may feed just a body: git diff | lei rediff -U9
 		if ($self->{-force_eml}) {
diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index f801610f..65c64cf2 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -254,7 +254,7 @@ sub inspect_start ($$) {
 sub do_inspect { # lei->do_env cb
 	my ($lei) = @_;
 	my $str = delete $lei->{istr};
-	$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+	PublicInbox::Eml::strip_from($str);
 	my $eml = PublicInbox::Eml->new(\$str);
 	inspect_start($lei, [ 'blob:'.$lei->git_oid($eml)->hexdigest,
 			map { "mid:$_" } @{mids($eml)} ]);
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 8771592d..ead60b38 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -53,8 +53,7 @@ sub _mbox_hdr_buf ($$$) {
 	}
 	my $buf = delete $eml->{hdr};
 
-	# fixup old bug from import (pre-a0c07cba0e5d8b6a)
-	$$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+	PublicInbox::Eml::strip_from($$buf);
 	my $ident = $smsg->{blob} // 'lei';
 	if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" }
 
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index bf61bb0e..52f88ae3 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -89,17 +89,15 @@ sub emit_raw {
 
 sub msg_hdr ($$) {
 	my ($ctx, $eml) = @_;
-	my $header_obj = $eml->header_obj;
 
-	# drop potentially confusing headers, ssoma already should've dropped
-	# Lines and Content-Length
-	foreach my $d (qw(Lines Bytes Content-Length Status)) {
-		$header_obj->header_set($d);
+	# drop potentially confusing headers, various importers should've
+	# already dropped these, but we can't trust stuff we've cloned
+	for my $d (qw(Lines Bytes Content-Length Status)) {
+		$eml->header_set($d);
 	}
-	my $crlf = $header_obj->crlf;
-	my $buf = $header_obj->as_string;
-	# fixup old bug from import (pre-a0c07cba0e5d8b6a)
-	$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+	my $crlf = $eml->crlf;
+	my $buf = $eml->header_obj->as_string;
+	PublicInbox::Eml::strip_from($buf);
 	"From mboxrd\@z Thu Jan  1 00:00:00 1970" . $crlf . $buf . $crlf;
 }
 
diff --git a/lib/PublicInbox/MboxReader.pm b/lib/PublicInbox/MboxReader.pm
index e4209022..d67fb4eb 100644
--- a/lib/PublicInbox/MboxReader.pm
+++ b/lib/PublicInbox/MboxReader.pm
@@ -93,7 +93,7 @@ sub _mbox_cl ($$$;@) {
 			undef $mbfh;
 		}
 		while (my $hdr = _extract_hdr(\$buf)) {
-			$$hdr =~ s/\A[\r\n]*From [^\n]*\n//s or
+			PublicInbox::Eml::strip_from($$hdr) or
 				die "E: no 'From ' line in:\n", Dumper($hdr);
 			my $eml = PublicInbox::Eml->new($hdr);
 			next unless $eml->raw_size;
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 316b7775..603cf094 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -523,8 +523,7 @@ sub msg_hdr_write ($$) {
 	set_nntp_headers($eml, $smsg);
 
 	my $hdr = $eml->{hdr} // \(my $x = '');
-	# fixup old bug from import (pre-a0c07cba0e5d8b6a)
-	$$hdr =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+	PublicInbox::Eml::strip_from($$hdr);
 	$$hdr =~ s/(?<!\r)\n/\r\n/sg; # Alpine barfs without this
 
 	# for leafnode compatibility, we need to ensure Message-ID headers
diff --git a/script/public-inbox-edit b/script/public-inbox-edit
index 1fbaf5a7..1fb6f32b 100755
--- a/script/public-inbox-edit
+++ b/script/public-inbox-edit
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 #
 # Used for editing messages in a public-inbox.
@@ -188,8 +188,7 @@ retry_edit:
 		"read $edit_fn: $!\n";
 
 	if (!$opt->{raw}) {
-		# get rid of the From we added
-		$new_raw =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+		PublicInbox::Eml::strip_from($new_raw);
 
 		# check if user forgot to purge (in mutt) after editing
 		if ($new_raw =~ /^From /sm) {
diff --git a/script/public-inbox-learn b/script/public-inbox-learn
index 8b8e1b77..6e1978a7 100755
--- a/script/public-inbox-learn
+++ b/script/public-inbox-learn
@@ -40,7 +40,7 @@ my $pi_cfg = PublicInbox::Config->new;
 my $err;
 my $mime = PublicInbox::Eml->new(do{
 	defined(my $data = do { local $/; <STDIN> }) or die "read STDIN: $!\n";
-	$data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+	PublicInbox::Eml::strip_from($data);
 
 	if ($train ne 'rm') {
 		eval {
diff --git a/script/public-inbox-mda b/script/public-inbox-mda
index ba498956..cac819ac 100755
--- a/script/public-inbox-mda
+++ b/script/public-inbox-mda
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2013-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 #
 # Mail delivery agent for public-inbox, run from your MTA upon mail delivery
@@ -39,7 +39,7 @@ use PublicInbox::Spamcheck;
 my $emergency = $ENV{PI_EMERGENCY} || "$ENV{HOME}/.public-inbox/emergency/";
 $ems = PublicInbox::Emergency->new($emergency);
 my $str = do { local $/; <STDIN> };
-$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+PublicInbox::Eml::strip_from($str);
 $ems->prepare(\$str);
 my $eml = PublicInbox::Eml->new(\$str);
 my $cfg = PublicInbox::Config->new;
diff --git a/script/public-inbox-purge b/script/public-inbox-purge
index 121027cc..8f9b0b16 100755
--- a/script/public-inbox-purge
+++ b/script/public-inbox-purge
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 #
 # Used for purging messages entirely from a public-inbox.  Currently
@@ -34,7 +34,7 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt);
 PublicInbox::AdminEdit::check_editable(\@ibxs);
 
 defined(my $data = do { local $/; <STDIN> }) or die "read STDIN: $!\n";
-$data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+PublicInbox::Eml::strip_from($data);
 my $n_purged = 0;
 
 foreach my $ibx (@ibxs) {

^ permalink raw reply related	[relevance 4%]

* [PATCH 0/9] lei + import-related updates
@ 2023-10-11  7:20  7% Eric Wong
  2023-10-11  7:20  4% ` [PATCH 5/9] treewide: consolidate "From " line removal Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2023-10-11  7:20 UTC (permalink / raw)
  To: meta

A few more ProcessIO conversions to start with, and then
cleanups while I started working on import-related stuff.
Some of this will tie in nicely for FUSE, too...

I've realized msgtime messages were pointless anyways since
there's nothing anybody can really do about bad messages that
get through various upstream spam filters.

5/9 is a long-overdue cleanup I noticed while going
over Import.pm

9/9 ought to fix the fragile t/lei-store-fail.t test
by using new features.

Eric Wong (9):
  lei rediff: use ProcessIO for --drq support
  lei_xsearch: improve curl progress reporting
  msgtime: quiet warnings we can do nothing about
  msgtime: simplify msg_timestamp and msg_datestamp
  treewide: consolidate "From " line removal
  import: switch to Unix stream socket for fast-import
  import: cat_blob is a no-op w/o live fast-import
  lei blob: run cat_blob on lei/store for pending blobs
  lei import|tag|rm: support --commit-delay=SECONDS

 lib/PublicInbox/Eml.pm        |   6 ++
 lib/PublicInbox/IMAP.pm       |   2 +-
 lib/PublicInbox/Import.pm     | 138 ++++++++++++++++------------------
 lib/PublicInbox/LEI.pm        |  23 +++---
 lib/PublicInbox/LeiBlob.pm    |  16 ++--
 lib/PublicInbox/LeiInput.pm   |   5 +-
 lib/PublicInbox/LeiInspect.pm |   2 +-
 lib/PublicInbox/LeiRediff.pm  |  33 ++++----
 lib/PublicInbox/LeiStore.pm   |  11 +++
 lib/PublicInbox/LeiToMail.pm  |   3 +-
 lib/PublicInbox/LeiXSearch.pm |  34 +++++----
 lib/PublicInbox/Mbox.pm       |  16 ++--
 lib/PublicInbox/MboxReader.pm |   2 +-
 lib/PublicInbox/MsgTime.pm    |  49 +++++-------
 lib/PublicInbox/NNTP.pm       |   3 +-
 lib/PublicInbox/ProcessIO.pm  |  18 ++---
 lib/PublicInbox/Spawn.pm      |   1 +
 script/public-inbox-convert   |  18 ++---
 script/public-inbox-edit      |   5 +-
 script/public-inbox-learn     |   2 +-
 script/public-inbox-mda       |   4 +-
 script/public-inbox-purge     |   4 +-
 t/lei-import.t                |  13 ++++
 t/lei-store-fail.t            |  20 +++--
 t/lei-tag.t                   |  15 +++-
 25 files changed, 230 insertions(+), 213 deletions(-)


^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-10-11  7:20  7% [PATCH 0/9] lei + import-related updates Eric Wong
2023-10-11  7:20  4% ` [PATCH 5/9] treewide: consolidate "From " line removal Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).