user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 05/13] smsg: introduce ->populate method
Date: Mon,  1 Jun 2020 10:06:49 +0000	[thread overview]
Message-ID: <20200601100657.14700-6-e@yhbt.net> (raw)
In-Reply-To: <20200601100657.14700-1-e@yhbt.net>

This will eventually replace the __hdr() calling methods and
eradicate {mime} usage from Smsg.  For now, we can eliminate
PublicInbox::Smsg->new since most callers already rely on an
open `bless' to avoid the old {mime} arg.
---
 lib/PublicInbox/Import.pm     | 40 ++++++++++++++++----------------
 lib/PublicInbox/SearchIdx.pm  | 31 +++++++++++--------------
 lib/PublicInbox/Smsg.pm       | 43 +++++++++++++++++++++++++++--------
 lib/PublicInbox/V2Writable.pm |  9 ++++----
 t/import.t                    |  3 ++-
 5 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 1a7ed9ce878..ab75aa00dc2 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -12,7 +12,8 @@ use v5.10.1;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::MID qw(mids mid2path);
 use PublicInbox::Address;
-use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::Smsg;
+use PublicInbox::MsgTime qw(msg_datestamp);
 use PublicInbox::ContentHash qw(content_digest);
 use PublicInbox::MDA;
 use PublicInbox::Eml;
@@ -269,8 +270,8 @@ sub remove {
 	(($self->{tip} = ":$commit"), $cur);
 }
 
-sub git_timestamp {
-	my ($ts, $zone) = @_;
+sub git_timestamp ($) {
+	my ($ts, $zone) = @{$_[0]};
 	$ts = 0 if $ts < 0; # git uses unsigned times
 	"$ts $zone";
 }
@@ -278,10 +279,13 @@ sub git_timestamp {
 sub extract_cmt_info ($;$) {
 	my ($mime, $smsg) = @_;
 	# $mime is PublicInbox::Eml, but remains Email::MIME-compatible
+	$smsg //= bless {}, 'PublicInbox::Smsg';
 
-	my $sender = '';
 	my $hdr = $mime->header_obj;
-	my $from = $hdr->header('From') // '';
+	$smsg->populate($hdr);
+
+	my $sender = '';
+	my $from = delete($smsg->{From}) // '';
 	my ($email) = PublicInbox::Address::emails($from);
 	my ($name) = PublicInbox::Address::names($from);
 	if (!defined($name) || !defined($email)) {
@@ -313,17 +317,11 @@ sub extract_cmt_info ($;$) {
 		warn "no name in From: $from or Sender: $sender\n";
 	}
 
-	my $subject = $hdr->header('Subject') // '(no subject)';
-	# MIME decoding can create nulls replace them with spaces to protect git
-	$subject =~ tr/\0/ /;
+	my $subject = delete($smsg->{Subject}) // '(no subject)';
 	utf8::encode($subject);
-	my $at = git_timestamp(my @at = msg_datestamp($hdr));
-	my $ct = git_timestamp(my @ct = msg_timestamp($hdr));
-	if ($smsg) {
-		$smsg->{ds} = $at[0];
-		$smsg->{ts} = $ct[0];
-	}
-	($name, $email, $at, $ct, $subject);
+	my $at = git_timestamp(delete $smsg->{-ds});
+	my $ct = git_timestamp(delete $smsg->{-ts});
+	("$name <$email>", $at, $ct, $subject);
 }
 
 # kill potentially confusing/misleading headers
@@ -370,7 +368,7 @@ sub clean_tree_v2 ($$$) {
 sub add {
 	my ($self, $mime, $check_cb, $smsg) = @_;
 
-	my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $smsg);
+	my ($author, $at, $ct, $subject) = extract_cmt_info($mime, $smsg);
 	my $path_type = $self->{path_type};
 	my $path;
 	if ($path_type eq '2/38') {
@@ -414,7 +412,7 @@ sub add {
 	}
 
 	print $w "commit $ref\nmark :$commit\n",
-		"author $name <$email> $at\n",
+		"author $author $at\n",
 		"committer $self->{ident} $ct\n" or wfail;
 	print $w "data ", (length($subject) + 1), "\n",
 		$subject, "\n\n" or wfail;
@@ -502,11 +500,11 @@ sub digest2mid ($$) {
 
 sub rewrite_commit ($$$$) {
 	my ($self, $oids, $buf, $mime) = @_;
-	my ($name, $email, $at, $ct, $subject);
+	my ($author, $at, $ct, $subject);
 	if ($mime) {
-		($name, $email, $at, $ct, $subject) = extract_cmt_info($mime);
+		($author, $at, $ct, $subject) = extract_cmt_info($mime);
 	} else {
-		$name = $email = '';
+		$author = '<>';
 		$subject = 'purged '.join(' ', @$oids);
 	}
 	@$oids = ();
@@ -515,7 +513,7 @@ sub rewrite_commit ($$$$) {
 		my $l = $buf->[$i];
 		if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) {
 			$at //= $1;
-			$buf->[$i] = "author $name <$email> $at\n";
+			$buf->[$i] = "author $author $at\n";
 		} elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) {
 			$ct //= $1;
 			$buf->[$i] = "committer $self->{ident} $ct\n";
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index b4088933dbf..eb228e6bba7 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -156,16 +156,14 @@ sub index_text ($$$$) {
 	}
 }
 
-sub index_users ($$) {
+sub index_headers ($$) {
 	my ($self, $smsg) = @_;
-
-	my $from = $smsg->from;
-	my $to = $smsg->to;
-	my $cc = $smsg->cc;
-
-	index_text($self, $from, 1, 'A'); # A - author
-	index_text($self, $to, 1, 'XTO') if $to ne '';
-	index_text($self, $cc, 1, 'XCC') if $cc ne '';
+	my @x = (from => 'A', # Author
+		subject => 'S', to => 'XTO', cc => 'XCC');
+	while (my ($field, $pfx) = splice(@x, 0, 2)) {
+		my $val = $smsg->{$field};
+		index_text($self, $val, 1, $pfx) if $val ne '';
+	}
 }
 
 sub index_diff_inc ($$$$) {
@@ -285,9 +283,9 @@ sub index_xapian { # msg_iter callback
 	if ($part->{is_submsg}) {
 		my $mids = mids_for_index($part);
 		index_ids($self, $doc, $part, $mids);
-		my $smsg = PublicInbox::Smsg->new($part);
-		index_users($self, $smsg);
-		index_text($self, $smsg->subject, 1, 'S') if $smsg->subject;
+		my $smsg = bless {}, 'PublicInbox::Smsg';
+		$smsg->populate($part);
+		index_headers($self, $smsg);
 	}
 
 	my ($s, undef) = msg_part_text($part, $ct);
@@ -335,10 +333,8 @@ sub index_ids ($$$$) {
 
 sub add_xapian ($$$$) {
 	my ($self, $mime, $smsg, $mids) = @_;
-	$smsg->{mime} = $mime; # XXX dangerous
 	my $hdr = $mime->header_obj;
 	my $doc = $X->{Document}->new;
-	my $subj = $smsg->subject;
 	add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
 	my @ds = gmtime($smsg->{ds});
 	my $yyyymmdd = strftime('%Y%m%d', @ds);
@@ -348,8 +344,7 @@ sub add_xapian ($$$$) {
 
 	my $tg = term_generator($self);
 	$tg->set_document($doc);
-	index_text($self, $subj, 1, 'S') if $subj;
-	index_users($self, $smsg);
+	index_headers($self, $smsg);
 
 	msg_iter($mime, \&index_xapian, [ $self, $doc ]);
 	index_ids($self, $doc, $hdr, $mids);
@@ -392,8 +387,7 @@ sub add_message {
 	};
 
 	# v1 and tests only:
-	$smsg->{ds} //= msg_datestamp($hdr, $self->{autime});
-	$smsg->{ts} //= msg_timestamp($hdr, $self->{cotime});
+	$smsg->populate($hdr, $self);
 
 	eval {
 		# order matters, overview stores every possible piece of
@@ -649,6 +643,7 @@ sub read_log {
 		my $mime = do_cat_mail($git, $blob, \$bytes);
 		$del_cb->($self, $mime);
 	}
+	delete @$self{qw(autime cotime)};
 	$batch_cb->($nr, $latest, $newest);
 }
 
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 7a2766d8ff8..8e2771274a1 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -17,11 +17,6 @@ use PublicInbox::Address;
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use Time::Local qw(timegm);
 
-sub new {
-	my ($class, $mime) = @_;
-	bless { mime => $mime }, $class;
-}
-
 sub wrap {
 	my ($class, $mid) = @_;
 	bless { mid => $mid }, $class;
@@ -36,11 +31,11 @@ sub get_val ($$) {
 sub to_doc_data {
 	my ($self) = @_;
 	join("\n",
-		$self->subject,
-		$self->from,
+		$self->{subject},
+		$self->{from},
 		$self->references,
-		$self->to,
-		$self->cc,
+		$self->{to},
+		$self->{cc},
 		$self->{blob},
 		$self->{mid},
 		$self->{bytes} // '',
@@ -115,6 +110,36 @@ sub __hdr ($$) {
 	};
 }
 
+# for Import and v1 WWW code paths
+sub populate {
+	my ($self, $hdr, $v2w) = @_;
+	for my $f (qw(From To Cc Subject)) {
+		my @all = $hdr->header($f);
+		my $val = join(', ', @all);
+		$val =~ tr/\r//d;
+		# MIME decoding can create NULs, replace them with spaces
+		# to protect git and NNTP clients
+		$val =~ tr/\0\t\n/   /;
+
+		# lower-case fields for read-only stuff
+		$self->{lc($f)} = $val;
+
+		# Capitalized From/Subject for git-fast-import
+		next if $f eq 'To' || $f eq 'Cc';
+		if (scalar(@all) > 1) {
+			$val = $all[0];
+			$val =~ tr/\r//d;
+			$val =~ tr/\0\t\n/   /;
+		}
+		$self->{$f} = $val if $val ne '';
+	}
+	$v2w //= {};
+	$self->{-ds} = [ my @ds = msg_datestamp($hdr, $v2w->{autime}) ];
+	$self->{-ts} = [ my @ts = msg_timestamp($hdr, $v2w->{cotime}) ];
+	$self->{ds} //= $ds[0]; # no zone
+	$self->{ts} //= $ts[0];
+}
+
 sub subject ($) { __hdr($_[0], 'Subject') }
 sub to ($) { __hdr($_[0], 'To') }
 sub cc ($) { __hdr($_[0], 'Cc') }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 1a824531f3c..79bee7f9f3d 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -19,7 +19,6 @@ use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::SearchIdx;
-use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use PublicInbox::MultiMidQueue;
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
@@ -156,8 +155,6 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$) {
 	my ($self, $msgref, $mime, $smsg) = @_;
-	$smsg->{ds} //= msg_datestamp($mime->header_obj, $self->{autime});
-	$smsg->{ts} //= msg_timestamp($mime->header_obj, $self->{cotime});
 	$self->{over}->add_overview($mime, $smsg);
 	my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
 	$idx->index_raw($msgref, $mime, $smsg);
@@ -575,6 +572,8 @@ W: $list
 			num => $smsg->{num},
 			mid => $smsg->{mid},
 		}, 'PublicInbox::Smsg';
+		my $v2w = { autime => $smsg->{ds}, cotime => $smsg->{ts} };
+		$new_smsg->populate($new_mime, $v2w);
 		do_idx($self, \$raw, $new_mime, $new_smsg);
 	}
 	$rewritten->{rewrites};
@@ -968,6 +967,7 @@ sub reindex_oid_m ($$$$;$) {
 		blob => $oid,
 		mid => $mid0,
 	}, 'PublicInbox::Smsg';
+	$smsg->populate($mime, $self);
 	if (do_idx($self, $msgref, $mime, $smsg)) {
 		reindex_checkpoint($self, $sync, $git);
 	}
@@ -1059,6 +1059,7 @@ sub reindex_oid ($$$$) {
 		blob => $oid,
 		mid => $mid0,
 	}, 'PublicInbox::Smsg';
+	$smsg->populate($mime, $self);
 	if (do_idx($self, $msgref, $mime, $smsg)) {
 		reindex_checkpoint($self, $sync, $git);
 	}
@@ -1298,7 +1299,7 @@ sub index_epoch ($$$) {
 		}
 	}
 	close $fh or die "git log failed: \$?=$?";
-	delete $self->{reindex_pipe};
+	delete @$self{qw(reindex_pipe autime cotime)};
 	update_last_commit($self, $git, $i, $cmt) if defined $cmt;
 }
 
diff --git a/t/import.t b/t/import.t
index 3f308299148..f987b1141f7 100644
--- a/t/import.t
+++ b/t/import.t
@@ -4,6 +4,7 @@ use strict;
 use warnings;
 use Test::More;
 use PublicInbox::Eml;
+use PublicInbox::Smsg;
 use PublicInbox::Git;
 use PublicInbox::Import;
 use PublicInbox::Spawn qw(spawn);
@@ -26,7 +27,7 @@ hello world
 EOF
 
 my $v2 = require_git(2.6, 1);
-my $smsg = {} if $v2;
+my $smsg = bless {}, 'PublicInbox::Smsg' if $v2;
 like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
 
 if ($v2) {

  parent reply	other threads:[~2020-06-01 10:06 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-01 10:06 [PATCH 00/13] smsg: remove tricky {mime} field Eric Wong
2020-06-01 10:06 ` [PATCH 01/13] inbox: introduce smsg_eml method Eric Wong
2020-06-01 10:06 ` [PATCH 02/13] wwwatomstream: convert callers to use smsg_eml Eric Wong
2020-06-01 10:06 ` [PATCH 03/13] v2writable: fix non-sensical interpolation in BUG message Eric Wong
2020-06-01 10:06 ` [PATCH 04/13] import: modernize to use Perl 5.10 features Eric Wong
2020-06-01 10:06 ` Eric Wong [this message]
2020-06-01 10:06 ` [PATCH 06/13] smsg: get rid of ->wrap initializer, too Eric Wong
2020-06-01 10:06 ` [PATCH 07/13] inbox: msg_by_*: remove $(size)ref args Eric Wong
2020-06-01 10:06 ` [PATCH 08/13] www: remove smsg_mime API and adjust callers Eric Wong
2020-06-01 10:06 ` [PATCH 09/13] nntp: smsg_range_i: favor ->{$field} lookups when possible Eric Wong
2020-06-01 10:06 ` [PATCH 10/13] smsg: get rid of remaining {mime} users Eric Wong
2020-06-01 10:06 ` [PATCH 11/13] smsg: remove ->bytes and ->lines methods Eric Wong
2020-06-01 10:06 ` [PATCH 12/13] smsg: remove remaining accessor methods Eric Wong
2020-06-01 10:06 ` [PATCH 13/13] wwwatomstream: drop smsg->{mid} fallback for non-SQLite Eric Wong
2020-06-01 16:45 ` [PATCH 00/13] smsg: remove tricky {mime} field Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200601100657.14700-6-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).