From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 05/13] smsg: introduce ->populate method
Date: Mon, 1 Jun 2020 10:06:49 +0000 [thread overview]
Message-ID: <20200601100657.14700-6-e@yhbt.net> (raw)
In-Reply-To: <20200601100657.14700-1-e@yhbt.net>
This will eventually replace the __hdr() calling methods and
eradicate {mime} usage from Smsg. For now, we can eliminate
PublicInbox::Smsg->new since most callers already rely on an
open `bless' to avoid the old {mime} arg.
---
lib/PublicInbox/Import.pm | 40 ++++++++++++++++----------------
lib/PublicInbox/SearchIdx.pm | 31 +++++++++++--------------
lib/PublicInbox/Smsg.pm | 43 +++++++++++++++++++++++++++--------
lib/PublicInbox/V2Writable.pm | 9 ++++----
t/import.t | 3 ++-
5 files changed, 73 insertions(+), 53 deletions(-)
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 1a7ed9ce878..ab75aa00dc2 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -12,7 +12,8 @@ use v5.10.1;
use PublicInbox::Spawn qw(spawn popen_rd);
use PublicInbox::MID qw(mids mid2path);
use PublicInbox::Address;
-use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::Smsg;
+use PublicInbox::MsgTime qw(msg_datestamp);
use PublicInbox::ContentHash qw(content_digest);
use PublicInbox::MDA;
use PublicInbox::Eml;
@@ -269,8 +270,8 @@ sub remove {
(($self->{tip} = ":$commit"), $cur);
}
-sub git_timestamp {
- my ($ts, $zone) = @_;
+sub git_timestamp ($) {
+ my ($ts, $zone) = @{$_[0]};
$ts = 0 if $ts < 0; # git uses unsigned times
"$ts $zone";
}
@@ -278,10 +279,13 @@ sub git_timestamp {
sub extract_cmt_info ($;$) {
my ($mime, $smsg) = @_;
# $mime is PublicInbox::Eml, but remains Email::MIME-compatible
+ $smsg //= bless {}, 'PublicInbox::Smsg';
- my $sender = '';
my $hdr = $mime->header_obj;
- my $from = $hdr->header('From') // '';
+ $smsg->populate($hdr);
+
+ my $sender = '';
+ my $from = delete($smsg->{From}) // '';
my ($email) = PublicInbox::Address::emails($from);
my ($name) = PublicInbox::Address::names($from);
if (!defined($name) || !defined($email)) {
@@ -313,17 +317,11 @@ sub extract_cmt_info ($;$) {
warn "no name in From: $from or Sender: $sender\n";
}
- my $subject = $hdr->header('Subject') // '(no subject)';
- # MIME decoding can create nulls replace them with spaces to protect git
- $subject =~ tr/\0/ /;
+ my $subject = delete($smsg->{Subject}) // '(no subject)';
utf8::encode($subject);
- my $at = git_timestamp(my @at = msg_datestamp($hdr));
- my $ct = git_timestamp(my @ct = msg_timestamp($hdr));
- if ($smsg) {
- $smsg->{ds} = $at[0];
- $smsg->{ts} = $ct[0];
- }
- ($name, $email, $at, $ct, $subject);
+ my $at = git_timestamp(delete $smsg->{-ds});
+ my $ct = git_timestamp(delete $smsg->{-ts});
+ ("$name <$email>", $at, $ct, $subject);
}
# kill potentially confusing/misleading headers
@@ -370,7 +368,7 @@ sub clean_tree_v2 ($$$) {
sub add {
my ($self, $mime, $check_cb, $smsg) = @_;
- my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $smsg);
+ my ($author, $at, $ct, $subject) = extract_cmt_info($mime, $smsg);
my $path_type = $self->{path_type};
my $path;
if ($path_type eq '2/38') {
@@ -414,7 +412,7 @@ sub add {
}
print $w "commit $ref\nmark :$commit\n",
- "author $name <$email> $at\n",
+ "author $author $at\n",
"committer $self->{ident} $ct\n" or wfail;
print $w "data ", (length($subject) + 1), "\n",
$subject, "\n\n" or wfail;
@@ -502,11 +500,11 @@ sub digest2mid ($$) {
sub rewrite_commit ($$$$) {
my ($self, $oids, $buf, $mime) = @_;
- my ($name, $email, $at, $ct, $subject);
+ my ($author, $at, $ct, $subject);
if ($mime) {
- ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime);
+ ($author, $at, $ct, $subject) = extract_cmt_info($mime);
} else {
- $name = $email = '';
+ $author = '<>';
$subject = 'purged '.join(' ', @$oids);
}
@$oids = ();
@@ -515,7 +513,7 @@ sub rewrite_commit ($$$$) {
my $l = $buf->[$i];
if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) {
$at //= $1;
- $buf->[$i] = "author $name <$email> $at\n";
+ $buf->[$i] = "author $author $at\n";
} elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) {
$ct //= $1;
$buf->[$i] = "committer $self->{ident} $ct\n";
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index b4088933dbf..eb228e6bba7 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -156,16 +156,14 @@ sub index_text ($$$$) {
}
}
-sub index_users ($$) {
+sub index_headers ($$) {
my ($self, $smsg) = @_;
-
- my $from = $smsg->from;
- my $to = $smsg->to;
- my $cc = $smsg->cc;
-
- index_text($self, $from, 1, 'A'); # A - author
- index_text($self, $to, 1, 'XTO') if $to ne '';
- index_text($self, $cc, 1, 'XCC') if $cc ne '';
+ my @x = (from => 'A', # Author
+ subject => 'S', to => 'XTO', cc => 'XCC');
+ while (my ($field, $pfx) = splice(@x, 0, 2)) {
+ my $val = $smsg->{$field};
+ index_text($self, $val, 1, $pfx) if $val ne '';
+ }
}
sub index_diff_inc ($$$$) {
@@ -285,9 +283,9 @@ sub index_xapian { # msg_iter callback
if ($part->{is_submsg}) {
my $mids = mids_for_index($part);
index_ids($self, $doc, $part, $mids);
- my $smsg = PublicInbox::Smsg->new($part);
- index_users($self, $smsg);
- index_text($self, $smsg->subject, 1, 'S') if $smsg->subject;
+ my $smsg = bless {}, 'PublicInbox::Smsg';
+ $smsg->populate($part);
+ index_headers($self, $smsg);
}
my ($s, undef) = msg_part_text($part, $ct);
@@ -335,10 +333,8 @@ sub index_ids ($$$$) {
sub add_xapian ($$$$) {
my ($self, $mime, $smsg, $mids) = @_;
- $smsg->{mime} = $mime; # XXX dangerous
my $hdr = $mime->header_obj;
my $doc = $X->{Document}->new;
- my $subj = $smsg->subject;
add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
my @ds = gmtime($smsg->{ds});
my $yyyymmdd = strftime('%Y%m%d', @ds);
@@ -348,8 +344,7 @@ sub add_xapian ($$$$) {
my $tg = term_generator($self);
$tg->set_document($doc);
- index_text($self, $subj, 1, 'S') if $subj;
- index_users($self, $smsg);
+ index_headers($self, $smsg);
msg_iter($mime, \&index_xapian, [ $self, $doc ]);
index_ids($self, $doc, $hdr, $mids);
@@ -392,8 +387,7 @@ sub add_message {
};
# v1 and tests only:
- $smsg->{ds} //= msg_datestamp($hdr, $self->{autime});
- $smsg->{ts} //= msg_timestamp($hdr, $self->{cotime});
+ $smsg->populate($hdr, $self);
eval {
# order matters, overview stores every possible piece of
@@ -649,6 +643,7 @@ sub read_log {
my $mime = do_cat_mail($git, $blob, \$bytes);
$del_cb->($self, $mime);
}
+ delete @$self{qw(autime cotime)};
$batch_cb->($nr, $latest, $newest);
}
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 7a2766d8ff8..8e2771274a1 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -17,11 +17,6 @@ use PublicInbox::Address;
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
use Time::Local qw(timegm);
-sub new {
- my ($class, $mime) = @_;
- bless { mime => $mime }, $class;
-}
-
sub wrap {
my ($class, $mid) = @_;
bless { mid => $mid }, $class;
@@ -36,11 +31,11 @@ sub get_val ($$) {
sub to_doc_data {
my ($self) = @_;
join("\n",
- $self->subject,
- $self->from,
+ $self->{subject},
+ $self->{from},
$self->references,
- $self->to,
- $self->cc,
+ $self->{to},
+ $self->{cc},
$self->{blob},
$self->{mid},
$self->{bytes} // '',
@@ -115,6 +110,36 @@ sub __hdr ($$) {
};
}
+# for Import and v1 WWW code paths
+sub populate {
+ my ($self, $hdr, $v2w) = @_;
+ for my $f (qw(From To Cc Subject)) {
+ my @all = $hdr->header($f);
+ my $val = join(', ', @all);
+ $val =~ tr/\r//d;
+ # MIME decoding can create NULs, replace them with spaces
+ # to protect git and NNTP clients
+ $val =~ tr/\0\t\n/ /;
+
+ # lower-case fields for read-only stuff
+ $self->{lc($f)} = $val;
+
+ # Capitalized From/Subject for git-fast-import
+ next if $f eq 'To' || $f eq 'Cc';
+ if (scalar(@all) > 1) {
+ $val = $all[0];
+ $val =~ tr/\r//d;
+ $val =~ tr/\0\t\n/ /;
+ }
+ $self->{$f} = $val if $val ne '';
+ }
+ $v2w //= {};
+ $self->{-ds} = [ my @ds = msg_datestamp($hdr, $v2w->{autime}) ];
+ $self->{-ts} = [ my @ts = msg_timestamp($hdr, $v2w->{cotime}) ];
+ $self->{ds} //= $ds[0]; # no zone
+ $self->{ts} //= $ts[0];
+}
+
sub subject ($) { __hdr($_[0], 'Subject') }
sub to ($) { __hdr($_[0], 'To') }
sub cc ($) { __hdr($_[0], 'Cc') }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 1a824531f3c..79bee7f9f3d 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -19,7 +19,6 @@ use PublicInbox::OverIdx;
use PublicInbox::Msgmap;
use PublicInbox::Spawn qw(spawn popen_rd);
use PublicInbox::SearchIdx;
-use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
use PublicInbox::MultiMidQueue;
use IO::Handle; # ->autoflush
use File::Temp qw(tempfile);
@@ -156,8 +155,6 @@ sub add {
# indexes a message, returns true if checkpointing is needed
sub do_idx ($$$$) {
my ($self, $msgref, $mime, $smsg) = @_;
- $smsg->{ds} //= msg_datestamp($mime->header_obj, $self->{autime});
- $smsg->{ts} //= msg_timestamp($mime->header_obj, $self->{cotime});
$self->{over}->add_overview($mime, $smsg);
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
$idx->index_raw($msgref, $mime, $smsg);
@@ -575,6 +572,8 @@ W: $list
num => $smsg->{num},
mid => $smsg->{mid},
}, 'PublicInbox::Smsg';
+ my $v2w = { autime => $smsg->{ds}, cotime => $smsg->{ts} };
+ $new_smsg->populate($new_mime, $v2w);
do_idx($self, \$raw, $new_mime, $new_smsg);
}
$rewritten->{rewrites};
@@ -968,6 +967,7 @@ sub reindex_oid_m ($$$$;$) {
blob => $oid,
mid => $mid0,
}, 'PublicInbox::Smsg';
+ $smsg->populate($mime, $self);
if (do_idx($self, $msgref, $mime, $smsg)) {
reindex_checkpoint($self, $sync, $git);
}
@@ -1059,6 +1059,7 @@ sub reindex_oid ($$$$) {
blob => $oid,
mid => $mid0,
}, 'PublicInbox::Smsg';
+ $smsg->populate($mime, $self);
if (do_idx($self, $msgref, $mime, $smsg)) {
reindex_checkpoint($self, $sync, $git);
}
@@ -1298,7 +1299,7 @@ sub index_epoch ($$$) {
}
}
close $fh or die "git log failed: \$?=$?";
- delete $self->{reindex_pipe};
+ delete @$self{qw(reindex_pipe autime cotime)};
update_last_commit($self, $git, $i, $cmt) if defined $cmt;
}
diff --git a/t/import.t b/t/import.t
index 3f308299148..f987b1141f7 100644
--- a/t/import.t
+++ b/t/import.t
@@ -4,6 +4,7 @@ use strict;
use warnings;
use Test::More;
use PublicInbox::Eml;
+use PublicInbox::Smsg;
use PublicInbox::Git;
use PublicInbox::Import;
use PublicInbox::Spawn qw(spawn);
@@ -26,7 +27,7 @@ hello world
EOF
my $v2 = require_git(2.6, 1);
-my $smsg = {} if $v2;
+my $smsg = bless {}, 'PublicInbox::Smsg' if $v2;
like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
if ($v2) {
next prev parent reply other threads:[~2020-06-01 10:06 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-06-01 10:06 [PATCH 00/13] smsg: remove tricky {mime} field Eric Wong
2020-06-01 10:06 ` [PATCH 01/13] inbox: introduce smsg_eml method Eric Wong
2020-06-01 10:06 ` [PATCH 02/13] wwwatomstream: convert callers to use smsg_eml Eric Wong
2020-06-01 10:06 ` [PATCH 03/13] v2writable: fix non-sensical interpolation in BUG message Eric Wong
2020-06-01 10:06 ` [PATCH 04/13] import: modernize to use Perl 5.10 features Eric Wong
2020-06-01 10:06 ` Eric Wong [this message]
2020-06-01 10:06 ` [PATCH 06/13] smsg: get rid of ->wrap initializer, too Eric Wong
2020-06-01 10:06 ` [PATCH 07/13] inbox: msg_by_*: remove $(size)ref args Eric Wong
2020-06-01 10:06 ` [PATCH 08/13] www: remove smsg_mime API and adjust callers Eric Wong
2020-06-01 10:06 ` [PATCH 09/13] nntp: smsg_range_i: favor ->{$field} lookups when possible Eric Wong
2020-06-01 10:06 ` [PATCH 10/13] smsg: get rid of remaining {mime} users Eric Wong
2020-06-01 10:06 ` [PATCH 11/13] smsg: remove ->bytes and ->lines methods Eric Wong
2020-06-01 10:06 ` [PATCH 12/13] smsg: remove remaining accessor methods Eric Wong
2020-06-01 10:06 ` [PATCH 13/13] wwwatomstream: drop smsg->{mid} fallback for non-SQLite Eric Wong
2020-06-01 16:45 ` [PATCH 00/13] smsg: remove tricky {mime} field Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200601100657.14700-6-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).