about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--lib/PublicInbox/Import.pm40
-rw-r--r--lib/PublicInbox/SearchIdx.pm31
-rw-r--r--lib/PublicInbox/Smsg.pm43
-rw-r--r--lib/PublicInbox/V2Writable.pm9
-rw-r--r--t/import.t3
5 files changed, 73 insertions, 53 deletions
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 1a7ed9ce..ab75aa00 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -12,7 +12,8 @@ use v5.10.1;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::MID qw(mids mid2path);
 use PublicInbox::Address;
-use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::Smsg;
+use PublicInbox::MsgTime qw(msg_datestamp);
 use PublicInbox::ContentHash qw(content_digest);
 use PublicInbox::MDA;
 use PublicInbox::Eml;
@@ -269,8 +270,8 @@ sub remove {
         (($self->{tip} = ":$commit"), $cur);
 }
 
-sub git_timestamp {
-        my ($ts, $zone) = @_;
+sub git_timestamp ($) {
+        my ($ts, $zone) = @{$_[0]};
         $ts = 0 if $ts < 0; # git uses unsigned times
         "$ts $zone";
 }
@@ -278,10 +279,13 @@ sub git_timestamp {
 sub extract_cmt_info ($;$) {
         my ($mime, $smsg) = @_;
         # $mime is PublicInbox::Eml, but remains Email::MIME-compatible
+        $smsg //= bless {}, 'PublicInbox::Smsg';
 
-        my $sender = '';
         my $hdr = $mime->header_obj;
-        my $from = $hdr->header('From') // '';
+        $smsg->populate($hdr);
+
+        my $sender = '';
+        my $from = delete($smsg->{From}) // '';
         my ($email) = PublicInbox::Address::emails($from);
         my ($name) = PublicInbox::Address::names($from);
         if (!defined($name) || !defined($email)) {
@@ -313,17 +317,11 @@ sub extract_cmt_info ($;$) {
                 warn "no name in From: $from or Sender: $sender\n";
         }
 
-        my $subject = $hdr->header('Subject') // '(no subject)';
-        # MIME decoding can create nulls replace them with spaces to protect git
-        $subject =~ tr/\0/ /;
+        my $subject = delete($smsg->{Subject}) // '(no subject)';
         utf8::encode($subject);
-        my $at = git_timestamp(my @at = msg_datestamp($hdr));
-        my $ct = git_timestamp(my @ct = msg_timestamp($hdr));
-        if ($smsg) {
-                $smsg->{ds} = $at[0];
-                $smsg->{ts} = $ct[0];
-        }
-        ($name, $email, $at, $ct, $subject);
+        my $at = git_timestamp(delete $smsg->{-ds});
+        my $ct = git_timestamp(delete $smsg->{-ts});
+        ("$name <$email>", $at, $ct, $subject);
 }
 
 # kill potentially confusing/misleading headers
@@ -370,7 +368,7 @@ sub clean_tree_v2 ($$$) {
 sub add {
         my ($self, $mime, $check_cb, $smsg) = @_;
 
-        my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime, $smsg);
+        my ($author, $at, $ct, $subject) = extract_cmt_info($mime, $smsg);
         my $path_type = $self->{path_type};
         my $path;
         if ($path_type eq '2/38') {
@@ -414,7 +412,7 @@ sub add {
         }
 
         print $w "commit $ref\nmark :$commit\n",
-                "author $name <$email> $at\n",
+                "author $author $at\n",
                 "committer $self->{ident} $ct\n" or wfail;
         print $w "data ", (length($subject) + 1), "\n",
                 $subject, "\n\n" or wfail;
@@ -502,11 +500,11 @@ sub digest2mid ($$) {
 
 sub rewrite_commit ($$$$) {
         my ($self, $oids, $buf, $mime) = @_;
-        my ($name, $email, $at, $ct, $subject);
+        my ($author, $at, $ct, $subject);
         if ($mime) {
-                ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime);
+                ($author, $at, $ct, $subject) = extract_cmt_info($mime);
         } else {
-                $name = $email = '';
+                $author = '<>';
                 $subject = 'purged '.join(' ', @$oids);
         }
         @$oids = ();
@@ -515,7 +513,7 @@ sub rewrite_commit ($$$$) {
                 my $l = $buf->[$i];
                 if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) {
                         $at //= $1;
-                        $buf->[$i] = "author $name <$email> $at\n";
+                        $buf->[$i] = "author $author $at\n";
                 } elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) {
                         $ct //= $1;
                         $buf->[$i] = "committer $self->{ident} $ct\n";
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index b4088933..eb228e6b 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -156,16 +156,14 @@ sub index_text ($$$$) {
         }
 }
 
-sub index_users ($$) {
+sub index_headers ($$) {
         my ($self, $smsg) = @_;
-
-        my $from = $smsg->from;
-        my $to = $smsg->to;
-        my $cc = $smsg->cc;
-
-        index_text($self, $from, 1, 'A'); # A - author
-        index_text($self, $to, 1, 'XTO') if $to ne '';
-        index_text($self, $cc, 1, 'XCC') if $cc ne '';
+        my @x = (from => 'A', # Author
+                subject => 'S', to => 'XTO', cc => 'XCC');
+        while (my ($field, $pfx) = splice(@x, 0, 2)) {
+                my $val = $smsg->{$field};
+                index_text($self, $val, 1, $pfx) if $val ne '';
+        }
 }
 
 sub index_diff_inc ($$$$) {
@@ -285,9 +283,9 @@ sub index_xapian { # msg_iter callback
         if ($part->{is_submsg}) {
                 my $mids = mids_for_index($part);
                 index_ids($self, $doc, $part, $mids);
-                my $smsg = PublicInbox::Smsg->new($part);
-                index_users($self, $smsg);
-                index_text($self, $smsg->subject, 1, 'S') if $smsg->subject;
+                my $smsg = bless {}, 'PublicInbox::Smsg';
+                $smsg->populate($part);
+                index_headers($self, $smsg);
         }
 
         my ($s, undef) = msg_part_text($part, $ct);
@@ -335,10 +333,8 @@ sub index_ids ($$$$) {
 
 sub add_xapian ($$$$) {
         my ($self, $mime, $smsg, $mids) = @_;
-        $smsg->{mime} = $mime; # XXX dangerous
         my $hdr = $mime->header_obj;
         my $doc = $X->{Document}->new;
-        my $subj = $smsg->subject;
         add_val($doc, PublicInbox::Search::TS(), $smsg->{ts});
         my @ds = gmtime($smsg->{ds});
         my $yyyymmdd = strftime('%Y%m%d', @ds);
@@ -348,8 +344,7 @@ sub add_xapian ($$$$) {
 
         my $tg = term_generator($self);
         $tg->set_document($doc);
-        index_text($self, $subj, 1, 'S') if $subj;
-        index_users($self, $smsg);
+        index_headers($self, $smsg);
 
         msg_iter($mime, \&index_xapian, [ $self, $doc ]);
         index_ids($self, $doc, $hdr, $mids);
@@ -392,8 +387,7 @@ sub add_message {
         };
 
         # v1 and tests only:
-        $smsg->{ds} //= msg_datestamp($hdr, $self->{autime});
-        $smsg->{ts} //= msg_timestamp($hdr, $self->{cotime});
+        $smsg->populate($hdr, $self);
 
         eval {
                 # order matters, overview stores every possible piece of
@@ -649,6 +643,7 @@ sub read_log {
                 my $mime = do_cat_mail($git, $blob, \$bytes);
                 $del_cb->($self, $mime);
         }
+        delete @$self{qw(autime cotime)};
         $batch_cb->($nr, $latest, $newest);
 }
 
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 7a2766d8..8e277127 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -17,11 +17,6 @@ use PublicInbox::Address;
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use Time::Local qw(timegm);
 
-sub new {
-        my ($class, $mime) = @_;
-        bless { mime => $mime }, $class;
-}
-
 sub wrap {
         my ($class, $mid) = @_;
         bless { mid => $mid }, $class;
@@ -36,11 +31,11 @@ sub get_val ($$) {
 sub to_doc_data {
         my ($self) = @_;
         join("\n",
-                $self->subject,
-                $self->from,
+                $self->{subject},
+                $self->{from},
                 $self->references,
-                $self->to,
-                $self->cc,
+                $self->{to},
+                $self->{cc},
                 $self->{blob},
                 $self->{mid},
                 $self->{bytes} // '',
@@ -115,6 +110,36 @@ sub __hdr ($$) {
         };
 }
 
+# for Import and v1 WWW code paths
+sub populate {
+        my ($self, $hdr, $v2w) = @_;
+        for my $f (qw(From To Cc Subject)) {
+                my @all = $hdr->header($f);
+                my $val = join(', ', @all);
+                $val =~ tr/\r//d;
+                # MIME decoding can create NULs, replace them with spaces
+                # to protect git and NNTP clients
+                $val =~ tr/\0\t\n/   /;
+
+                # lower-case fields for read-only stuff
+                $self->{lc($f)} = $val;
+
+                # Capitalized From/Subject for git-fast-import
+                next if $f eq 'To' || $f eq 'Cc';
+                if (scalar(@all) > 1) {
+                        $val = $all[0];
+                        $val =~ tr/\r//d;
+                        $val =~ tr/\0\t\n/   /;
+                }
+                $self->{$f} = $val if $val ne '';
+        }
+        $v2w //= {};
+        $self->{-ds} = [ my @ds = msg_datestamp($hdr, $v2w->{autime}) ];
+        $self->{-ts} = [ my @ts = msg_timestamp($hdr, $v2w->{cotime}) ];
+        $self->{ds} //= $ds[0]; # no zone
+        $self->{ts} //= $ts[0];
+}
+
 sub subject ($) { __hdr($_[0], 'Subject') }
 sub to ($) { __hdr($_[0], 'To') }
 sub cc ($) { __hdr($_[0], 'Cc') }
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 1a824531..79bee7f9 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -19,7 +19,6 @@ use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::SearchIdx;
-use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 use PublicInbox::MultiMidQueue;
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
@@ -156,8 +155,6 @@ sub add {
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$) {
         my ($self, $msgref, $mime, $smsg) = @_;
-        $smsg->{ds} //= msg_datestamp($mime->header_obj, $self->{autime});
-        $smsg->{ts} //= msg_timestamp($mime->header_obj, $self->{cotime});
         $self->{over}->add_overview($mime, $smsg);
         my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
         $idx->index_raw($msgref, $mime, $smsg);
@@ -575,6 +572,8 @@ W: $list
                         num => $smsg->{num},
                         mid => $smsg->{mid},
                 }, 'PublicInbox::Smsg';
+                my $v2w = { autime => $smsg->{ds}, cotime => $smsg->{ts} };
+                $new_smsg->populate($new_mime, $v2w);
                 do_idx($self, \$raw, $new_mime, $new_smsg);
         }
         $rewritten->{rewrites};
@@ -968,6 +967,7 @@ sub reindex_oid_m ($$$$;$) {
                 blob => $oid,
                 mid => $mid0,
         }, 'PublicInbox::Smsg';
+        $smsg->populate($mime, $self);
         if (do_idx($self, $msgref, $mime, $smsg)) {
                 reindex_checkpoint($self, $sync, $git);
         }
@@ -1059,6 +1059,7 @@ sub reindex_oid ($$$$) {
                 blob => $oid,
                 mid => $mid0,
         }, 'PublicInbox::Smsg';
+        $smsg->populate($mime, $self);
         if (do_idx($self, $msgref, $mime, $smsg)) {
                 reindex_checkpoint($self, $sync, $git);
         }
@@ -1298,7 +1299,7 @@ sub index_epoch ($$$) {
                 }
         }
         close $fh or die "git log failed: \$?=$?";
-        delete $self->{reindex_pipe};
+        delete @$self{qw(reindex_pipe autime cotime)};
         update_last_commit($self, $git, $i, $cmt) if defined $cmt;
 }
 
diff --git a/t/import.t b/t/import.t
index 3f308299..f987b114 100644
--- a/t/import.t
+++ b/t/import.t
@@ -4,6 +4,7 @@ use strict;
 use warnings;
 use Test::More;
 use PublicInbox::Eml;
+use PublicInbox::Smsg;
 use PublicInbox::Git;
 use PublicInbox::Import;
 use PublicInbox::Spawn qw(spawn);
@@ -26,7 +27,7 @@ hello world
 EOF
 
 my $v2 = require_git(2.6, 1);
-my $smsg = {} if $v2;
+my $smsg = bless {}, 'PublicInbox::Smsg' if $v2;
 like($im->add($mime, undef, $smsg), qr/\A:[0-9]+\z/, 'added one message');
 
 if ($v2) {