about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-01-21 19:46:16 +0000
committerEric Wong <e@80x24.org>2021-01-22 16:18:01 -0400
commit0bc18178bc8cd08a28befbb6b3d2e7cccfc62589 (patch)
tree3c11d8c580bf1be71c8b13ce4f365dcaf9fd9125
parentaa680c4c254aaaa531d263ae7a85e5015d8a2f6a (diff)
downloadpublic-inbox-0bc18178bc8cd08a28befbb6b3d2e7cccfc62589.tar.gz
From_ lines are shown when mbox* variants are output to stdout,
making {oid} and {pct} information visible without risking being
propagated to other importer processes if they were in
lei-specific X-* headers.

Maildirs already had OIDs in the filename, now they gain Xapian
{pct} in case anybody cares.
-rw-r--r--lib/PublicInbox/LeiOverview.pm9
-rw-r--r--lib/PublicInbox/LeiToMail.pm60
-rw-r--r--t/lei_to_mail.t41
3 files changed, 61 insertions, 49 deletions
diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm
index 47d9eb31..7a4fa857 100644
--- a/lib/PublicInbox/LeiOverview.pm
+++ b/lib/PublicInbox/LeiOverview.pm
@@ -224,8 +224,9 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
                 my $git_dir = $git->{git_dir};
                 sub {
                         my ($smsg, $mitem) = @_;
-                        $l2m->wq_do('write_mail', \@io, $git_dir,
-                                        $smsg->{blob}, $lei_ipc, $smsg->{kw});
+                        $smsg->{pct} = get_pct($mitem) if $mitem;
+                        $l2m->wq_do('write_mail', \@io, $git_dir, $smsg,
+                                        $lei_ipc);
                 }
         } elsif ($l2m) {
                 my $wcb = $l2m->write_cb($lei);
@@ -234,8 +235,8 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
                 my $g2m = $l2m->can('git_to_mail');
                 sub {
                         my ($smsg, $mitem) = @_;
-                        $git->cat_async($smsg->{blob}, $g2m,
-                                        [ $wcb, $smsg->{kw} ]);
+                        $smsg->{pct} = get_pct($mitem) if $mitem;
+                        $git->cat_async($smsg->{blob}, $g2m, [ $wcb, $smsg ]);
                 };
         } elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) {
                 my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},";
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 1be0b09c..3dcce9e7 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -32,14 +32,14 @@ my %kw2status = (
 );
 
 sub _mbox_hdr_buf ($$$) {
-        my ($eml, $type, $kw) = @_;
+        my ($eml, $type, $smsg) = @_;
         $eml->header_set($_) for (qw(Lines Bytes Content-Length));
 
         # Messages are always 'O' (non-\Recent in IMAP), it saves
         # MUAs the trouble of rewriting the mbox if no other
         # changes are made
         my %hdr = (Status => [ 'O' ]); # set Status, X-Status
-        for my $k (@$kw) {
+        for my $k (@{$smsg->{kw} // []}) {
                 if (my $ent = $kw2status{$k}) {
                         push @{$hdr{$ent->[0]}}, $ent->[1];
                 } else { # X-Label?
@@ -53,9 +53,11 @@ sub _mbox_hdr_buf ($$$) {
 
         # fixup old bug from import (pre-a0c07cba0e5d8b6a)
         $$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+        my $ident = $smsg->{blob} // 'lei';
+        if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" }
 
         substr($$buf, 0, 0, # prepend From line
-                "From lei\@$type Thu Jan  1 00:00:00 1970$eml->{crlf}");
+                "From $ident\@$type Thu Jan  1 00:00:00 1970$eml->{crlf}");
         $buf;
 }
 
@@ -71,8 +73,8 @@ sub _print_full {
 }
 
 sub eml2mboxrd ($;$) {
-        my ($eml, $kw) = @_;
-        my $buf = _mbox_hdr_buf($eml, 'mboxrd', $kw);
+        my ($eml, $smsg) = @_;
+        my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg);
         if (my $bdy = delete $eml->{bdy}) {
                 $$bdy =~ s/^(>*From )/>$1/gm;
                 $$buf .= $eml->{crlf};
@@ -84,8 +86,8 @@ sub eml2mboxrd ($;$) {
 }
 
 sub eml2mboxo {
-        my ($eml, $kw) = @_;
-        my $buf = _mbox_hdr_buf($eml, 'mboxo', $kw);
+        my ($eml, $smsg) = @_;
+        my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg);
         if (my $bdy = delete $eml->{bdy}) {
                 $$bdy =~ s/^From />From /gm;
                 $$buf .= $eml->{crlf};
@@ -108,8 +110,8 @@ sub _mboxcl_common ($$$) {
 
 # mboxcl still escapes "From " lines
 sub eml2mboxcl {
-        my ($eml, $kw) = @_;
-        my $buf = _mbox_hdr_buf($eml, 'mboxcl', $kw);
+        my ($eml, $smsg) = @_;
+        my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg);
         my $crlf = $eml->{crlf};
         if (my $bdy = delete $eml->{bdy}) {
                 $$bdy =~ s/^From />From /gm;
@@ -121,8 +123,8 @@ sub eml2mboxcl {
 
 # mboxcl2 has no "From " escaping
 sub eml2mboxcl2 {
-        my ($eml, $kw) = @_;
-        my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $kw);
+        my ($eml, $smsg) = @_;
+        my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg);
         my $crlf = $eml->{crlf};
         if (my $bdy = delete $eml->{bdy}) {
                 _mboxcl_common($buf, $bdy, $crlf);
@@ -140,10 +142,11 @@ sub git_to_mail { # git->cat_async callback
                         warn "unexpected type=$type for $oid\n";
                 }
         }
-        if ($size > 0) {
-                my ($write_cb, $kw) = @$arg;
-                $write_cb->($bref, $oid, $kw);
+        my ($write_cb, $smsg) = @$arg;
+        if ($smsg->{blob} ne $oid) {
+                die "BUG: expected=$smsg->{blob} got=$oid";
         }
+        $write_cb->($bref, $smsg) if $size > 0;
 }
 
 sub reap_compress { # dwaitpid callback
@@ -247,11 +250,11 @@ sub _mbox_write_cb ($$) {
         my $dedupe = $lei->{dedupe};
         $dedupe->prepare_dedupe;
         sub { # for git_to_mail
-                my ($buf, $oid, $kw) = @_;
+                my ($buf, $smsg) = @_;
                 return unless $out;
                 my $eml = PublicInbox::Eml->new($buf);
-                if (!$dedupe->is_dup($eml, $oid)) {
-                        $buf = $eml2mbox->($eml, $kw);
+                if (!$dedupe->is_dup($eml, $smsg->{blob})) {
+                        $buf = $eml2mbox->($eml, $smsg);
                         my $lk = $ovv->lock_for_scope;
                         eval { $write->($out, $buf) };
                         if ($@) {
@@ -283,12 +286,15 @@ sub _augment_file { # _maildir_each_file cb
 sub _unlink { unlink($_[0]) }
 
 sub _buf2maildir {
-        my ($dst, $buf, $oid, $kw) = @_;
+        my ($dst, $buf, $smsg) = @_;
+        my $kw = $smsg->{kw} // [];
         my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw));
         my $rand = ''; # chosen by die roll :P
         my ($tmp, $fh, $final);
+        my $common = $smsg->{blob};
+        if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" }
         do {
-                $tmp = $dst.'tmp/'.$rand."oid=$oid";
+                $tmp = $dst.'tmp/'.$rand.$common;
         } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY) &&
                 $! == EEXIST && ($rand = int(rand 0x7fffffff).','));
         if (print $fh $$buf and close($fh)) {
@@ -299,14 +305,14 @@ sub _buf2maildir {
                 $dst .= 'cur/';
                 $rand = '';
                 do {
-                        $final = $dst.$rand."oid=$oid:2,$sfx";
+                        $final = $dst.$rand.$common.':2,'.$sfx;
                 } while (!link($tmp, $final) && $! == EEXIST &&
                         ($rand = int(rand 0x7fffffff).','));
                 unlink($tmp) or warn "W: failed to unlink $tmp: $!\n";
         } else {
                 my $err = $!;
                 unlink($tmp);
-                die "Error writing $oid to $dst: $err";
+                die "Error writing $smsg->{blob} to $dst: $err";
         }
 }
 
@@ -316,12 +322,12 @@ sub _maildir_write_cb ($$) {
         $dedupe->prepare_dedupe;
         my $dst = $lei->{ovv}->{dst};
         sub { # for git_to_mail
-                my ($buf, $oid, $kw) = @_;
-                return _buf2maildir($dst, $buf, $oid, $kw) if !$dedupe;
+                my ($buf, $smsg) = @_;
+                return _buf2maildir($dst, $buf, $smsg) if !$dedupe;
                 my $eml = PublicInbox::Eml->new($$buf); # copy buf
-                return if $dedupe->is_dup($eml, $oid);
+                return if $dedupe->is_dup($eml, $smsg->{blob});
                 undef $eml;
-                _buf2maildir($dst, $buf, $oid, $kw);
+                _buf2maildir($dst, $buf, $smsg);
         }
 }
 
@@ -447,7 +453,7 @@ sub post_augment { # fast (spawn compressor or mkdir), runs in main daemon
 }
 
 sub write_mail { # via ->wq_do
-        my ($self, $git_dir, $oid, $lei, $kw) = @_;
+        my ($self, $git_dir, $smsg, $lei) = @_;
         my $not_done = delete $self->{4}; # write end of {each_smsg_done}
         my $wcb = $self->{wcb} //= do { # first message
                 my %sig = $lei->atfork_child_wq($self);
@@ -456,7 +462,7 @@ sub write_mail { # via ->wq_do
                 $self->write_cb($lei);
         };
         my $git = $self->{"$$\0$git_dir"} //= PublicInbox::Git->new($git_dir);
-        $git->cat_async($oid, \&git_to_mail, [ $wcb, $kw, $not_done ]);
+        $git->cat_async($smsg->{blob}, \&git_to_mail, [$wcb, $smsg, $not_done]);
 }
 
 sub ipc_atfork_prepare {
diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t
index 6673d9a6..47c0e3d4 100644
--- a/t/lei_to_mail.t
+++ b/t/lei_to_mail.t
@@ -18,11 +18,12 @@ my $noeol = "Subject: x\n\nFrom hell";
 my $crlf = $noeol;
 $crlf =~ s/\n/\r\n/g;
 my $kw = [qw(seen answered flagged)];
+my $smsg = { kw => $kw, blob => '0'x40 };
 my @MBOX = qw(mboxcl2 mboxrd mboxcl mboxo);
 for my $mbox (@MBOX) {
         my $m = "eml2$mbox";
         my $cb = PublicInbox::LeiToMail->can($m);
-        my $s = $cb->(PublicInbox::Eml->new($from), $kw);
+        my $s = $cb->(PublicInbox::Eml->new($from), $smsg);
         is(substr($$s, -1, 1), "\n", "trailing LF in normal $mbox");
         my $eml = PublicInbox::Eml->new($s);
         is($eml->header('Status'), 'OR', "Status: set by $m");
@@ -40,7 +41,7 @@ for my $mbox (@MBOX) {
         } else {
                 is(scalar(@cl), 0, "$m clobbered Content-Length");
         }
-        $s = $cb->(PublicInbox::Eml->new($noeol), $kw);
+        $s = $cb->(PublicInbox::Eml->new($noeol), $smsg);
         is(substr($$s, -1, 1), "\n",
                 "trailing LF added by $m when original lacks EOL");
         $eml = PublicInbox::Eml->new($s);
@@ -49,7 +50,7 @@ for my $mbox (@MBOX) {
         } else {
                 is($eml->body_raw, ">From hell\n", "From escaped once by $m");
         }
-        $s = $cb->(PublicInbox::Eml->new($crlf), $kw);
+        $s = $cb->(PublicInbox::Eml->new($crlf), $smsg);
         is(substr($$s, -2, 2), "\r\n",
                 "trailing CRLF added $m by original lacks EOL");
         $eml = PublicInbox::Eml->new($s);
@@ -62,7 +63,7 @@ for my $mbox (@MBOX) {
                 is($eml->header('Content-Length') + length("\r\n"),
                         length($eml->body_raw), "$m Content-Length matches");
         } elsif ($mbox eq 'mboxrd') {
-                $s = $cb->($eml, $kw);
+                $s = $cb->($eml, $smsg);
                 $eml = PublicInbox::Eml->new($s);
                 is($eml->body_raw,
                         ">>From hell\r\n\r\n", "From escaped again by $m");
@@ -102,11 +103,12 @@ my $wcb_get = sub {
         $cb;
 };
 
+my $deadbeef = { blob => 'deadbeef', kw => [ qw(seen) ] };
 my $orig = do {
         my $wcb = $wcb_get->($mbox, $fn);
         is(ref $wcb, 'CODE', 'write_cb returned callback');
         ok(-f $fn && !-s _, 'empty file created');
-        $wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]);
+        $wcb->(\(my $dup = $buf), $deadbeef);
         undef $wcb;
         open my $fh, '<', $fn or BAIL_OUT $!;
         my $raw = do { local $/; <$fh> };
@@ -116,7 +118,7 @@ my $orig = do {
         local $lei->{opt} = { jobs => 2 };
         $wcb = $wcb_get->($mbox, $fn);
         ok(-f $fn && !-s _, 'truncated mbox destination');
-        $wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]);
+        $wcb->(\($dup = $buf), $deadbeef);
         undef $wcb;
         open $fh, '<', $fn or BAIL_OUT $!;
         is(do { local $/; <$fh> }, $raw, 'jobs > 1');
@@ -131,7 +133,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
                 ok($dc_cmd, "decompressor for .$zsfx");
                 my $f = "$fn.$zsfx";
                 my $wcb = $wcb_get->($mbox, $f);
-                $wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]);
+                $wcb->(\(my $dup = $buf), $deadbeef);
                 undef $wcb;
                 my $uncompressed = xqx([@$dc_cmd, $f]);
                 is($uncompressed, $orig, "$zsfx works unlocked");
@@ -139,13 +141,13 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
                 local $lei->{opt} = { jobs => 2 }; # for atomic writes
                 unlink $f or BAIL_OUT "unlink $!";
                 $wcb = $wcb_get->($mbox, $f);
-                $wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]);
+                $wcb->(\($dup = $buf), $deadbeef);
                 undef $wcb;
                 is(xqx([@$dc_cmd, $f]), $orig, "$zsfx matches with lock");
 
                 local $lei->{opt} = { augment => 1 };
                 $wcb = $wcb_get->($mbox, $f);
-                $wcb->(\($dup = $buf . "\nx\n"), 'deadbeef', [ qw(seen) ]);
+                $wcb->(\($dup = $buf . "\nx\n"), $deadbeef);
                 undef $wcb; # commit
 
                 my $cat = popen_rd([@$dc_cmd, $f]);
@@ -157,7 +159,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
 
                 local $lei->{opt} = { augment => 1, jobs => 2 };
                 $wcb = $wcb_get->($mbox, $f);
-                $wcb->(\($dup = $buf . "\ny\n"), 'deadbeef', [ qw(seen) ]);
+                $wcb->(\($dup = $buf . "\ny\n"), $deadbeef);
                 undef $wcb; # commit
 
                 my @raw3;
@@ -179,7 +181,8 @@ my $as_orig = sub {
 unlink $fn or BAIL_OUT $!;
 if ('default deduplication uses content_hash') {
         my $wcb = $wcb_get->('mboxo', $fn);
-        $wcb->(\(my $x = $buf), 'deadbeef', []) for (1..2);
+        $deadbeef->{kw} = [];
+        $wcb->(\(my $x = $buf), $deadbeef) for (1..2);
         undef $wcb; # undef to commit changes
         my $cmp = '';
         open my $fh, '<', $fn or BAIL_OUT $!;
@@ -188,7 +191,7 @@ if ('default deduplication uses content_hash') {
 
         local $lei->{opt} = { augment => 1 };
         $wcb = $wcb_get->('mboxo', $fn);
-        $wcb->(\($x = $buf . "\nx\n"), 'deadbeef', []) for (1..2);
+        $wcb->(\($x = $buf . "\nx\n"), $deadbeef) for (1..2);
         undef $wcb; # undef to commit changes
         open $fh, '<', $fn or BAIL_OUT $!;
         my @x;
@@ -202,7 +205,7 @@ if ('default deduplication uses content_hash') {
         open my $tmp, '+>', undef or BAIL_OUT $!;
         local $lei->{1} = $tmp;
         my $wcb = $wcb_get->('mboxrd', '/dev/stdout');
-        $wcb->(\(my $x = $buf), 'deadbeef', []);
+        $wcb->(\(my $x = $buf), $deadbeef);
         undef $wcb; # commit
         seek($tmp, 0, SEEK_SET) or BAIL_OUT $!;
         my $cmp = '';
@@ -216,7 +219,7 @@ SKIP: { # FIFO support
         mkfifo($fn, 0600) or skip("mkfifo not supported: $!", 1);
         my $cat = popen_rd([which('cat'), $fn]);
         my $wcb = $wcb_get->('mboxo', $fn);
-        $wcb->(\(my $x = $buf), 'deadbeef', []);
+        $wcb->(\(my $x = $buf), $deadbeef);
         undef $wcb; # commit
         my $cmp = '';
         PublicInbox::MboxReader->mboxo($cat, sub { $cmp .= $as_orig->(@_) });
@@ -227,7 +230,8 @@ SKIP: { # FIFO support
         my $md = "$tmpdir/maildir/";
         my $wcb = $wcb_get->('maildir', $md);
         is(ref($wcb), 'CODE', 'got Maildir callback');
-        $wcb->(\(my $x = $buf), 'badc0ffee', []);
+        my $b4dc0ffee = { blob => 'badc0ffee', kw => [] };
+        $wcb->(\(my $x = $buf), $b4dc0ffee);
 
         my @f;
         PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
@@ -235,7 +239,8 @@ SKIP: { # FIFO support
         is(do { local $/; <$fh> }, $buf, 'wrote to Maildir');
 
         $wcb = $wcb_get->('maildir', $md);
-        $wcb->(\($x = $buf."\nx\n"), 'deadcafe', []);
+        my $deadcafe = { blob => 'deadcafe', kw => [] };
+        $wcb->(\($x = $buf."\nx\n"), $deadcafe);
 
         my @x = ();
         PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @x, shift });
@@ -246,8 +251,8 @@ SKIP: { # FIFO support
 
         local $lei->{opt}->{augment} = 1;
         $wcb = $wcb_get->('maildir', $md);
-        $wcb->(\($x = $buf."\ny\n"), 'deadcafe', []);
-        $wcb->(\($x = $buf."\ny\n"), 'b4dc0ffee', []); # skipped by dedupe
+        $wcb->(\($x = $buf."\ny\n"), $deadcafe);
+        $wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe
         @f = ();
         PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
         is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there');