From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 04/12] lei: show {pct} and {oid} in From_ lines and filenames
Date: Thu, 21 Jan 2021 19:46:16 +0000 [thread overview]
Message-ID: <20210121194624.32002-5-e@80x24.org> (raw)
In-Reply-To: <20210121194624.32002-1-e@80x24.org>
From_ lines are shown when mbox* variants are output to stdout,
making {oid} and {pct} information visible without risking being
propagated to other importer processes if they were in
lei-specific X-* headers.
Maildirs already had OIDs in the filename, now they gain Xapian
{pct} in case anybody cares.
---
lib/PublicInbox/LeiOverview.pm | 9 ++---
lib/PublicInbox/LeiToMail.pm | 60 +++++++++++++++++++---------------
t/lei_to_mail.t | 41 +++++++++++++----------
3 files changed, 61 insertions(+), 49 deletions(-)
diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm
index 47d9eb31..7a4fa857 100644
--- a/lib/PublicInbox/LeiOverview.pm
+++ b/lib/PublicInbox/LeiOverview.pm
@@ -224,8 +224,9 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
my $git_dir = $git->{git_dir};
sub {
my ($smsg, $mitem) = @_;
- $l2m->wq_do('write_mail', \@io, $git_dir,
- $smsg->{blob}, $lei_ipc, $smsg->{kw});
+ $smsg->{pct} = get_pct($mitem) if $mitem;
+ $l2m->wq_do('write_mail', \@io, $git_dir, $smsg,
+ $lei_ipc);
}
} elsif ($l2m) {
my $wcb = $l2m->write_cb($lei);
@@ -234,8 +235,8 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
my $g2m = $l2m->can('git_to_mail');
sub {
my ($smsg, $mitem) = @_;
- $git->cat_async($smsg->{blob}, $g2m,
- [ $wcb, $smsg->{kw} ]);
+ $smsg->{pct} = get_pct($mitem) if $mitem;
+ $git->cat_async($smsg->{blob}, $g2m, [ $wcb, $smsg ]);
};
} elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) {
my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},";
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 1be0b09c..3dcce9e7 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -32,14 +32,14 @@ my %kw2status = (
);
sub _mbox_hdr_buf ($$$) {
- my ($eml, $type, $kw) = @_;
+ my ($eml, $type, $smsg) = @_;
$eml->header_set($_) for (qw(Lines Bytes Content-Length));
# Messages are always 'O' (non-\Recent in IMAP), it saves
# MUAs the trouble of rewriting the mbox if no other
# changes are made
my %hdr = (Status => [ 'O' ]); # set Status, X-Status
- for my $k (@$kw) {
+ for my $k (@{$smsg->{kw} // []}) {
if (my $ent = $kw2status{$k}) {
push @{$hdr{$ent->[0]}}, $ent->[1];
} else { # X-Label?
@@ -53,9 +53,11 @@ sub _mbox_hdr_buf ($$$) {
# fixup old bug from import (pre-a0c07cba0e5d8b6a)
$$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+ my $ident = $smsg->{blob} // 'lei';
+ if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" }
substr($$buf, 0, 0, # prepend From line
- "From lei\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}");
+ "From $ident\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}");
$buf;
}
@@ -71,8 +73,8 @@ sub _print_full {
}
sub eml2mboxrd ($;$) {
- my ($eml, $kw) = @_;
- my $buf = _mbox_hdr_buf($eml, 'mboxrd', $kw);
+ my ($eml, $smsg) = @_;
+ my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg);
if (my $bdy = delete $eml->{bdy}) {
$$bdy =~ s/^(>*From )/>$1/gm;
$$buf .= $eml->{crlf};
@@ -84,8 +86,8 @@ sub eml2mboxrd ($;$) {
}
sub eml2mboxo {
- my ($eml, $kw) = @_;
- my $buf = _mbox_hdr_buf($eml, 'mboxo', $kw);
+ my ($eml, $smsg) = @_;
+ my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg);
if (my $bdy = delete $eml->{bdy}) {
$$bdy =~ s/^From />From /gm;
$$buf .= $eml->{crlf};
@@ -108,8 +110,8 @@ sub _mboxcl_common ($$$) {
# mboxcl still escapes "From " lines
sub eml2mboxcl {
- my ($eml, $kw) = @_;
- my $buf = _mbox_hdr_buf($eml, 'mboxcl', $kw);
+ my ($eml, $smsg) = @_;
+ my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg);
my $crlf = $eml->{crlf};
if (my $bdy = delete $eml->{bdy}) {
$$bdy =~ s/^From />From /gm;
@@ -121,8 +123,8 @@ sub eml2mboxcl {
# mboxcl2 has no "From " escaping
sub eml2mboxcl2 {
- my ($eml, $kw) = @_;
- my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $kw);
+ my ($eml, $smsg) = @_;
+ my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg);
my $crlf = $eml->{crlf};
if (my $bdy = delete $eml->{bdy}) {
_mboxcl_common($buf, $bdy, $crlf);
@@ -140,10 +142,11 @@ sub git_to_mail { # git->cat_async callback
warn "unexpected type=$type for $oid\n";
}
}
- if ($size > 0) {
- my ($write_cb, $kw) = @$arg;
- $write_cb->($bref, $oid, $kw);
+ my ($write_cb, $smsg) = @$arg;
+ if ($smsg->{blob} ne $oid) {
+ die "BUG: expected=$smsg->{blob} got=$oid";
}
+ $write_cb->($bref, $smsg) if $size > 0;
}
sub reap_compress { # dwaitpid callback
@@ -247,11 +250,11 @@ sub _mbox_write_cb ($$) {
my $dedupe = $lei->{dedupe};
$dedupe->prepare_dedupe;
sub { # for git_to_mail
- my ($buf, $oid, $kw) = @_;
+ my ($buf, $smsg) = @_;
return unless $out;
my $eml = PublicInbox::Eml->new($buf);
- if (!$dedupe->is_dup($eml, $oid)) {
- $buf = $eml2mbox->($eml, $kw);
+ if (!$dedupe->is_dup($eml, $smsg->{blob})) {
+ $buf = $eml2mbox->($eml, $smsg);
my $lk = $ovv->lock_for_scope;
eval { $write->($out, $buf) };
if ($@) {
@@ -283,12 +286,15 @@ sub _augment_file { # _maildir_each_file cb
sub _unlink { unlink($_[0]) }
sub _buf2maildir {
- my ($dst, $buf, $oid, $kw) = @_;
+ my ($dst, $buf, $smsg) = @_;
+ my $kw = $smsg->{kw} // [];
my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw));
my $rand = ''; # chosen by die roll :P
my ($tmp, $fh, $final);
+ my $common = $smsg->{blob};
+ if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" }
do {
- $tmp = $dst.'tmp/'.$rand."oid=$oid";
+ $tmp = $dst.'tmp/'.$rand.$common;
} while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY) &&
$! == EEXIST && ($rand = int(rand 0x7fffffff).','));
if (print $fh $$buf and close($fh)) {
@@ -299,14 +305,14 @@ sub _buf2maildir {
$dst .= 'cur/';
$rand = '';
do {
- $final = $dst.$rand."oid=$oid:2,$sfx";
+ $final = $dst.$rand.$common.':2,'.$sfx;
} while (!link($tmp, $final) && $! == EEXIST &&
($rand = int(rand 0x7fffffff).','));
unlink($tmp) or warn "W: failed to unlink $tmp: $!\n";
} else {
my $err = $!;
unlink($tmp);
- die "Error writing $oid to $dst: $err";
+ die "Error writing $smsg->{blob} to $dst: $err";
}
}
@@ -316,12 +322,12 @@ sub _maildir_write_cb ($$) {
$dedupe->prepare_dedupe;
my $dst = $lei->{ovv}->{dst};
sub { # for git_to_mail
- my ($buf, $oid, $kw) = @_;
- return _buf2maildir($dst, $buf, $oid, $kw) if !$dedupe;
+ my ($buf, $smsg) = @_;
+ return _buf2maildir($dst, $buf, $smsg) if !$dedupe;
my $eml = PublicInbox::Eml->new($$buf); # copy buf
- return if $dedupe->is_dup($eml, $oid);
+ return if $dedupe->is_dup($eml, $smsg->{blob});
undef $eml;
- _buf2maildir($dst, $buf, $oid, $kw);
+ _buf2maildir($dst, $buf, $smsg);
}
}
@@ -447,7 +453,7 @@ sub post_augment { # fast (spawn compressor or mkdir), runs in main daemon
}
sub write_mail { # via ->wq_do
- my ($self, $git_dir, $oid, $lei, $kw) = @_;
+ my ($self, $git_dir, $smsg, $lei) = @_;
my $not_done = delete $self->{4}; # write end of {each_smsg_done}
my $wcb = $self->{wcb} //= do { # first message
my %sig = $lei->atfork_child_wq($self);
@@ -456,7 +462,7 @@ sub write_mail { # via ->wq_do
$self->write_cb($lei);
};
my $git = $self->{"$$\0$git_dir"} //= PublicInbox::Git->new($git_dir);
- $git->cat_async($oid, \&git_to_mail, [ $wcb, $kw, $not_done ]);
+ $git->cat_async($smsg->{blob}, \&git_to_mail, [$wcb, $smsg, $not_done]);
}
sub ipc_atfork_prepare {
diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t
index 6673d9a6..47c0e3d4 100644
--- a/t/lei_to_mail.t
+++ b/t/lei_to_mail.t
@@ -18,11 +18,12 @@ my $noeol = "Subject: x\n\nFrom hell";
my $crlf = $noeol;
$crlf =~ s/\n/\r\n/g;
my $kw = [qw(seen answered flagged)];
+my $smsg = { kw => $kw, blob => '0'x40 };
my @MBOX = qw(mboxcl2 mboxrd mboxcl mboxo);
for my $mbox (@MBOX) {
my $m = "eml2$mbox";
my $cb = PublicInbox::LeiToMail->can($m);
- my $s = $cb->(PublicInbox::Eml->new($from), $kw);
+ my $s = $cb->(PublicInbox::Eml->new($from), $smsg);
is(substr($$s, -1, 1), "\n", "trailing LF in normal $mbox");
my $eml = PublicInbox::Eml->new($s);
is($eml->header('Status'), 'OR', "Status: set by $m");
@@ -40,7 +41,7 @@ for my $mbox (@MBOX) {
} else {
is(scalar(@cl), 0, "$m clobbered Content-Length");
}
- $s = $cb->(PublicInbox::Eml->new($noeol), $kw);
+ $s = $cb->(PublicInbox::Eml->new($noeol), $smsg);
is(substr($$s, -1, 1), "\n",
"trailing LF added by $m when original lacks EOL");
$eml = PublicInbox::Eml->new($s);
@@ -49,7 +50,7 @@ for my $mbox (@MBOX) {
} else {
is($eml->body_raw, ">From hell\n", "From escaped once by $m");
}
- $s = $cb->(PublicInbox::Eml->new($crlf), $kw);
+ $s = $cb->(PublicInbox::Eml->new($crlf), $smsg);
is(substr($$s, -2, 2), "\r\n",
"trailing CRLF added $m by original lacks EOL");
$eml = PublicInbox::Eml->new($s);
@@ -62,7 +63,7 @@ for my $mbox (@MBOX) {
is($eml->header('Content-Length') + length("\r\n"),
length($eml->body_raw), "$m Content-Length matches");
} elsif ($mbox eq 'mboxrd') {
- $s = $cb->($eml, $kw);
+ $s = $cb->($eml, $smsg);
$eml = PublicInbox::Eml->new($s);
is($eml->body_raw,
">>From hell\r\n\r\n", "From escaped again by $m");
@@ -102,11 +103,12 @@ my $wcb_get = sub {
$cb;
};
+my $deadbeef = { blob => 'deadbeef', kw => [ qw(seen) ] };
my $orig = do {
my $wcb = $wcb_get->($mbox, $fn);
is(ref $wcb, 'CODE', 'write_cb returned callback');
ok(-f $fn && !-s _, 'empty file created');
- $wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]);
+ $wcb->(\(my $dup = $buf), $deadbeef);
undef $wcb;
open my $fh, '<', $fn or BAIL_OUT $!;
my $raw = do { local $/; <$fh> };
@@ -116,7 +118,7 @@ my $orig = do {
local $lei->{opt} = { jobs => 2 };
$wcb = $wcb_get->($mbox, $fn);
ok(-f $fn && !-s _, 'truncated mbox destination');
- $wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]);
+ $wcb->(\($dup = $buf), $deadbeef);
undef $wcb;
open $fh, '<', $fn or BAIL_OUT $!;
is(do { local $/; <$fh> }, $raw, 'jobs > 1');
@@ -131,7 +133,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
ok($dc_cmd, "decompressor for .$zsfx");
my $f = "$fn.$zsfx";
my $wcb = $wcb_get->($mbox, $f);
- $wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]);
+ $wcb->(\(my $dup = $buf), $deadbeef);
undef $wcb;
my $uncompressed = xqx([@$dc_cmd, $f]);
is($uncompressed, $orig, "$zsfx works unlocked");
@@ -139,13 +141,13 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
local $lei->{opt} = { jobs => 2 }; # for atomic writes
unlink $f or BAIL_OUT "unlink $!";
$wcb = $wcb_get->($mbox, $f);
- $wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]);
+ $wcb->(\($dup = $buf), $deadbeef);
undef $wcb;
is(xqx([@$dc_cmd, $f]), $orig, "$zsfx matches with lock");
local $lei->{opt} = { augment => 1 };
$wcb = $wcb_get->($mbox, $f);
- $wcb->(\($dup = $buf . "\nx\n"), 'deadbeef', [ qw(seen) ]);
+ $wcb->(\($dup = $buf . "\nx\n"), $deadbeef);
undef $wcb; # commit
my $cat = popen_rd([@$dc_cmd, $f]);
@@ -157,7 +159,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
local $lei->{opt} = { augment => 1, jobs => 2 };
$wcb = $wcb_get->($mbox, $f);
- $wcb->(\($dup = $buf . "\ny\n"), 'deadbeef', [ qw(seen) ]);
+ $wcb->(\($dup = $buf . "\ny\n"), $deadbeef);
undef $wcb; # commit
my @raw3;
@@ -179,7 +181,8 @@ my $as_orig = sub {
unlink $fn or BAIL_OUT $!;
if ('default deduplication uses content_hash') {
my $wcb = $wcb_get->('mboxo', $fn);
- $wcb->(\(my $x = $buf), 'deadbeef', []) for (1..2);
+ $deadbeef->{kw} = [];
+ $wcb->(\(my $x = $buf), $deadbeef) for (1..2);
undef $wcb; # undef to commit changes
my $cmp = '';
open my $fh, '<', $fn or BAIL_OUT $!;
@@ -188,7 +191,7 @@ if ('default deduplication uses content_hash') {
local $lei->{opt} = { augment => 1 };
$wcb = $wcb_get->('mboxo', $fn);
- $wcb->(\($x = $buf . "\nx\n"), 'deadbeef', []) for (1..2);
+ $wcb->(\($x = $buf . "\nx\n"), $deadbeef) for (1..2);
undef $wcb; # undef to commit changes
open $fh, '<', $fn or BAIL_OUT $!;
my @x;
@@ -202,7 +205,7 @@ if ('default deduplication uses content_hash') {
open my $tmp, '+>', undef or BAIL_OUT $!;
local $lei->{1} = $tmp;
my $wcb = $wcb_get->('mboxrd', '/dev/stdout');
- $wcb->(\(my $x = $buf), 'deadbeef', []);
+ $wcb->(\(my $x = $buf), $deadbeef);
undef $wcb; # commit
seek($tmp, 0, SEEK_SET) or BAIL_OUT $!;
my $cmp = '';
@@ -216,7 +219,7 @@ SKIP: { # FIFO support
mkfifo($fn, 0600) or skip("mkfifo not supported: $!", 1);
my $cat = popen_rd([which('cat'), $fn]);
my $wcb = $wcb_get->('mboxo', $fn);
- $wcb->(\(my $x = $buf), 'deadbeef', []);
+ $wcb->(\(my $x = $buf), $deadbeef);
undef $wcb; # commit
my $cmp = '';
PublicInbox::MboxReader->mboxo($cat, sub { $cmp .= $as_orig->(@_) });
@@ -227,7 +230,8 @@ SKIP: { # FIFO support
my $md = "$tmpdir/maildir/";
my $wcb = $wcb_get->('maildir', $md);
is(ref($wcb), 'CODE', 'got Maildir callback');
- $wcb->(\(my $x = $buf), 'badc0ffee', []);
+ my $b4dc0ffee = { blob => 'badc0ffee', kw => [] };
+ $wcb->(\(my $x = $buf), $b4dc0ffee);
my @f;
PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
@@ -235,7 +239,8 @@ SKIP: { # FIFO support
is(do { local $/; <$fh> }, $buf, 'wrote to Maildir');
$wcb = $wcb_get->('maildir', $md);
- $wcb->(\($x = $buf."\nx\n"), 'deadcafe', []);
+ my $deadcafe = { blob => 'deadcafe', kw => [] };
+ $wcb->(\($x = $buf."\nx\n"), $deadcafe);
my @x = ();
PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @x, shift });
@@ -246,8 +251,8 @@ SKIP: { # FIFO support
local $lei->{opt}->{augment} = 1;
$wcb = $wcb_get->('maildir', $md);
- $wcb->(\($x = $buf."\ny\n"), 'deadcafe', []);
- $wcb->(\($x = $buf."\ny\n"), 'b4dc0ffee', []); # skipped by dedupe
+ $wcb->(\($x = $buf."\ny\n"), $deadcafe);
+ $wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe
@f = ();
PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there');
next prev parent reply other threads:[~2021-01-21 19:46 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-01-21 19:46 [PATCH 00/12] lei: another dump Eric Wong
2021-01-21 19:46 ` [PATCH 01/12] lei_overview: rename {relevance} => {pct} Eric Wong
2021-01-21 19:46 ` [PATCH 02/12] lei q: retrieve keywords for local, non-external messages Eric Wong
2021-01-21 19:46 ` [PATCH 03/12] lei_xsearch: eliminate some unused, commented-out code Eric Wong
2021-01-21 19:46 ` Eric Wong [this message]
2021-01-21 19:46 ` [PATCH 05/12] lei: fix inadvertant FD sharing Eric Wong
2021-01-21 19:46 ` [PATCH 06/12] lei_to_mail: avoid segfault on exit Eric Wong
2021-01-21 19:46 ` [PATCH 07/12] lei: oneshot: use client $io[2] for placeholder Eric Wong
2021-01-21 19:46 ` [PATCH 08/12] lei: remove INT/QUIT/TERM handlers, fix daemon EOF Eric Wong
2021-01-21 19:46 ` [PATCH 09/12] lei_xsearch: reduce reference paths to lxs Eric Wong
2021-01-21 19:46 ` [PATCH 10/12] lei: remove @TO_CLOSE_ATFORK_CHILD Eric Wong
2021-01-21 19:46 ` [PATCH 11/12] lei: forget-external support with canonicalization Eric Wong
2021-01-21 19:46 ` [PATCH 12/12] lei forget-external: bash completion support Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210121194624.32002-5-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).