user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 04/12] lei: show {pct} and {oid} in From_ lines and filenames
Date: Thu, 21 Jan 2021 19:46:16 +0000	[thread overview]
Message-ID: <20210121194624.32002-5-e@80x24.org> (raw)
In-Reply-To: <20210121194624.32002-1-e@80x24.org>

From_ lines are shown when mbox* variants are output to stdout,
making {oid} and {pct} information visible without risking being
propagated to other importer processes if they were in
lei-specific X-* headers.

Maildirs already had OIDs in the filename, now they gain Xapian
{pct} in case anybody cares.
---
 lib/PublicInbox/LeiOverview.pm |  9 ++---
 lib/PublicInbox/LeiToMail.pm   | 60 +++++++++++++++++++---------------
 t/lei_to_mail.t                | 41 +++++++++++++----------
 3 files changed, 61 insertions(+), 49 deletions(-)

diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm
index 47d9eb31..7a4fa857 100644
--- a/lib/PublicInbox/LeiOverview.pm
+++ b/lib/PublicInbox/LeiOverview.pm
@@ -224,8 +224,9 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
 		my $git_dir = $git->{git_dir};
 		sub {
 			my ($smsg, $mitem) = @_;
-			$l2m->wq_do('write_mail', \@io, $git_dir,
-					$smsg->{blob}, $lei_ipc, $smsg->{kw});
+			$smsg->{pct} = get_pct($mitem) if $mitem;
+			$l2m->wq_do('write_mail', \@io, $git_dir, $smsg,
+					$lei_ipc);
 		}
 	} elsif ($l2m) {
 		my $wcb = $l2m->write_cb($lei);
@@ -234,8 +235,8 @@ sub ovv_each_smsg_cb { # runs in wq worker usually
 		my $g2m = $l2m->can('git_to_mail');
 		sub {
 			my ($smsg, $mitem) = @_;
-			$git->cat_async($smsg->{blob}, $g2m,
-					[ $wcb, $smsg->{kw} ]);
+			$smsg->{pct} = get_pct($mitem) if $mitem;
+			$git->cat_async($smsg->{blob}, $g2m, [ $wcb, $smsg ]);
 		};
 	} elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) {
 		my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},";
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 1be0b09c..3dcce9e7 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -32,14 +32,14 @@ my %kw2status = (
 );
 
 sub _mbox_hdr_buf ($$$) {
-	my ($eml, $type, $kw) = @_;
+	my ($eml, $type, $smsg) = @_;
 	$eml->header_set($_) for (qw(Lines Bytes Content-Length));
 
 	# Messages are always 'O' (non-\Recent in IMAP), it saves
 	# MUAs the trouble of rewriting the mbox if no other
 	# changes are made
 	my %hdr = (Status => [ 'O' ]); # set Status, X-Status
-	for my $k (@$kw) {
+	for my $k (@{$smsg->{kw} // []}) {
 		if (my $ent = $kw2status{$k}) {
 			push @{$hdr{$ent->[0]}}, $ent->[1];
 		} else { # X-Label?
@@ -53,9 +53,11 @@ sub _mbox_hdr_buf ($$$) {
 
 	# fixup old bug from import (pre-a0c07cba0e5d8b6a)
 	$$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s;
+	my $ident = $smsg->{blob} // 'lei';
+	if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" }
 
 	substr($$buf, 0, 0, # prepend From line
-		"From lei\@$type Thu Jan  1 00:00:00 1970$eml->{crlf}");
+		"From $ident\@$type Thu Jan  1 00:00:00 1970$eml->{crlf}");
 	$buf;
 }
 
@@ -71,8 +73,8 @@ sub _print_full {
 }
 
 sub eml2mboxrd ($;$) {
-	my ($eml, $kw) = @_;
-	my $buf = _mbox_hdr_buf($eml, 'mboxrd', $kw);
+	my ($eml, $smsg) = @_;
+	my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg);
 	if (my $bdy = delete $eml->{bdy}) {
 		$$bdy =~ s/^(>*From )/>$1/gm;
 		$$buf .= $eml->{crlf};
@@ -84,8 +86,8 @@ sub eml2mboxrd ($;$) {
 }
 
 sub eml2mboxo {
-	my ($eml, $kw) = @_;
-	my $buf = _mbox_hdr_buf($eml, 'mboxo', $kw);
+	my ($eml, $smsg) = @_;
+	my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg);
 	if (my $bdy = delete $eml->{bdy}) {
 		$$bdy =~ s/^From />From /gm;
 		$$buf .= $eml->{crlf};
@@ -108,8 +110,8 @@ sub _mboxcl_common ($$$) {
 
 # mboxcl still escapes "From " lines
 sub eml2mboxcl {
-	my ($eml, $kw) = @_;
-	my $buf = _mbox_hdr_buf($eml, 'mboxcl', $kw);
+	my ($eml, $smsg) = @_;
+	my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg);
 	my $crlf = $eml->{crlf};
 	if (my $bdy = delete $eml->{bdy}) {
 		$$bdy =~ s/^From />From /gm;
@@ -121,8 +123,8 @@ sub eml2mboxcl {
 
 # mboxcl2 has no "From " escaping
 sub eml2mboxcl2 {
-	my ($eml, $kw) = @_;
-	my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $kw);
+	my ($eml, $smsg) = @_;
+	my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg);
 	my $crlf = $eml->{crlf};
 	if (my $bdy = delete $eml->{bdy}) {
 		_mboxcl_common($buf, $bdy, $crlf);
@@ -140,10 +142,11 @@ sub git_to_mail { # git->cat_async callback
 			warn "unexpected type=$type for $oid\n";
 		}
 	}
-	if ($size > 0) {
-		my ($write_cb, $kw) = @$arg;
-		$write_cb->($bref, $oid, $kw);
+	my ($write_cb, $smsg) = @$arg;
+	if ($smsg->{blob} ne $oid) {
+		die "BUG: expected=$smsg->{blob} got=$oid";
 	}
+	$write_cb->($bref, $smsg) if $size > 0;
 }
 
 sub reap_compress { # dwaitpid callback
@@ -247,11 +250,11 @@ sub _mbox_write_cb ($$) {
 	my $dedupe = $lei->{dedupe};
 	$dedupe->prepare_dedupe;
 	sub { # for git_to_mail
-		my ($buf, $oid, $kw) = @_;
+		my ($buf, $smsg) = @_;
 		return unless $out;
 		my $eml = PublicInbox::Eml->new($buf);
-		if (!$dedupe->is_dup($eml, $oid)) {
-			$buf = $eml2mbox->($eml, $kw);
+		if (!$dedupe->is_dup($eml, $smsg->{blob})) {
+			$buf = $eml2mbox->($eml, $smsg);
 			my $lk = $ovv->lock_for_scope;
 			eval { $write->($out, $buf) };
 			if ($@) {
@@ -283,12 +286,15 @@ sub _augment_file { # _maildir_each_file cb
 sub _unlink { unlink($_[0]) }
 
 sub _buf2maildir {
-	my ($dst, $buf, $oid, $kw) = @_;
+	my ($dst, $buf, $smsg) = @_;
+	my $kw = $smsg->{kw} // [];
 	my $sfx = join('', sort(map { $kw2char{$_} // () } @$kw));
 	my $rand = ''; # chosen by die roll :P
 	my ($tmp, $fh, $final);
+	my $common = $smsg->{blob};
+	if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" }
 	do {
-		$tmp = $dst.'tmp/'.$rand."oid=$oid";
+		$tmp = $dst.'tmp/'.$rand.$common;
 	} while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY) &&
 		$! == EEXIST && ($rand = int(rand 0x7fffffff).','));
 	if (print $fh $$buf and close($fh)) {
@@ -299,14 +305,14 @@ sub _buf2maildir {
 		$dst .= 'cur/';
 		$rand = '';
 		do {
-			$final = $dst.$rand."oid=$oid:2,$sfx";
+			$final = $dst.$rand.$common.':2,'.$sfx;
 		} while (!link($tmp, $final) && $! == EEXIST &&
 			($rand = int(rand 0x7fffffff).','));
 		unlink($tmp) or warn "W: failed to unlink $tmp: $!\n";
 	} else {
 		my $err = $!;
 		unlink($tmp);
-		die "Error writing $oid to $dst: $err";
+		die "Error writing $smsg->{blob} to $dst: $err";
 	}
 }
 
@@ -316,12 +322,12 @@ sub _maildir_write_cb ($$) {
 	$dedupe->prepare_dedupe;
 	my $dst = $lei->{ovv}->{dst};
 	sub { # for git_to_mail
-		my ($buf, $oid, $kw) = @_;
-		return _buf2maildir($dst, $buf, $oid, $kw) if !$dedupe;
+		my ($buf, $smsg) = @_;
+		return _buf2maildir($dst, $buf, $smsg) if !$dedupe;
 		my $eml = PublicInbox::Eml->new($$buf); # copy buf
-		return if $dedupe->is_dup($eml, $oid);
+		return if $dedupe->is_dup($eml, $smsg->{blob});
 		undef $eml;
-		_buf2maildir($dst, $buf, $oid, $kw);
+		_buf2maildir($dst, $buf, $smsg);
 	}
 }
 
@@ -447,7 +453,7 @@ sub post_augment { # fast (spawn compressor or mkdir), runs in main daemon
 }
 
 sub write_mail { # via ->wq_do
-	my ($self, $git_dir, $oid, $lei, $kw) = @_;
+	my ($self, $git_dir, $smsg, $lei) = @_;
 	my $not_done = delete $self->{4}; # write end of {each_smsg_done}
 	my $wcb = $self->{wcb} //= do { # first message
 		my %sig = $lei->atfork_child_wq($self);
@@ -456,7 +462,7 @@ sub write_mail { # via ->wq_do
 		$self->write_cb($lei);
 	};
 	my $git = $self->{"$$\0$git_dir"} //= PublicInbox::Git->new($git_dir);
-	$git->cat_async($oid, \&git_to_mail, [ $wcb, $kw, $not_done ]);
+	$git->cat_async($smsg->{blob}, \&git_to_mail, [$wcb, $smsg, $not_done]);
 }
 
 sub ipc_atfork_prepare {
diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t
index 6673d9a6..47c0e3d4 100644
--- a/t/lei_to_mail.t
+++ b/t/lei_to_mail.t
@@ -18,11 +18,12 @@ my $noeol = "Subject: x\n\nFrom hell";
 my $crlf = $noeol;
 $crlf =~ s/\n/\r\n/g;
 my $kw = [qw(seen answered flagged)];
+my $smsg = { kw => $kw, blob => '0'x40 };
 my @MBOX = qw(mboxcl2 mboxrd mboxcl mboxo);
 for my $mbox (@MBOX) {
 	my $m = "eml2$mbox";
 	my $cb = PublicInbox::LeiToMail->can($m);
-	my $s = $cb->(PublicInbox::Eml->new($from), $kw);
+	my $s = $cb->(PublicInbox::Eml->new($from), $smsg);
 	is(substr($$s, -1, 1), "\n", "trailing LF in normal $mbox");
 	my $eml = PublicInbox::Eml->new($s);
 	is($eml->header('Status'), 'OR', "Status: set by $m");
@@ -40,7 +41,7 @@ for my $mbox (@MBOX) {
 	} else {
 		is(scalar(@cl), 0, "$m clobbered Content-Length");
 	}
-	$s = $cb->(PublicInbox::Eml->new($noeol), $kw);
+	$s = $cb->(PublicInbox::Eml->new($noeol), $smsg);
 	is(substr($$s, -1, 1), "\n",
 		"trailing LF added by $m when original lacks EOL");
 	$eml = PublicInbox::Eml->new($s);
@@ -49,7 +50,7 @@ for my $mbox (@MBOX) {
 	} else {
 		is($eml->body_raw, ">From hell\n", "From escaped once by $m");
 	}
-	$s = $cb->(PublicInbox::Eml->new($crlf), $kw);
+	$s = $cb->(PublicInbox::Eml->new($crlf), $smsg);
 	is(substr($$s, -2, 2), "\r\n",
 		"trailing CRLF added $m by original lacks EOL");
 	$eml = PublicInbox::Eml->new($s);
@@ -62,7 +63,7 @@ for my $mbox (@MBOX) {
 		is($eml->header('Content-Length') + length("\r\n"),
 			length($eml->body_raw), "$m Content-Length matches");
 	} elsif ($mbox eq 'mboxrd') {
-		$s = $cb->($eml, $kw);
+		$s = $cb->($eml, $smsg);
 		$eml = PublicInbox::Eml->new($s);
 		is($eml->body_raw,
 			">>From hell\r\n\r\n", "From escaped again by $m");
@@ -102,11 +103,12 @@ my $wcb_get = sub {
 	$cb;
 };
 
+my $deadbeef = { blob => 'deadbeef', kw => [ qw(seen) ] };
 my $orig = do {
 	my $wcb = $wcb_get->($mbox, $fn);
 	is(ref $wcb, 'CODE', 'write_cb returned callback');
 	ok(-f $fn && !-s _, 'empty file created');
-	$wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]);
+	$wcb->(\(my $dup = $buf), $deadbeef);
 	undef $wcb;
 	open my $fh, '<', $fn or BAIL_OUT $!;
 	my $raw = do { local $/; <$fh> };
@@ -116,7 +118,7 @@ my $orig = do {
 	local $lei->{opt} = { jobs => 2 };
 	$wcb = $wcb_get->($mbox, $fn);
 	ok(-f $fn && !-s _, 'truncated mbox destination');
-	$wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]);
+	$wcb->(\($dup = $buf), $deadbeef);
 	undef $wcb;
 	open $fh, '<', $fn or BAIL_OUT $!;
 	is(do { local $/; <$fh> }, $raw, 'jobs > 1');
@@ -131,7 +133,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
 		ok($dc_cmd, "decompressor for .$zsfx");
 		my $f = "$fn.$zsfx";
 		my $wcb = $wcb_get->($mbox, $f);
-		$wcb->(\(my $dup = $buf), 'deadbeef', [ qw(seen) ]);
+		$wcb->(\(my $dup = $buf), $deadbeef);
 		undef $wcb;
 		my $uncompressed = xqx([@$dc_cmd, $f]);
 		is($uncompressed, $orig, "$zsfx works unlocked");
@@ -139,13 +141,13 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
 		local $lei->{opt} = { jobs => 2 }; # for atomic writes
 		unlink $f or BAIL_OUT "unlink $!";
 		$wcb = $wcb_get->($mbox, $f);
-		$wcb->(\($dup = $buf), 'deadbeef', [ qw(seen) ]);
+		$wcb->(\($dup = $buf), $deadbeef);
 		undef $wcb;
 		is(xqx([@$dc_cmd, $f]), $orig, "$zsfx matches with lock");
 
 		local $lei->{opt} = { augment => 1 };
 		$wcb = $wcb_get->($mbox, $f);
-		$wcb->(\($dup = $buf . "\nx\n"), 'deadbeef', [ qw(seen) ]);
+		$wcb->(\($dup = $buf . "\nx\n"), $deadbeef);
 		undef $wcb; # commit
 
 		my $cat = popen_rd([@$dc_cmd, $f]);
@@ -157,7 +159,7 @@ for my $zsfx (qw(gz bz2 xz)) { # XXX should we support zst, zz, lzo, lzma?
 
 		local $lei->{opt} = { augment => 1, jobs => 2 };
 		$wcb = $wcb_get->($mbox, $f);
-		$wcb->(\($dup = $buf . "\ny\n"), 'deadbeef', [ qw(seen) ]);
+		$wcb->(\($dup = $buf . "\ny\n"), $deadbeef);
 		undef $wcb; # commit
 
 		my @raw3;
@@ -179,7 +181,8 @@ my $as_orig = sub {
 unlink $fn or BAIL_OUT $!;
 if ('default deduplication uses content_hash') {
 	my $wcb = $wcb_get->('mboxo', $fn);
-	$wcb->(\(my $x = $buf), 'deadbeef', []) for (1..2);
+	$deadbeef->{kw} = [];
+	$wcb->(\(my $x = $buf), $deadbeef) for (1..2);
 	undef $wcb; # undef to commit changes
 	my $cmp = '';
 	open my $fh, '<', $fn or BAIL_OUT $!;
@@ -188,7 +191,7 @@ if ('default deduplication uses content_hash') {
 
 	local $lei->{opt} = { augment => 1 };
 	$wcb = $wcb_get->('mboxo', $fn);
-	$wcb->(\($x = $buf . "\nx\n"), 'deadbeef', []) for (1..2);
+	$wcb->(\($x = $buf . "\nx\n"), $deadbeef) for (1..2);
 	undef $wcb; # undef to commit changes
 	open $fh, '<', $fn or BAIL_OUT $!;
 	my @x;
@@ -202,7 +205,7 @@ if ('default deduplication uses content_hash') {
 	open my $tmp, '+>', undef or BAIL_OUT $!;
 	local $lei->{1} = $tmp;
 	my $wcb = $wcb_get->('mboxrd', '/dev/stdout');
-	$wcb->(\(my $x = $buf), 'deadbeef', []);
+	$wcb->(\(my $x = $buf), $deadbeef);
 	undef $wcb; # commit
 	seek($tmp, 0, SEEK_SET) or BAIL_OUT $!;
 	my $cmp = '';
@@ -216,7 +219,7 @@ SKIP: { # FIFO support
 	mkfifo($fn, 0600) or skip("mkfifo not supported: $!", 1);
 	my $cat = popen_rd([which('cat'), $fn]);
 	my $wcb = $wcb_get->('mboxo', $fn);
-	$wcb->(\(my $x = $buf), 'deadbeef', []);
+	$wcb->(\(my $x = $buf), $deadbeef);
 	undef $wcb; # commit
 	my $cmp = '';
 	PublicInbox::MboxReader->mboxo($cat, sub { $cmp .= $as_orig->(@_) });
@@ -227,7 +230,8 @@ SKIP: { # FIFO support
 	my $md = "$tmpdir/maildir/";
 	my $wcb = $wcb_get->('maildir', $md);
 	is(ref($wcb), 'CODE', 'got Maildir callback');
-	$wcb->(\(my $x = $buf), 'badc0ffee', []);
+	my $b4dc0ffee = { blob => 'badc0ffee', kw => [] };
+	$wcb->(\(my $x = $buf), $b4dc0ffee);
 
 	my @f;
 	PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
@@ -235,7 +239,8 @@ SKIP: { # FIFO support
 	is(do { local $/; <$fh> }, $buf, 'wrote to Maildir');
 
 	$wcb = $wcb_get->('maildir', $md);
-	$wcb->(\($x = $buf."\nx\n"), 'deadcafe', []);
+	my $deadcafe = { blob => 'deadcafe', kw => [] };
+	$wcb->(\($x = $buf."\nx\n"), $deadcafe);
 
 	my @x = ();
 	PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @x, shift });
@@ -246,8 +251,8 @@ SKIP: { # FIFO support
 
 	local $lei->{opt}->{augment} = 1;
 	$wcb = $wcb_get->('maildir', $md);
-	$wcb->(\($x = $buf."\ny\n"), 'deadcafe', []);
-	$wcb->(\($x = $buf."\ny\n"), 'b4dc0ffee', []); # skipped by dedupe
+	$wcb->(\($x = $buf."\ny\n"), $deadcafe);
+	$wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe
 	@f = ();
 	PublicInbox::LeiToMail::_maildir_each_file($md, sub { push @f, shift });
 	is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there');

  parent reply	other threads:[~2021-01-21 19:46 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-21 19:46 [PATCH 00/12] lei: another dump Eric Wong
2021-01-21 19:46 ` [PATCH 01/12] lei_overview: rename {relevance} => {pct} Eric Wong
2021-01-21 19:46 ` [PATCH 02/12] lei q: retrieve keywords for local, non-external messages Eric Wong
2021-01-21 19:46 ` [PATCH 03/12] lei_xsearch: eliminate some unused, commented-out code Eric Wong
2021-01-21 19:46 ` Eric Wong [this message]
2021-01-21 19:46 ` [PATCH 05/12] lei: fix inadvertant FD sharing Eric Wong
2021-01-21 19:46 ` [PATCH 06/12] lei_to_mail: avoid segfault on exit Eric Wong
2021-01-21 19:46 ` [PATCH 07/12] lei: oneshot: use client $io[2] for placeholder Eric Wong
2021-01-21 19:46 ` [PATCH 08/12] lei: remove INT/QUIT/TERM handlers, fix daemon EOF Eric Wong
2021-01-21 19:46 ` [PATCH 09/12] lei_xsearch: reduce reference paths to lxs Eric Wong
2021-01-21 19:46 ` [PATCH 10/12] lei: remove @TO_CLOSE_ATFORK_CHILD Eric Wong
2021-01-21 19:46 ` [PATCH 11/12] lei: forget-external support with canonicalization Eric Wong
2021-01-21 19:46 ` [PATCH 12/12] lei forget-external: bash completion support Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210121194624.32002-5-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).