user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 3/5] lei: maildir: move shard support to MdirReader
Date: Mon,  5 Apr 2021 10:27:50 +0000	[thread overview]
Message-ID: <20210405102752.6249-4-e@80x24.org> (raw)
In-Reply-To: <20210405102752.6249-1-e@80x24.org>

We'll eventually want lei_input users like "lei import" and
"lei tag" to support parallel reads.
---
 lib/PublicInbox/InboxWritable.pm |  4 ++--
 lib/PublicInbox/LeiInput.pm      |  2 +-
 lib/PublicInbox/LeiToMail.pm     | 29 +++++++++--------------------
 lib/PublicInbox/MdirReader.pm    | 25 +++++++++++++++++++++----
 t/lei-convert.t                  |  2 +-
 t/lei_to_mail.t                  |  8 ++++----
 6 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index eeebc485..45d8cdc7 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -154,8 +154,8 @@ sub import_maildir {
 	my $im = $self->importer(1);
 	my @self = $self->filter($im) ? ($self) : ();
 	require PublicInbox::MdirReader;
-	PublicInbox::MdirReader::maildir_each_file(\&_each_maildir_fn,
-						$im, @self);
+	PublicInbox::MdirReader->new->maildir_each_file(\&_each_maildir_fn,
+							$im, @self);
 	$im->done;
 }
 
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index 40d71f9e..e416d3ed 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -88,7 +88,7 @@ sub input_path_url {
 		return $lei->fail(<<EOM) if $ifmt && $ifmt ne 'maildir';
 $input appears to a be a maildir, not $ifmt
 EOM
-		PublicInbox::MdirReader::maildir_each_eml($input,
+		PublicInbox::MdirReader->new->maildir_each_eml($input,
 					$self->can('input_maildir_cb'),
 					$self, @args);
 	} else {
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 76a11b0e..2e736070 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -14,7 +14,6 @@ use PublicInbox::PktOp qw(pkt_do);
 use Symbol qw(gensym);
 use IO::Handle; # ->autoflush
 use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY);
-use Digest::SHA qw(sha256_hex);
 
 my %kw2char = ( # Maildir characters
 	draft => 'D',
@@ -234,17 +233,9 @@ sub update_kw_maybe ($$$$) {
 	}
 }
 
-sub _augment_or_unlink { # maildir_each_eml cb
-	my ($f, $kw, $eml, $lei, $lse, $mod, $shard, $unlink) = @_;
-	if ($mod) {
-		# can't get dirent.d_ino w/ pure Perl readdir, so we extract
-		# the OID if it looks like one instead of doing stat(2)
-		my $hex = $f =~ m!\b([a-f0-9]{40,})[^/]*\z! ?
-				$1 : sha256_hex($f);
-		my $recno = hex(substr($hex, 0, 8));
-		return if ($recno % $mod) != $shard;
-		update_kw_maybe($lei, $lse, $eml, $kw);
-	}
+sub _md_update { # maildir_each_eml cb
+	my ($f, $kw, $eml, $lei, $lse, $unlink) = @_;
+	update_kw_maybe($lei, $lse, $eml, $kw);
 	$unlink ? unlink($f) : _augment($eml, $lei);
 }
 
@@ -392,21 +383,19 @@ sub _do_augment_maildir {
 	my ($self, $lei) = @_;
 	my $dst = $lei->{ovv}->{dst};
 	my $lse = $lei->{opt}->{'import-before'} ? $lei->{lse} : undef;
-	my ($mod, $shard) = @{$self->{shard_info} // []};
+	my $mdr = PublicInbox::MdirReader->new;
 	if ($lei->{opt}->{augment}) {
 		my $dedupe = $lei->{dedupe};
 		if ($dedupe && $dedupe->prepare_dedupe) {
-			PublicInbox::MdirReader::maildir_each_eml($dst,
-						\&_augment_or_unlink,
-						$lei, $lse, $mod, $shard);
+			$mdr->{shard_info} = $self->{shard_info};
+			$mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse);
 			$dedupe->pause_dedupe;
 		}
 	} elsif ($lse) {
-		PublicInbox::MdirReader::maildir_each_eml($dst,
-					\&_augment_or_unlink,
-					$lei, $lse, $mod, $shard, 1);
+		$mdr->{shard_info} = $self->{shard_info};
+		$mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse, 1);
 	} else {# clobber existing Maildir
-		PublicInbox::MdirReader::maildir_each_file($dst, \&_unlink);
+		$mdr->maildir_each_file($dst, \&_unlink);
 	}
 }
 
diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm
index 1685e4d8..b49c8ceb 100644
--- a/lib/PublicInbox/MdirReader.pm
+++ b/lib/PublicInbox/MdirReader.pm
@@ -8,6 +8,7 @@ package PublicInbox::MdirReader;
 use strict;
 use v5.10.1;
 use PublicInbox::InboxWritable qw(eml_from_path);
+use Digest::SHA qw(sha256_hex);
 
 # returns Maildir flags from a basename ('' for no flags, undef for invalid)
 sub maildir_basename_flags {
@@ -24,14 +25,25 @@ sub maildir_path_flags {
 	$i >= 0 ? maildir_basename_flags(substr($f, $i + 1)) : undef;
 }
 
-sub maildir_each_file ($$;@) {
-	my ($dir, $cb, @arg) = @_;
+sub shard_ok ($$$) {
+	my ($bn, $mod, $shard) = @_;
+	# can't get dirent.d_ino w/ pure Perl readdir, so we extract
+	# the OID if it looks like one instead of doing stat(2)
+	my $hex = $bn =~ m!\A([a-f0-9]{40,})! ? $1 : sha256_hex($bn);
+	my $recno = hex(substr($hex, 0, 8));
+	($recno % $mod) == $shard;
+}
+
+sub maildir_each_file {
+	my ($self, $dir, $cb, @arg) = @_;
 	$dir .= '/' unless substr($dir, -1) eq '/';
+	my ($mod, $shard) = @{$self->{shard_info} // []};
 	for my $d (qw(new/ cur/)) {
 		my $pfx = $dir.$d;
 		opendir my $dh, $pfx or next;
 		while (defined(my $bn = readdir($dh))) {
 			maildir_basename_flags($bn) // next;
+			next if defined($mod) && !shard_ok($bn, $mod, $shard);
 			$cb->($pfx.$bn, @arg);
 		}
 	}
@@ -40,15 +52,17 @@ sub maildir_each_file ($$;@) {
 my %c2kw = ('D' => 'draft', F => 'flagged', P => 'forwarded',
 	R => 'answered', S => 'seen');
 
-sub maildir_each_eml ($$;@) {
-	my ($dir, $cb, @arg) = @_;
+sub maildir_each_eml {
+	my ($self, $dir, $cb, @arg) = @_;
 	$dir .= '/' unless substr($dir, -1) eq '/';
+	my ($mod, $shard) = @{$self->{shard_info} // []};
 	my $pfx = $dir . 'new/';
 	if (opendir(my $dh, $pfx)) {
 		while (defined(my $bn = readdir($dh))) {
 			next if substr($bn, 0, 1) eq '.';
 			my @f = split(/:/, $bn, -1);
 			next if scalar(@f) != 1;
+			next if defined($mod) && !shard_ok($bn, $mod, $shard);
 			my $f = $pfx.$bn;
 			my $eml = eml_from_path($f) or next;
 			$cb->($f, [], $eml, @arg);
@@ -59,6 +73,7 @@ sub maildir_each_eml ($$;@) {
 	while (defined(my $bn = readdir($dh))) {
 		my $fl = maildir_basename_flags($bn) // next;
 		next if index($fl, 'T') >= 0;
+		next if defined($mod) && !shard_ok($bn, $mod, $shard);
 		my $f = $pfx.$bn;
 		my $eml = eml_from_path($f) or next;
 		my @kw = sort(map { $c2kw{$_} // () } split(//, $fl));
@@ -66,4 +81,6 @@ sub maildir_each_eml ($$;@) {
 	}
 }
 
+sub new { bless {}, __PACKAGE__ }
+
 1;
diff --git a/t/lei-convert.t b/t/lei-convert.t
index dc53b82c..0ea860c8 100644
--- a/t/lei-convert.t
+++ b/t/lei-convert.t
@@ -57,7 +57,7 @@ test_lei({ tmpdir => $tmpdir }, sub {
 	lei_ok('convert', '-o', "$d/md", "mboxrd:$d/foo.mboxrd");
 	ok(-d "$d/md", 'Maildir created');
 	my @md;
-	PublicInbox::MdirReader::maildir_each_eml("$d/md", sub {
+	PublicInbox::MdirReader->new->maildir_each_eml("$d/md", sub {
 		push @md, $_[2];
 	});
 	is(scalar(@md), scalar(@mboxrd), 'got expected emails in Maildir') or
diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t
index 75314add..51357257 100644
--- a/t/lei_to_mail.t
+++ b/t/lei_to_mail.t
@@ -253,7 +253,7 @@ SKIP: { # FIFO support
 }
 
 { # Maildir support
-	my $each_file = PublicInbox::MdirReader->can('maildir_each_file');
+	my $mdr = PublicInbox::MdirReader->new;
 	my $md = "$tmpdir/maildir/";
 	my $wcb = $wcb_get->('maildir', $md);
 	is(ref($wcb), 'CODE', 'got Maildir callback');
@@ -261,7 +261,7 @@ SKIP: { # FIFO support
 	$wcb->(\(my $x = $buf), $b4dc0ffee);
 
 	my @f;
-	$each_file->($md, sub { push @f, shift });
+	$mdr->maildir_each_file($md, sub { push @f, shift });
 	open my $fh, $f[0] or BAIL_OUT $!;
 	is(do { local $/; <$fh> }, $buf, 'wrote to Maildir');
 
@@ -270,7 +270,7 @@ SKIP: { # FIFO support
 	$wcb->(\($x = $buf."\nx\n"), $deadcafe);
 
 	my @x = ();
-	$each_file->($md, sub { push @x, shift });
+	$mdr->maildir_each_file($md, sub { push @x, shift });
 	is(scalar(@x), 1, 'wrote one new file');
 	ok(!-f $f[0], 'old file clobbered');
 	open $fh, $x[0] or BAIL_OUT $!;
@@ -281,7 +281,7 @@ SKIP: { # FIFO support
 	$wcb->(\($x = $buf."\ny\n"), $deadcafe);
 	$wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe
 	@f = ();
-	$each_file->($md, sub { push @f, shift });
+	$mdr->maildir_each_file($md, sub { push @f, shift });
 	is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there');
 	my @new = grep(!/\A\Q$x[0]\E\z/, @f);
 	is(scalar @new, 1, '1 new file written (b4dc0ffee skipped)');

  parent reply	other threads:[~2021-04-05 10:27 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-05 10:27 [PATCH 0/5] lei_to_mail fixes Eric Wong
2021-04-05 10:27 ` [PATCH 1/5] lei_to_mail: trim down imports Eric Wong
2021-04-05 10:27 ` [PATCH 2/5] lei_tag: fix comments w.r.t support levels Eric Wong
2021-04-05 10:27 ` Eric Wong [this message]
2021-04-05 10:27 ` [PATCH 4/5] lei_to_mail: improve comments and reduce LoC Eric Wong
2021-04-05 10:27 ` [PATCH 5/5] lei q: fix auth IMAP --output with remote mboxrd Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210405102752.6249-4-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).