From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 3/5] lei: maildir: move shard support to MdirReader
Date: Mon, 5 Apr 2021 10:27:50 +0000 [thread overview]
Message-ID: <20210405102752.6249-4-e@80x24.org> (raw)
In-Reply-To: <20210405102752.6249-1-e@80x24.org>
We'll eventually want lei_input users like "lei import" and
"lei tag" to support parallel reads.
---
lib/PublicInbox/InboxWritable.pm | 4 ++--
lib/PublicInbox/LeiInput.pm | 2 +-
lib/PublicInbox/LeiToMail.pm | 29 +++++++++--------------------
lib/PublicInbox/MdirReader.pm | 25 +++++++++++++++++++++----
t/lei-convert.t | 2 +-
t/lei_to_mail.t | 8 ++++----
6 files changed, 38 insertions(+), 32 deletions(-)
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index eeebc485..45d8cdc7 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -154,8 +154,8 @@ sub import_maildir {
my $im = $self->importer(1);
my @self = $self->filter($im) ? ($self) : ();
require PublicInbox::MdirReader;
- PublicInbox::MdirReader::maildir_each_file(\&_each_maildir_fn,
- $im, @self);
+ PublicInbox::MdirReader->new->maildir_each_file(\&_each_maildir_fn,
+ $im, @self);
$im->done;
}
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index 40d71f9e..e416d3ed 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -88,7 +88,7 @@ sub input_path_url {
return $lei->fail(<<EOM) if $ifmt && $ifmt ne 'maildir';
$input appears to a be a maildir, not $ifmt
EOM
- PublicInbox::MdirReader::maildir_each_eml($input,
+ PublicInbox::MdirReader->new->maildir_each_eml($input,
$self->can('input_maildir_cb'),
$self, @args);
} else {
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 76a11b0e..2e736070 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -14,7 +14,6 @@ use PublicInbox::PktOp qw(pkt_do);
use Symbol qw(gensym);
use IO::Handle; # ->autoflush
use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY);
-use Digest::SHA qw(sha256_hex);
my %kw2char = ( # Maildir characters
draft => 'D',
@@ -234,17 +233,9 @@ sub update_kw_maybe ($$$$) {
}
}
-sub _augment_or_unlink { # maildir_each_eml cb
- my ($f, $kw, $eml, $lei, $lse, $mod, $shard, $unlink) = @_;
- if ($mod) {
- # can't get dirent.d_ino w/ pure Perl readdir, so we extract
- # the OID if it looks like one instead of doing stat(2)
- my $hex = $f =~ m!\b([a-f0-9]{40,})[^/]*\z! ?
- $1 : sha256_hex($f);
- my $recno = hex(substr($hex, 0, 8));
- return if ($recno % $mod) != $shard;
- update_kw_maybe($lei, $lse, $eml, $kw);
- }
+sub _md_update { # maildir_each_eml cb
+ my ($f, $kw, $eml, $lei, $lse, $unlink) = @_;
+ update_kw_maybe($lei, $lse, $eml, $kw);
$unlink ? unlink($f) : _augment($eml, $lei);
}
@@ -392,21 +383,19 @@ sub _do_augment_maildir {
my ($self, $lei) = @_;
my $dst = $lei->{ovv}->{dst};
my $lse = $lei->{opt}->{'import-before'} ? $lei->{lse} : undef;
- my ($mod, $shard) = @{$self->{shard_info} // []};
+ my $mdr = PublicInbox::MdirReader->new;
if ($lei->{opt}->{augment}) {
my $dedupe = $lei->{dedupe};
if ($dedupe && $dedupe->prepare_dedupe) {
- PublicInbox::MdirReader::maildir_each_eml($dst,
- \&_augment_or_unlink,
- $lei, $lse, $mod, $shard);
+ $mdr->{shard_info} = $self->{shard_info};
+ $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse);
$dedupe->pause_dedupe;
}
} elsif ($lse) {
- PublicInbox::MdirReader::maildir_each_eml($dst,
- \&_augment_or_unlink,
- $lei, $lse, $mod, $shard, 1);
+ $mdr->{shard_info} = $self->{shard_info};
+ $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse, 1);
} else {# clobber existing Maildir
- PublicInbox::MdirReader::maildir_each_file($dst, \&_unlink);
+ $mdr->maildir_each_file($dst, \&_unlink);
}
}
diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm
index 1685e4d8..b49c8ceb 100644
--- a/lib/PublicInbox/MdirReader.pm
+++ b/lib/PublicInbox/MdirReader.pm
@@ -8,6 +8,7 @@ package PublicInbox::MdirReader;
use strict;
use v5.10.1;
use PublicInbox::InboxWritable qw(eml_from_path);
+use Digest::SHA qw(sha256_hex);
# returns Maildir flags from a basename ('' for no flags, undef for invalid)
sub maildir_basename_flags {
@@ -24,14 +25,25 @@ sub maildir_path_flags {
$i >= 0 ? maildir_basename_flags(substr($f, $i + 1)) : undef;
}
-sub maildir_each_file ($$;@) {
- my ($dir, $cb, @arg) = @_;
+sub shard_ok ($$$) {
+ my ($bn, $mod, $shard) = @_;
+ # can't get dirent.d_ino w/ pure Perl readdir, so we extract
+ # the OID if it looks like one instead of doing stat(2)
+ my $hex = $bn =~ m!\A([a-f0-9]{40,})! ? $1 : sha256_hex($bn);
+ my $recno = hex(substr($hex, 0, 8));
+ ($recno % $mod) == $shard;
+}
+
+sub maildir_each_file {
+ my ($self, $dir, $cb, @arg) = @_;
$dir .= '/' unless substr($dir, -1) eq '/';
+ my ($mod, $shard) = @{$self->{shard_info} // []};
for my $d (qw(new/ cur/)) {
my $pfx = $dir.$d;
opendir my $dh, $pfx or next;
while (defined(my $bn = readdir($dh))) {
maildir_basename_flags($bn) // next;
+ next if defined($mod) && !shard_ok($bn, $mod, $shard);
$cb->($pfx.$bn, @arg);
}
}
@@ -40,15 +52,17 @@ sub maildir_each_file ($$;@) {
my %c2kw = ('D' => 'draft', F => 'flagged', P => 'forwarded',
R => 'answered', S => 'seen');
-sub maildir_each_eml ($$;@) {
- my ($dir, $cb, @arg) = @_;
+sub maildir_each_eml {
+ my ($self, $dir, $cb, @arg) = @_;
$dir .= '/' unless substr($dir, -1) eq '/';
+ my ($mod, $shard) = @{$self->{shard_info} // []};
my $pfx = $dir . 'new/';
if (opendir(my $dh, $pfx)) {
while (defined(my $bn = readdir($dh))) {
next if substr($bn, 0, 1) eq '.';
my @f = split(/:/, $bn, -1);
next if scalar(@f) != 1;
+ next if defined($mod) && !shard_ok($bn, $mod, $shard);
my $f = $pfx.$bn;
my $eml = eml_from_path($f) or next;
$cb->($f, [], $eml, @arg);
@@ -59,6 +73,7 @@ sub maildir_each_eml ($$;@) {
while (defined(my $bn = readdir($dh))) {
my $fl = maildir_basename_flags($bn) // next;
next if index($fl, 'T') >= 0;
+ next if defined($mod) && !shard_ok($bn, $mod, $shard);
my $f = $pfx.$bn;
my $eml = eml_from_path($f) or next;
my @kw = sort(map { $c2kw{$_} // () } split(//, $fl));
@@ -66,4 +81,6 @@ sub maildir_each_eml ($$;@) {
}
}
+sub new { bless {}, __PACKAGE__ }
+
1;
diff --git a/t/lei-convert.t b/t/lei-convert.t
index dc53b82c..0ea860c8 100644
--- a/t/lei-convert.t
+++ b/t/lei-convert.t
@@ -57,7 +57,7 @@ test_lei({ tmpdir => $tmpdir }, sub {
lei_ok('convert', '-o', "$d/md", "mboxrd:$d/foo.mboxrd");
ok(-d "$d/md", 'Maildir created');
my @md;
- PublicInbox::MdirReader::maildir_each_eml("$d/md", sub {
+ PublicInbox::MdirReader->new->maildir_each_eml("$d/md", sub {
push @md, $_[2];
});
is(scalar(@md), scalar(@mboxrd), 'got expected emails in Maildir') or
diff --git a/t/lei_to_mail.t b/t/lei_to_mail.t
index 75314add..51357257 100644
--- a/t/lei_to_mail.t
+++ b/t/lei_to_mail.t
@@ -253,7 +253,7 @@ SKIP: { # FIFO support
}
{ # Maildir support
- my $each_file = PublicInbox::MdirReader->can('maildir_each_file');
+ my $mdr = PublicInbox::MdirReader->new;
my $md = "$tmpdir/maildir/";
my $wcb = $wcb_get->('maildir', $md);
is(ref($wcb), 'CODE', 'got Maildir callback');
@@ -261,7 +261,7 @@ SKIP: { # FIFO support
$wcb->(\(my $x = $buf), $b4dc0ffee);
my @f;
- $each_file->($md, sub { push @f, shift });
+ $mdr->maildir_each_file($md, sub { push @f, shift });
open my $fh, $f[0] or BAIL_OUT $!;
is(do { local $/; <$fh> }, $buf, 'wrote to Maildir');
@@ -270,7 +270,7 @@ SKIP: { # FIFO support
$wcb->(\($x = $buf."\nx\n"), $deadcafe);
my @x = ();
- $each_file->($md, sub { push @x, shift });
+ $mdr->maildir_each_file($md, sub { push @x, shift });
is(scalar(@x), 1, 'wrote one new file');
ok(!-f $f[0], 'old file clobbered');
open $fh, $x[0] or BAIL_OUT $!;
@@ -281,7 +281,7 @@ SKIP: { # FIFO support
$wcb->(\($x = $buf."\ny\n"), $deadcafe);
$wcb->(\($x = $buf."\ny\n"), $b4dc0ffee); # skipped by dedupe
@f = ();
- $each_file->($md, sub { push @f, shift });
+ $mdr->maildir_each_file($md, sub { push @f, shift });
is(scalar grep(/\A\Q$x[0]\E\z/, @f), 1, 'old file still there');
my @new = grep(!/\A\Q$x[0]\E\z/, @f);
is(scalar @new, 1, '1 new file written (b4dc0ffee skipped)');
next prev parent reply other threads:[~2021-04-05 10:27 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-04-05 10:27 [PATCH 0/5] lei_to_mail fixes Eric Wong
2021-04-05 10:27 ` [PATCH 1/5] lei_to_mail: trim down imports Eric Wong
2021-04-05 10:27 ` [PATCH 2/5] lei_tag: fix comments w.r.t support levels Eric Wong
2021-04-05 10:27 ` Eric Wong [this message]
2021-04-05 10:27 ` [PATCH 4/5] lei_to_mail: improve comments and reduce LoC Eric Wong
2021-04-05 10:27 ` [PATCH 5/5] lei q: fix auth IMAP --output with remote mboxrd Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210405102752.6249-4-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).