about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-04-05 10:27:50 +0000
committerEric Wong <e@80x24.org>2021-04-05 19:10:14 +0000
commitca7b99270e0574de573d29d6ba5b8a3bbdff82c5 (patch)
treea811445aaea9a17cd75d86cb5660555ad25441da /lib
parentcfd9e2ed62acdca0168fc3561a96f24568aff0fe (diff)
downloadpublic-inbox-ca7b99270e0574de573d29d6ba5b8a3bbdff82c5.tar.gz
We'll eventually want lei_input users like "lei import" and
"lei tag" to support parallel reads.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/InboxWritable.pm4
-rw-r--r--lib/PublicInbox/LeiInput.pm2
-rw-r--r--lib/PublicInbox/LeiToMail.pm29
-rw-r--r--lib/PublicInbox/MdirReader.pm25
4 files changed, 33 insertions, 27 deletions
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index eeebc485..45d8cdc7 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -154,8 +154,8 @@ sub import_maildir {
         my $im = $self->importer(1);
         my @self = $self->filter($im) ? ($self) : ();
         require PublicInbox::MdirReader;
-        PublicInbox::MdirReader::maildir_each_file(\&_each_maildir_fn,
-                                                $im, @self);
+        PublicInbox::MdirReader->new->maildir_each_file(\&_each_maildir_fn,
+                                                        $im, @self);
         $im->done;
 }
 
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index 40d71f9e..e416d3ed 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -88,7 +88,7 @@ sub input_path_url {
                 return $lei->fail(<<EOM) if $ifmt && $ifmt ne 'maildir';
 $input appears to a be a maildir, not $ifmt
 EOM
-                PublicInbox::MdirReader::maildir_each_eml($input,
+                PublicInbox::MdirReader->new->maildir_each_eml($input,
                                         $self->can('input_maildir_cb'),
                                         $self, @args);
         } else {
diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm
index 76a11b0e..2e736070 100644
--- a/lib/PublicInbox/LeiToMail.pm
+++ b/lib/PublicInbox/LeiToMail.pm
@@ -14,7 +14,6 @@ use PublicInbox::PktOp qw(pkt_do);
 use Symbol qw(gensym);
 use IO::Handle; # ->autoflush
 use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY);
-use Digest::SHA qw(sha256_hex);
 
 my %kw2char = ( # Maildir characters
         draft => 'D',
@@ -234,17 +233,9 @@ sub update_kw_maybe ($$$$) {
         }
 }
 
-sub _augment_or_unlink { # maildir_each_eml cb
-        my ($f, $kw, $eml, $lei, $lse, $mod, $shard, $unlink) = @_;
-        if ($mod) {
-                # can't get dirent.d_ino w/ pure Perl readdir, so we extract
-                # the OID if it looks like one instead of doing stat(2)
-                my $hex = $f =~ m!\b([a-f0-9]{40,})[^/]*\z! ?
-                                $1 : sha256_hex($f);
-                my $recno = hex(substr($hex, 0, 8));
-                return if ($recno % $mod) != $shard;
-                update_kw_maybe($lei, $lse, $eml, $kw);
-        }
+sub _md_update { # maildir_each_eml cb
+        my ($f, $kw, $eml, $lei, $lse, $unlink) = @_;
+        update_kw_maybe($lei, $lse, $eml, $kw);
         $unlink ? unlink($f) : _augment($eml, $lei);
 }
 
@@ -392,21 +383,19 @@ sub _do_augment_maildir {
         my ($self, $lei) = @_;
         my $dst = $lei->{ovv}->{dst};
         my $lse = $lei->{opt}->{'import-before'} ? $lei->{lse} : undef;
-        my ($mod, $shard) = @{$self->{shard_info} // []};
+        my $mdr = PublicInbox::MdirReader->new;
         if ($lei->{opt}->{augment}) {
                 my $dedupe = $lei->{dedupe};
                 if ($dedupe && $dedupe->prepare_dedupe) {
-                        PublicInbox::MdirReader::maildir_each_eml($dst,
-                                                \&_augment_or_unlink,
-                                                $lei, $lse, $mod, $shard);
+                        $mdr->{shard_info} = $self->{shard_info};
+                        $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse);
                         $dedupe->pause_dedupe;
                 }
         } elsif ($lse) {
-                PublicInbox::MdirReader::maildir_each_eml($dst,
-                                        \&_augment_or_unlink,
-                                        $lei, $lse, $mod, $shard, 1);
+                $mdr->{shard_info} = $self->{shard_info};
+                $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse, 1);
         } else {# clobber existing Maildir
-                PublicInbox::MdirReader::maildir_each_file($dst, \&_unlink);
+                $mdr->maildir_each_file($dst, \&_unlink);
         }
 }
 
diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm
index 1685e4d8..b49c8ceb 100644
--- a/lib/PublicInbox/MdirReader.pm
+++ b/lib/PublicInbox/MdirReader.pm
@@ -8,6 +8,7 @@ package PublicInbox::MdirReader;
 use strict;
 use v5.10.1;
 use PublicInbox::InboxWritable qw(eml_from_path);
+use Digest::SHA qw(sha256_hex);
 
 # returns Maildir flags from a basename ('' for no flags, undef for invalid)
 sub maildir_basename_flags {
@@ -24,14 +25,25 @@ sub maildir_path_flags {
         $i >= 0 ? maildir_basename_flags(substr($f, $i + 1)) : undef;
 }
 
-sub maildir_each_file ($$;@) {
-        my ($dir, $cb, @arg) = @_;
+sub shard_ok ($$$) {
+        my ($bn, $mod, $shard) = @_;
+        # can't get dirent.d_ino w/ pure Perl readdir, so we extract
+        # the OID if it looks like one instead of doing stat(2)
+        my $hex = $bn =~ m!\A([a-f0-9]{40,})! ? $1 : sha256_hex($bn);
+        my $recno = hex(substr($hex, 0, 8));
+        ($recno % $mod) == $shard;
+}
+
+sub maildir_each_file {
+        my ($self, $dir, $cb, @arg) = @_;
         $dir .= '/' unless substr($dir, -1) eq '/';
+        my ($mod, $shard) = @{$self->{shard_info} // []};
         for my $d (qw(new/ cur/)) {
                 my $pfx = $dir.$d;
                 opendir my $dh, $pfx or next;
                 while (defined(my $bn = readdir($dh))) {
                         maildir_basename_flags($bn) // next;
+                        next if defined($mod) && !shard_ok($bn, $mod, $shard);
                         $cb->($pfx.$bn, @arg);
                 }
         }
@@ -40,15 +52,17 @@ sub maildir_each_file ($$;@) {
 my %c2kw = ('D' => 'draft', F => 'flagged', P => 'forwarded',
         R => 'answered', S => 'seen');
 
-sub maildir_each_eml ($$;@) {
-        my ($dir, $cb, @arg) = @_;
+sub maildir_each_eml {
+        my ($self, $dir, $cb, @arg) = @_;
         $dir .= '/' unless substr($dir, -1) eq '/';
+        my ($mod, $shard) = @{$self->{shard_info} // []};
         my $pfx = $dir . 'new/';
         if (opendir(my $dh, $pfx)) {
                 while (defined(my $bn = readdir($dh))) {
                         next if substr($bn, 0, 1) eq '.';
                         my @f = split(/:/, $bn, -1);
                         next if scalar(@f) != 1;
+                        next if defined($mod) && !shard_ok($bn, $mod, $shard);
                         my $f = $pfx.$bn;
                         my $eml = eml_from_path($f) or next;
                         $cb->($f, [], $eml, @arg);
@@ -59,6 +73,7 @@ sub maildir_each_eml ($$;@) {
         while (defined(my $bn = readdir($dh))) {
                 my $fl = maildir_basename_flags($bn) // next;
                 next if index($fl, 'T') >= 0;
+                next if defined($mod) && !shard_ok($bn, $mod, $shard);
                 my $f = $pfx.$bn;
                 my $eml = eml_from_path($f) or next;
                 my @kw = sort(map { $c2kw{$_} // () } split(//, $fl));
@@ -66,4 +81,6 @@ sub maildir_each_eml ($$;@) {
         }
 }
 
+sub new { bless {}, __PACKAGE__ }
+
 1;