user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH] lei pmdir: fix nproc for <= 4 CPUs
@ 2021-06-08 23:56  7% Eric Wong
  0 siblings, 0 replies; 3+ results
From: Eric Wong @ 2021-06-08 23:56 UTC (permalink / raw)
  To: meta

I forgot my FreeBSD VM has 8 cores, actually, and tweaked the
nproc detection on that machine before finalizing commit
10b523eb017162240b1ac3647f8dcbbf2be348a7
("lei import: speed up repeated Maildir imports")

Fixes: 10b523eb01716224 ("lei import: speed up repeated Maildir imports")
---
 lib/PublicInbox/LeiPmdir.pm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/LeiPmdir.pm b/lib/PublicInbox/LeiPmdir.pm
index 5efb012e..b71efe70 100644
--- a/lib/PublicInbox/LeiPmdir.pm
+++ b/lib/PublicInbox/LeiPmdir.pm
@@ -13,14 +13,14 @@ use parent qw(PublicInbox::IPC);
 sub new {
 	my ($cls, $lei, $ipt) = @_;
 	my $self = bless { -wq_ident => 'lei Maildir worker' }, $cls;
-	my $jobs = $lei->{opt}->{jobs};
+	my $jobs = $lei->{opt}->{jobs} // '';
 	$jobs =~ /\A[0-9]+,([0-9]+)\z/ and $jobs = $1;
-	my $nproc = $jobs // do {
-		# untested with >=4 CPUs, though I suspect I/O latency
+	my $nproc = $jobs || do {
+		# barely tested with >=4 CPUs, though I suspect I/O latency
 		# of SATA SSD storage will make >=4 processes unnecessary,
 		# here.  NVMe users may wish to use '-j'
 		my $n = $self->detect_nproc;
-		$n = 4 if $n > 4;
+		$n = $n > 4 ? 4 : $n;
 	};
 	my ($op_c, $ops) = $lei->workers_start($self, $nproc,
 		undef, { ipt => $ipt }); # LeiInput subclass

^ permalink raw reply related	[relevance 7%]

* [PATCH 3/3] lei import: speed up repeated Maildir imports
  2021-06-08  9:50  6% [PATCH 0/3] lei import: speedup repeated Maildir import Eric Wong
@ 2021-06-08  9:50  3% ` Eric Wong
  0 siblings, 0 replies; 3+ results
From: Eric Wong @ 2021-06-08  9:50 UTC (permalink / raw)
  To: meta

On a 4-core CPU, this speeds up "lei import" on a largish
Maildir inbox with 75K messages from ~8 minutes down to ~40s.

Parallelizing alone did not bring any improvement and may
even hurt performance slightly, depending on CPU availability.
However, creating the index on the "fid" and "name" columns in
blob2name yields us the same speedup we got.

Parallelizing IMAP makes more sense due to the fact most IMAP
stores are non-local and subject to network latency.

Followup-to: bdecd7ed8e0dcf0b45491b947cd737ba8cfe38a3 ("lei import: speed up kw updates for old IMAP messages")
---
 MANIFEST                       |  1 +
 lib/PublicInbox/LEI.pm         | 11 +++---
 lib/PublicInbox/LeiImport.pm   | 36 ++++++++++++------
 lib/PublicInbox/LeiIndex.pm    |  2 +-
 lib/PublicInbox/LeiInput.pm    | 31 +++++++++++-----
 lib/PublicInbox/LeiMailSync.pm | 14 +++++++
 lib/PublicInbox/LeiPmdir.pm    | 67 ++++++++++++++++++++++++++++++++++
 lib/PublicInbox/MdirReader.pm  | 22 ++++++-----
 t/lei-import-maildir.t         |  2 +-
 9 files changed, 148 insertions(+), 38 deletions(-)
 create mode 100644 lib/PublicInbox/LeiPmdir.pm

diff --git a/MANIFEST b/MANIFEST
index 5a70a144..7bdbf252 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -221,6 +221,7 @@ lib/PublicInbox/LeiMailSync.pm
 lib/PublicInbox/LeiMirror.pm
 lib/PublicInbox/LeiOverview.pm
 lib/PublicInbox/LeiP2q.pm
+lib/PublicInbox/LeiPmdir.pm
 lib/PublicInbox/LeiQuery.pm
 lib/PublicInbox/LeiRediff.pm
 lib/PublicInbox/LeiRemote.pm
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index ed01e8de..77fc5b8f 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -240,7 +240,7 @@ our %CMD = ( # sorted in order of importance/use:
 	 @c_opt ],
 'import' => [ 'LOCATION...|--stdin',
 	'one-time import/update from URL or filesystem',
-	qw(stdin| offset=i recursive|r exclude=s include|I=s
+	qw(stdin| offset=i recursive|r exclude=s include|I=s jobs=s
 	lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync!),
 	qw(no-torsocks torsocks=s), PublicInbox::LeiQuery::curl_opt(), @c_opt ],
 'forget-mail-sync' => [ 'LOCATION...',
@@ -421,7 +421,7 @@ my %CONFIG_KEYS = (
 	'leistore.dir' => 'top-level storage location',
 );
 
-my @WQ_KEYS = qw(lxs l2m wq1 ikw); # internal workers
+my @WQ_KEYS = qw(lxs l2m ikw pmd wq1); # internal workers
 
 sub _drop_wq {
 	my ($self) = @_;
@@ -566,7 +566,7 @@ sub pkt_op_pair {
 }
 
 sub workers_start {
-	my ($lei, $wq, $jobs, $ops) = @_;
+	my ($lei, $wq, $jobs, $ops, $flds) = @_;
 	$ops = {
 		'!' => [ \&fail_handler, $lei ],
 		'|' => [ \&sigpipe_handler, $lei ],
@@ -577,7 +577,8 @@ sub workers_start {
 	$ops->{''} //= [ $wq->can('_lei_wq_eof') || \&wq_eof, $lei ];
 	my $end = $lei->pkt_op_pair;
 	my $ident = $wq->{-wq_ident} // "lei-$lei->{cmd} worker";
-	$wq->wq_workers_start($ident, $jobs, $lei->oldset, { lei => $lei });
+	$flds->{lei} = $lei;
+	$wq->wq_workers_start($ident, $jobs, $lei->oldset, $flds);
 	delete $lei->{pkt_op_p};
 	my $op_c = delete $lei->{pkt_op_c};
 	# {-lei_sock} persists script/lei process until ops->{''} EOF callback
@@ -590,7 +591,7 @@ sub workers_start {
 # call this when we're ready to wait on events and yield to other clients
 sub wait_wq_events {
 	my ($lei, $op_c, $ops) = @_;
-	for my $wq (grep(defined, @$lei{qw(ikw)})) { # auxiliary WQs
+	for my $wq (grep(defined, @$lei{qw(ikw pmd)})) { # auxiliary WQs
 		$wq->wq_close(1);
 	}
 	$op_c->{ops} = $ops;
diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm
index 222f75c8..b0e7ba6b 100644
--- a/lib/PublicInbox/LeiImport.pm
+++ b/lib/PublicInbox/LeiImport.pm
@@ -6,6 +6,7 @@ package PublicInbox::LeiImport;
 use strict;
 use v5.10.1;
 use parent qw(PublicInbox::IPC PublicInbox::LeiInput);
+use PublicInbox::InboxWritable qw(eml_from_path);
 
 # /^input_/ subs are used by (or override) PublicInbox::LeiInput superclass
 
@@ -28,17 +29,26 @@ sub input_mbox_cb { # MboxReader callback
 	input_eml_cb($self, $eml, $vmd);
 }
 
-sub input_maildir_cb { # maildir_each_eml cb
-	my ($f, $kw, $eml, $self) = @_;
+sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn
+	my ($self, $f, @args) = @_;
+	my ($folder, $bn) = ($f =~ m!\A(.+?)/(?:new|cur)/([^/]+)\z!) or
+		die "BUG: $f was not from a Maildir?\n";
+	my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn);
+	return if index($fl, 'T') >= 0; # no Trashed messages
+	my $kw = PublicInbox::MdirReader::flags2kw($fl);
+	substr($folder, 0, 0) = 'maildir:'; # add prefix
+	my $lms = $self->{-lms_ro};
+	my $oidbin = $lms ? $lms->name_oidbin($folder, $bn) : undef;
+	my @docids = defined($oidbin) ?
+			$self->{over}->oidbin_exists($oidbin) : ();
 	my $vmd = $self->{-import_kw} ? { kw => $kw } : undef;
-	if ($self->{-mail_sync}) {
-		if ($f =~ m!\A(.+?)/(?:new|cur)/([^/]+)\z!) { # ugh...
-			$vmd->{sync_info} = [ "maildir:$1", \(my $n = $2) ];
-		} else {
-			warn "E: $f was not from a Maildir?\n";
-		}
+	if (scalar @docids) {
+		$self->{lse}->kw_changed(undef, $kw, \@docids) or return;
+	}
+	if (my $eml = eml_from_path($f)) {
+		$vmd->{sync_info} = [ $folder, \$bn ] if $self->{-mail_sync};
+		$self->input_eml_cb($eml, $vmd);
 	}
-	$self->input_eml_cb($eml, $vmd);
 }
 
 sub input_net_cb { # imap_each / nntp_each
@@ -62,11 +72,13 @@ sub do_import_index ($$@) {
 	my $vmd_mod = $self->vmd_mod_extract(\@inputs);
 	return $lei->fail(join("\n", @{$vmd_mod->{err}})) if $vmd_mod->{err};
 	$self->{all_vmd} = $vmd_mod if scalar keys %$vmd_mod;
-	$self->prepare_inputs($lei, \@inputs) or return;
+	$lei->ale; # initialize for workers to read (before LeiPmdir->new)
 	$self->{-mail_sync} = $lei->{opt}->{'mail-sync'} // 1;
+	$self->prepare_inputs($lei, \@inputs) or return;
 
-	$lei->ale; # initialize for workers to read
-	my $j = $lei->{opt}->{jobs} // scalar(@{$self->{inputs}}) || 1;
+	my $j = $lei->{opt}->{jobs} // 0;
+	$j =~ /\A([0-9]+),[0-9]+\z/ and $j = $1 + 0;
+	$j ||= scalar(@{$self->{inputs}}) || 1;
 	my $ikw;
 	if (my $net = $lei->{net}) {
 		# $j = $net->net_concurrency($j); TODO
diff --git a/lib/PublicInbox/LeiIndex.pm b/lib/PublicInbox/LeiIndex.pm
index cc3e83e7..4be0c649 100644
--- a/lib/PublicInbox/LeiIndex.pm
+++ b/lib/PublicInbox/LeiIndex.pm
@@ -35,7 +35,7 @@ sub lei_index {
 
 no warnings 'once';
 no strict 'refs';
-for my $m (qw(input_maildir_cb input_net_cb)) {
+for my $m (qw(pmdir_cb input_net_cb)) {
 	*$m = PublicInbox::LeiImport->can($m);
 }
 
diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm
index 4ff7a379..24211bf0 100644
--- a/lib/PublicInbox/LeiInput.pm
+++ b/lib/PublicInbox/LeiInput.pm
@@ -151,9 +151,16 @@ sub input_path_url {
 		return $lei->fail(<<EOM) if $ifmt && $ifmt ne 'maildir';
 $input appears to be a maildir, not $ifmt
 EOM
-		PublicInbox::MdirReader->new->maildir_each_eml($input,
-					$self->can('input_maildir_cb'),
-					$self, @args);
+		my $mdr = PublicInbox::MdirReader->new;
+		if (my $pmd = $self->{pmd}) {
+			$mdr->maildir_each_file($input,
+						$pmd->can('each_mdir_fn'),
+						$pmd, @args);
+		} else {
+			$mdr->maildir_each_eml($input,
+						$self->can('input_maildir_cb'),
+						$self, @args);
+		}
 	} else {
 		$lei->fail("$input unsupported (TODO)");
 	}
@@ -215,7 +222,7 @@ sub prepare_inputs { # returns undef on error
 		push @{$sync->{no}}, '/dev/stdin' if $sync;
 	}
 	my $net = $lei->{net}; # NetWriter may be created by l2m
-	my (@f, @d);
+	my (@f, @md);
 	# e.g. Maildir:/home/user/Mail/ or imaps://example.com/INBOX
 	for my $input (@$inputs) {
 		my $input_path = $input;
@@ -247,11 +254,11 @@ sub prepare_inputs { # returns undef on error
 				PublicInbox::MboxReader->reads($ifmt) or return
 					$lei->fail("$ifmt not supported");
 			} elsif (-d $input_path) {
-				require PublicInbox::MdirReader;
 				$ifmt eq 'maildir' or return
 					$lei->fail("$ifmt not supported");
 				$sync and $input = 'maildir:'.
 						$lei->abs_path($input_path);
+				push @md, $input;
 			} else {
 				return $lei->fail("Unable to handle $input");
 			}
@@ -266,21 +273,18 @@ $input is `eml', not --in-format=$in_fmt
 			if ($devfd >= 0 || -f $input || -p _) {
 				push @{$sync->{no}}, $input if $sync;
 				push @f, $input;
-			} elsif (-d $input) {
+			} elsif (-d "$input/new" && -d "$input/cur") {
 				if ($sync) {
 					$input = $lei->abs_path($input);
 					push @{$sync->{ok}}, $input;
 				}
-				push @d, $input;
+				push @md, $input;
 			} else {
 				return $lei->fail("Unable to handle $input")
 			}
 		}
 	}
 	if (@f) { check_input_format($lei, \@f) or return }
-	if (@d) { # TODO: check for MH vs Maildir, here
-		require PublicInbox::MdirReader;
-	}
 	if ($sync && $sync->{no}) {
 		return $lei->fail(<<"") if !$sync->{ok};
 --mail-sync specified but no inputs support it
@@ -299,6 +303,13 @@ $input is `eml', not --in-format=$in_fmt
 		$lei->{auth} //= PublicInbox::LeiAuth->new;
 		$lei->{net} //= $net;
 	}
+	if (scalar(@md)) {
+		require PublicInbox::MdirReader;
+		if ($self->can('pmdir_cb')) {
+			require PublicInbox::LeiPmdir;
+			$self->{pmd} = PublicInbox::LeiPmdir->new($lei, $self);
+		}
+	}
 	$self->{inputs} = $inputs;
 }
 
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index 75603d89..ec05404a 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -66,6 +66,10 @@ CREATE TABLE IF NOT EXISTS blob2name (
 	UNIQUE (oidbin, fid, name)
 )
 
+	# speeds up LeiImport->pmdir_cb (for "lei import") by ~6x:
+	$dbh->do(<<'');
+CREATE INDEX IF NOT EXISTS idx_fid_name ON blob2name(fid,name)
+
 }
 
 sub fid_for {
@@ -375,6 +379,16 @@ EOM
 	$sth->fetchrow_array;
 }
 
+sub name_oidbin ($$$) {
+	my ($self, $mdir, $nm) = @_;
+	my $fid = $self->{fmap}->{$mdir} //= fid_for($self, $mdir) // return;
+	my $sth = $self->{dbh}->prepare_cached(<<EOM, undef, 1);
+SELECT oidbin FROM blob2name WHERE fid = ? AND name = ?
+EOM
+	$sth->execute($fid, $nm);
+	$sth->fetchrow_array;
+}
+
 sub imap_oid {
 	my ($self, $lei, $uid_uri) = @_;
 	my $mailbox_uri = $uid_uri->clone;
diff --git a/lib/PublicInbox/LeiPmdir.pm b/lib/PublicInbox/LeiPmdir.pm
new file mode 100644
index 00000000..5efb012e
--- /dev/null
+++ b/lib/PublicInbox/LeiPmdir.pm
@@ -0,0 +1,67 @@
+# Copyright (C) 2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# WQ worker for dealing with parallel Maildir reads;
+# this does NOT use the {shard_info} field of LeiToMail
+# (and we may remove {shard_info})
+# WQ key: {pmd}
+package PublicInbox::LeiPmdir;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::IPC);
+
+sub new {
+	my ($cls, $lei, $ipt) = @_;
+	my $self = bless { -wq_ident => 'lei Maildir worker' }, $cls;
+	my $jobs = $lei->{opt}->{jobs};
+	$jobs =~ /\A[0-9]+,([0-9]+)\z/ and $jobs = $1;
+	my $nproc = $jobs // do {
+		# untested with >=4 CPUs, though I suspect I/O latency
+		# of SATA SSD storage will make >=4 processes unnecessary,
+		# here.  NVMe users may wish to use '-j'
+		my $n = $self->detect_nproc;
+		$n = 4 if $n > 4;
+	};
+	my ($op_c, $ops) = $lei->workers_start($self, $nproc,
+		undef, { ipt => $ipt }); # LeiInput subclass
+	$op_c->{ops} = $ops; # for PktOp->event_step
+	$lei->{pmd} = $self;
+}
+
+sub ipc_atfork_child {
+	my ($self) = @_;
+	my $lei = $self->{lei};
+	$lei->_lei_atfork_child;
+	my $ipt = $self->{ipt} // die 'BUG: no self->{ipt}';
+	$ipt->{lei} = $lei;
+	$ipt->{sto} = $lei->{sto} // die 'BUG: no lei->{sto}';
+	$ipt->{lse} = $ipt->{sto}->search;
+	$ipt->{over} = $ipt->{lse}->over;
+	$ipt->{-lms_ro} //= $ipt->{lse}->lms; # may be undef or '0'
+	$self->SUPER::ipc_atfork_child;
+}
+
+sub each_mdir_fn { # maildir_each_file callback
+	my ($f, $self, @args) = @_;
+	$self->wq_io_do('mdir_iter', [], $f, @args);
+}
+
+sub mdir_iter { # via wq_io_do
+	my ($self, $f, @args) = @_;
+	$self->{ipt}->pmdir_cb($f, @args);
+}
+
+sub pmd_done_wait {
+	my ($arg, $pid) = @_;
+	my ($self, $lei) = @$arg;
+	my $wait = $lei->{sto}->ipc_do('done');
+	$lei->can('wq_done_wait')->($arg, $pid);
+}
+
+sub _lei_wq_eof { # EOF callback for main lei daemon
+	my ($lei) = @_;
+	my $pmd = delete $lei->{pmd} or return $lei->fail;
+	$pmd->wq_wait_old(\&pmd_done_wait, $lei);
+}
+
+1;
diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm
index 304be63d..484bf0a8 100644
--- a/lib/PublicInbox/MdirReader.pm
+++ b/lib/PublicInbox/MdirReader.pm
@@ -87,17 +87,21 @@ sub maildir_each_eml {
 sub new { bless {}, __PACKAGE__ }
 
 sub flags2kw ($) {
-	my @unknown;
-	my %kw;
-	for (split(//, $_[0])) {
-		my $k = $c2kw{$_};
-		if (defined($k)) {
-			$kw{$k} = 1;
-		} else {
-			push @unknown, $_;
+	if (wantarray) {
+		my @unknown;
+		my %kw;
+		for (split(//, $_[0])) {
+			my $k = $c2kw{$_};
+			if (defined($k)) {
+				$kw{$k} = 1;
+			} else {
+				push @unknown, $_;
+			}
 		}
+		(\%kw, \@unknown);
+	} else {
+		[ sort(map { $c2kw{$_} // () } split(//, $_[0])) ];
 	}
-	(\%kw, \@unknown);
 }
 
 1;
diff --git a/t/lei-import-maildir.t b/t/lei-import-maildir.t
index 688b10ce..c81e7805 100644
--- a/t/lei-import-maildir.t
+++ b/t/lei-import-maildir.t
@@ -28,7 +28,7 @@ test_lei(sub {
 	is(scalar(keys %v), 1, 'inspect handles relative and absolute paths');
 	my $inspect = json_utf8->decode([ keys %v ]->[0]);
 	is_deeply($inspect, {"maildir:$md" => { 'name.count' => 1 }},
-		'inspect maildir: path had expected output');
+		'inspect maildir: path had expected output') or xbail($inspect);
 
 	lei_ok(qw(q s:boolean));
 	my $res = json_utf8->decode($lei_out);

^ permalink raw reply related	[relevance 3%]

* [PATCH 0/3] lei import: speedup repeated Maildir import
@ 2021-06-08  9:50  6% Eric Wong
  2021-06-08  9:50  3% ` [PATCH 3/3] lei import: speed up repeated Maildir imports Eric Wong
  0 siblings, 1 reply; 3+ results
From: Eric Wong @ 2021-06-08  9:50 UTC (permalink / raw)
  To: meta

More importantly, this series starts us down the road of
generalizing the use of auxiliary WQs like {ikw} (import
keywords for IMAP) and {pmd} (parallel Maildir).

It took me a bit to figure out some consistency problems
which I reordered and split out into 1/3.  2/3 should
make future work easier, and 3/3 gives the final speedup
along the lines of what I already did with IMAP.

Eric Wong (3):
  lei: safety fix for multiple WQ classes
  lei: generalize auxiliary WQ handling
  lei import: speed up repeated Maildir imports

 MANIFEST                       |  1 +
 lib/PublicInbox/LEI.pm         | 20 ++++++++--
 lib/PublicInbox/LeiBlob.pm     |  2 +-
 lib/PublicInbox/LeiConvert.pm  |  2 +-
 lib/PublicInbox/LeiExportKw.pm |  2 +-
 lib/PublicInbox/LeiImport.pm   | 39 +++++++++++++-------
 lib/PublicInbox/LeiIndex.pm    |  2 +-
 lib/PublicInbox/LeiInput.pm    | 31 +++++++++++-----
 lib/PublicInbox/LeiLsSearch.pm |  2 +-
 lib/PublicInbox/LeiMailSync.pm | 14 +++++++
 lib/PublicInbox/LeiMirror.pm   |  4 +-
 lib/PublicInbox/LeiP2q.pm      |  4 +-
 lib/PublicInbox/LeiPmdir.pm    | 67 ++++++++++++++++++++++++++++++++++
 lib/PublicInbox/LeiRediff.pm   |  2 +-
 lib/PublicInbox/LeiRm.pm       |  2 +-
 lib/PublicInbox/LeiTag.pm      |  2 +-
 lib/PublicInbox/LeiXSearch.pm  |  2 +-
 lib/PublicInbox/MdirReader.pm  | 22 ++++++-----
 lib/PublicInbox/PktOp.pm       |  6 ---
 t/lei-import-maildir.t         |  2 +-
 20 files changed, 171 insertions(+), 57 deletions(-)
 create mode 100644 lib/PublicInbox/LeiPmdir.pm

^ permalink raw reply	[relevance 6%]

Results 1-3 of 3 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2021-06-08  9:50  6% [PATCH 0/3] lei import: speedup repeated Maildir import Eric Wong
2021-06-08  9:50  3% ` [PATCH 3/3] lei import: speed up repeated Maildir imports Eric Wong
2021-06-08 23:56  7% [PATCH] lei pmdir: fix nproc for <= 4 CPUs Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).