user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 3/5] watch: improve fairness during full rescans
Date: Mon, 26 Jun 2017 03:15:43 +0000	[thread overview]
Message-ID: <20170626031545.11879-4-e@80x24.org> (raw)
In-Reply-To: <20170626031545.11879-1-e@80x24.org>

We need to ensure new messages are being processed
fairly during full rescans, so have the ->scan subroutine
yield and reschedule itself.  Additionally, having a
long-running task inside the signal handler is dangerous
and subject to reentrancy bugs.

Due to the limitations of the Filesys::Notify::Simple interface,
we cannot rely on multiplexing I/O interfaces (select, IO::Poll,
Danga::Socket, etc...) for this.  Forking a separate process
was considered, but it is more expensive for a mostly-idle
process.

So, we use a variant of the "self-pipe trick" via inotify (or
whatever Filesys::Notify::Simple gives us).  Instead of writing
to our own pipe, we write to a file in our own temporary
directory watched by Filesys::Notify::Simple to trigger events
in signal handlers.
---
 lib/PublicInbox/WatchMaildir.pm | 99 +++++++++++++++++++++++++++++++----------
 script/public-inbox-watch       |  2 +-
 t/watch_maildir.t               | 12 ++---
 3 files changed, 82 insertions(+), 31 deletions(-)

diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index f81a917..fc42ec4 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -13,25 +13,27 @@ use PublicInbox::Git;
 use PublicInbox::Import;
 use PublicInbox::MDA;
 use PublicInbox::Spawn qw(spawn);
+use File::Temp qw//;
 
 sub new {
 	my ($class, $config) = @_;
-	my (%mdmap, @mdir, $spamc);
+	my (%mdmap, @mdir, $spamc, $spamdir);
 
 	# "publicinboxwatch" is the documented namespace
 	# "publicinboxlearn" is legacy but may be supported
 	# indefinitely...
 	foreach my $pfx (qw(publicinboxwatch publicinboxlearn)) {
 		my $k = "$pfx.watchspam";
-		if (my $spamdir = $config->{$k}) {
-			if ($spamdir =~ s/\Amaildir://) {
-				$spamdir =~ s!/+\z!!;
+		if (my $dir = $config->{$k}) {
+			if ($dir =~ s/\Amaildir://) {
+				$dir =~ s!/+\z!!;
 				# skip "new", no MUA has seen it, yet.
-				my $cur = "$spamdir/cur";
+				my $cur = "$dir/cur";
+				$spamdir = $cur;
 				push @mdir, $cur;
 				$mdmap{$cur} = 'watchspam';
 			} else {
-				warn "unsupported $k=$spamdir\n";
+				warn "unsupported $k=$dir\n";
 			}
 		}
 	}
@@ -77,22 +79,41 @@ sub new {
 	$mdre = qr!\A($mdre)/!;
 	bless {
 		spamcheck => $spamcheck,
+		spamdir => $spamdir,
 		mdmap => \%mdmap,
 		mdir => \@mdir,
 		mdre => $mdre,
 		config => $config,
 		importers => {},
+		opendirs => {}, # dirname => dirhandle (in progress scans)
 	}, $class;
 }
 
 sub _done_for_now {
-	$_->done foreach values %{$_[0]->{importers}};
+	my ($self) = @_;
+	my $opendirs = $self->{opendirs};
+
+	# spamdir scanning means every importer remains open
+	my $spamdir = $self->{spamdir};
+	return if defined($spamdir) && $opendirs->{$spamdir};
+
+	foreach my $im (values %{$self->{importers}}) {
+		# not done if we're scanning
+		next if $opendirs->{$im->{git}->{git_dir}};
+		$im->done;
+	}
 }
 
 sub _try_fsn_paths {
-	my ($self, $paths) = @_;
-	_try_path($self, $_->{path}) foreach @$paths;
-	_done_for_now($self);
+	my ($self, $scan_re, $paths) = @_;
+	foreach (@$paths) {
+		my $path = $_->{path};
+		if ($path =~ $scan_re) {
+			scan($self, $path);
+		} else {
+			_try_path($self, $path);
+		}
+	}
 }
 
 sub _remove_spam {
@@ -183,31 +204,61 @@ sub quit { $_[0]->{quit} = 1 }
 
 sub watch {
 	my ($self) = @_;
-	my $cb = sub { _try_fsn_paths($self, \@_) };
-	my $mdir = $self->{mdir};
+	my $scan = File::Temp->newdir("public-inbox-watch.$$.scan.XXXXXX",
+					TMPDIR => 1);
+	my $scandir = $self->{scandir} = $scan->dirname;
+	my $re = qr!\A$scandir/!;
+	my $cb = sub { _try_fsn_paths($self, $re, \@_) };
 
 	# lazy load here, we may support watching via IMAP IDLE
 	# in the future...
 	require Filesys::Notify::Simple;
-	my $watcher = Filesys::Notify::Simple->new($mdir);
-	$watcher->wait($cb) until ($self->{quit});
+	my $fsn = Filesys::Notify::Simple->new([@{$self->{mdir}}, $scandir]);
+	$fsn->wait($cb) until ($self->{quit});
+}
+
+sub trigger_scan {
+	my ($self, $base) = @_;
+	my $dir = $self->{scandir} or die "not watch-ing, yet\n";
+	open my $fh, '>', "$dir/$base" or die "open $dir/$base failed: $!\n";
+	close $fh or die "close $dir/$base failed: $!\n";
 }
 
 sub scan {
-	my ($self) = @_;
-	my $mdir = $self->{mdir};
-	foreach my $dir (@$mdir) {
-		my $ok = opendir(my $dh, $dir);
-		unless ($ok) {
-			warn "failed to open $dir: $!\n";
-			next;
-		}
+	my ($self, $path) = @_;
+	my $max = 10;
+	my $opendirs = $self->{opendirs};
+	my @dirnames = keys %$opendirs;
+	foreach my $dir (@dirnames) {
+		my $dh = delete $opendirs->{$dir};
+		my $n = $max;
 		while (my $fn = readdir($dh)) {
 			_try_path($self, "$dir/$fn");
+			last if --$n < 0;
 		}
-		closedir $dh;
+		$opendirs->{$dir} = $dh if $n < 0;
+	}
+	if ($path =~ /full\z/) {
+		foreach my $dir (@{$self->{mdir}}) {
+			next if $opendirs->{$dir}; # already in progress
+			my $ok = opendir(my $dh, $dir);
+			unless ($ok) {
+				warn "failed to open $dir: $!\n";
+				next;
+			}
+			my $n = $max;
+			while (my $fn = readdir($dh)) {
+				_try_path($self, "$dir/$fn");
+				last if --$n < 0;
+			}
+			$opendirs->{$dir} = $dh if $n < 0;
+		}
+	}
+	if (keys %$opendirs) { # do we have more work to do?
+		trigger_scan($self, 'cont');
+	} else {
+		_done_for_now($self);
 	}
-	_done_for_now($self);
 }
 
 sub _path_to_mime {
diff --git a/script/public-inbox-watch b/script/public-inbox-watch
index a72180c..51f1baa 100755
--- a/script/public-inbox-watch
+++ b/script/public-inbox-watch
@@ -13,7 +13,7 @@ my $reload = sub {
 };
 $reload->();
 if ($watch_md) {
-	my $scan = sub { $watch_md->scan if $watch_md };
+	my $scan = sub { $watch_md->trigger_scan('full') if $watch_md };
 	$SIG{HUP} = $reload;
 	$SIG{USR1} = $scan;
 	$SIG{ALRM} = sub { $SIG{ALRM} = 'DEFAULT'; $scan->() };
diff --git a/t/watch_maildir.t b/t/watch_maildir.t
index 3969c80..e12e083 100644
--- a/t/watch_maildir.t
+++ b/t/watch_maildir.t
@@ -42,7 +42,7 @@ my $config = PublicInbox::Config->new({
 	"publicinboxlearn.watchspam" => "maildir:$spamdir",
 });
 
-PublicInbox::WatchMaildir->new($config)->scan;
+PublicInbox::WatchMaildir->new($config)->scan('full');
 my $git = PublicInbox::Git->new($git_dir);
 my @list = $git->qx(qw(rev-list refs/heads/master));
 is(scalar @list, 1, 'one revision in rev-list');
@@ -59,7 +59,7 @@ my $write_spam = sub {
 };
 $write_spam->();
 is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam');
-PublicInbox::WatchMaildir->new($config)->scan;
+PublicInbox::WatchMaildir->new($config)->scan('full');
 @list = $git->qx(qw(rev-list refs/heads/master));
 is(scalar @list, 2, 'two revisions in rev-list');
 @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
@@ -72,7 +72,7 @@ To unsubscribe from this list: send the line "unsubscribe git" in
 the body of a message to majordomo\@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
-	PublicInbox::WatchMaildir->new($config)->scan;
+	PublicInbox::WatchMaildir->new($config)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 1, 'tree has one file');
 	my $mref = $git->cat_file('HEAD:'.$list[0]);
@@ -80,7 +80,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 
 	is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam');
 	$write_spam->();
-	PublicInbox::WatchMaildir->new($config)->scan;
+	PublicInbox::WatchMaildir->new($config)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 0, 'tree is empty');
 	@list = $git->qx(qw(rev-list refs/heads/master));
@@ -96,7 +96,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
 	{
 		local $SIG{__WARN__} = sub {}; # quiet spam check warning
-		PublicInbox::WatchMaildir->new($config)->scan;
+		PublicInbox::WatchMaildir->new($config)->scan('full');
 	}
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 0, 'tree has no files spamc checked');
@@ -111,7 +111,7 @@ More majordomo info at  http://vger.kernel.org/majordomo-info.html\n);
 	PublicInbox::Emergency->new($maildir)->prepare(\$msg);
 	$config->{'publicinboxwatch.spamcheck'} = 'spamc';
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
-	PublicInbox::WatchMaildir->new($config)->scan;
+	PublicInbox::WatchMaildir->new($config)->scan('full');
 	@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master));
 	is(scalar @list, 1, 'tree has one file after spamc checked');
 
-- 
EW


  parent reply	other threads:[~2017-06-26  3:15 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-26  3:15 [PATCH 0/5] watch: various bugfixes and fairness improvements Eric Wong
2017-06-26  3:15 ` [PATCH 1/5] watch: ensure HUP causes the scanner to be reloaded Eric Wong
2017-06-26  3:15 ` [PATCH 2/5] spamc: retry on EINTR Eric Wong
2017-06-26  3:15 ` Eric Wong [this message]
2017-06-26  4:01   ` [PATCH 3/5] watch: improve fairness during full rescans Eric Wong
2017-06-26  3:15 ` [PATCH 4/5] watch: use "self-inotify-tempfile trick" for quit Eric Wong
2017-06-26  3:15 ` [PATCH 5/5] watch: commit changes to fast-import sooner Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170626031545.11879-4-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).