From f1f61f2e7ab16fdf4441e12b4c3cfb98d7e3e3c5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 18 Dec 2020 20:53:09 +0000 Subject: wwwstream: linkify coderepo URLs It seems like a good idea to get more cgit visibility. --- lib/PublicInbox/WwwStream.pm | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 66e34a12..49940262 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -78,22 +78,20 @@ sub html_top ($) { sub coderepos ($) { my ($ctx) = @_; - my $ibx = $ctx->{ibx}; + my $cr = $ctx->{ibx}->{coderepo} // return (); + my $cfg = $ctx->{www}->{pi_cfg}; my @ret; - if (defined(my $cr = $ibx->{coderepo})) { - my $cfg = $ctx->{www}->{pi_cfg}; - my $env = $ctx->{env}; - for my $cr_name (@$cr) { - my $urls = $cfg->{"coderepo.$cr_name.cgiturl"}; - if ($urls) { - $ret[0] //= <{"coderepo.$cr_name.cgiturl"} // next; + $ret[0] //= <{env}, $u)); + $ret[0] .= qq(\n\t$u); } } - @ret; # may be empty + @ret; # may be empty, this sub is called as an arg for join() } sub code_footer ($) { -- cgit v1.2.3-24-ge0c7 From 9fcce78e40b0a7c61797be2aff6c5afeb474568e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:11 +0000 Subject: script/public-inbox-*: favor caller-provided pathnames We'll try to avoid calling Cwd::abs_path and use File::Spec->rel2abs instead, since abs_path will resolve symlinks the user specified on the command-line. Unfortunately, ->rel2abs still leaves "/.." and "/../" uncollapsed, so we still need to fall back to Cwd::abs_path in those cases. While we are at it, we'll also resolve inboxdir from deep inside v2 directories instead of misdetecting them as v1 bare git repos. In any case, stop matching directories by name and instead rely on the unique combination of st_dev + st_ino on stat() as we started doing in the extindex code. --- lib/PublicInbox/Admin.pm | 102 +++++++++++++++++++++++++++++--------------- script/public-inbox-convert | 27 +++--------- script/public-inbox-init | 10 +---- t/admin.t | 16 ++++--- 4 files changed, 87 insertions(+), 68 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 3977d812..ea82133a 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -6,15 +6,15 @@ package PublicInbox::Admin; use strict; use parent qw(Exporter); -use Cwd qw(abs_path); -use POSIX (); our @EXPORT_OK = qw(setup_signals); use PublicInbox::Config; use PublicInbox::Inbox; use PublicInbox::Spawn qw(popen_rd); +use File::Spec (); sub setup_signals { my ($cb, $arg) = @_; # optional + require POSIX; # we call exit() here instead of _exit() so DESTROY methods # get called (e.g. File::Temp::Dir and PublicInbox::Msgmap) @@ -27,21 +27,43 @@ sub setup_signals { }; } +# abs_path resolves symlinks, so we want to avoid it if rel2abs +# is sufficient and doesn't leave "/.." or "/../" +sub rel2abs_collapsed ($) { + my $p = File::Spec->rel2abs($_[0]); + return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; # likely + require Cwd; + Cwd::abs_path($p); +} + sub resolve_inboxdir { my ($cd, $ver) = @_; - my $prefix = defined $cd ? $cd : './'; - if (-d $prefix && -f "$prefix/inbox.lock") { # v2 - $$ver = 2 if $ver; - return abs_path($prefix); + my $try = $cd // '.'; + my $root_dev_ino; + while (1) { # favor v2, first + if (-f "$try/inbox.lock") { + $$ver = 2 if $ver; + return rel2abs_collapsed($try); + } elsif (-d $try) { + my @try = stat _; + $root_dev_ino //= do { + my @root = stat('/') or die "stat /: $!\n"; + "$root[0]\0$root[1]"; + }; + last if "$try[0]\0$try[1]" eq $root_dev_ino; + $try .= '/..'; # continue, cd up + } else { + die "`$try' is not a directory\n"; + } } + # try v1 bare git dirs my $cmd = [ qw(git rev-parse --git-dir) ]; my $fh = popen_rd($cmd, undef, {-C => $cd}); my $dir = do { local $/; <$fh> }; - close $fh or die "error in ".join(' ', @$cmd)." (cwd:$cd): $!\n"; + close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $!\n"; chomp $dir; $$ver = 1 if $ver; - return abs_path($cd) if ($dir eq '.' && defined $cd); - abs_path($dir); + rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir); } # for unconfigured inboxes @@ -78,8 +100,8 @@ sub unconfigured_ibx ($$) { name => $name, address => [ "$name\@example.com" ], inboxdir => $dir, - # TODO: consumers may want to warn on this: - #-unconfigured => 1, + # consumers (-convert) warn on this: + -unconfigured => 1, }); } @@ -95,41 +117,53 @@ sub resolve_inboxes ($;$$) { } my $min_ver = $opt->{-min_inbox_version} || 0; + # lookup inboxes by st_dev + st_ino instead of {inboxdir} pathnames, + # pathnames are not unique due to symlinks and bind mounts my (@old, @ibxs); - my %dir2ibx; - my $all = $opt->{all} ? [] : undef; - if ($cfg) { + if ($opt->{all}) { $cfg->each_inbox(sub { my ($ibx) = @_; - my $path = abs_path($ibx->{inboxdir}); - if (defined($path)) { - $dir2ibx{$path} = $ibx; - push @$all, $ibx if $all; + if (-e $ibx->{inboxdir}) { + push(@ibxs, $ibx) if $ibx->version >= $min_ver; } else { - warn <{name} $ibx->{inboxdir}: $! -EOF + warn "W: $ibx->{name} $ibx->{inboxdir}: $!\n"; } }); - } - if ($all) { - @$all = grep { $_->version >= $min_ver } @$all; - @ibxs = @$all; } else { # directories specified on the command-line - my $i = 0; my @dirs = @$argv; push @dirs, '.' if !@dirs && $opt->{-use_cwd}; - foreach (@dirs) { - my $v; - my $dir = resolve_inboxdir($_, \$v); - if ($v < $min_ver) { + my %s2i; # "st_dev\0st_ino" => array index + for (my $i = 0; $i <= $#dirs; $i++) { + my $dir = $dirs[$i]; + my @st = stat($dir) or die "stat($dir): $!\n"; + $dir = resolve_inboxdir($dir, \(my $ver)); + if ($ver >= $min_ver) { + $s2i{"$st[0]\0$st[1]"} //= $i; + } else { push @old, $dir; - next; } - my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i); - $i++; - push @ibxs, $ibx; } + my $done = \'done'; + eval { + $cfg->each_inbox(sub { + my ($ibx) = @_; + return if $ibx->version < $min_ver; + my $dir = $ibx->{inboxdir}; + if (my @s = stat $dir) { + my $i = delete($s2i{"$s[0]\0$s[1]"}) + // return; + $ibxs[$i] = $ibx; + die $done if !keys(%s2i); + } else { + warn "W: $ibx->{name} $dir: $!\n"; + } + }); + }; + die $@ if $@ && $@ ne $done; + for my $i (sort { $a <=> $b } values %s2i) { + $ibxs[$i] = unconfigured_ibx($dirs[$i], $i); + } + @ibxs = grep { defined } @ibxs; # duplicates are undef } if (@old) { die "-V$min_ver inboxes not supported by $0\n\t", diff --git a/script/public-inbox-convert b/script/public-inbox-convert index b61c743f..fbd527a6 100755 --- a/script/public-inbox-convert +++ b/script/public-inbox-convert @@ -47,34 +47,21 @@ die $help if (scalar(@ARGV) || $new_dir eq '' || $old_dir eq ''); die "$new_dir exists\n" if -d $new_dir; die "$old_dir not a directory\n" unless -d $old_dir; -require Cwd; -Cwd->import('abs_path'); +require PublicInbox::Admin; require PublicInbox::Config; require PublicInbox::InboxWritable; -my $abs = abs_path($old_dir); -die "failed to resolve $old_dir: $!\n" if (!defined($abs)); - my $cfg = PublicInbox::Config->new; -my $old; -$cfg->each_inbox(sub { - $old = $_[0] if abs_path($_[0]->{inboxdir}) eq $old_dir; -}); -if ($old) { - $old = PublicInbox::InboxWritable->new($old); -} else { +my @old = PublicInbox::Admin::resolve_inboxes([$old_dir], undef, $cfg); +@old > 1 and die "BUG: resolved several inboxes from $old_dir:\n", + map { "\t$_->{inboxdir}\n" } @old; +my $old = PublicInbox::InboxWritable->new($old[0]); +if (delete $old->{-unconfigured}) { warn "W: $old_dir not configured in " . PublicInbox::Config::default_file() . "\n"; - $old = PublicInbox::InboxWritable->new({ - inboxdir => $old_dir, - name => 'ignored', - -primary_address => 'old@example.com', - address => [ 'old@example.com' ], - }); } die "Only conversion from v1 inboxes is supported\n" if $old->version >= 2; -require File::Spec; require PublicInbox::Admin; my $detected = PublicInbox::Admin::detect_indexlevel($old); $old->{indexlevel} //= $detected; @@ -88,7 +75,7 @@ if ($opt->{'index'}) { } local %ENV = (%$env, %ENV) if $env; my $new = { %$old }; -$new->{inboxdir} = File::Spec->canonpath($new_dir); +$new->{inboxdir} = PublicInbox::Admin::rel2abs_collapsed($new_dir); $new->{version} = 2; $new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} }); $new->{-no_fsync} = 1 if !$opt->{fsync}; diff --git a/script/public-inbox-init b/script/public-inbox-init index c775eb31..eb605a51 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -138,10 +138,9 @@ close($fh) or die "failed to close $pi_config_tmp: $!\n"; my $pfx = "publicinbox.$name"; my @x = (qw/git config/, "--file=$pi_config_tmp"); -require File::Spec; -$inboxdir = File::Spec->canonpath($inboxdir); +PublicInbox::Admin::rel2abs_collapsed($inboxdir); +die "`\\n' not allowed in `$inboxdir'\n" if index($inboxdir, "\n") >= 0; -die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s; if (-f "$inboxdir/inbox.lock") { if (!defined $version) { $version = 2; @@ -186,11 +185,6 @@ if ($skip_docdata) { $ibx->{-skip_docdata} = $skip_docdata; } $ibx->init_inbox(0, $skip_epoch, $skip_artnum); -require Cwd; -my $tmp = Cwd::abs_path($inboxdir); -defined($tmp) or die "failed to resolve $inboxdir: $!\n"; -$inboxdir = $tmp; -die "`\\n' not allowed in `$inboxdir'\n" if $inboxdir =~ /\n/s; # needed for git prior to v2.1.0 umask(0077) if defined $perm; diff --git a/t/admin.t b/t/admin.t index af132577..60c6037d 100644 --- a/t/admin.t +++ b/t/admin.t @@ -12,10 +12,7 @@ my $v2_dir = "$tmpdir/v2"; my ($res, $err, $v); PublicInbox::Import::init_bare($git_dir); -*resolve_inboxdir = do { - no warnings 'once'; - *PublicInbox::Admin::resolve_inboxdir; -}; +*resolve_inboxdir = \&PublicInbox::Admin::resolve_inboxdir; # v1 is(resolve_inboxdir($git_dir), $git_dir, 'top-level GIT_DIR resolved'); @@ -72,16 +69,23 @@ SKIP: { ok(-e "$v2_dir/inbox.lock", 'exists'); is(resolve_inboxdir($v2_dir), $v2_dir, 'resolve_inboxdir works on v2_dir'); - ok(chdir($v2_dir), 'chdir v2_dir OK'); + chdir($v2_dir) or BAIL_OUT "chdir v2_dir: $!"; is(resolve_inboxdir(), $v2_dir, 'resolve_inboxdir works inside v2_dir'); $res = resolve_inboxdir(undef, \$v); is($v, 2, 'version 2 detected'); is($res, $v2_dir, 'detects directory along with version'); # TODO: should work from inside Xapian dirs, and git dirs, here... + PublicInbox::Import::init_bare("$v2_dir/git/0.git"); + my $objdir = "$v2_dir/git/0.git/objects"; + is($v2_dir, resolve_inboxdir($objdir, \$v), 'at $objdir'); + is($v, 2, 'version 2 detected at $objdir'); + chdir($objdir) or BAIL_OUT "chdir objdir: $!"; + is(resolve_inboxdir(undef, \$v), $v2_dir, 'inside $objdir'); + is($v, 2, 'version 2 detected inside $objdir'); } -chdir '/'; +chdir '/' or BAIL_OUT "chdir: $!"; my @pairs = ( '1g' => 1024 ** 3, -- cgit v1.2.3-24-ge0c7 From ab243aa2328e2fc4cf895c99c68345e57cc4653c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:12 +0000 Subject: inboxidle: remove needless check for {inboxdir} ->each_inbox will never attempt to iterate an object without {inboxdir}, and simplify + short-circuit the corresponding code --- lib/PublicInbox/Config.pm | 7 +++---- lib/PublicInbox/InboxIdle.pm | 7 +------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index cafd9c3b..199ce019 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -391,9 +391,9 @@ EOF } } - # backwards compatibility: - $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"}; - if (($ibx->{inboxdir} // '') =~ /\n/s) { + # "mainrepo" is backwards compatibility: + $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; + if ($ibx->{inboxdir} =~ /\n/s) { warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n"; return; } @@ -415,7 +415,6 @@ EOF } } - return unless defined($ibx->{inboxdir}); my $name = $pfx; $name =~ s/\Apublicinbox\.//; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 2737bbbd..f1cbc012 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -7,7 +7,6 @@ package PublicInbox::InboxIdle; use strict; use parent qw(PublicInbox::DS); -use Cwd qw(abs_path); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); my $IN_MODIFY = 0x02; # match Linux inotify my $ino_cls; @@ -22,11 +21,7 @@ require PublicInbox::In2Tie if $ino_cls; sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback my ($ibx, $self) = @_; - my $dir = abs_path($ibx->{inboxdir}); - if (!defined($dir)) { - warn "W: $ibx->{inboxdir} not watched: $!\n"; - return; - } + my $dir = $ibx->{inboxdir}; my $inot = $self->{inot}; my $cur = $self->{pathmap}->{$dir} //= []; -- cgit v1.2.3-24-ge0c7 From ba958c1a44ca7e48a5790a2ecb2198636d6492db Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:13 +0000 Subject: daemon: lazy load Cwd only for --daemonize users systemd users won't need it polluting the namespace; though other things are still likely to load it. --- lib/PublicInbox/Daemon.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index a2171535..d1a42fc3 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -11,7 +11,6 @@ use IO::Socket; use POSIX qw(WNOHANG :signal_h); use Socket qw(IPPROTO_TCP SOL_SOCKET); sub SO_ACCEPTFILTER () { 0x1000 } -use Cwd qw/abs_path/; STDOUT->autoflush(1); STDERR->autoflush(1); use PublicInbox::DS qw(now); @@ -202,10 +201,11 @@ sub check_absolute ($$) { sub daemonize () { if ($daemonize) { + require Cwd; foreach my $i (0..$#ARGV) { my $arg = $ARGV[$i]; next unless -e $arg; - $ARGV[$i] = abs_path($arg); + $ARGV[$i] = Cwd::abs_path($arg); } check_absolute('stdout', $stdout); check_absolute('stderr', $stderr); -- cgit v1.2.3-24-ge0c7 From df708a4f7c4d62ef685ccf9868c78ff2709c1f40 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:14 +0000 Subject: daemon: unconditionally call IO::Handle::blocking(0) IO::Handle::blocking will always return the initial value from the F_GETFL op and it won't issue F_SETFL if a socket is already non-blocking. --- lib/PublicInbox/Daemon.pm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index d1a42fc3..eeac3bd2 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -367,14 +367,12 @@ sub inherit ($) { foreach my $fd (3..$end) { my $s = IO::Handle->new_from_fd($fd, 'r'); if (my $k = sockname($s)) { - if ($s->blocking) { - $s->blocking(0); - warn <<""; + my $prev_was_blocking = $s->blocking(0); + warn <<"" if $prev_was_blocking; Inherited socket (fd=$fd) is blocking, making it non-blocking. Set 'NonBlocking = true' in the systemd.service unit to avoid stalled processes when multiple service instances start. - } $listener_names->{$k} = $s; push @rv, $s; } else { -- cgit v1.2.3-24-ge0c7 From e77241850763dea9995381f3c0c7b354aa791cc0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:15 +0000 Subject: daemon: kill_workers: eliminate unnecessary loop The `kill' perl op takes multiple PIDs, so there's no need to iterate through the %pids hash. --- lib/PublicInbox/Daemon.pm | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index eeac3bd2..1762be0b 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -419,11 +419,8 @@ sub upgrade { # $_[0] = signal name or number (unused) } sub kill_workers ($) { - my ($s) = @_; - - while (my ($pid, $id) = each %pids) { - kill $s, $pid; - } + my ($sig) = @_; + kill $sig, keys(%pids); } sub upgrade_aborted ($) { -- cgit v1.2.3-24-ge0c7 From fb7d958dbc601f126dc391d915f5935d4cfc595e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:16 +0000 Subject: config: eliminate unnecessary join call up front We can rely on implicit join in string interpolation on die() iff needed. And just creating the arrayref up front to avoid an extra backslash seems nicer at the moment. --- lib/PublicInbox/Config.pm | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 199ce019..2f5c83cd 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -163,11 +163,10 @@ sub config_fh_parse ($$$) { sub git_config_dump { my ($file) = @_; return {} unless -e $file; - my @cmd = (qw/git config -z -l --includes/, "--file=$file"); - my $cmd = join(' ', @cmd); - my $fh = popen_rd(\@cmd); + my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; + my $fh = popen_rd($cmd); my $rv = config_fh_parse($fh, "\0", "\n"); - close $fh or die "failed to close ($cmd) pipe: $?"; + close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } -- cgit v1.2.3-24-ge0c7 From 3ce4c38119f13d419bb865a0aa9b66feff339308 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 19:41:21 +0000 Subject: manifest.js.gz: fix per-inbox /$INBOX/manifest.js.gz /$INBOX/manifest.js.gz should not attempt to match every inbox in the domain (or every inbox); that is for /manifest.js.gz (without a /$INBOX prefix). Fixes: f303b4add8ea1883 ("wwwlisting: avoid hogging event loop") --- lib/PublicInbox/ManifestJsGz.pm | 7 +++++++ lib/PublicInbox/WWW.pm | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm index 6d5b57ee..e02450fa 100644 --- a/lib/PublicInbox/ManifestJsGz.pm +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -99,4 +99,11 @@ sub psgi_triple { 'Content-Length', bytes::length($out) ], [ $out ] ] } +sub per_inbox { + my ($ctx) = @_; + # only one inbox, slow is probably OK + slow_manifest_add($ctx, $ctx->{ibx}); + psgi_triple($ctx); +} + 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index a33d25ab..52630ae3 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -505,7 +505,7 @@ sub get_inbox_manifest ($$$) { my $r404 = invalid_inbox($ctx, $inbox); return $r404 if $r404; require PublicInbox::ManifestJsGz; - PublicInbox::ManifestJsGz->response($ctx); + PublicInbox::ManifestJsGz::per_inbox($ctx); } sub get_attach { -- cgit v1.2.3-24-ge0c7 From eea4ba7bac73b67ccd2cdd07946b176260ac5fac Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:18 +0000 Subject: inbox: delay ->version detection Our read-only code won't need to know the version until an inbox is accessed. This is a small step towards eliminating many stat() calls on read-only daemon startup. --- lib/PublicInbox/Inbox.pm | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 8a3a0194..863a5de4 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -109,10 +109,6 @@ sub new { delete $opts->{feedmax}; } $opts->{nntpserver} ||= $pi_cfg->{'publicinbox.nntpserver'}; - my $dir = $opts->{inboxdir}; - if (defined $dir && -f "$dir/inbox.lock") { - $opts->{version} = 2; - } # allow any combination of multi-line or comma-delimited hide entries my $hide = {}; @@ -125,7 +121,9 @@ sub new { bless $opts, $class; } -sub version { $_[0]->{version} // 1 } +sub version { + $_[0]->{version} //= -f "$_[0]->{inboxdir}/inbox.lock" ? 2 : 1 +} sub git_epoch { my ($self, $epoch) = @_; # v2-only, callers always supply $epoch -- cgit v1.2.3-24-ge0c7 From bad84119fb0915abe3f19fe4fb9c34e24fe7e564 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:19 +0000 Subject: isearch: use numeric sort for article numbers Perl sort is alphabetical by default and Xapian uses numeric document IDs, so sort must be told explicitly to use numeric comparisons even if the scalars are integer values (IV) internally. And eliminate extra hash marks ("#") since they're probably too noisy if there are many IDs. Note: I haven't seen this warning message in syslog, yet :> --- lib/PublicInbox/Isearch.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index 8a1f257a..e362c80a 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -89,7 +89,7 @@ SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) } if (scalar keys %order) { warn "W: $self->{es}->{topdir} #", - join(', #', sort keys %order), + join(', ', sort { $a <=> $b } keys %order), " not mapped to `$self->{eidx_key}'\n"; warn "W: $self->{es}->{topdir} may need to be reindexed\n"; @xnums = grep { defined } @xnums; @@ -113,7 +113,7 @@ sub mset_to_smsg { } if (scalar keys %order) { warn "W: $ibx->{inboxdir} #", - join(', #', sort keys %order), + join(', ', sort { $a <=> $b } keys %order), " no longer valid\n"; warn "W: $self->{es}->{topdir} may need to be reindexed\n"; } -- cgit v1.2.3-24-ge0c7 From 3e9888ed30b7fe092b03789d19a8020d4bc0fb39 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:20 +0000 Subject: use rel2abs_collapsed when loading Inbox objects We need to canonicalize paths for inboxes which do not have a newsgroup defined, otherwise ->eidx_key matches can fail in unexpected ways. --- lib/PublicInbox/Admin.pm | 11 +---------- lib/PublicInbox/Config.pm | 28 +++++++++++++++++++++++----- lib/PublicInbox/ExtSearchIdx.pm | 5 ----- script/public-inbox-convert | 2 +- script/public-inbox-init | 2 +- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index ea82133a..c972fb68 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -10,7 +10,7 @@ our @EXPORT_OK = qw(setup_signals); use PublicInbox::Config; use PublicInbox::Inbox; use PublicInbox::Spawn qw(popen_rd); -use File::Spec (); +*rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed; sub setup_signals { my ($cb, $arg) = @_; # optional @@ -27,15 +27,6 @@ sub setup_signals { }; } -# abs_path resolves symlinks, so we want to avoid it if rel2abs -# is sufficient and doesn't leave "/.." or "/../" -sub rel2abs_collapsed ($) { - my $p = File::Spec->rel2abs($_[0]); - return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; # likely - require Cwd; - Cwd::abs_path($p); -} - sub resolve_inboxdir { my ($cd, $ver) = @_; my $try = $cd // '.'; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 2f5c83cd..577337dc 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -368,6 +368,16 @@ sub git_bool { } } +# abs_path resolves symlinks, so we want to avoid it if rel2abs +# is sufficient and doesn't leave "/.." or "/../" +sub rel2abs_collapsed { + require File::Spec; + my $p = File::Spec->rel2abs($_[-1]); + return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; + require Cwd; + Cwd::abs_path($p); +} + sub _fill { my ($self, $pfx) = @_; my $ibx = {}; @@ -391,9 +401,9 @@ EOF } # "mainrepo" is backwards compatibility: - $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; - if ($ibx->{inboxdir} =~ /\n/s) { - warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n"; + my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; + if (index($dir, "\n") >= 0) { + warn "E: `$dir' must not contain `\\n'\n"; return; } foreach my $k (qw(obfuscate)) { @@ -436,7 +446,7 @@ EOF $self->{-by_list_id}->{lc($list_id)} = $ibx; } } - if (my $ngname = $ibx->{newsgroup}) { + if (defined(my $ngname = $ibx->{newsgroup})) { if (ref($ngname)) { delete $ibx->{newsgroup}; warn 'multiple newsgroups not supported: '. @@ -445,7 +455,8 @@ EOF # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. # Leave out a few chars likely to cause problems or conflicts: # '|', '<', '>', ';', '#', '$', '&', - } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) { + } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]! || + $ngname eq '') { delete $ibx->{newsgroup}; warn "newsgroup name invalid: `$ngname'\n"; } else { @@ -454,6 +465,13 @@ EOF $self->{-by_newsgroup}->{$ngname} = $ibx; } } + unless (defined $ibx->{newsgroup}) { # for ->eidx_key + my $abs = rel2abs_collapsed($dir); + if ($abs ne $dir) { + warn "W: `$dir' canonicalized to `$abs'\n"; + $ibx->{inboxdir} = $abs; + } + } $self->{-by_name}->{$name} = $ibx; if ($ibx->{obfuscate}) { $ibx->{-no_obfuscate} = $self->{-no_obfuscate}; diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index b82d0546..c4b429df 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -72,11 +72,6 @@ sub attach_inbox { warn "W: skipping $key (no UIDVALIDITY)\n"; return; } - my $ibxdir = File::Spec->canonpath($ibx->{inboxdir}); - if ($ibxdir ne $ibx->{inboxdir}) { - warn "W: `$ibx->{inboxdir}' canonicalized to `$ibxdir'\n"; - $ibx->{inboxdir} = $ibxdir; - } $self->{ibx_map}->{$key} //= do { push @{$self->{ibx_list}}, $ibx; $ibx; diff --git a/script/public-inbox-convert b/script/public-inbox-convert index fbd527a6..800c364c 100755 --- a/script/public-inbox-convert +++ b/script/public-inbox-convert @@ -75,7 +75,7 @@ if ($opt->{'index'}) { } local %ENV = (%$env, %ENV) if $env; my $new = { %$old }; -$new->{inboxdir} = PublicInbox::Admin::rel2abs_collapsed($new_dir); +$new->{inboxdir} = $cfg->rel2abs_collapsed($new_dir); $new->{version} = 2; $new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} }); $new->{-no_fsync} = 1 if !$opt->{fsync}; diff --git a/script/public-inbox-init b/script/public-inbox-init index eb605a51..afaa4c12 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -138,7 +138,7 @@ close($fh) or die "failed to close $pi_config_tmp: $!\n"; my $pfx = "publicinbox.$name"; my @x = (qw/git config/, "--file=$pi_config_tmp"); -PublicInbox::Admin::rel2abs_collapsed($inboxdir); +PublicInbox::Config::rel2abs_collapsed($inboxdir); die "`\\n' not allowed in `$inboxdir'\n" if index($inboxdir, "\n") >= 0; if (-f "$inboxdir/inbox.lock") { -- cgit v1.2.3-24-ge0c7 From 427b4fbbc68e4e03b20d66062dd47a0213e18390 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:21 +0000 Subject: searchidx: rename get_val to int_val and return IV Values can be strings in Xapian, although we currently use integer values exclusively. Give the wrapper a more appropriate name in case we start using string columns. For future-proofing, we'll now return `undef' on missing columns and coerce the return value to an IV (integer value) to save memory, as sortable_unserialise returns a PV (pointer value) scalar despite it existing to support numeric values. --- lib/PublicInbox/SearchIdx.pm | 9 +++++---- t/search.t | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index b731f698..cf2c2c55 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -501,17 +501,18 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -sub get_val ($$) { +sub int_val ($$) { my ($doc, $col) = @_; - sortable_unserialise($doc->get_value($col)); + my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion } sub smsg_from_doc ($) { my ($doc) = @_; my $data = $doc->get_data or return; my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->{ts} = get_val($doc, PublicInbox::Search::TS()); - my $dt = get_val($doc, PublicInbox::Search::DT()); + $smsg->{ts} = int_val($doc, PublicInbox::Search::TS()); + my $dt = int_val($doc, PublicInbox::Search::DT()); my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); $smsg->load_from_data($data); diff --git a/t/search.t b/t/search.t index da9acb07..11143204 100644 --- a/t/search.t +++ b/t/search.t @@ -332,13 +332,13 @@ $ibx->with_umask(sub { like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears'); my $doc = $m->get_document; my $col = PublicInbox::Search::BYTES(); - my $bytes = PublicInbox::SearchIdx::get_val($doc, $col); + my $bytes = PublicInbox::SearchIdx::int_val($doc, $col); like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit'); ok($bytes > 0, '$bytes is > 0'); is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over'); $col = PublicInbox::Search::UID(); - my $uid = PublicInbox::SearchIdx::get_val($doc, $col); + my $uid = PublicInbox::SearchIdx::int_val($doc, $col); is($uid, $smsg->{num}, 'UID column matches {num}'); is($uid, $m->get_docid, 'UID column matches docid'); } -- cgit v1.2.3-24-ge0c7 From c13272432ad28adb506faf6fb9121569cf5ec710 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:22 +0000 Subject: extsearch*: drop unnecessary path canonicalization Unlike inboxdir, the canonical-ness of -extindex paths is not relevant at the moment, and may never be relevant at all. So don't mislead others into thinking these paths being canonicalized matters. --- lib/PublicInbox/ExtSearch.pm | 2 -- lib/PublicInbox/ExtSearchIdx.pm | 2 -- 2 files changed, 4 deletions(-) diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index 2a560935..a2b97798 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -9,7 +9,6 @@ use strict; use v5.10.1; use PublicInbox::Over; use PublicInbox::Inbox; -use File::Spec (); use PublicInbox::MiscSearch; use DBI qw(:sql_types); # SQL_BLOB @@ -18,7 +17,6 @@ use parent qw(PublicInbox::Search); sub new { my (undef, $topdir) = @_; - $topdir = File::Spec->canonpath($topdir); bless { topdir => $topdir, # xpfx => 'ei15' diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index c4b429df..f04e0443 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -30,13 +30,11 @@ use PublicInbox::V2Writable; use PublicInbox::InboxWritable; use PublicInbox::ContentHash qw(content_hash); use PublicInbox::Eml; -use File::Spec; use PublicInbox::DS qw(now); use DBI qw(:sql_types); # SQL_BLOB sub new { my (undef, $dir, $opt) = @_; - $dir = File::Spec->canonpath($dir); my $l = $opt->{indexlevel} // 'full'; $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and die "invalid indexlevel=$l\n"; -- cgit v1.2.3-24-ge0c7 From 2adbbf87748719b422ba08d674ccb35637cb76b5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 22:10:55 +0000 Subject: support multiple CODE_URLs public-inbox.org will expire in a few years, so ensure Tor .onions can be known before then. --- Documentation/mknews.perl | 8 ++++---- README | 1 + lib/PublicInbox/Unsubscribe.pm | 9 ++++++--- lib/PublicInbox/WwwStream.pm | 3 ++- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/Documentation/mknews.perl b/Documentation/mknews.perl index d87c2609..a11dd5f0 100755 --- a/Documentation/mknews.perl +++ b/Documentation/mknews.perl @@ -119,10 +119,10 @@ sub html_start { } sub html_end { - print $out < -EOF + for (@$PublicInbox::WwwStream::CODE_URL) { + print $out " git clone $_\n" or die; + } + print $out "\n" or die; } sub atom_start { diff --git a/README b/README index ae428bcf..6396373f 100644 --- a/README +++ b/README @@ -94,6 +94,7 @@ AGPL source code is available via git: git clone https://public-inbox.org/public-inbox.git git clone https://repo.or.cz/public-inbox.git + torsocks git clone http://ou63pmih66umazou.onion/public-inbox.git torsocks git clone http://hjrcffqmbrq6wope.onion/public-inbox See below for contact info. diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm index b0d3220c..ae0b0679 100644 --- a/lib/PublicInbox/Unsubscribe.pm +++ b/lib/PublicInbox/Unsubscribe.pm @@ -12,7 +12,8 @@ use warnings; use Crypt::CBC; use Plack::Util; use MIME::Base64 qw(decode_base64url); -my $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +my @CODE_URL = qw(http://ou63pmih66umazou.onion/public-inbox.git + https://public-inbox.org/public-inbox.git); my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8'); sub new { @@ -38,13 +39,15 @@ sub new { my $unsubscribe = $opt{unsubscribe} or die "`unsubscribe' callback not given\n"; + my $code_url = $opt{code_url} || \@CODE_URL; + $code_url = [ $code_url ] if ref($code_url) ne 'ARRAY'; bless { pi_cfg => $opt{pi_config}, # PublicInbox::Config owner_email => $opt{owner_email}, cipher => $cipher, unsubscribe => $unsubscribe, contact => qq($e), - code_url => $opt{code_url} || $CODE_URL, + code_url => $code_url, confirm => $opt{confirm}, }, $class; } @@ -138,7 +141,7 @@ sub r { "$title
".
 		join("\n", "$title\n", @body) . '

'. "
This page is available under AGPL-3.0+\n" .
-		"git clone $self->{code_url}\n" .
+		join('', map { "git clone $_\n" } @{$self->{code_url}}) .
 		qq(Email $self->{contact} if you have any questions).
 		'
' ] ]; diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 49940262..9ba8fa11 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -12,7 +12,8 @@ our @EXPORT_OK = qw(html_oneshot); use bytes (); # length use PublicInbox::Hval qw(ascii_html prurl ts2str); our $TOR_URL = 'https://www.torproject.org/'; -our $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +our $CODE_URL = [ qw(http://ou63pmih66umazou.onion/public-inbox.git + https://public-inbox.org/public-inbox.git) ]; sub base_url ($) { my $ctx = shift; -- cgit v1.2.3-24-ge0c7 From 1350f5ab09f72c75ac2cd6c88f6a2b9e198fef55 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 22 Dec 2020 18:18:10 +0100 Subject: public-inbox-v[12]-format.pod: make lexgrog happy The Debian package linter (lintian) emits the following warning: W: bad-whatis-entry N: N: A manual page should start with a NAME section, which lists the N: program name and a brief description. The NAME section is used to N: generate a database that can be queried by commands like apropos and N: whatis. You are seeing this tag because lexgrog was unable to parse N: the NAME section. N: N: Manual pages for multiple programs, functions, or files should list N: each separated by a comma and a space, followed by \- and a common N: description. N: N: Listed items may not contain any spaces. A manual page for a two-level N: command such as fs listacl must look like fs_listacl so the list is N: read correctly. N: N: Refer to the lexgrog(1) manual page, the groff_man(7) manual page, and N: the groff_mdoc(7) manual page for details. N: N: Severity: warning N: N: Check: documentation/manual N: N: Renamed from: manpage-has-bad-whatis-entry N: for public-inbox-v1-format and public-inbox-v2-format. Adapt the descriptions to make lexgrog and so lintian happy. --- Documentation/public-inbox-v1-format.pod | 2 +- Documentation/public-inbox-v2-format.pod | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/public-inbox-v1-format.pod b/Documentation/public-inbox-v1-format.pod index e5b1dd06..da19d2c9 100644 --- a/Documentation/public-inbox-v1-format.pod +++ b/Documentation/public-inbox-v1-format.pod @@ -2,7 +2,7 @@ =head1 NAME -public-inbox v1 git repository and tree description (aka "ssoma") +public-inbox-v1-format - git repository and tree description (aka "ssoma") =head1 DESCRIPTION diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod index d6282cb4..3c89f13e 100644 --- a/Documentation/public-inbox-v2-format.pod +++ b/Documentation/public-inbox-v2-format.pod @@ -2,7 +2,7 @@ =head1 NAME -public-inbox v2 format description +public-inbox-v2-format - structure of public inbox v2 archives =head1 DESCRIPTION -- cgit v1.2.3-24-ge0c7 From 949e8b4a65a2dbb99d8923ebb4715a8724ca8bf2 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 22 Dec 2020 06:01:44 +0000 Subject: admin: resolve inboxes to absolute paths for index Some of my ancient v1-only scripts called public-inbox-index to operate on GIT_DIR: GIT_DIR=/path/to/foo.git public-inbox-index This change ensures they keep working, otherwise "." will be passed to the --git-dir= switch of git(1) because that's the default directory if no inboxes are specified on the command-line. Fixes: 9fcce78e40b0a7c6 ("script/public-inbox-*: favor caller-provided pathnames") --- lib/PublicInbox/Admin.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index c972fb68..9a86d206 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -127,7 +127,7 @@ sub resolve_inboxes ($;$$) { for (my $i = 0; $i <= $#dirs; $i++) { my $dir = $dirs[$i]; my @st = stat($dir) or die "stat($dir): $!\n"; - $dir = resolve_inboxdir($dir, \(my $ver)); + $dir = $dirs[$i] = resolve_inboxdir($dir, \(my $ver)); if ($ver >= $min_ver) { $s2i{"$st[0]\0$st[1]"} //= $i; } else { -- cgit v1.2.3-24-ge0c7 From 37526779e4c544b134af74f43d6c26569cb8e466 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 22 Dec 2020 05:04:16 +0000 Subject: wwwstream: show relative coderepo URLs correctly Trying to link "foo.git" relative to the current URL usually does not provide correct results, so prefix it by going into the parent directory if an absolute (or protocol-relative) URL is not supplied. --- lib/PublicInbox/WwwStream.pm | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 9ba8fa11..958251a3 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -81,13 +81,17 @@ sub coderepos ($) { my ($ctx) = @_; my $cr = $ctx->{ibx}->{coderepo} // return (); my $cfg = $ctx->{www}->{pi_cfg}; + my $upfx = ($ctx->{-upfx} // ''). '../'; my @ret; for my $cr_name (@$cr) { my $urls = $cfg->{"coderepo.$cr_name.cgiturl"} // next; $ret[0] //= <{env}, $u)); $ret[0] .= qq(\n\t$u); } -- cgit v1.2.3-24-ge0c7 From 7e2bbc90bcc458f91d29c6847245857d7bb8fb60 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:44 +0000 Subject: miscsearch: load Xapian at initialization We need Xapian bindings loaded before calling (Search::)Xapian::Database->new --- lib/PublicInbox/MiscSearch.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index f2e31443..de587d35 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -23,6 +23,7 @@ my %PROB_PREFIX = ( sub new { my ($class, $dir) = @_; + PublicInbox::Search::load_xapian(); bless { xdb => $PublicInbox::Search::X{Database}->new($dir) }, $class; -- cgit v1.2.3-24-ge0c7 From 052b9762f82c119f272c3ab8334773bd9c1c5b25 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:45 +0000 Subject: xt: add create-many-inboxes helper test I've been using something like this to mock out thousands of inboxes for testing. --- MANIFEST | 1 + xt/create-many-inboxes.t | 99 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 xt/create-many-inboxes.t diff --git a/MANIFEST b/MANIFEST index ac442606..a4cdedff 100644 --- a/MANIFEST +++ b/MANIFEST @@ -394,6 +394,7 @@ t/x-unknown-alpine.eml t/xcpdb-reshard.t xt/cmp-msgstr.t xt/cmp-msgview.t +xt/create-many-inboxes.t xt/eml_check_limits.t xt/git-http-backend.t xt/git_async_cmp.t diff --git a/xt/create-many-inboxes.t b/xt/create-many-inboxes.t new file mode 100644 index 00000000..c92643b2 --- /dev/null +++ b/xt/create-many-inboxes.t @@ -0,0 +1,99 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use PublicInbox::TestCommon; +use PublicInbox::Eml; +use File::Path qw(mkpath); +use IO::Handle (); # autoflush +use POSIX qw(_exit); +use Cwd qw(getcwd abs_path); +use File::Spec; +my $many_root = $ENV{TEST_MANY_ROOT} or + plan skip_all => 'TEST_MANY_ROOT not defined'; +my $cwd = getcwd(); +mkpath($many_root); +-d $many_root or BAIL_OUT "$many_root: $!"; +$many_root = abs_path($many_root); +$many_root =~ m!\A\Q$cwd\E/! and BAIL_OUT "$many_root must not be in $cwd"; +require_git 2.6; +require_mods(qw(DBD::SQLite Search::Xapian)); +use_ok 'PublicInbox::V2Writable'; +my $nr_inbox = $ENV{NR_INBOX} // 10; +my $nproc = $ENV{NPROC} || PublicInbox::V2Writable::detect_nproc() || 2; +my $indexlevel = $ENV{TEST_INDEXLEVEL} // 'basic'; +diag "NR_INBOX=$nr_inbox NPROC=$nproc TEST_INDEXLEVEL=$indexlevel"; +diag "TEST_MANY_ROOT=$many_root"; +my $level_cfg = $indexlevel eq 'full' ? '' : "\tindexlevel = $indexlevel\n"; +my $pfx = "$many_root/$nr_inbox-$indexlevel"; +mkpath($pfx); +open my $cfg_fh, '>>', "$pfx/config" or BAIL_OUT $!; +$cfg_fh->autoflush(1); +my $v2_init_add = sub { + my ($i) = @_; + my $ibx = PublicInbox::Inbox->new({ + inboxdir => "$pfx/test-$i", + name => "test-$i", + newsgroup => "inbox.comp.test.foo.test-$i", + address => [ "test-$i\@example.com" ], + url => [ "//example.com/test-$i" ], + version => 2, + }); + $ibx->{indexlevel} = $indexlevel if $level_cfg ne ''; + my $entry = <{name}"] + address = $ibx->{-primary_address} + url = $ibx->{url}->[0] + newsgroup = $ibx->{newsgroup} + inboxdir = $ibx->{inboxdir} +EOF + $entry .= $level_cfg; + print $cfg_fh $entry or die $!; + my $v2w = PublicInbox::V2Writable->new($ibx, { nproc => 0 }); + $v2w->init_inbox(0); + $v2w->add(PublicInbox::Eml->new(< +To: test-$i\@example.com +Message-ID: <20101002-000000-$i\@example.com> +Subject: hello world $i + +hi +EOM + $v2w->done; +}; + +my @children; +for my $i (1..$nproc) { + my ($r, $w); + pipe($r, $w) or BAIL_OUT $!; + my $pid = fork; + if ($pid == 0) { + close $w; + while (my $i = <$r>) { + chomp $i; + $v2_init_add->($i); + } + _exit(0); + } + defined $pid or BAIL_OUT "fork: $!"; + close $r or BAIL_OUT $!; + push @children, [ $w, $pid ]; + $w->autoflush(1); +} + +for my $i (0..$nr_inbox) { + print { $children[$i % @children]->[0] } "$i\n" or BAIL_OUT $!; +} + +for my $c (@children) { + close $c->[0] or BAIL_OUT "close $!"; +} +my $i = 0; +for my $c (@children) { + my $pid = waitpid($c->[1], 0); + is($?, 0, ++$i.' exited ok'); +} +ok(close($cfg_fh), 'config written'); +done_testing; -- cgit v1.2.3-24-ge0c7 From fbc71f023f2356b2e88a8cfeb738a945085ac9a2 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:46 +0000 Subject: inbox: git_epoch: correct false comment The original comment hasn't been true since PublicInbox::Git->modified was changed to use cat_async blob responses. In any case, manifest.js.gz generation already cleans up per-epoch git processes used for ->modified. --- lib/PublicInbox/Inbox.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 863a5de4..ec8469e3 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -132,7 +132,7 @@ sub git_epoch { return unless -d $git_dir; my $g = PublicInbox::Git->new($git_dir); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; - # no cleanup needed, we never cat-file off this, only clone + # caller must manually cleanup when done $g; }; } -- cgit v1.2.3-24-ge0c7 From 361cdfd0ae67d52d8a589b4ddc6e7fa94d8a9c8d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:47 +0000 Subject: inboxwritable: _init_v1: set created_at ASAP This ensures we have UIDVALIDITY to index earlier rather than later for v1 inboxes, matching v2 behavior. --- lib/PublicInbox/InboxWritable.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index c0e88f3d..69275bb0 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -46,12 +46,13 @@ sub _init_v1 { require PublicInbox::Msgmap; my $sidx = PublicInbox::SearchIdx->new($self, 1); # just create $sidx->begin_txn_lazy; + my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); if (defined $skip_artnum) { - my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); $mm->{dbh}->begin_work; $mm->skip_artnum($skip_artnum); $mm->{dbh}->commit; } + undef $mm; # ->created_at set $sidx->commit_txn_lazy; } else { open my $fh, '>>', "$self->{inboxdir}/ssoma.lock" or -- cgit v1.2.3-24-ge0c7 From 4a2e89007cb7b62151cb1869e49b27ebacfc27eb Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:48 +0000 Subject: miscsearch: index UIDVALIDITY, use as startup cache This brings -nntpd startup time down from ~35s to ~5s with 50K inboxes. Further improvements ought to be possible with deeper changes to MiscIdx, since -mda having to load every inbox seems unreasonable; but this general change is fairly unintrusive. --- lib/PublicInbox/ExtSearchIdx.pm | 22 +++++++++------- lib/PublicInbox/MiscIdx.pm | 26 +++++++++++++------ lib/PublicInbox/MiscSearch.pm | 56 ++++++++++++++++++++++++++++++++++++++--- lib/PublicInbox/NNTPD.pm | 4 ++- lib/PublicInbox/Search.pm | 9 ++++++- lib/PublicInbox/SearchIdx.pm | 7 ------ t/search.t | 4 +-- 7 files changed, 96 insertions(+), 32 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index f04e0443..9d64ff5a 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -61,16 +61,20 @@ sub new { sub attach_inbox { my ($self, $ibx) = @_; - my $key = $ibx->eidx_key; - if (!$ibx->over || !$ibx->mm) { - warn "W: skipping $key (unindexed)\n"; - return; - } - if (!defined($ibx->uidvalidity)) { - warn "W: skipping $key (no UIDVALIDITY)\n"; - return; + my $ekey = $ibx->eidx_key; + my $misc = $self->{misc}; + if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed + } else { + if (!$ibx->over || !$ibx->mm) { + warn "W: skipping $ekey (unindexed)\n"; + return; + } + if (!defined($ibx->uidvalidity)) { + warn "W: skipping $ekey (no UIDVALIDITY)\n"; + return; + } } - $self->{ibx_map}->{$key} //= do { + $self->{ibx_map}->{$ekey} //= do { push @{$self->{ibx_list}}, $ibx; $ibx; } diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index 64591d05..a04dd1c5 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -21,6 +21,7 @@ use Carp qw(croak); use File::Path (); use PublicInbox::MiscSearch; use PublicInbox::Config; +my $json; sub new { my ($class, $eidx) = @_; @@ -30,6 +31,7 @@ sub new { nodatacow_dir($mi_dir); my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $json //= PublicInbox::Config::json(); bless { mi_dir => $mi_dir, flags => $flags, @@ -91,17 +93,27 @@ EOF $xdb->delete_document($_) for @drop; # just in case my $doc = $PublicInbox::Search::X{Document}->new; + term_generator($self)->set_document($doc); - # allow sorting by modified + # allow sorting by modified and uidvalidity (created at) add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); - $doc->add_boolean_term('Q'.$eidx_key); - $doc->add_boolean_term('T'.'inbox'); - term_generator($self)->set_document($doc); + $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + $doc->add_boolean_term('T'.'inbox'); # Type + + if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { + $doc->add_boolean_term('T'.'newsgroup'); # additional Type + } + + # force reread from disk, {description} could be loaded from {misc} + delete $ibx->{description}; + my $desc = $ibx->description; # description = S/Subject (or title) # address = A/Author - index_text($self, $ibx->description, 1, 'S'); + index_text($self, $desc, 1, 'S'); + index_text($self, $ibx->{name}, 1, 'XNAME'); my %map = ( address => 'A', listid => 'XLISTID', @@ -113,10 +125,8 @@ EOF index_text($self, $v, 1, $pfx); } } - index_text($self, $ibx->{name}, 1, 'XNAME'); my $data = {}; if (defined(my $max = $ibx->max_git_epoch)) { # v2 - my $desc = $ibx->description; my $pfx = "/$ibx->{name}/git/"; for my $epoch (0..$max) { my $git = $ibx->git_epoch($epoch) or return; @@ -130,7 +140,7 @@ EOF $ent->{git_dir} = $ibx->{inboxdir}; $data->{"/$ibx->{name}"} = $ent; } - $doc->set_data(PublicInbox::Config::json()->encode($data)); + $doc->set_data($json->encode($data)); if (defined $docid) { $xdb->replace_document($docid, $doc); } else { diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index de587d35..c6ce255f 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -5,10 +5,12 @@ package PublicInbox::MiscSearch; use strict; use v5.10.1; -use PublicInbox::Search qw(retry_reopen); +use PublicInbox::Search qw(retry_reopen int_val); +my $json; # Xapian value columns: our $MODIFIED = 0; +our $UIDVALIDITY = 1; # (created time) # avoid conflicting with message Search::prob_prefix for UI/UX reasons my %PROB_PREFIX = ( @@ -24,6 +26,7 @@ my %PROB_PREFIX = ( sub new { my ($class, $dir) = @_; PublicInbox::Search::load_xapian(); + $json //= PublicInbox::Config::json(); bless { xdb => $PublicInbox::Search::X{Database}->new($dir) }, $class; @@ -120,11 +123,13 @@ sub newsgroup_matches { sub ibx_data_once { my ($self, $ibx) = @_; my $xdb = $self->{xdb}; - my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private - my $head = $xdb->postlist_begin('Q'.$eidx_key); - my $tail = $xdb->postlist_end('Q'.$eidx_key); + my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private + my $head = $xdb->postlist_begin($term); + my $tail = $xdb->postlist_end($term); if ($head != $tail) { my $doc = $xdb->get_document($head->get_docid); + $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); + $ibx->{-modified} = int_val($doc, $MODIFIED); $doc->get_data; } else { undef; @@ -136,4 +141,47 @@ sub inbox_data { retry_reopen($self, \&ibx_data_once, $ibx); } +sub ibx_cache_load { + my ($doc, $cache) = @_; + my $end = $doc->termlist_end; + my $cur = $doc->termlist_begin; + $cur->skip_to('Q'); + return if $cur == $end; + my $eidx_key = $cur->get_termname; + $eidx_key =~ s/\AQ// or return; # expired + my $ce = $cache->{$eidx_key} = {}; + $ce->{uidvalidity} = int_val($doc, $UIDVALIDITY); + $ce->{-modified} = int_val($doc, $MODIFIED); + $ce->{description} = do { + # extract description from manifest.js.gz epoch description + my $d; + my $data = $json->decode($doc->get_data); + for (values %$data) { + $d = $_->{description} // next; + $d =~ s/ \[epoch [0-9]+\]\z// or next; + last; + } + $d; + } +} + +sub _nntpd_cache_load { # retry_reopen callback + my ($self) = @_; + my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 }; + my $mset = mset($self, 'type:newsgroup type:inbox', $opt); + my $cache = {}; + for my $it ($mset->items) { + ibx_cache_load($it->get_document, $cache); + } + $cache +} + +# returns { newsgroup => $cache_entry } mapping, $cache_entry contains +# anything which may trigger seeks at startup, currently: description, +# -modified, and uidvalidity. +sub nntpd_cache_load { + my ($self) = @_; + retry_reopen($self, \&_nntpd_cache_load); +} + 1; diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm index 7f9a1d58..6907a03c 100644 --- a/lib/PublicInbox/NNTPD.pm +++ b/lib/PublicInbox/NNTPD.pm @@ -36,10 +36,12 @@ sub refresh_groups { my ($self, $sig) = @_; my $pi_cfg = $sig ? PublicInbox::Config->new : $self->{pi_cfg}; my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox + my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; $pi_cfg->each_inbox(sub { my ($ibx) = @_; my $ngname = $ibx->{newsgroup} // return; - if ($ibx->nntp_usable) { + my $ce = $cache->{$ngname}; + if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) { # only valid if msgmap and over works # preload to avoid fragmentation: $ibx->description; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index b1d38fb9..05c679c9 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(retry_reopen); +our @EXPORT_OK = qw(retry_reopen int_val); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -91,6 +91,7 @@ sub load_xapian () { 1 : Search::Xapian::ENQ_ASCENDING(); *sortable_serialise = $x.'::sortable_serialise'; + *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert @@ -436,4 +437,10 @@ sub help { \@ret; } +sub int_val ($$) { + my ($doc, $col) = @_; + my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index cf2c2c55..d1b0c724 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -106,7 +106,6 @@ sub load_xapian_writable () { } eval 'require '.$X->{WritableDatabase} or die; *sortable_serialise = $xap.'::sortable_serialise'; - *sortable_unserialise = $xap.'::sortable_unserialise'; $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); my $ver = (eval($xap.'::major_version()') << 16) | @@ -501,12 +500,6 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -sub int_val ($$) { - my ($doc, $col) = @_; - my $val = $doc->get_value($col) or return; # undefined is '' in Xapian - sortable_unserialise($val) + 0; # PV => IV conversion -} - sub smsg_from_doc ($) { my ($doc) = @_; my $data = $doc->get_data or return; diff --git a/t/search.t b/t/search.t index 11143204..3754717d 100644 --- a/t/search.t +++ b/t/search.t @@ -332,13 +332,13 @@ $ibx->with_umask(sub { like($smsg->{to}, qr/\blist\@example\.com\b/, 'to appears'); my $doc = $m->get_document; my $col = PublicInbox::Search::BYTES(); - my $bytes = PublicInbox::SearchIdx::int_val($doc, $col); + my $bytes = PublicInbox::Search::int_val($doc, $col); like($bytes, qr/\A[0-9]+\z/, '$bytes stored as digit'); ok($bytes > 0, '$bytes is > 0'); is($bytes, $smsg->{bytes}, 'bytes Xapian value matches Over'); $col = PublicInbox::Search::UID(); - my $uid = PublicInbox::SearchIdx::int_val($doc, $col); + my $uid = PublicInbox::Search::int_val($doc, $col); is($uid, $smsg->{num}, 'UID column matches {num}'); is($uid, $m->get_docid, 'UID column matches docid'); } -- cgit v1.2.3-24-ge0c7 From 6f9b927bf1fc5e84b92532477b275a45cd30cb01 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:49 +0000 Subject: extsearchidx: close SQLite handles after attaching This is needed to prevent us from running out of FDs when indexing many inboxes. Perhaps checking these on attach_inbox is unnecessary and may be removed entirely down the line. --- lib/PublicInbox/ExtSearchIdx.pm | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 9d64ff5a..fb627089 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -65,11 +65,14 @@ sub attach_inbox { my $misc = $self->{misc}; if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed } else { - if (!$ibx->over || !$ibx->mm) { + my @sqlite = ($ibx->over, $ibx->mm); + my $uidvalidity = $ibx->uidvalidity; + $ibx->{mm} = $ibx->{over} = undef; + if (scalar(@sqlite) != 2) { warn "W: skipping $ekey (unindexed)\n"; return; } - if (!defined($ibx->uidvalidity)) { + if (!defined($uidvalidity)) { warn "W: skipping $ekey (no UIDVALIDITY)\n"; return; } -- cgit v1.2.3-24-ge0c7 From 83a03a80c7392148fa65143f60fb16d15cf19006 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:50 +0000 Subject: config: _fill: inbox name extraction optimization Using substr() instead of a string copy + s// substitution here reduces ->fill_all from 4.00s to 3.88s with 50K inboxes on my workstation. --- lib/PublicInbox/Config.pm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 577337dc..cd8957a1 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -424,9 +424,7 @@ EOF } } - my $name = $pfx; - $name =~ s/\Apublicinbox\.//; - + my $name = substr($pfx, length('publicinbox.')); if (!valid_inbox_name($name)) { warn "invalid inbox name: '$name'\n"; return; -- cgit v1.2.3-24-ge0c7 From 80424a6a72dbe2b1ea7ba9bfba8f273e4da385a7 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:51 +0000 Subject: config: git_config_dump: pre-compile RE for split It appears the Perl split() operator is not optimized for fixed strings at all. With this change, PublicInbox::Config->new (w/o ->fill_all) time is reduced from 1.81s to 1.22s on a config file with 50K inboxes. --- lib/PublicInbox/Config.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index cd8957a1..4d143c6e 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -165,7 +165,7 @@ sub git_config_dump { return {} unless -e $file; my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; my $fh = popen_rd($cmd); - my $rv = config_fh_parse($fh, "\0", "\n"); + my $rv = config_fh_parse($fh, "\0", qr/\n/); close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } -- cgit v1.2.3-24-ge0c7 From a05445fb400108e60ede7d377cf3b26a0392eb24 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:52 +0000 Subject: config: config_fh_parse: micro-optimize We can avoid a slow regexp capture and instead and rely on rindex + substr to extract the section from the config file. Then we use the defined-or-assignment (//=) operator combined with the documented return value of `push' to ensure @section_order is unique without repeating a hash lookup. Finally, we avoid short-lived variables inside the loop and declare them subroutine-wide to knock a teeny bit of allocation time. Combined, these optimizations bring the ~1.22s PublicInbox::Config->new time down to ~0.98s with 50K inboxes. --- lib/PublicInbox/Config.pm | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 4d143c6e..60107d45 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -132,20 +132,15 @@ sub default_file { sub config_fh_parse ($$$) { my ($fh, $rs, $fs) = @_; - my %rv; - my (%section_seen, @section_order); + my (%rv, %section_seen, @section_order, $line, $k, $v, $section, $cur); local $/ = $rs; - while (defined(my $line = <$fh>)) { + while (defined($line = <$fh>)) { # performance critical with giant configs chomp $line; - my ($k, $v) = split($fs, $line, 2); - my ($section) = ($k =~ /\A(\S+)\.[^\.]+\z/); - unless (defined $section_seen{$section}) { - $section_seen{$section} = 1; - push @section_order, $section; - } + ($k, $v) = split($fs, $line, 2); + $section = substr($k, 0, rindex($k, '.')); + $section_seen{$section} //= push(@section_order, $section); - my $cur = $rv{$k}; - if (defined $cur) { + if (defined($cur = $rv{$k})) { if (ref($cur) eq "ARRAY") { push @$cur, $v; } else { -- cgit v1.2.3-24-ge0c7 From 74e705e24dffaf4dcb3952910678df7529783762 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:53 +0000 Subject: config: config_fh_parse: micro-optimize harder Instead of relying on split() and a regexp, we'll drop split() entirely and rely on index() + two substr() calls to operate on fixed strings. This brings PublicInbox::Config->new time down from 0.98s down to 0.84s. --- lib/PublicInbox/Config.pm | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 60107d45..21f2161a 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -132,13 +132,14 @@ sub default_file { sub config_fh_parse ($$$) { my ($fh, $rs, $fs) = @_; - my (%rv, %section_seen, @section_order, $line, $k, $v, $section, $cur); + my (%rv, %seen, @section_order, $line, $k, $v, $section, $cur, $i); local $/ = $rs; - while (defined($line = <$fh>)) { # performance critical with giant configs - chomp $line; - ($k, $v) = split($fs, $line, 2); + while (defined($line = <$fh>)) { # perf critical with giant configs + $i = index($line, $fs); + $k = substr($line, 0, $i); + $v = substr($line, $i + 1, -1); # chop off $fs $section = substr($k, 0, rindex($k, '.')); - $section_seen{$section} //= push(@section_order, $section); + $seen{$section} //= push(@section_order, $section); if (defined($cur = $rv{$k})) { if (ref($cur) eq "ARRAY") { @@ -160,7 +161,7 @@ sub git_config_dump { return {} unless -e $file; my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; my $fh = popen_rd($cmd); - my $rv = config_fh_parse($fh, "\0", qr/\n/); + my $rv = config_fh_parse($fh, "\0", "\n"); close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } -- cgit v1.2.3-24-ge0c7 From d6d3159d1ae25f67c09dd189e6df36795a3b8bfa Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 23:02:55 +0000 Subject: index: update [extindex "all"] by default, support -E In most cases, this ensures users will only have to opt-in to using -extindex once and won't have to issue extra commands to keep external indices up-to-date when using public-inbox-index. Since we support arbitrary numbers of external indices for ease-of-development, we'll support repeating "-E" ("--update-extindex=") in case users want to test changes in parallel. --- Documentation/public-inbox-index.pod | 19 ++++++++++++++- script/public-inbox-index | 47 ++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 0848e860..2d5df930 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -162,6 +162,23 @@ See L for description and caveats. Available in public-inbox 1.6.0+. +=item --update-extindex=EXTINDEX, -E + +Update the given external index (L. +Either the configured section name (e.g. C) or a directory name +may be specified. + +Defaults to C if C<[extindex "all"]> is configured, +otherwise no external indices are updated. + +May be specified multiple times in rare cases where multiple +external indices are configured. + +=item --no-update-extindex + +Do not update the C external index by default. This negates +all uses of C<-E> / C<--update-extindex=> on the command-line. + =back =head1 FILES @@ -297,4 +314,4 @@ License: AGPL-3.0+ L =head1 SEE ALSO -L, L +L, L, L diff --git a/script/public-inbox-index b/script/public-inbox-index index 8a61817c..f10bb5ad 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -17,7 +17,7 @@ options: --no-fsync speed up indexing, risk corruption on power outage -L LEVEL `basic', `medium', or `full' (default: full) - -E EIDX update EIDX (e.g. `all') + -E EXTINDEX update extindex (default: `all') --all index all configured inboxes --compact | -c run public-inbox-compact(1) after indexing --sequential-shard index Xapian shards sequentially for slow storage @@ -32,12 +32,16 @@ options: BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes) See public-inbox-index(1) man page for full documentation. EOF -my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 }; +my $opt = { + quiet => -1, compact => 0, max_size => undef, fsync => 1, + 'update-extindex' => [], # ":s@" optional arg sets '' if no arg given +}; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune fsync|sync! xapian_only|xapian-only indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s sequential_shard|seq-shard|sequential-shard + no-update-extindex update-extindex|E=s@ skip-docdata all help|h)) or die $help; if ($opt->{help}) { print $help; exit 0 }; @@ -56,7 +60,31 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); PublicInbox::Admin::require_or_die('-index'); unless (@ibxs) { print STDERR $help; exit 1 } +my (@eidx_dir, %eidx_seen); +my $update_extindex = $opt->{'update-extindex'}; +if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) { + # extindex and normal inboxes may have different owners + push(@$update_extindex, 'all') if -w $ALL->{topdir}; +} +@$update_extindex = () if $opt->{'no-update-extindex'}; +if (scalar @$update_extindex) { + PublicInbox::Admin::require_or_die('-search'); + require PublicInbox::ExtSearchIdx; +} +for my $ei_name (@$update_extindex) { + my $es = $cfg->lookup_ei($ei_name); + my $topdir; + if (!$es && -d $ei_name) { # allow dirname or config section name + $topdir = $ei_name; + } elsif ($es) { + $topdir = $es->{topdir}; + } else { + die "extindex `$ei_name' not configured or found\n"; + } + $eidx_seen{$topdir} //= push(@eidx_dir, $topdir); +} my $mods = {}; +my @eidx_unconfigured; foreach my $ibx (@ibxs) { # detect_indexlevel may also set $ibx->{-skip_docdata} my $detected = PublicInbox::Admin::detect_indexlevel($ibx); @@ -64,7 +92,14 @@ foreach my $ibx (@ibxs) { $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ? 'full' : $detected); PublicInbox::Admin::scan_ibx_modules($mods, $ibx); + if (@eidx_dir && $ibx->{-unconfigured}) { + push @eidx_unconfigured, " $ibx->{inboxdir}\n"; + } } +warn <{compact} = 0 if !$mods->{'Search::Xapian'}; @@ -96,4 +131,12 @@ EOL local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard}; PublicInbox::Xapcmd::run($ibx, 'compact', $copt); } + next if $ibx->{-unconfigured}; + last if $ibx_opt->{quit}; + for my $dir (@eidx_dir) { + my $eidx = PublicInbox::ExtSearchIdx->new($dir); + $eidx->attach_inbox($ibx); + $eidx->eidx_sync($ibx_opt); + last if $ibx_opt->{quit}; + } } -- cgit v1.2.3-24-ge0c7 From b3cf37096874c6c80ef554e5153e5d995c72ab95 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 24 Dec 2020 10:09:18 +0000 Subject: inboxwritable: delay umask_prepare calls This simplifies all ->with_umask callers and opens the door for further optimizations to delay/elide process spawning. --- lib/PublicInbox/ExtSearchIdx.pm | 2 -- lib/PublicInbox/InboxWritable.pm | 6 ++---- lib/PublicInbox/SearchIdx.pm | 1 - lib/PublicInbox/V2Writable.pm | 3 --- lib/PublicInbox/Xapcmd.pm | 1 - script/public-inbox-convert | 1 - 6 files changed, 2 insertions(+), 12 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index fb627089..c43a6c5e 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -934,7 +934,6 @@ sub idx_init { # similar to V2Writable PublicInbox::V2Writable::write_alternates($info_dir, $mode, $o); } $self->parallel_init($self->{indexlevel}); - $self->umask_prepare; $self->with_umask(\&_idx_init, $self, $opt); $self->{oidx}->begin_lazy; $self->{oidx}->eidx_prep; @@ -943,7 +942,6 @@ sub idx_init { # similar to V2Writable no warnings 'once'; *done = \&PublicInbox::V2Writable::done; -*umask_prepare = \&PublicInbox::InboxWritable::umask_prepare; *with_umask = \&PublicInbox::InboxWritable::with_umask; *parallel_init = \&PublicInbox::V2Writable::parallel_init; *nproc_shards = \&PublicInbox::V2Writable::nproc_shards; diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 69275bb0..31eb3f15 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -65,7 +65,6 @@ sub init_inbox { if ($self->version == 1) { my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); - $self->umask_prepare; $self->with_umask(\&_init_v1, $self, $skip_artnum); } else { my $v2w = importer($self); @@ -260,7 +259,7 @@ sub _umask_for { sub with_umask { my ($self, $cb, @arg) = @_; - my $old = umask $self->{umask}; + my $old = umask($self->{umask} //= umask_prepare($self)); my $rv = eval { $cb->(@arg) }; my $err = $@; umask $old; @@ -271,8 +270,7 @@ sub with_umask { sub umask_prepare { my ($self) = @_; my $perm = _git_config_perm($self); - my $umask = _umask_for($perm); - $self->{umask} = $umask; + _umask_for($perm); } sub cleanup ($) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index d1b0c724..c8e309fc 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -67,7 +67,6 @@ sub new { $self->{-set_skip_docdata_once} = 1; $self->{-skip_docdata} = 1; } - $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 3e3b275f..531a72b2 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -97,8 +97,6 @@ sub new { die "$dir does not exist\n"; } } - $v2ibx->umask_prepare; - my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; my $self = { ibx => $v2ibx, @@ -320,7 +318,6 @@ sub idx_init { $ibx->git->cleanup; parallel_init($self, $ibx->{indexlevel}); - $ibx->umask_prepare; $ibx->with_umask(\&_idx_init, $self, $opt); } diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 4f77ef25..ca2345f7 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -270,7 +270,6 @@ sub run { local %SIG = %SIG; setup_signals(); - $ibx->umask_prepare; $ibx->with_umask(\&_run, $ibx, $cb, $opt); } diff --git a/script/public-inbox-convert b/script/public-inbox-convert index 800c364c..e6ee6529 100755 --- a/script/public-inbox-convert +++ b/script/public-inbox-convert @@ -80,7 +80,6 @@ $new->{version} = 2; $new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} }); $new->{-no_fsync} = 1 if !$opt->{fsync}; my $v2w; -$old->umask_prepare; sub link_or_copy ($$) { my ($src, $dst) = @_; -- cgit v1.2.3-24-ge0c7 From e8d6c34c749d1b0fd1dc1278cd4a2a310b31a9ac Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 24 Dec 2020 10:09:19 +0000 Subject: index: support --fast-noop / -F switch Note: I'm not sure if it's worth documenting and supporting this long-term. We can can avoid taking locks for invocations of "index --all" and rely on high-resolution ctime (struct timespec st_ctim) comparisons of msgmap.sqlite3 and the packed-refs + refs/heads directory of the newest epoch. This cuts public-inbox-index invocations with "--all --no-update-extindex -L basic" down from 0.92s to 0.31s. The change with "-L medium" or "-L full" and (default) non-zero jobs is even more drastic, reducing a 12-13s no-op invocation down to the same 0.31s --- lib/PublicInbox/V2Writable.pm | 14 +++++++++++--- script/public-inbox-index | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 531a72b2..2b849ddf 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1351,11 +1351,19 @@ sub index_sync { $opt //= {}; return xapian_only($self, $opt) if $opt->{xapian_only}; - my $pr = $opt->{-progress}; my $epoch_max; - my $latest = $self->{ibx}->git_dir_latest(\$epoch_max); - return unless defined $latest; + my $latest = $self->{ibx}->git_dir_latest(\$epoch_max) // return; + if ($opt->{'fast-noop'}) { # nanosecond (st_ctim) comparison + use Time::HiRes qw(stat); + if (my @mm = stat("$self->{ibx}->{inboxdir}/msgmap.sqlite3")) { + my $c = $mm[10]; # 10 = ctime (nsec NV) + my @hd = stat("$latest/refs/heads"); + my @pr = stat("$latest/packed-refs"); + return if $c > ($hd[10] // 0) && $c > ($pr[10] // 0); + } + } + my $pr = $opt->{-progress}; my $seq = $opt->{sequential_shard}; my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; diff --git a/script/public-inbox-index b/script/public-inbox-index index f10bb5ad..91afac88 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -42,7 +42,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune batch_size|batch-size=s sequential_shard|seq-shard|sequential-shard no-update-extindex update-extindex|E=s@ - skip-docdata all help|h)) + fast-noop|F skip-docdata all help|h)) or die $help; if ($opt->{help}) { print $help; exit 0 }; die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; -- cgit v1.2.3-24-ge0c7 From b2e8536cd607d71182d6228c629bca12017ce34c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 01:44:38 +0000 Subject: init: use the return value of rel2abs_collapsed :x Fixes: 9fcce78e40b0a7c6 ("script/public-inbox-*: favor caller-provided pathnames") --- script/public-inbox-init | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/public-inbox-init b/script/public-inbox-init index afaa4c12..6d538e43 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -138,7 +138,7 @@ close($fh) or die "failed to close $pi_config_tmp: $!\n"; my $pfx = "publicinbox.$name"; my @x = (qw/git config/, "--file=$pi_config_tmp"); -PublicInbox::Config::rel2abs_collapsed($inboxdir); +$inboxdir = PublicInbox::Config::rel2abs_collapsed($inboxdir); die "`\\n' not allowed in `$inboxdir'\n" if index($inboxdir, "\n") >= 0; if (-f "$inboxdir/inbox.lock") { -- cgit v1.2.3-24-ge0c7 From 4af931f9ad100b0eca5729e2b1c56b844cf1a1c8 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:09 +0000 Subject: index: disable --fast-noop on --reindex These options make no sense when used together, just inform the user and move on since it's probably harmless to continue. --- script/public-inbox-index | 3 +++ 1 file changed, 3 insertions(+) diff --git a/script/public-inbox-index b/script/public-inbox-index index 91afac88..87893ef1 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -49,6 +49,9 @@ die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; if ($opt->{xapian_only} && !$opt->{reindex}) { die "--xapian-only requires --reindex\n"; } +if ($opt->{reindex} && delete($opt->{'fast-noop'})) { + warn "--fast-noop ignored with --reindex\n"; +} # require lazily to speed up --help require PublicInbox::Admin; -- cgit v1.2.3-24-ge0c7 From 672d146577305baa7f508bd2e33212bba6fdb800 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:10 +0000 Subject: extsearchidx: delay SQLite availability checks This will make attach_inbox faster for no-op calls. It also helps us avoid races in case msgmap or over.sqlite3 gets unlinked while -extindex is running. --- lib/PublicInbox/ExtSearchIdx.pm | 57 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index c43a6c5e..386e1cee 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -61,23 +61,7 @@ sub new { sub attach_inbox { my ($self, $ibx) = @_; - my $ekey = $ibx->eidx_key; - my $misc = $self->{misc}; - if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed - } else { - my @sqlite = ($ibx->over, $ibx->mm); - my $uidvalidity = $ibx->uidvalidity; - $ibx->{mm} = $ibx->{over} = undef; - if (scalar(@sqlite) != 2) { - warn "W: skipping $ekey (unindexed)\n"; - return; - } - if (!defined($uidvalidity)) { - warn "W: skipping $ekey (no UIDVALIDITY)\n"; - return; - } - } - $self->{ibx_map}->{$ekey} //= do { + $self->{ibx_map}->{$ibx->eidx_key} //= do { push @{$self->{ibx_list}}, $ibx; $ibx; } @@ -281,29 +265,36 @@ sub last_commits { $heads; } +sub _ibx_index_reject ($) { + my ($ibx) = @_; + $ibx->mm // return 'unindexed, no msgmap.sqlite3'; + $ibx->uidvalidity // return 'no UIDVALIDITY'; + $ibx->over // return 'unindexed, no over.sqlite3'; + undef; +} + sub _sync_inbox ($$$) { my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + if (defined(my $err = _ibx_index_reject($ibx))) { + return "W: skipping $ekey ($err)"; + } $sync->{ibx} = $ibx; $sync->{nr} = \(my $nr = 0); my $v = $ibx->version; - my $ekey = $ibx->eidx_key; if ($v == 2) { $sync->{epoch_max} = $ibx->max_git_epoch // return; sync_prepare($self, $sync); # or return # TODO: once MiscIdx is stable } elsif ($v == 1) { my $uv = $ibx->uidvalidity; my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv"); - my $head = $ibx->mm->last_commit; - unless (defined $head) { - warn "E: $ibx->{inboxdir} is not indexed\n"; - return; - } + my $head = $ibx->mm->last_commit // + return "E: $ibx->{inboxdir} is not indexed"; my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head); my $unit = { stack => $stk, git => $ibx->git }; push @{$sync->{todo}}, $unit; } else { - warn "E: $ekey unsupported inbox version (v$v)\n"; - return; + return "E: $ekey unsupported inbox version (v$v)"; } for my $unit (@{delete($sync->{todo}) // []}) { last if $sync->{quit}; @@ -311,6 +302,7 @@ sub _sync_inbox ($$$) { } $self->{midx}->index_ibx($ibx) unless $sync->{quit}; $ibx->git->cleanup; # done with this inbox, now + undef; } sub gc_unref_doc ($$$$) { @@ -787,9 +779,14 @@ DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ? sub _reindex_inbox ($$$) { my ($self, $sync, $ibx) = @_; - local $self->{current_info} = $ibx->eidx_key; - _reindex_check_unseen($self, $sync, $ibx); - _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit}; + my $ekey = $ibx->eidx_key; + local $self->{current_info} = $ekey; + if (defined(my $err = _ibx_index_reject($ibx))) { + warn "W: cannot reindex $ekey ($err)\n"; + } else { + _reindex_check_unseen($self, $sync, $ibx); + _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit}; + } delete @$ibx{qw(over mm search git)}; # won't need these for a bit } @@ -847,7 +844,9 @@ sub eidx_sync { # main entry point # don't use $_ here, it'll get clobbered by reindex_checkpoint for my $ibx (@{$self->{ibx_list}}) { last if $sync->{quit}; - _sync_inbox($self, $sync, $ibx); + my $err = _sync_inbox($self, $sync, $ibx); + delete @$ibx{qw(mm over)}; + warn $err, "\n" if defined($err); } $self->{oidx}->rethread_done($opt) unless $sync->{quit}; eidxq_process($self, $sync) unless $sync->{quit}; -- cgit v1.2.3-24-ge0c7 From 14e606423429d6121c295c2bc0599fe1bf66b07c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:11 +0000 Subject: extsearchidx: close DB handles after use if FD constrained Most distros ship with low RLIMIT_NOFILE limits and surprises may lurk for admins who configure many inboxes. Keep FD usage under control to avoid EMFILE errors at inopportune times during reindex. From what I can tell, this is the only place where extindex can have unpredictable FD growth when there's thousands of inboxes, and it's in an extremely rare code path. --- lib/PublicInbox/ExtSearchIdx.pm | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 386e1cee..3f197973 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -393,6 +393,32 @@ sub _ibx_for ($$$) { $self->{ibx_list}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" } +sub _fd_constrained ($) { + my ($self) = @_; + $self->{-fd_constrained} //= do { + my $soft; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($soft, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + chomp($soft = `sh -c 'ulimit -n'`); + } + if (defined($soft)) { + my $want = scalar(@{$self->{ibx_list}}) + 64; # estimate + my $ret = $want > $soft; + if ($ret) { + warn <{sync}; @@ -429,11 +455,16 @@ sub _reindex_finalize ($$$) { my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty"; $x->{num} = delete($x->{xnum}) // die '{xnum} unset'; $ibx = _ibx_for($self, $sync, $x); - my $e = $ibx->over->get_art($x->{num}); - $e->{blob} eq $x->{blob} or die <over) { + my $e = $over->get_art($x->{num}); + $e->{blob} eq $x->{blob} or die <{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num}); EOF - push @todo, $ibx, $e; + push @todo, $ibx, $e; + $over->dbh_close if _fd_constrained($self); + } else { + die "$ibx->{inboxdir}: over.sqlite3 unusable: $!\n"; + } } undef $by_chash; while (my ($ibx, $e) = splice(@todo, 0, 2)) { -- cgit v1.2.3-24-ge0c7 From fb4dd7fdeeed8478cda9b7e63e56564da8cbdacf Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:12 +0000 Subject: index: do not attach inbox to extindex unless updated We'll count the number of log changes (regardless of index or unindex) and only attach inboxes to ExtSearchIdx objects when they get new work. We'll also reduce lock bouncing and only update external indices after all per-inbox indexing is done. This also updates existing v2 indexing/unindexing callers to be more consistent and ensures unindex log entries update per-inbox last commit information. --- lib/PublicInbox/Admin.pm | 1 + lib/PublicInbox/SearchIdx.pm | 2 ++ lib/PublicInbox/V2Writable.pm | 26 +++++++++++++++++++------- script/public-inbox-index | 23 ++++++++++++++--------- 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 9a86d206..b468108e 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -271,6 +271,7 @@ EOM $idx = PublicInbox::SearchIdx->new($ibx, 1); } $idx->index_sync($opt); + $idx->{nidx} // 0; # returns number processed } sub progress_prepare ($) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c8e309fc..b3361e05 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -615,6 +615,7 @@ sub index_both { # git->cat_async callback $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; ${$sync->{latest_cmt}} = $cur_cmt; } @@ -629,6 +630,7 @@ sub unindex_both { # git->cat_async callback if (defined(my $cur_cmt = $sync->{cur_cmt})) { ${$sync->{latest_cmt}} = $cur_cmt; } + ++$self->{nidx}; } sub with_umask { diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 2b849ddf..ca52874b 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -891,12 +891,22 @@ sub reindex_checkpoint ($$) { $mm_tmp->atfork_parent if $mm_tmp; } +sub index_finalize ($$) { + my ($arg, $index) = @_; + ++$arg->{self}->{nidx}; + if (defined(my $cur = $arg->{cur_cmt})) { + ${$arg->{latest_cmt}} = $cur; + } elsif ($index) { + die 'BUG: {cur_cmt} missing'; + } # else { unindexing @leftovers doesn't set {cur_cmt} +} + sub index_oid { # cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; - return if is_bad_blob($oid, $type, $size, $arg->{oid}); + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 1); # size == 0 purged returns here my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - return if $size == 0; # purged my ($num, $mid0); my $eml = PublicInbox::Eml->new($$bref); my $mids = mids($eml); @@ -967,7 +977,7 @@ sub index_oid { # cat_async callback if (do_idx($self, $bref, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } - ${$arg->{latest_cmt}} = $arg->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + index_finalize($arg, 1); } # only update last_commit for $i on reindex iff newer than current @@ -1157,11 +1167,12 @@ sub unindex_oid_aux ($$$) { } sub unindex_oid ($$;$) { # git->cat_async callback - my ($bref, $oid, $type, $size, $sync) = @_; - return if is_bad_blob($oid, $type, $size, $sync->{oid}); - my $self = $sync->{self}; + my ($bref, $oid, $type, $size, $arg) = @_; + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 0); + my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef; + my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef; my $mm = $self->{mm}; my $mids = mids(PublicInbox::Eml->new($bref)); undef $$bref; @@ -1186,6 +1197,7 @@ sub unindex_oid ($$;$) { # git->cat_async callback } unindex_oid_aux($self, $oid, $mid); } + index_finalize($arg, 0); } sub git { $_[0]->{ibx}->git } diff --git a/script/public-inbox-index b/script/public-inbox-index index 87893ef1..a17bf615 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -63,7 +63,7 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); PublicInbox::Admin::require_or_die('-index'); unless (@ibxs) { print STDERR $help; exit 1 } -my (@eidx_dir, %eidx_seen); +my (@eidx, %eidx_seen); my $update_extindex = $opt->{'update-extindex'}; if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) { # extindex and normal inboxes may have different owners @@ -84,7 +84,8 @@ for my $ei_name (@$update_extindex) { } else { die "extindex `$ei_name' not configured or found\n"; } - $eidx_seen{$topdir} //= push(@eidx_dir, $topdir); + $eidx_seen{$topdir} //= + push(@eidx, PublicInbox::ExtSearchIdx->new($topdir)); } my $mods = {}; my @eidx_unconfigured; @@ -95,7 +96,7 @@ foreach my $ibx (@ibxs) { $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ? 'full' : $detected); PublicInbox::Admin::scan_ibx_modules($mods, $ibx); - if (@eidx_dir && $ibx->{-unconfigured}) { + if (@eidx && $ibx->{-unconfigured}) { push @eidx_unconfigured, " $ibx->{inboxdir}\n"; } } @@ -128,18 +129,22 @@ publicInbox.$ibx->{name}.indexSequentialShard not boolean EOL $ibx_opt = { %$opt, sequential_shard => $v }; } - PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); + my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); last if $ibx_opt->{quit}; if (my $copt = $opt->{compact_opt}) { local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard}; PublicInbox::Xapcmd::run($ibx, 'compact', $copt); } - next if $ibx->{-unconfigured}; last if $ibx_opt->{quit}; - for my $dir (@eidx_dir) { - my $eidx = PublicInbox::ExtSearchIdx->new($dir); + next if $ibx->{-unconfigured} || !$nidx; + for my $eidx (@eidx) { $eidx->attach_inbox($ibx); - $eidx->eidx_sync($ibx_opt); - last if $ibx_opt->{quit}; } } +$opt->{-no_fsync} = 1 if !$opt->{fsync}; +my $pr = $opt->{-progress}; +for my $eidx (@eidx) { + $pr->("indexing $eidx->{topdir} ...\n") if $pr; + $eidx->eidx_sync($opt); + last if $opt->{quit}; +} -- cgit v1.2.3-24-ge0c7 From 66518051763825d491d0c1df6837d4266edc180a Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:13 +0000 Subject: index: fix --no-fsync flag propagation to extindex Negation in flag names are confusing, but trying to deviate from the DB_NO_SYNC name used by Xapian is also confusing. --- lib/PublicInbox/ExtSearchIdx.pm | 2 +- script/public-inbox-index | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 3f197973..e7fdae48 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -54,7 +54,7 @@ sub new { }, __PACKAGE__; $self->{shards} = $self->count_shards || nproc_shards($opt->{creat}); my $oidx = PublicInbox::OverIdx->new("$self->{xpfx}/over.sqlite3"); - $oidx->{-no_fsync} = 1 if $opt->{-no_fsync}; + $self->{-no_fsync} = $oidx->{-no_fsync} = 1 if !$opt->{fsync}; $self->{oidx} = $oidx; $self } diff --git a/script/public-inbox-index b/script/public-inbox-index index a17bf615..c68f9224 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -85,7 +85,7 @@ for my $ei_name (@$update_extindex) { die "extindex `$ei_name' not configured or found\n"; } $eidx_seen{$topdir} //= - push(@eidx, PublicInbox::ExtSearchIdx->new($topdir)); + push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $opt)); } my $mods = {}; my @eidx_unconfigured; @@ -141,7 +141,6 @@ EOL $eidx->attach_inbox($ibx); } } -$opt->{-no_fsync} = 1 if !$opt->{fsync}; my $pr = $opt->{-progress}; for my $eidx (@eidx) { $pr->("indexing $eidx->{topdir} ...\n") if $pr; -- cgit v1.2.3-24-ge0c7 From 9a17bc230973405b0656bf05fb76902d46627ffa Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:14 +0000 Subject: v2writable: don't verify tip if reindexing We only rely on git-rev-parse to resolve symbolic names ("HEAD") to a SHA-* git commit ID. We'll assume any git commit IDs we get from SQLite DBs are valid and let "git-log" fail if it isn't. --- lib/PublicInbox/V2Writable.pm | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ca52874b..f20b5c7f 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1104,12 +1104,14 @@ sub sync_prepare ($$) { -d $git_dir or next; # missing epochs are fine my $git = PublicInbox::Git->new($git_dir); my $unit = { git => $git, epoch => $i }; + my $tip; if ($reindex_heads) { - $head = $reindex_heads->[$i] or next; + $tip = $head = $reindex_heads->[$i] or next; + } else { + $tip = $git->qx(qw(rev-parse -q --verify), $head); + next if $?; # new repo + chomp $tip; } - chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head)); - next if $?; # new repo - my $range = log_range($sync, $unit, $tip) or next; # can't use 'rev-list --count' if we use --diff-filter $pr->("$pfx $i.git counting $range ... ") if $pr; -- cgit v1.2.3-24-ge0c7 From 451ffd3068017ac1ca8bb0b454a65a7f2a3bf407 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:15 +0000 Subject: index: filter out indexlevel=basic from extindex extindex users will likely want to use indexlevel=basic for per-inbox indices, however extindex itself doesn't support basic index level (yet?). Let's ensure we don't trip up extindex users who specify "-L basic" on the -index command-line. --- script/public-inbox-index | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/script/public-inbox-index b/script/public-inbox-index index c68f9224..0fdfddc0 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -84,8 +84,10 @@ for my $ei_name (@$update_extindex) { } else { die "extindex `$ei_name' not configured or found\n"; } + my $o = { %$opt }; + delete $o->{indexlevel} if ($o->{indexlevel}//'') eq 'basic'; $eidx_seen{$topdir} //= - push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $opt)); + push(@eidx, PublicInbox::ExtSearchIdx->new($topdir, $o)); } my $mods = {}; my @eidx_unconfigured; -- cgit v1.2.3-24-ge0c7 From 5e05c2eb58a450849f1826f3d02ed62b814b6617 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 05:59:22 +0000 Subject: inboxidle: clue users into resolving ENOSPC from inotify It may not be obvious to users a ENOSPC error is from hitting a (tunable) kernel-imposed limit on inotify watches, and not some storage device running out of space. Give them a hint here to reduce our own support burden. --- lib/PublicInbox/InboxIdle.pm | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index f1cbc012..84b6d26f 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -39,6 +39,11 @@ sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback $self->{on_unlock}->{$w->name} = $ibx; } else { warn "E: ".ref($inot)."->watch($lock, IN_MODIFY) failed: $!\n"; + if ($!{ENOSPC} && $^O eq 'linux') { + warn <<""; +I: consider increasing /proc/sys/fs/inotify/max_user_watches + + } } # TODO: detect deleted packs (and possibly other files) -- cgit v1.2.3-24-ge0c7 From 10bf54305da8422d9ece6b809996092c1c4b1786 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 09:34:39 +0000 Subject: inboxidle: avoid needless syscalls on refresh We don't have to replace a bunch of existing watches with identical new ones. On Linux with Linux::Inotify2 installed, this avoids a storm of inotify_add_watch(2) and inotify_rm_watch(2) syscalls on SIGHUP with -imapd and "-extindex --watch" --- lib/PublicInbox/InboxIdle.pm | 15 ++++++++++++--- t/imapd.t | 26 +++++--------------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 84b6d26f..508007d7 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -24,17 +24,26 @@ sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback my $dir = $ibx->{inboxdir}; my $inot = $self->{inot}; my $cur = $self->{pathmap}->{$dir} //= []; + my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); # transfer old subscriptions to the current inbox, cancel the old watch - if (my $old_ibx = $cur->[0]) { + my $old_ibx = $cur->[0]; + $cur->[0] = $ibx; + if ($old_ibx) { $ibx->{unlock_subs} and die "BUG: $dir->{unlock_subs} should not exist"; $ibx->{unlock_subs} = $old_ibx->{unlock_subs}; + + # Linux::Inotify2::Watch::name matches if watches are the + # same, no point in replacing a watch of the same name + if ($cur->[1]->name eq $lock) { + $self->{on_unlock}->{$lock} = $ibx; + return; + } + # rare, name changed (v1 inbox converted to v2) $cur->[1]->cancel; # Linux::Inotify2::Watch::cancel } - $cur->[0] = $ibx; - my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); if (my $w = $cur->[1] = $inot->watch($lock, $IN_MODIFY)) { $self->{on_unlock}->{$w->name} = $ibx; } else { diff --git a/t/imapd.t b/t/imapd.t index 43ec200c..63a86e71 100644 --- a/t/imapd.t +++ b/t/imapd.t @@ -296,27 +296,11 @@ $pi_cfg->each_inbox(sub { # ensure IDLE persists across HUP, w/o extra watches or FDs $td->kill('HUP') or BAIL_OUT "failed to kill -imapd: $!"; - SKIP: { - skip 'no inotify fdinfo (or support)', 2 if !@ino_info; - my (@tmp, %prev); - local $/ = "\n"; - my $end = time + 5; - until (time > $end) { - select undef, undef, undef, 0.01; - open my $fh, '<', $ino_fdinfo or - BAIL_OUT "$ino_fdinfo: $!"; - %prev = map { $_ => 1 } @ino_info; - @tmp = grep(/^inotify wd:/, <$fh>); - if (scalar(@tmp) == scalar(@ino_info)) { - delete @prev{@tmp}; - last if scalar(keys(%prev)) == @ino_info; - } - } - is(scalar @tmp, scalar @ino_info, - 'old inotify watches replaced'); - is(scalar keys %prev, scalar @ino_info, - 'no previous watches overlap'); - }; + for my $n (1..2) { # kick the event loop so we know HUP is done + my $m = $imap_client->new(%mic_opt); + ok($m->login && $m->IsAuthenticated && $m->logout, + "connection $n works after HUP"); + } open($fh, '<', 't/data/0001.patch') or BAIL_OUT("open: $!"); run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or -- cgit v1.2.3-24-ge0c7 From b5e960f50289434025f5904c8c1311e4c8a02b82 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 08:12:52 +0000 Subject: inbox: name variable for values loop iterator ->on_inbox_unlock callbacks could clobber $_, and this seems to fix a problem with -extindex --watch failing to index some inboxes after SIGHUP reload. --- lib/PublicInbox/Inbox.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index ec8469e3..1b9b56ff 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -414,8 +414,8 @@ sub on_unlock { my ($self) = @_; check_inodes($self); my $subs = $self->{unlock_subs} or return; - for (values %$subs) { - eval { $_->on_inbox_unlock($self) }; + for my $obj (values %$subs) { + eval { $obj->on_inbox_unlock($self) }; warn "E: $@ ($self->{inboxdir})\n" if $@; } } -- cgit v1.2.3-24-ge0c7 From 0b018bebe7d8ee807ab07b570cf33669da4875b0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 01:44:36 +0000 Subject: default to CORE::warn in $SIG{__WARN__} handlers As with CORE::die and $SIG{__DIE__}, it turns out CORE::warn is safe to use inside $SIG{__WARN__} handlers without triggering infinite recursion. So fall back to reusing CORE::warn instead of creating a new sub. --- lib/PublicInbox/Admin.pm | 2 +- lib/PublicInbox/ExtSearchIdx.pm | 2 +- lib/PublicInbox/InboxWritable.pm | 2 +- lib/PublicInbox/Watch.pm | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index b468108e..d414e4e2 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -241,7 +241,7 @@ sub index_inbox { } local %SIG = %SIG; setup_signals(\&index_terminate, $ibx); - my $warn_cb = $SIG{__WARN__} // sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} // \&CORE::warn; my $idx = { current_info => $ibx->{inboxdir} }; my $warn_ignore = PublicInbox::InboxWritable->can('warn_ignore'); local $SIG{__WARN__} = sub { diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index e7fdae48..64ebf6db 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -841,7 +841,7 @@ sub eidx_reindex { sub eidx_sync { # main entry point my ($self, $opt) = @_; - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $self->{current_info} = ''; local $SIG{__WARN__} = sub { $warn_cb->($self->{current_info}, ': ', @_); diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 31eb3f15..b1d5caf5 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -292,7 +292,7 @@ sub warn_ignore { # this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..." sub warn_ignore_cb { - my $cb = $SIG{__WARN__} // sub { print STDERR @_ }; + my $cb = $SIG{__WARN__} // \&CORE::warn; sub { return if warn_ignore(@_); $cb->(@_); diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm index e1246096..bc296e01 100644 --- a/lib/PublicInbox/Watch.pm +++ b/lib/PublicInbox/Watch.pm @@ -217,7 +217,7 @@ sub _try_path { warn "unmappable dir: $1\n"; return; } - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; $warn_cb->($pfx, "path: $path\n", @_); @@ -467,7 +467,7 @@ sub imap_fetch_all ($$$) { my $key = $req; $key =~ s/\.PEEK//; my ($uids, $batch); - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; $batch //= '?'; @@ -929,7 +929,7 @@ sub nntp_fetch_all ($$$) { $beg = $l_art + 1; warn "I: $url fetching ARTICLE $beg..$end\n"; - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; my ($err, $art); local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; -- cgit v1.2.3-24-ge0c7 From 382a2bdd54cfb6c28a935c2b9fe4f1b1c2f469c4 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 12:30:35 +0000 Subject: t/config: test --get-urlmatch for git <2.26 While git 1.8.5 learned --get-urlmatch, git did not learn to match URLs against wildcards until 2.26. So only depend on 1.8.5 for this test since 2.26 is too new. Reported-by: Ali Alnubani Link: https://public-inbox.org/meta/DM6PR12MB49106F8E3BD697B63B943A22DADB0@DM6PR12MB4910.namprd12.prod.outlook.com/ Tested-by: Ali Alnubani --- t/config.t | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/t/config.t b/t/config.t index 99a7fef4..7fb44acc 100644 --- a/t/config.t +++ b/t/config.t @@ -234,12 +234,13 @@ EOF } SKIP: { + # XXX wildcard match requires git 2.26+ require_git('1.8.5', 2) or skip 'git 1.8.5+ required for --url-match', 2; my $f = "$tmpdir/urlmatch"; open my $fh, '>', $f or BAIL_OUT $!; print $fh < Date: Sat, 26 Dec 2020 12:25:42 +0000 Subject: eml: fix undefined vars on Link: https://public-inbox.org/meta/DM6PR12MB49106F8E3BD697B63B943A22DADB0@DM6PR12MB4910.namprd12.prod.outlook.com/ Tested-by: Ali Alnubani --- lib/PublicInbox/Eml.pm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 571edc5c..4d3fffc0 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -378,7 +378,9 @@ sub header_str_set { header_set($self, $name, @vals); } -sub mhdr_decode ($) { eval { $MIME_Header->decode($_[0]) } // $_[0] } +sub mhdr_decode ($) { + eval { $MIME_Header->decode($_[0], Encode::FB_DEFAULT) } // $_[0]; +} sub filename { my $dis = header_raw($_[0], 'Content-Disposition'); -- cgit v1.2.3-24-ge0c7 From 1d96509a3f59c38394d2f3ac4323dc54c74dc202 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 01:44:37 +0000 Subject: extindex: --watch for inotify-based updates This reuses existing InboxIdle infrastructure to update external indices based on per-inbox updates. This is an alternative to auto-updating external indices via the -index command and also works with existing uses of -mda and public-inbox-watch. Using inotify (or EVFILT_VNODE) allows watching thousands of inboxes without having to scan every single one at every invocation. This is especially beneficial in cases where an external index is not writable to the users writing to per-inbox indices. --- lib/PublicInbox/ExtSearchIdx.pm | 126 +++++++++++++++++++++++++++++++++++++--- lib/PublicInbox/InboxIdle.pm | 8 ++- lib/PublicInbox/OverIdx.pm | 8 ++- lib/PublicInbox/V2Writable.pm | 2 +- script/public-inbox-extindex | 19 ++++-- 5 files changed, 146 insertions(+), 17 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 64ebf6db..53ff2ca1 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -630,7 +630,7 @@ sub eidxq_process ($$) { # for reindexing my $dbh = $self->{oidx}->dbh; my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return; ${$sync->{nr}} = 0; - $sync->{-regen_fmt} = "%u/$tot\n"; + local $sync->{-regen_fmt} = "%u/$tot\n"; my $pr = $sync->{-opt}->{-progress}; if ($pr) { my $min = $dbh->selectrow_array('SELECT MIN(docid) FROM eidxq'); @@ -709,7 +709,8 @@ sub _reindex_check_unseen ($$$) { my $msgs; my $pr = $sync->{-opt}->{-progress}; my $ekey = $ibx->eidx_key; - $sync->{-regen_fmt} = "$ekey checking unseen %u/".$ibx->over->max."\n"; + local $sync->{-regen_fmt} = + "$ekey checking unseen %u/".$ibx->over->max."\n"; ${$sync->{nr}} = 0; while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) { @@ -752,7 +753,7 @@ sub _reindex_check_stale ($$$) { my $pr = $sync->{-opt}->{-progress}; my $fetching; my $ekey = $ibx->eidx_key; - $sync->{-regen_fmt} = + local $sync->{-regen_fmt} = "$ekey check stale/missing %u/".$ibx->over->max."\n"; ${$sync->{nr}} = 0; do { @@ -838,6 +839,13 @@ sub eidx_reindex { eidxq_process($self, $sync) unless $sync->{quit}; } +sub sync_inbox { + my ($self, $sync, $ibx) = @_; + my $err = _sync_inbox($self, $sync, $ibx); + delete @$ibx{qw(mm over)}; + warn $err, "\n" if defined($err); +} + sub eidx_sync { # main entry point my ($self, $opt) = @_; @@ -868,22 +876,21 @@ sub eidx_sync { # main entry point $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key); } if (delete($opt->{reindex})) { - $sync->{checkpoint_unlocks} = 1; + local $sync->{checkpoint_unlocks} = 1; eidx_reindex($self, $sync); } # don't use $_ here, it'll get clobbered by reindex_checkpoint for my $ibx (@{$self->{ibx_list}}) { last if $sync->{quit}; - my $err = _sync_inbox($self, $sync, $ibx); - delete @$ibx{qw(mm over)}; - warn $err, "\n" if defined($err); + sync_inbox($self, $sync, $ibx); } $self->{oidx}->rethread_done($opt) unless $sync->{quit}; eidxq_process($self, $sync) unless $sync->{quit}; eidxq_release($self); - PublicInbox::V2Writable::done($self); + done($self); + $sync; # for eidx_watch } sub update_last_commit { # overrides V2Writable @@ -970,6 +977,109 @@ sub idx_init { # similar to V2Writable $self->{midx}->begin_txn; } +sub _watch_commit { # PublicInbox::DS::add_timer callback + my ($self) = @_; + delete $self->{-commit_timer}; + eidxq_process($self, $self->{-watch_sync}); + eidxq_release($self); + delete local $self->{-watch_sync}->{-regen_fmt}; + reindex_checkpoint($self, $self->{-watch_sync}); + + # call event_step => done unless commit_timer is armed + PublicInbox::DS::requeue($self); +} + +sub on_inbox_unlock { # called by PublicInbox::InboxIdle + my ($self, $ibx) = @_; + my $opt = $self->{-watch_sync}->{-opt}; + my $pr = $opt->{-progress}; + my $ekey = $ibx->eidx_key; + local $0 = "sync $ekey"; + $pr->("indexing $ekey\n") if $pr; + $self->idx_init($opt); + sync_inbox($self, $self->{-watch_sync}, $ibx); + $self->{-commit_timer} //= PublicInbox::DS::add_timer( + $opt->{'commit-interval'} // 10, + \&_watch_commit, $self); +} + +sub eidx_reload { # -extindex --watch SIGHUP handler + my ($self, $idler) = @_; + if ($self->{cfg}) { + my $pr = $self->{-watch_sync}->{-opt}->{-progress}; + $pr->('reloading ...') if $pr; + @{$self->{ibx_list}} = (); + %{$self->{ibx_map}} = (); + delete $self->{-watch_sync}->{id2pos}; + my $cfg = PublicInbox::Config->new; + attach_config($self, $cfg); + $idler->refresh($cfg); + $pr->(" done\n") if $pr; + } else { + warn "reload not supported without --all\n"; + } +} + +sub eidx_resync_start ($) { # -extindex --watch SIGUSR1 handler + my ($self) = @_; + $self->{-resync_queue} //= [ @{$self->{ibx_list}} ]; + PublicInbox::DS::requeue($self); # trigger our ->event_step +} + +sub event_step { # PublicInbox::DS::requeue callback + my ($self) = @_; + if (my $resync_queue = $self->{-resync_queue}) { + if (my $ibx = shift(@$resync_queue)) { + on_inbox_unlock($self, $ibx); + PublicInbox::DS::requeue($self); + } else { + delete $self->{-resync_queue}; + _watch_commit($self); + } + } else { + done($self) unless $self->{-commit_timer}; + } +} + +sub eidx_watch { # public-inbox-extindex --watch main loop + my ($self, $opt) = @_; + require PublicInbox::InboxIdle; + require PublicInbox::DS; + require PublicInbox::Syscall; + require PublicInbox::Sigfd; + my $idler = PublicInbox::InboxIdle->new($self->{cfg}); + if (!$self->{cfg}) { + $idler->watch_inbox($_) for @{$self->{ibx_list}}; + } + $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}}; + my $sync = eidx_sync($self, $opt); # initial sync + return if $sync->{quit}; + my $oldset = PublicInbox::Sigfd::block_signals(); + local $self->{current_info} = ''; + my $cb = $SIG{__WARN__} || \&CORE::warn; + local $SIG{__WARN__} = sub { $cb->($self->{current_info}, ': ', @_) }; + my $sig = { + HUP => sub { eidx_reload($self, $idler) }, + USR1 => sub { eidx_resync_start($self) }, + TSTP => sub { kill('STOP', $$) }, + }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; + my $sigfd = PublicInbox::Sigfd->new($sig, + $PublicInbox::Syscall::SFD_NONBLOCK); + local %SIG = (%SIG, %$sig) if !$sigfd; + local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock + if (!$sigfd) { + # wake up every second to accept signals if we don't + # have signalfd or IO::KQueue: + PublicInbox::Sigfd::sig_setmask($oldset); + PublicInbox::DS->SetLoopTimeout(1000); + } + PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} }); + PublicInbox::DS->EventLoop; # calls InboxIdle->event_step + done($self); +} + no warnings 'once'; *done = \&PublicInbox::V2Writable::done; *with_umask = \&PublicInbox::InboxWritable::with_umask; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 508007d7..35aed696 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -63,6 +63,9 @@ sub refresh { $pi_cfg->each_inbox(\&in2_arm, $self); } +# internal API for ease-of-use +sub watch_inbox { in2_arm($_[1], $_[0]) }; + sub new { my ($class, $pi_cfg) = @_; my $self = bless {}, $class; @@ -78,7 +81,7 @@ sub new { $self->{inot} = $inot; $self->{pathmap} = {}; # inboxdir => [ ibx, watch1, watch2, watch3...] $self->{on_unlock} = {}; # lock path => ibx - refresh($self, $pi_cfg); + refresh($self, $pi_cfg) if $pi_cfg; PublicInbox::FakeInotify::poll_once($self) if !$ino_cls; $self; } @@ -89,7 +92,8 @@ sub event_step { my @events = $self->{inot}->read; # Linux::Inotify2::read my $on_unlock = $self->{on_unlock}; for my $ev (@events) { - if (my $ibx = $on_unlock->{$ev->fullname}) { + my $fn = $ev->fullname // next; # cancelled + if (my $ibx = $on_unlock->{$fn}) { $ibx->on_unlock; } } diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 4a39bf53..dcc2cff3 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -473,10 +473,14 @@ sub dbh_close { sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + Carp::confess('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + File::Path::mkpath(File::Basename::dirname($fn)); } # create the DB: PublicInbox::Over::dbh($self); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index f20b5c7f..567582c5 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -879,7 +879,7 @@ sub reindex_checkpoint ($$) { $self->done; # release lock } - if (my $pr = $sync->{-opt}->{-progress}) { + if (my $pr = $sync->{-regen_fmt} ? $sync->{-opt}->{-progress} : undef) { $pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}})); } diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index 17ad59fa..607baa3e 100644 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -11,6 +11,7 @@ usage: public-inbox-extindex [options] EXTINDEX_DIR [INBOX_DIR] Create and update external (detached) search indices --no-fsync speed up indexing, risk corruption on power outage + --watch run persistently and watch for inbox updates -L LEVEL `medium', or `full' (default: full) --all index all configured inboxes --jobs=NUM set or disable parallelization (NUM=0) @@ -27,7 +28,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i fsync|sync! indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s - gc + gc commit-interval=i watch all help|h)) or die $help; if ($opt->{help}) { print $help; exit 0 }; @@ -41,7 +42,8 @@ my $cfg = PublicInbox::Config->new; my @ibxs; if ($opt->{gc}) { die "E: inbox paths must not be specified with --gc\n" if @ARGV; - die "E: --all not compatible --gc\n" if $opt->{all}; + die "E: --all not compatible with --gc\n" if $opt->{all}; + die "E: --watch is not compatible with --gc\n" if $opt->{watch}; } else { @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); } @@ -56,6 +58,15 @@ if ($opt->{gc}) { $eidx->attach_config($cfg); $eidx->eidx_gc($opt); } else { - $eidx->attach_inbox($_) for @ibxs; - $eidx->eidx_sync($opt); + if ($opt->{all}) { + $eidx->attach_config($cfg); + } else { + $eidx->attach_inbox($_) for @ibxs; + } + if ($opt->{watch}) { + $cfg = undef; # save memory only after SIGHUP + $eidx->eidx_watch($opt); + } else { + $eidx->eidx_sync($opt); + } } -- cgit v1.2.3-24-ge0c7 From ae8df17135014a64a9f1def786f66c9c16b05fcf Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 10:16:21 +0000 Subject: extindex: various --watch signal handling fixes We need to clobber the SIGUSR1 resync queue on SIGHUP to invalidate old inbox objects. Furthermore, the lengthy initial scan needs to ignore signals intended for the event loop to avoid unexpected behavior. Finally, add some progress output to inform users on the terminal to inform users' of progress. --- lib/PublicInbox/ExtSearchIdx.pm | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 53ff2ca1..778154a5 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -1008,6 +1008,7 @@ sub eidx_reload { # -extindex --watch SIGHUP handler if ($self->{cfg}) { my $pr = $self->{-watch_sync}->{-opt}->{-progress}; $pr->('reloading ...') if $pr; + delete $self->{-resync_queue}; @{$self->{ibx_list}} = (); %{$self->{ibx_map}} = (); delete $self->{-watch_sync}->{id2pos}; @@ -1043,6 +1044,10 @@ sub event_step { # PublicInbox::DS::requeue callback sub eidx_watch { # public-inbox-extindex --watch main loop my ($self, $opt) = @_; + local %SIG = %SIG; + for my $sig (qw(HUP USR1 TSTP QUIT INT TERM)) { + $SIG{$sig} = sub { warn "SIG$sig ignored while scanning\n" }; + } require PublicInbox::InboxIdle; require PublicInbox::DS; require PublicInbox::Syscall; @@ -1052,6 +1057,8 @@ sub eidx_watch { # public-inbox-extindex --watch main loop $idler->watch_inbox($_) for @{$self->{ibx_list}}; } $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}}; + my $pr = $opt->{-progress}; + $pr->("performing initial scan ...\n") if $pr; my $sync = eidx_sync($self, $opt); # initial sync return if $sync->{quit}; my $oldset = PublicInbox::Sigfd::block_signals(); @@ -1067,7 +1074,7 @@ sub eidx_watch { # public-inbox-extindex --watch main loop $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; my $sigfd = PublicInbox::Sigfd->new($sig, $PublicInbox::Syscall::SFD_NONBLOCK); - local %SIG = (%SIG, %$sig) if !$sigfd; + %SIG = (%SIG, %$sig) if !$sigfd; local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock if (!$sigfd) { # wake up every second to accept signals if we don't @@ -1076,6 +1083,7 @@ sub eidx_watch { # public-inbox-extindex --watch main loop PublicInbox::DS->SetLoopTimeout(1000); } PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} }); + $pr->("initial scan complete, entering event loop\n") if $pr; PublicInbox::DS->EventLoop; # calls InboxIdle->event_step done($self); } -- cgit v1.2.3-24-ge0c7 From 41464d205ade16a5a847061fa2eb706a33b52a88 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 10:16:22 +0000 Subject: extindex: enable autoflush on STDOUT/STDERR With --watch, the output may be redirected to a pipe or socket which Perl may decide to buffer. Ensure Perl doesn't buffer these outputs since they can provide real-time status updates in response to signals or FS activity. --- script/public-inbox-extindex | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index 607baa3e..17986f60 100644 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -33,7 +33,9 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i or die $help; if ($opt->{help}) { print $help; exit 0 }; die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; - +require IO::Handle; +STDOUT->autoflush(1); +STDERR->autoflush(1); # require lazily to speed up --help my $eidx_dir = shift(@ARGV) // die "E: $help"; local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync -- cgit v1.2.3-24-ge0c7 From 02aad3e340d1711359c4def6e91482140a989ce1 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 10:16:23 +0000 Subject: extindex: add undocumented --no-scan switch This makes diagnosing --watch problems easier when there's 50K inboxes by avoiding the lengthy scan (which is the reason --watch exists in the first place). --- lib/PublicInbox/ExtSearchIdx.pm | 8 +++++--- script/public-inbox-extindex | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 778154a5..07e64698 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -881,9 +881,11 @@ sub eidx_sync { # main entry point } # don't use $_ here, it'll get clobbered by reindex_checkpoint - for my $ibx (@{$self->{ibx_list}}) { - last if $sync->{quit}; - sync_inbox($self, $sync, $ibx); + if ($opt->{scan} // 1) { + for my $ibx (@{$self->{ibx_list}}) { + last if $sync->{quit}; + sync_inbox($self, $sync, $ibx); + } } $self->{oidx}->rethread_done($opt) unless $sync->{quit}; eidxq_process($self, $sync) unless $sync->{quit}; diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index 17986f60..f4ffda4b 100644 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -23,12 +23,12 @@ usage: public-inbox-extindex [options] EXTINDEX_DIR [INBOX_DIR] BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes) See public-inbox-extindex(1) man page for full documentation. EOF -my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 }; +my $opt = { quiet => -1, compact => 0, fsync => 1, scan => 1 }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i fsync|sync! indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s - gc commit-interval=i watch + gc commit-interval=i watch scan! all help|h)) or die $help; if ($opt->{help}) { print $help; exit 0 }; -- cgit v1.2.3-24-ge0c7 From e411f4465dd26d8b09d005224a8ead7056e6e532 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 10:16:24 +0000 Subject: extindex: allow using --all without EXTINDEX_DIR If "--all" is specified to index all inboxes, implicitly choose the configured [extindex "all"] external index since "--all" is incompatible with specifying inbox directories on the command-line. --- script/public-inbox-extindex | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index f4ffda4b..5f27988f 100644 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -6,7 +6,7 @@ use strict; use v5.10.1; use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); my $help = <= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; require IO::Handle; STDOUT->autoflush(1); STDERR->autoflush(1); -# require lazily to speed up --help -my $eidx_dir = shift(@ARGV) // die "E: $help"; local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync +# require lazily to speed up --help require PublicInbox::Admin; my $cfg = PublicInbox::Config->new; +my $eidx_dir = shift(@ARGV); +unless (defined $eidx_dir) { + if ($opt->{all} && $cfg->ALL) { + $eidx_dir = $cfg->ALL->{topdir}; + } else { + die "E: $help"; + } +} my @ibxs; if ($opt->{gc}) { die "E: inbox paths must not be specified with --gc\n" if @ARGV; -- cgit v1.2.3-24-ge0c7 From d209c2064ebc8ccddc7f0da068c663fc08077334 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 11:01:41 +0000 Subject: extsearch: unconditionally reopen on access Since ExtSearch lacks the janky cleanup timer of PublicInbox::Inbox objects, its search results get stale. Reopen the Xapian DB on every ->search call for now, as reducing reopen calls doesn't seem worth the complexity. The Xapian::Database::reopen operation itself takes only ~50us on my old workstation with 3 shards totaling <200GB. Other parts of Xapian dominates the search time, so the reopen seems inconsequential with single-digit shard counts. --- lib/PublicInbox/ExtSearch.pm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index a2b97798..7c9586a6 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -29,8 +29,6 @@ sub misc { $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc"); } -sub search { $_[0] } # self - # overrides PublicInbox::Search::_xdb sub _xdb { my ($self) = @_; @@ -126,6 +124,6 @@ no warnings 'once'; *recent = \&PublicInbox::Inbox::recent; *max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef -*isrch = *search; +*isrch = *search = \&PublicInbox::Search::reopen; 1; -- cgit v1.2.3-24-ge0c7 From 54b250c611c752538666f2bf6d361d3762a21781 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 11:01:42 +0000 Subject: miscsearch: take reopen from Search and use it As with ExtSearch, MiscSearch lacks a janky cleanup timer of PublicInbox::Inbox objects, leading to info about inboxes/newsgroups going stale. Fortunately, we don't use MiscSearch very heavily, yet. In the future, we may be able to detect new inboxes without having to SIGHUP or restart daemons using MiscSearch. --- lib/PublicInbox/MiscSearch.pm | 4 ++++ lib/PublicInbox/WwwListing.pm | 3 +++ 2 files changed, 7 insertions(+) diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index c6ce255f..6683d564 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -73,6 +73,7 @@ sub misc_enquire_once { # retry_reopen callback sub mset { my ($self, $qs, $opt) = @_; $opt ||= {}; + reopen($self); my $qp = $self->{qp} //= mi_qp_new($self); $qs = 'type:inbox' if $qs eq ''; my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); @@ -184,4 +185,7 @@ sub nntpd_cache_load { retry_reopen($self, \&_nntpd_cache_load); } +no warnings 'once'; +*reopen = \&PublicInbox::Search::reopen; + 1; diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index fce0e530..4b3f1674 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -69,6 +69,9 @@ sub hide_key { 'www' } sub response { my ($class, $ctx) = @_; bless $ctx, $class; + if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) { + $ALL->misc->reopen; + } my $re = $ctx->url_regexp or return $ctx->psgi_triple; my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg}, \&list_match_i, $re, $ctx); -- cgit v1.2.3-24-ge0c7 From d0e74a3591d9e701af6fea30baacf2ddb51475d5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 19:38:28 +0000 Subject: search: remove pointless {relevance} setting SearchView will set it to `undef', others will set the 'mset' option (for the ->mset method :P) to 2 which causes {relevance} to be ignored. And the 'mset' option is poorly named now that the message is named ->mset... --- lib/PublicInbox/Search.pm | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 05c679c9..ffd19a1f 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -287,7 +287,6 @@ sub mset { $opts ||= {}; my $qp = $self->{qp} //= qparse_new($self); my $query = $qp->parse_query($query_string, $self->{qp_flags}); - $opts->{relevance} = 1 unless exists $opts->{relevance}; _do_enquire($self, $query, $opts); } -- cgit v1.2.3-24-ge0c7 From 5f875446975b1473c1ffd7196e572e13d58ba56f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 19:38:29 +0000 Subject: search: remove {mset} option for ->mset method The ->mset method always returns a Xapian mset nowadays, so naming a parameter {mset} is too confusing. As it does with MiscSearch, setting the {relevance} parameter to -1 now sorts by ascending docid order. -2 is now supported for descending docid order, too, since it may be useful for lei users. --- lib/PublicInbox/ExtMsg.pm | 2 +- lib/PublicInbox/IMAP.pm | 2 +- lib/PublicInbox/Isearch.pm | 2 +- lib/PublicInbox/Mbox.pm | 2 +- lib/PublicInbox/Search.pm | 24 +++++++++++++----------- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 6a173f67..4df885ab 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -33,7 +33,7 @@ sub search_partial ($$) { my ($ibx, $mid) = @_; return if length($mid) < $MIN_PARTIAL_LEN; my $srch = $ibx->search or return; # NOT ->isrch, we already try ->ALL - my $opt = { limit => PARTIAL_MAX, mset => 2 }; + my $opt = { limit => PARTIAL_MAX, relevance => -1 }; my @try = ("m:$mid*"); my $chop = $mid; if ($chop =~ s/(\W+)(\w*)\z//) { diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index a3a10bde..2af5ab0c 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -1136,7 +1136,7 @@ sub search_common { my $srch = $self->{ibx}->isrch or return "$tag BAD search not available for mailbox\r\n"; my $opt = { - mset => 2, + relevance => -1, limit => UID_SLICE, uid_range => $range_info }; diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index e362c80a..7ca2f9e4 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -61,7 +61,7 @@ sub mset_to_artnums { my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset); my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); my $qmarks = join(',', map { '?' } @$docids); - if ($opt && ($opt->{mset} // 0) == 2) { # opt->{mset} = 2 was used + if ($opt && ($opt->{relevance} // 0) == -1) { # -1 => ENQ_ASCENDING my $range = ''; my @r; if (my $r = $opt->{uid_range}) { diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index c8e4b406..83fa7d8a 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -235,7 +235,7 @@ sub mbox_all { my $over = $ctx->{ibx}->over or return PublicInbox::WWW::need($ctx, 'Overview'); - my $qopts = $ctx->{qopts} = { mset => 2 }; # order by docid + my $qopts = $ctx->{qopts} = { relevance => -1 }; # ORDER BY docid ASC $qopts->{thread} = 1 if $q->{t}; my $mset = $srch->mset($q_string, $qopts); $qopts->{offset} = $mset->size or diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index ffd19a1f..fb3e9975 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -58,7 +58,11 @@ our $QP_FLAGS; our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); our $Xap; # 'Search::Xapian' or 'Xapian' our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') -our $ENQ_ASCENDING; + +# ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, +# let's hope the ABI is stable +our $ENQ_DESCENDING = 0; +our $ENQ_ASCENDING = 1; sub load_xapian () { return 1 if defined $Xap; @@ -84,12 +88,6 @@ sub load_xapian () { 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); - # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm, - # so lets hope this part of the ABI is stable because it's - # just an integer: - $ENQ_ASCENDING = $x eq 'Xapian' ? - 1 : Search::Xapian::ENQ_ASCENDING(); - *sortable_serialise = $x.'::sortable_serialise'; *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public @@ -344,13 +342,17 @@ sub _enquire_once { # retry_reopen callback $enquire->set_query($query); $opts ||= {}; my $desc = !$opts->{asc}; - if (($opts->{mset} || 0) == 2) { # mset == 2: ORDER BY docid/UID + my $rel = $opts->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid/UID + $enquire->set_weighting_scheme($X{BoolWeight}->new); $enquire->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enquire->set_sort_by_value_then_relevance(TS, $desc); + } elsif ($rel == -2) { $enquire->set_weighting_scheme($X{BoolWeight}->new); - } elsif ($opts->{relevance}) { + $enquire->set_docid_order($ENQ_DESCENDING); + } else { # rel > 0 $enquire->set_sort_by_relevance_then_value(TS, $desc); - } else { - $enquire->set_sort_by_value_then_relevance(TS, $desc); } # `mairix -t / --threads' or JMAP collapseThreads -- cgit v1.2.3-24-ge0c7 From 645ec505c03c86ed4500151737bcf3d636d9b18b Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:03 +0000 Subject: git: qx: avoid extra "local" for scalar context case We can use the ternary operator to avoid an early return, here --- lib/PublicInbox/Git.pm | 6 ++---- t/git.t | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 08406925..73dc7d3e 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -362,10 +362,8 @@ sub popen { sub qx { my ($self, @cmd) = @_; my $fh = $self->popen(@cmd); - local $/ = "\n"; - return <$fh> if wantarray; - local $/; - <$fh> + local $/ = wantarray ? "\n" : undef; + <$fh>; } # check_async and cat_async may trigger the other, so ensure they're diff --git a/t/git.t b/t/git.t index dfd7173a..dcd053c5 100644 --- a/t/git.t +++ b/t/git.t @@ -79,6 +79,7 @@ if (1) { my @ref = $gcf->qx(qw(cat-file blob), $buf); my $nl = scalar @ref; ok($nl > 1, "qx returned array length of $nl"); + is(join('', @ref), $ref, 'qx array and scalar context both work'); $gcf->qx(qw(repack -adq)); ok($gcf->packed_bytes > 0, 'packed size is positive'); -- cgit v1.2.3-24-ge0c7 From 0f545dd95ca88e69f011aa56d07494887e7df822 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:04 +0000 Subject: import: check for git->qx errors, clearer return values Those git commands can fail and git->qx will set $? when it fails. There's no need for the extra indirection of the @ret array, either. Improve git->qx coverage to check for $? while we're at it. --- lib/PublicInbox/Import.pm | 9 +++++---- t/git.t | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 2cb4896a..e0a84bfd 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -48,7 +48,7 @@ sub gfi_start { return ($self->{in}, $self->{out}) if $self->{pid}; - my (@ret, $out_r, $out_w); + my ($in_r, $pid, $out_r, $out_w); pipe($out_r, $out_w) or die "pipe failed: $!"; $self->lock_acquire; @@ -56,27 +56,28 @@ sub gfi_start { my ($git, $ref) = @$self{qw(git ref)}; local $/ = "\n"; chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + die "fatal: rev-parse --revs-only $ref: \$?=$?" if $?; if ($self->{path_type} ne '2/38' && $self->{tip}) { local $/ = "\0"; my @t = $git->qx(qw(ls-tree -r -z --name-only), $ref); + die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; chomp @t; $self->{-tree} = { map { $_ => 1 } @t }; } my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(fast-import --quiet --done --date-format=raw)); - my ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); + ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); $out_w->autoflush(1); $self->{in} = $in_r; $self->{out} = $out_w; $self->{pid} = $pid; $self->{nchg} = 0; - @ret = ($in_r, $out_w); }; if ($@) { $self->lock_release; die $@; } - @ret; + ($in_r, $out_w); } sub wfail () { die "write to fast-import failed: $!" } diff --git a/t/git.t b/t/git.t index dcd053c5..2cfff248 100644 --- a/t/git.t +++ b/t/git.t @@ -76,13 +76,17 @@ if (1) { is(length($$x), $size, 'read correct number of bytes'); my $ref = $gcf->qx(qw(cat-file blob), $buf); + is($?, 0, 'no error on scalar success'); my @ref = $gcf->qx(qw(cat-file blob), $buf); + is($?, 0, 'no error on wantarray success'); my $nl = scalar @ref; ok($nl > 1, "qx returned array length of $nl"); is(join('', @ref), $ref, 'qx array and scalar context both work'); $gcf->qx(qw(repack -adq)); ok($gcf->packed_bytes > 0, 'packed size is positive'); + $gcf->qx(qw(rev-parse --verify bogus)); + isnt($?, 0, '$? set on failure'.$?); } SKIP: { -- cgit v1.2.3-24-ge0c7 From 522bb4a4973d4ac41b83be58dd3257e8cd038744 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:05 +0000 Subject: check defined return value for localized slurp errors Reading from regular files (even on STDIN) can fail when dealing with flakey storage. --- lib/PublicInbox/Gcf2.pm | 2 +- lib/PublicInbox/Inbox.pm | 9 +++------ script/public-inbox-edit | 3 ++- script/public-inbox-init | 6 +----- script/public-inbox-learn | 3 +-- script/public-inbox-purge | 2 +- 6 files changed, 9 insertions(+), 16 deletions(-) diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 041dffe7..fe6afef2 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -35,7 +35,7 @@ BEGIN { if (open(my $fh, '<', $f)) { chomp($l, $c); local $/; - $c_src = <$fh>; + defined($c_src = <$fh>) or die "read $f: $!\n"; $CFG{LIBS} = $l; $CFG{CCFLAGSEX} = $c; last; diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 1b9b56ff..af6380a7 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -210,12 +210,9 @@ sub over { sub try_cat { my ($path) = @_; - my $rv = ''; - if (open(my $fh, '<', $path)) { - local $/; - $rv = <$fh>; - } - $rv; + open(my $fh, '<', $path) or return ''; + local $/; + <$fh> // ''; } sub cat_desc ($) { diff --git a/script/public-inbox-edit b/script/public-inbox-edit index a70614fc..81f023bc 100755 --- a/script/public-inbox-edit +++ b/script/public-inbox-edit @@ -183,7 +183,8 @@ retry_edit: # rename/relink $edit_fn open my $new_fh, '<', $edit_fn or die "can't read edited file ($edit_fn): $!\n"; - my $new_raw = do { local $/; <$new_fh> }; + defined(my $new_raw = do { local $/; <$new_fh> }) or die + "read $edit_fn: $!\n"; if (!$opt->{raw}) { # get rid of the From we added diff --git a/script/public-inbox-init b/script/public-inbox-init index 6d538e43..7ac77830 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -100,11 +100,7 @@ if (-e $pi_config) { defined $perm or die "(f)stat failed on $pi_config: $!\n"; chmod($perm & 07777, $fh) or die "(f)chmod failed on future $pi_config: $!\n"; - my $old; - { - local $/; - $old = <$oh>; - } + defined(my $old = do { local $/; <$oh> }) or die "read $pi_config: $!\n"; print $fh $old or die "failed to write: $!\n"; close $oh or die "failed to close $pi_config: $!\n"; diff --git a/script/public-inbox-learn b/script/public-inbox-learn index 9352c8ff..1731a4ba 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -39,8 +39,7 @@ my $spamc = PublicInbox::Spamcheck::Spamc->new; my $pi_cfg = PublicInbox::Config->new; my $err; my $mime = PublicInbox::Eml->new(do{ - local $/; - my $data = ; + defined(my $data = do { local $/; }) or die "read STDIN: $!\n"; $data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; if ($train ne 'rm') { diff --git a/script/public-inbox-purge b/script/public-inbox-purge index 7bca11ea..52f1f18a 100755 --- a/script/public-inbox-purge +++ b/script/public-inbox-purge @@ -32,7 +32,7 @@ if ($opt->{help}) { print $help; exit 0 }; my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt); PublicInbox::AdminEdit::check_editable(\@ibxs); -my $data = do { local $/; }; +defined(my $data = do { local $/; }) or die "read STDIN: $!\n"; $data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; my $n_purged = 0; -- cgit v1.2.3-24-ge0c7 From a7794134700c855a7a4fc030720ee8901b38a5f0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:06 +0000 Subject: ds: simplify EventLoop implementation More importantly, make it easier-to-find the sub by avoiding runtime manipulation of subroutine names. There's no point in avoiding a potential call to _InitPoller in EventLoop since entering EventLoop is rare. On the contrary, PublicInbox::DS->new is called often and this change to avoid entering _InitPoller there may have more benefits (which may still be unmeasurable). --- lib/PublicInbox/DS.pm | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index a02b3bb7..12df5919 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -50,7 +50,6 @@ our ( $PostLoopCallback, # subref to call at the end of each loop, if defined (global) $LoopTimeout, # timeout of event loop in milliseconds - $DoneInit, # if we've done the one-time module init yet @Timers, # timers $in_loop, ); @@ -75,12 +74,9 @@ sub Reset { @Timers = (); $PostLoopCallback = undef; - $DoneInit = 0; $_io = undef; # closes real $Epoll FD $Epoll = undef; # may call DSKQXS::DESTROY - - *EventLoop = *FirstTimeEventLoop; } =head2 C<< CLASS->SetLoopTimeout( $timeout ) >> @@ -137,14 +133,13 @@ sub set_cloexec ($) { fcntl($_io, F_SETFD, $fl | FD_CLOEXEC); } +# caller sets return value to $Epoll sub _InitPoller { - return if $DoneInit; - $DoneInit = 1; - if (PublicInbox::Syscall::epoll_defined()) { - $Epoll = epoll_create(); - set_cloexec($Epoll) if (defined($Epoll) && $Epoll >= 0); + my $fd = epoll_create(); + set_cloexec($fd) if (defined($fd) && $fd >= 0); + $fd; } else { my $cls; for (qw(DSKQXS DSPoll)) { @@ -152,9 +147,8 @@ sub _InitPoller last if eval "require $cls"; } $cls->import(qw(epoll_ctl epoll_wait)); - $Epoll = $cls->new; + $cls->new; } - *EventLoop = *EpollEventLoop; } =head2 C<< CLASS->EventLoop() >> @@ -163,13 +157,6 @@ Start processing IO events. In most daemon programs this never exits. See C below for how to exit the loop. =cut -sub FirstTimeEventLoop { - my $class = shift; - - _InitPoller(); - - EventLoop($class); -} sub now () { clock_gettime(CLOCK_MONOTONIC) } @@ -271,7 +258,8 @@ sub PostEventLoop () { $PostLoopCallback ? $PostLoopCallback->(\%DescriptorMap) : 1; } -sub EpollEventLoop { +sub EventLoop { + $Epoll //= _InitPoller(); local $in_loop = 1; do { my @events; @@ -330,8 +318,7 @@ sub new { $self->{sock} = $sock; my $fd = fileno($sock); - _InitPoller(); - + $Epoll //= _InitPoller(); retry: if (epoll_ctl($Epoll, EPOLL_CTL_ADD, $fd, $ev)) { if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { -- cgit v1.2.3-24-ge0c7 From 0f461dcd3317f44670e2fc50346f87ff41e80127 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:07 +0000 Subject: ds: flatten + reuse @events, epoll_wait style fixes Consistently returning the equivalent of pollfd.revents in a portable manner was never worth the effort for us, as we use the same ->event_step callback regardless of POLLIN/POLLOUT/POLLHUP. Being a Perl, @events knows it size and we don't have to return a maximum index for the caller to iterate on. We can also avoid redundant integer coercion ("+0") since we ensure everything is an IV in other places. Finally, vec() is preferable to ("\0" x $size) for resizing buffers because it only needs to write the extended portion and not overwrite the entire buffer. --- lib/PublicInbox/DS.pm | 20 +++++--------- lib/PublicInbox/DSKQXS.pm | 2 +- lib/PublicInbox/DSPoll.pm | 3 +-- lib/PublicInbox/Syscall.pm | 66 ++++++++++++++++++++++++++-------------------- t/ds-poll.t | 28 ++++++++++---------- t/epoll.t | 8 +++--- 6 files changed, 63 insertions(+), 64 deletions(-) diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index 12df5919..97a6f6ef 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -87,9 +87,7 @@ A timeout of 0 (zero) means poll forever. A timeout of -1 means poll and return immediately. =cut -sub SetLoopTimeout { - return $LoopTimeout = $_[1] + 0; -} +sub SetLoopTimeout { $LoopTimeout = $_[1] + 0 } =head2 C<< PublicInbox::DS::add_timer( $seconds, $coderef, $arg) >> @@ -200,12 +198,7 @@ sub RunTimers { my $timeout = int(($Timers[0][0] - $now) * 1000) + 1; # -1 is an infinite timeout, so prefer a real timeout - return $timeout if $LoopTimeout == -1; - - # otherwise pick the lower of our regular timeout and time until - # the next timer - return $LoopTimeout if $LoopTimeout < $timeout; - return $timeout; + ($LoopTimeout < 0 || $LoopTimeout >= $timeout) ? $timeout : $LoopTimeout; } # We can't use waitpid(-1) safely here since it can hit ``, system(), @@ -261,19 +254,18 @@ sub PostEventLoop () { sub EventLoop { $Epoll //= _InitPoller(); local $in_loop = 1; + my @events; do { - my @events; - my $i; my $timeout = RunTimers(); # get up to 1000 events - my $evcount = epoll_wait($Epoll, 1000, $timeout, \@events); - for ($i=0; $i<$evcount; $i++) { + epoll_wait($Epoll, 1000, $timeout, \@events); + for my $fd (@events) { # it's possible epoll_wait returned many events, including some at the end # that ones in the front triggered unregister-interest actions. if we # can't find the %sock entry, it's because we're no longer interested # in that event. - $DescriptorMap{$events[$i]->[0]}->event_step; + $DescriptorMap{$fd}->event_step; } } while (PostEventLoop()); _run_later(); diff --git a/lib/PublicInbox/DSKQXS.pm b/lib/PublicInbox/DSKQXS.pm index d1d3fe60..aa2c9168 100644 --- a/lib/PublicInbox/DSKQXS.pm +++ b/lib/PublicInbox/DSKQXS.pm @@ -134,7 +134,7 @@ sub epoll_wait { } } # caller only cares for $events[$i]->[0] - scalar(@$events); + $_ = $_->[0] for @$events; } # kqueue is close-on-fork (not exec), so we must not close it diff --git a/lib/PublicInbox/DSPoll.pm b/lib/PublicInbox/DSPoll.pm index 1d9b51d9..a218f695 100644 --- a/lib/PublicInbox/DSPoll.pm +++ b/lib/PublicInbox/DSPoll.pm @@ -45,14 +45,13 @@ sub epoll_wait { my $fd = $pset[$i++]; my $revents = $pset[$i++] or next; delete($self->{$fd}) if $self->{$fd} & EPOLLONESHOT; - push @$events, [ $fd ]; + push @$events, $fd; } my $nevents = scalar @$events; if ($n != $nevents) { warn "BUG? poll() returned $n, but got $nevents"; } } - $n; } 1; diff --git a/lib/PublicInbox/Syscall.pm b/lib/PublicInbox/Syscall.pm index e4f00a2a..c403f78a 100644 --- a/lib/PublicInbox/Syscall.pm +++ b/lib/PublicInbox/Syscall.pm @@ -227,38 +227,46 @@ sub epoll_ctl_mod8 { our $epoll_wait_events; our $epoll_wait_size = 0; sub epoll_wait_mod4 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 12 x $epoll_wait_size; - } - my $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - for (0..$ct-1) { - @{$_[3]->[$_]}[1,0] = unpack("LL", substr($epoll_wait_events, 12*$_, 8)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 12 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec); + for (0..$ct - 1) { + # 12-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 8 bytes: epoll_data_t union (first 4 bytes are the fd) + # So we skip the first 4 bytes and take the middle 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 12 * $_ + 4, 4)); + } } sub epoll_wait_mod8 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 16 x $epoll_wait_size; - } - my $ct; - if ($no_deprecated) { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0, undef); - } else { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - } - for (0..$ct-1) { - # 16 byte epoll_event structs, with format: - # 4 byte mask [idx 1] - # 4 byte padding (we put it into idx 2, useless) - # 8 byte data (first 4 bytes are fd, into idx 0) - @{$_[3]->[$_]}[1,2,0] = unpack("LLL", substr($epoll_wait_events, 16*$_, 12)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 16 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec, + $no_deprecated ? undef : ()); + for (0..$ct - 1) { + # 16-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 4 bytes padding (skipped, useless) + # 8 bytes epoll_data_t union (first 4 bytes are the fd) + # So skip the first 8 bytes, take 4, and ignore the last 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 16 * $_ + 8, 4)); + } } sub signalfd ($$$) { diff --git a/t/ds-poll.t b/t/ds-poll.t index 3771059b..0ee57b69 100644 --- a/t/ds-poll.t +++ b/t/ds-poll.t @@ -16,35 +16,35 @@ pipe($r, $w) or die; pipe($x, $y) or die; is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($r), EPOLLIN), 0, 'add EPOLLIN'); my $events = []; -my $n = $p->epoll_wait(9, 0, $events); +$p->epoll_wait(9, 0, $events); is_deeply($events, [], 'no events set'); -is($n, 0, 'nothing ready, yet'); is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($w), EPOLLOUT|EPOLLONESHOT), 0, 'add EPOLLOUT|EPOLLONESHOT'); -$n = $p->epoll_wait(9, -1, $events); -is($n, 1, 'got POLLOUT event'); -is($events->[0]->[0], fileno($w), '$w ready'); +$p->epoll_wait(9, -1, $events); +is(scalar(@$events), 1, 'got POLLOUT event'); +is($events->[0], fileno($w), '$w ready'); -$n = $p->epoll_wait(9, 0, $events); -is($n, 0, 'nothing ready after oneshot'); +$p->epoll_wait(9, 0, $events); +is(scalar(@$events), 0, 'nothing ready after oneshot'); is_deeply($events, [], 'no events set after oneshot'); syswrite($w, '1') == 1 or die; for my $t (0..1) { - $n = $p->epoll_wait(9, $t, $events); - is($events->[0]->[0], fileno($r), "level-trigger POLLIN ready #$t"); - is($n, 1, "only event ready #$t"); + $p->epoll_wait(9, $t, $events); + is($events->[0], fileno($r), "level-trigger POLLIN ready #$t"); + is(scalar(@$events), 1, "only event ready #$t"); } syswrite($y, '1') == 1 or die; is($p->epoll_ctl(EPOLL_CTL_ADD, fileno($x), EPOLLIN|EPOLLONESHOT), 0, 'EPOLLIN|EPOLLONESHOT add'); -is($p->epoll_wait(9, -1, $events), 2, 'epoll_wait has 2 ready'); -my @fds = sort(map { $_->[0] } @$events); +$p->epoll_wait(9, -1, $events); +is(scalar @$events, 2, 'epoll_wait has 2 ready'); +my @fds = sort @$events; my @exp = sort((fileno($r), fileno($x))); is_deeply(\@fds, \@exp, 'got both ready FDs'); is($p->epoll_ctl(EPOLL_CTL_DEL, fileno($r), 0), 0, 'EPOLL_CTL_DEL OK'); -$n = $p->epoll_wait(9, 0, $events); -is($n, 0, 'nothing ready after EPOLL_CTL_DEL'); +$p->epoll_wait(9, 0, $events); +is(scalar @$events, 0, 'nothing ready after EPOLL_CTL_DEL'); done_testing; diff --git a/t/epoll.t b/t/epoll.t index b47650e3..a1e73e07 100644 --- a/t/epoll.t +++ b/t/epoll.t @@ -12,11 +12,11 @@ is(epoll_ctl($epfd, EPOLL_CTL_ADD, fileno($w), EPOLLOUT), 0, 'epoll_ctl socket EPOLLOUT'); my @events; -is(epoll_wait($epfd, 100, 10000, \@events), 1, 'epoll_wait returns'); +epoll_wait($epfd, 100, 10000, \@events); is(scalar(@events), 1, 'got one event'); -is($events[0]->[0], fileno($w), 'got expected FD'); -is($events[0]->[1], EPOLLOUT, 'got expected event'); +is($events[0], fileno($w), 'got expected FD'); close $w; -is(epoll_wait($epfd, 100, 0, \@events), 0, 'epoll_wait timeout'); +epoll_wait($epfd, 100, 0, \@events); +is(@events, 0, 'epoll_wait timeout'); done_testing; -- cgit v1.2.3-24-ge0c7