From f1f61f2e7ab16fdf4441e12b4c3cfb98d7e3e3c5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 18 Dec 2020 20:53:09 +0000 Subject: wwwstream: linkify coderepo URLs It seems like a good idea to get more cgit visibility. --- lib/PublicInbox/WwwStream.pm | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 66e34a12..49940262 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -78,22 +78,20 @@ sub html_top ($) { sub coderepos ($) { my ($ctx) = @_; - my $ibx = $ctx->{ibx}; + my $cr = $ctx->{ibx}->{coderepo} // return (); + my $cfg = $ctx->{www}->{pi_cfg}; my @ret; - if (defined(my $cr = $ibx->{coderepo})) { - my $cfg = $ctx->{www}->{pi_cfg}; - my $env = $ctx->{env}; - for my $cr_name (@$cr) { - my $urls = $cfg->{"coderepo.$cr_name.cgiturl"}; - if ($urls) { - $ret[0] //= <{"coderepo.$cr_name.cgiturl"} // next; + $ret[0] //= <{env}, $u)); + $ret[0] .= qq(\n\t$u); } } - @ret; # may be empty + @ret; # may be empty, this sub is called as an arg for join() } sub code_footer ($) { -- cgit v1.2.3-24-ge0c7 From 9fcce78e40b0a7c61797be2aff6c5afeb474568e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:11 +0000 Subject: script/public-inbox-*: favor caller-provided pathnames We'll try to avoid calling Cwd::abs_path and use File::Spec->rel2abs instead, since abs_path will resolve symlinks the user specified on the command-line. Unfortunately, ->rel2abs still leaves "/.." and "/../" uncollapsed, so we still need to fall back to Cwd::abs_path in those cases. While we are at it, we'll also resolve inboxdir from deep inside v2 directories instead of misdetecting them as v1 bare git repos. In any case, stop matching directories by name and instead rely on the unique combination of st_dev + st_ino on stat() as we started doing in the extindex code. --- lib/PublicInbox/Admin.pm | 102 +++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 34 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 3977d812..ea82133a 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -6,15 +6,15 @@ package PublicInbox::Admin; use strict; use parent qw(Exporter); -use Cwd qw(abs_path); -use POSIX (); our @EXPORT_OK = qw(setup_signals); use PublicInbox::Config; use PublicInbox::Inbox; use PublicInbox::Spawn qw(popen_rd); +use File::Spec (); sub setup_signals { my ($cb, $arg) = @_; # optional + require POSIX; # we call exit() here instead of _exit() so DESTROY methods # get called (e.g. File::Temp::Dir and PublicInbox::Msgmap) @@ -27,21 +27,43 @@ sub setup_signals { }; } +# abs_path resolves symlinks, so we want to avoid it if rel2abs +# is sufficient and doesn't leave "/.." or "/../" +sub rel2abs_collapsed ($) { + my $p = File::Spec->rel2abs($_[0]); + return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; # likely + require Cwd; + Cwd::abs_path($p); +} + sub resolve_inboxdir { my ($cd, $ver) = @_; - my $prefix = defined $cd ? $cd : './'; - if (-d $prefix && -f "$prefix/inbox.lock") { # v2 - $$ver = 2 if $ver; - return abs_path($prefix); + my $try = $cd // '.'; + my $root_dev_ino; + while (1) { # favor v2, first + if (-f "$try/inbox.lock") { + $$ver = 2 if $ver; + return rel2abs_collapsed($try); + } elsif (-d $try) { + my @try = stat _; + $root_dev_ino //= do { + my @root = stat('/') or die "stat /: $!\n"; + "$root[0]\0$root[1]"; + }; + last if "$try[0]\0$try[1]" eq $root_dev_ino; + $try .= '/..'; # continue, cd up + } else { + die "`$try' is not a directory\n"; + } } + # try v1 bare git dirs my $cmd = [ qw(git rev-parse --git-dir) ]; my $fh = popen_rd($cmd, undef, {-C => $cd}); my $dir = do { local $/; <$fh> }; - close $fh or die "error in ".join(' ', @$cmd)." (cwd:$cd): $!\n"; + close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $!\n"; chomp $dir; $$ver = 1 if $ver; - return abs_path($cd) if ($dir eq '.' && defined $cd); - abs_path($dir); + rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir); } # for unconfigured inboxes @@ -78,8 +100,8 @@ sub unconfigured_ibx ($$) { name => $name, address => [ "$name\@example.com" ], inboxdir => $dir, - # TODO: consumers may want to warn on this: - #-unconfigured => 1, + # consumers (-convert) warn on this: + -unconfigured => 1, }); } @@ -95,41 +117,53 @@ sub resolve_inboxes ($;$$) { } my $min_ver = $opt->{-min_inbox_version} || 0; + # lookup inboxes by st_dev + st_ino instead of {inboxdir} pathnames, + # pathnames are not unique due to symlinks and bind mounts my (@old, @ibxs); - my %dir2ibx; - my $all = $opt->{all} ? [] : undef; - if ($cfg) { + if ($opt->{all}) { $cfg->each_inbox(sub { my ($ibx) = @_; - my $path = abs_path($ibx->{inboxdir}); - if (defined($path)) { - $dir2ibx{$path} = $ibx; - push @$all, $ibx if $all; + if (-e $ibx->{inboxdir}) { + push(@ibxs, $ibx) if $ibx->version >= $min_ver; } else { - warn <{name} $ibx->{inboxdir}: $! -EOF + warn "W: $ibx->{name} $ibx->{inboxdir}: $!\n"; } }); - } - if ($all) { - @$all = grep { $_->version >= $min_ver } @$all; - @ibxs = @$all; } else { # directories specified on the command-line - my $i = 0; my @dirs = @$argv; push @dirs, '.' if !@dirs && $opt->{-use_cwd}; - foreach (@dirs) { - my $v; - my $dir = resolve_inboxdir($_, \$v); - if ($v < $min_ver) { + my %s2i; # "st_dev\0st_ino" => array index + for (my $i = 0; $i <= $#dirs; $i++) { + my $dir = $dirs[$i]; + my @st = stat($dir) or die "stat($dir): $!\n"; + $dir = resolve_inboxdir($dir, \(my $ver)); + if ($ver >= $min_ver) { + $s2i{"$st[0]\0$st[1]"} //= $i; + } else { push @old, $dir; - next; } - my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i); - $i++; - push @ibxs, $ibx; } + my $done = \'done'; + eval { + $cfg->each_inbox(sub { + my ($ibx) = @_; + return if $ibx->version < $min_ver; + my $dir = $ibx->{inboxdir}; + if (my @s = stat $dir) { + my $i = delete($s2i{"$s[0]\0$s[1]"}) + // return; + $ibxs[$i] = $ibx; + die $done if !keys(%s2i); + } else { + warn "W: $ibx->{name} $dir: $!\n"; + } + }); + }; + die $@ if $@ && $@ ne $done; + for my $i (sort { $a <=> $b } values %s2i) { + $ibxs[$i] = unconfigured_ibx($dirs[$i], $i); + } + @ibxs = grep { defined } @ibxs; # duplicates are undef } if (@old) { die "-V$min_ver inboxes not supported by $0\n\t", -- cgit v1.2.3-24-ge0c7 From ab243aa2328e2fc4cf895c99c68345e57cc4653c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:12 +0000 Subject: inboxidle: remove needless check for {inboxdir} ->each_inbox will never attempt to iterate an object without {inboxdir}, and simplify + short-circuit the corresponding code --- lib/PublicInbox/Config.pm | 7 +++---- lib/PublicInbox/InboxIdle.pm | 7 +------ 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index cafd9c3b..199ce019 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -391,9 +391,9 @@ EOF } } - # backwards compatibility: - $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"}; - if (($ibx->{inboxdir} // '') =~ /\n/s) { + # "mainrepo" is backwards compatibility: + $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; + if ($ibx->{inboxdir} =~ /\n/s) { warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n"; return; } @@ -415,7 +415,6 @@ EOF } } - return unless defined($ibx->{inboxdir}); my $name = $pfx; $name =~ s/\Apublicinbox\.//; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 2737bbbd..f1cbc012 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -7,7 +7,6 @@ package PublicInbox::InboxIdle; use strict; use parent qw(PublicInbox::DS); -use Cwd qw(abs_path); use PublicInbox::Syscall qw(EPOLLIN EPOLLET); my $IN_MODIFY = 0x02; # match Linux inotify my $ino_cls; @@ -22,11 +21,7 @@ require PublicInbox::In2Tie if $ino_cls; sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback my ($ibx, $self) = @_; - my $dir = abs_path($ibx->{inboxdir}); - if (!defined($dir)) { - warn "W: $ibx->{inboxdir} not watched: $!\n"; - return; - } + my $dir = $ibx->{inboxdir}; my $inot = $self->{inot}; my $cur = $self->{pathmap}->{$dir} //= []; -- cgit v1.2.3-24-ge0c7 From ba958c1a44ca7e48a5790a2ecb2198636d6492db Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:13 +0000 Subject: daemon: lazy load Cwd only for --daemonize users systemd users won't need it polluting the namespace; though other things are still likely to load it. --- lib/PublicInbox/Daemon.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index a2171535..d1a42fc3 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -11,7 +11,6 @@ use IO::Socket; use POSIX qw(WNOHANG :signal_h); use Socket qw(IPPROTO_TCP SOL_SOCKET); sub SO_ACCEPTFILTER () { 0x1000 } -use Cwd qw/abs_path/; STDOUT->autoflush(1); STDERR->autoflush(1); use PublicInbox::DS qw(now); @@ -202,10 +201,11 @@ sub check_absolute ($$) { sub daemonize () { if ($daemonize) { + require Cwd; foreach my $i (0..$#ARGV) { my $arg = $ARGV[$i]; next unless -e $arg; - $ARGV[$i] = abs_path($arg); + $ARGV[$i] = Cwd::abs_path($arg); } check_absolute('stdout', $stdout); check_absolute('stderr', $stderr); -- cgit v1.2.3-24-ge0c7 From df708a4f7c4d62ef685ccf9868c78ff2709c1f40 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:14 +0000 Subject: daemon: unconditionally call IO::Handle::blocking(0) IO::Handle::blocking will always return the initial value from the F_GETFL op and it won't issue F_SETFL if a socket is already non-blocking. --- lib/PublicInbox/Daemon.pm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index d1a42fc3..eeac3bd2 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -367,14 +367,12 @@ sub inherit ($) { foreach my $fd (3..$end) { my $s = IO::Handle->new_from_fd($fd, 'r'); if (my $k = sockname($s)) { - if ($s->blocking) { - $s->blocking(0); - warn <<""; + my $prev_was_blocking = $s->blocking(0); + warn <<"" if $prev_was_blocking; Inherited socket (fd=$fd) is blocking, making it non-blocking. Set 'NonBlocking = true' in the systemd.service unit to avoid stalled processes when multiple service instances start. - } $listener_names->{$k} = $s; push @rv, $s; } else { -- cgit v1.2.3-24-ge0c7 From e77241850763dea9995381f3c0c7b354aa791cc0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:15 +0000 Subject: daemon: kill_workers: eliminate unnecessary loop The `kill' perl op takes multiple PIDs, so there's no need to iterate through the %pids hash. --- lib/PublicInbox/Daemon.pm | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index eeac3bd2..1762be0b 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -419,11 +419,8 @@ sub upgrade { # $_[0] = signal name or number (unused) } sub kill_workers ($) { - my ($s) = @_; - - while (my ($pid, $id) = each %pids) { - kill $s, $pid; - } + my ($sig) = @_; + kill $sig, keys(%pids); } sub upgrade_aborted ($) { -- cgit v1.2.3-24-ge0c7 From fb7d958dbc601f126dc391d915f5935d4cfc595e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 20 Dec 2020 06:30:16 +0000 Subject: config: eliminate unnecessary join call up front We can rely on implicit join in string interpolation on die() iff needed. And just creating the arrayref up front to avoid an extra backslash seems nicer at the moment. --- lib/PublicInbox/Config.pm | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 199ce019..2f5c83cd 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -163,11 +163,10 @@ sub config_fh_parse ($$$) { sub git_config_dump { my ($file) = @_; return {} unless -e $file; - my @cmd = (qw/git config -z -l --includes/, "--file=$file"); - my $cmd = join(' ', @cmd); - my $fh = popen_rd(\@cmd); + my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; + my $fh = popen_rd($cmd); my $rv = config_fh_parse($fh, "\0", "\n"); - close $fh or die "failed to close ($cmd) pipe: $?"; + close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } -- cgit v1.2.3-24-ge0c7 From 3ce4c38119f13d419bb865a0aa9b66feff339308 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 19:41:21 +0000 Subject: manifest.js.gz: fix per-inbox /$INBOX/manifest.js.gz /$INBOX/manifest.js.gz should not attempt to match every inbox in the domain (or every inbox); that is for /manifest.js.gz (without a /$INBOX prefix). Fixes: f303b4add8ea1883 ("wwwlisting: avoid hogging event loop") --- lib/PublicInbox/ManifestJsGz.pm | 7 +++++++ lib/PublicInbox/WWW.pm | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm index 6d5b57ee..e02450fa 100644 --- a/lib/PublicInbox/ManifestJsGz.pm +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -99,4 +99,11 @@ sub psgi_triple { 'Content-Length', bytes::length($out) ], [ $out ] ] } +sub per_inbox { + my ($ctx) = @_; + # only one inbox, slow is probably OK + slow_manifest_add($ctx, $ctx->{ibx}); + psgi_triple($ctx); +} + 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index a33d25ab..52630ae3 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -505,7 +505,7 @@ sub get_inbox_manifest ($$$) { my $r404 = invalid_inbox($ctx, $inbox); return $r404 if $r404; require PublicInbox::ManifestJsGz; - PublicInbox::ManifestJsGz->response($ctx); + PublicInbox::ManifestJsGz::per_inbox($ctx); } sub get_attach { -- cgit v1.2.3-24-ge0c7 From eea4ba7bac73b67ccd2cdd07946b176260ac5fac Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:18 +0000 Subject: inbox: delay ->version detection Our read-only code won't need to know the version until an inbox is accessed. This is a small step towards eliminating many stat() calls on read-only daemon startup. --- lib/PublicInbox/Inbox.pm | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 8a3a0194..863a5de4 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -109,10 +109,6 @@ sub new { delete $opts->{feedmax}; } $opts->{nntpserver} ||= $pi_cfg->{'publicinbox.nntpserver'}; - my $dir = $opts->{inboxdir}; - if (defined $dir && -f "$dir/inbox.lock") { - $opts->{version} = 2; - } # allow any combination of multi-line or comma-delimited hide entries my $hide = {}; @@ -125,7 +121,9 @@ sub new { bless $opts, $class; } -sub version { $_[0]->{version} // 1 } +sub version { + $_[0]->{version} //= -f "$_[0]->{inboxdir}/inbox.lock" ? 2 : 1 +} sub git_epoch { my ($self, $epoch) = @_; # v2-only, callers always supply $epoch -- cgit v1.2.3-24-ge0c7 From bad84119fb0915abe3f19fe4fb9c34e24fe7e564 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:19 +0000 Subject: isearch: use numeric sort for article numbers Perl sort is alphabetical by default and Xapian uses numeric document IDs, so sort must be told explicitly to use numeric comparisons even if the scalars are integer values (IV) internally. And eliminate extra hash marks ("#") since they're probably too noisy if there are many IDs. Note: I haven't seen this warning message in syslog, yet :> --- lib/PublicInbox/Isearch.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index 8a1f257a..e362c80a 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -89,7 +89,7 @@ SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) } if (scalar keys %order) { warn "W: $self->{es}->{topdir} #", - join(', #', sort keys %order), + join(', ', sort { $a <=> $b } keys %order), " not mapped to `$self->{eidx_key}'\n"; warn "W: $self->{es}->{topdir} may need to be reindexed\n"; @xnums = grep { defined } @xnums; @@ -113,7 +113,7 @@ sub mset_to_smsg { } if (scalar keys %order) { warn "W: $ibx->{inboxdir} #", - join(', #', sort keys %order), + join(', ', sort { $a <=> $b } keys %order), " no longer valid\n"; warn "W: $self->{es}->{topdir} may need to be reindexed\n"; } -- cgit v1.2.3-24-ge0c7 From 3e9888ed30b7fe092b03789d19a8020d4bc0fb39 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:20 +0000 Subject: use rel2abs_collapsed when loading Inbox objects We need to canonicalize paths for inboxes which do not have a newsgroup defined, otherwise ->eidx_key matches can fail in unexpected ways. --- lib/PublicInbox/Admin.pm | 11 +---------- lib/PublicInbox/Config.pm | 28 +++++++++++++++++++++++----- lib/PublicInbox/ExtSearchIdx.pm | 5 ----- 3 files changed, 24 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index ea82133a..c972fb68 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -10,7 +10,7 @@ our @EXPORT_OK = qw(setup_signals); use PublicInbox::Config; use PublicInbox::Inbox; use PublicInbox::Spawn qw(popen_rd); -use File::Spec (); +*rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed; sub setup_signals { my ($cb, $arg) = @_; # optional @@ -27,15 +27,6 @@ sub setup_signals { }; } -# abs_path resolves symlinks, so we want to avoid it if rel2abs -# is sufficient and doesn't leave "/.." or "/../" -sub rel2abs_collapsed ($) { - my $p = File::Spec->rel2abs($_[0]); - return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; # likely - require Cwd; - Cwd::abs_path($p); -} - sub resolve_inboxdir { my ($cd, $ver) = @_; my $try = $cd // '.'; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 2f5c83cd..577337dc 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -368,6 +368,16 @@ sub git_bool { } } +# abs_path resolves symlinks, so we want to avoid it if rel2abs +# is sufficient and doesn't leave "/.." or "/../" +sub rel2abs_collapsed { + require File::Spec; + my $p = File::Spec->rel2abs($_[-1]); + return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; + require Cwd; + Cwd::abs_path($p); +} + sub _fill { my ($self, $pfx) = @_; my $ibx = {}; @@ -391,9 +401,9 @@ EOF } # "mainrepo" is backwards compatibility: - $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; - if ($ibx->{inboxdir} =~ /\n/s) { - warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n"; + my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; + if (index($dir, "\n") >= 0) { + warn "E: `$dir' must not contain `\\n'\n"; return; } foreach my $k (qw(obfuscate)) { @@ -436,7 +446,7 @@ EOF $self->{-by_list_id}->{lc($list_id)} = $ibx; } } - if (my $ngname = $ibx->{newsgroup}) { + if (defined(my $ngname = $ibx->{newsgroup})) { if (ref($ngname)) { delete $ibx->{newsgroup}; warn 'multiple newsgroups not supported: '. @@ -445,7 +455,8 @@ EOF # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. # Leave out a few chars likely to cause problems or conflicts: # '|', '<', '>', ';', '#', '$', '&', - } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) { + } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]! || + $ngname eq '') { delete $ibx->{newsgroup}; warn "newsgroup name invalid: `$ngname'\n"; } else { @@ -454,6 +465,13 @@ EOF $self->{-by_newsgroup}->{$ngname} = $ibx; } } + unless (defined $ibx->{newsgroup}) { # for ->eidx_key + my $abs = rel2abs_collapsed($dir); + if ($abs ne $dir) { + warn "W: `$dir' canonicalized to `$abs'\n"; + $ibx->{inboxdir} = $abs; + } + } $self->{-by_name}->{$name} = $ibx; if ($ibx->{obfuscate}) { $ibx->{-no_obfuscate} = $self->{-no_obfuscate}; diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index b82d0546..c4b429df 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -72,11 +72,6 @@ sub attach_inbox { warn "W: skipping $key (no UIDVALIDITY)\n"; return; } - my $ibxdir = File::Spec->canonpath($ibx->{inboxdir}); - if ($ibxdir ne $ibx->{inboxdir}) { - warn "W: `$ibx->{inboxdir}' canonicalized to `$ibxdir'\n"; - $ibx->{inboxdir} = $ibxdir; - } $self->{ibx_map}->{$key} //= do { push @{$self->{ibx_list}}, $ibx; $ibx; -- cgit v1.2.3-24-ge0c7 From 427b4fbbc68e4e03b20d66062dd47a0213e18390 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:21 +0000 Subject: searchidx: rename get_val to int_val and return IV Values can be strings in Xapian, although we currently use integer values exclusively. Give the wrapper a more appropriate name in case we start using string columns. For future-proofing, we'll now return `undef' on missing columns and coerce the return value to an IV (integer value) to save memory, as sortable_unserialise returns a PV (pointer value) scalar despite it existing to support numeric values. --- lib/PublicInbox/SearchIdx.pm | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index b731f698..cf2c2c55 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -501,17 +501,18 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -sub get_val ($$) { +sub int_val ($$) { my ($doc, $col) = @_; - sortable_unserialise($doc->get_value($col)); + my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion } sub smsg_from_doc ($) { my ($doc) = @_; my $data = $doc->get_data or return; my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->{ts} = get_val($doc, PublicInbox::Search::TS()); - my $dt = get_val($doc, PublicInbox::Search::DT()); + $smsg->{ts} = int_val($doc, PublicInbox::Search::TS()); + my $dt = int_val($doc, PublicInbox::Search::DT()); my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); $smsg->load_from_data($data); -- cgit v1.2.3-24-ge0c7 From c13272432ad28adb506faf6fb9121569cf5ec710 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 07:51:22 +0000 Subject: extsearch*: drop unnecessary path canonicalization Unlike inboxdir, the canonical-ness of -extindex paths is not relevant at the moment, and may never be relevant at all. So don't mislead others into thinking these paths being canonicalized matters. --- lib/PublicInbox/ExtSearch.pm | 2 -- lib/PublicInbox/ExtSearchIdx.pm | 2 -- 2 files changed, 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index 2a560935..a2b97798 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -9,7 +9,6 @@ use strict; use v5.10.1; use PublicInbox::Over; use PublicInbox::Inbox; -use File::Spec (); use PublicInbox::MiscSearch; use DBI qw(:sql_types); # SQL_BLOB @@ -18,7 +17,6 @@ use parent qw(PublicInbox::Search); sub new { my (undef, $topdir) = @_; - $topdir = File::Spec->canonpath($topdir); bless { topdir => $topdir, # xpfx => 'ei15' diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index c4b429df..f04e0443 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -30,13 +30,11 @@ use PublicInbox::V2Writable; use PublicInbox::InboxWritable; use PublicInbox::ContentHash qw(content_hash); use PublicInbox::Eml; -use File::Spec; use PublicInbox::DS qw(now); use DBI qw(:sql_types); # SQL_BLOB sub new { my (undef, $dir, $opt) = @_; - $dir = File::Spec->canonpath($dir); my $l = $opt->{indexlevel} // 'full'; $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and die "invalid indexlevel=$l\n"; -- cgit v1.2.3-24-ge0c7 From 2adbbf87748719b422ba08d674ccb35637cb76b5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 21 Dec 2020 22:10:55 +0000 Subject: support multiple CODE_URLs public-inbox.org will expire in a few years, so ensure Tor .onions can be known before then. --- lib/PublicInbox/Unsubscribe.pm | 9 ++++++--- lib/PublicInbox/WwwStream.pm | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm index b0d3220c..ae0b0679 100644 --- a/lib/PublicInbox/Unsubscribe.pm +++ b/lib/PublicInbox/Unsubscribe.pm @@ -12,7 +12,8 @@ use warnings; use Crypt::CBC; use Plack::Util; use MIME::Base64 qw(decode_base64url); -my $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +my @CODE_URL = qw(http://ou63pmih66umazou.onion/public-inbox.git + https://public-inbox.org/public-inbox.git); my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8'); sub new { @@ -38,13 +39,15 @@ sub new { my $unsubscribe = $opt{unsubscribe} or die "`unsubscribe' callback not given\n"; + my $code_url = $opt{code_url} || \@CODE_URL; + $code_url = [ $code_url ] if ref($code_url) ne 'ARRAY'; bless { pi_cfg => $opt{pi_config}, # PublicInbox::Config owner_email => $opt{owner_email}, cipher => $cipher, unsubscribe => $unsubscribe, contact => qq($e), - code_url => $opt{code_url} || $CODE_URL, + code_url => $code_url, confirm => $opt{confirm}, }, $class; } @@ -138,7 +141,7 @@ sub r { "$title
".
 		join("\n", "$title\n", @body) . '

'. "
This page is available under AGPL-3.0+\n" .
-		"git clone $self->{code_url}\n" .
+		join('', map { "git clone $_\n" } @{$self->{code_url}}) .
 		qq(Email $self->{contact} if you have any questions).
 		'
' ] ]; diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 49940262..9ba8fa11 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -12,7 +12,8 @@ our @EXPORT_OK = qw(html_oneshot); use bytes (); # length use PublicInbox::Hval qw(ascii_html prurl ts2str); our $TOR_URL = 'https://www.torproject.org/'; -our $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +our $CODE_URL = [ qw(http://ou63pmih66umazou.onion/public-inbox.git + https://public-inbox.org/public-inbox.git) ]; sub base_url ($) { my $ctx = shift; -- cgit v1.2.3-24-ge0c7 From 949e8b4a65a2dbb99d8923ebb4715a8724ca8bf2 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 22 Dec 2020 06:01:44 +0000 Subject: admin: resolve inboxes to absolute paths for index Some of my ancient v1-only scripts called public-inbox-index to operate on GIT_DIR: GIT_DIR=/path/to/foo.git public-inbox-index This change ensures they keep working, otherwise "." will be passed to the --git-dir= switch of git(1) because that's the default directory if no inboxes are specified on the command-line. Fixes: 9fcce78e40b0a7c6 ("script/public-inbox-*: favor caller-provided pathnames") --- lib/PublicInbox/Admin.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index c972fb68..9a86d206 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -127,7 +127,7 @@ sub resolve_inboxes ($;$$) { for (my $i = 0; $i <= $#dirs; $i++) { my $dir = $dirs[$i]; my @st = stat($dir) or die "stat($dir): $!\n"; - $dir = resolve_inboxdir($dir, \(my $ver)); + $dir = $dirs[$i] = resolve_inboxdir($dir, \(my $ver)); if ($ver >= $min_ver) { $s2i{"$st[0]\0$st[1]"} //= $i; } else { -- cgit v1.2.3-24-ge0c7 From 37526779e4c544b134af74f43d6c26569cb8e466 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 22 Dec 2020 05:04:16 +0000 Subject: wwwstream: show relative coderepo URLs correctly Trying to link "foo.git" relative to the current URL usually does not provide correct results, so prefix it by going into the parent directory if an absolute (or protocol-relative) URL is not supplied. --- lib/PublicInbox/WwwStream.pm | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 9ba8fa11..958251a3 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -81,13 +81,17 @@ sub coderepos ($) { my ($ctx) = @_; my $cr = $ctx->{ibx}->{coderepo} // return (); my $cfg = $ctx->{www}->{pi_cfg}; + my $upfx = ($ctx->{-upfx} // ''). '../'; my @ret; for my $cr_name (@$cr) { my $urls = $cfg->{"coderepo.$cr_name.cgiturl"} // next; $ret[0] //= <{env}, $u)); $ret[0] .= qq(\n\t$u); } -- cgit v1.2.3-24-ge0c7 From 7e2bbc90bcc458f91d29c6847245857d7bb8fb60 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:44 +0000 Subject: miscsearch: load Xapian at initialization We need Xapian bindings loaded before calling (Search::)Xapian::Database->new --- lib/PublicInbox/MiscSearch.pm | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index f2e31443..de587d35 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -23,6 +23,7 @@ my %PROB_PREFIX = ( sub new { my ($class, $dir) = @_; + PublicInbox::Search::load_xapian(); bless { xdb => $PublicInbox::Search::X{Database}->new($dir) }, $class; -- cgit v1.2.3-24-ge0c7 From fbc71f023f2356b2e88a8cfeb738a945085ac9a2 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:46 +0000 Subject: inbox: git_epoch: correct false comment The original comment hasn't been true since PublicInbox::Git->modified was changed to use cat_async blob responses. In any case, manifest.js.gz generation already cleans up per-epoch git processes used for ->modified. --- lib/PublicInbox/Inbox.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 863a5de4..ec8469e3 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -132,7 +132,7 @@ sub git_epoch { return unless -d $git_dir; my $g = PublicInbox::Git->new($git_dir); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; - # no cleanup needed, we never cat-file off this, only clone + # caller must manually cleanup when done $g; }; } -- cgit v1.2.3-24-ge0c7 From 361cdfd0ae67d52d8a589b4ddc6e7fa94d8a9c8d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:47 +0000 Subject: inboxwritable: _init_v1: set created_at ASAP This ensures we have UIDVALIDITY to index earlier rather than later for v1 inboxes, matching v2 behavior. --- lib/PublicInbox/InboxWritable.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index c0e88f3d..69275bb0 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -46,12 +46,13 @@ sub _init_v1 { require PublicInbox::Msgmap; my $sidx = PublicInbox::SearchIdx->new($self, 1); # just create $sidx->begin_txn_lazy; + my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); if (defined $skip_artnum) { - my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); $mm->{dbh}->begin_work; $mm->skip_artnum($skip_artnum); $mm->{dbh}->commit; } + undef $mm; # ->created_at set $sidx->commit_txn_lazy; } else { open my $fh, '>>', "$self->{inboxdir}/ssoma.lock" or -- cgit v1.2.3-24-ge0c7 From 4a2e89007cb7b62151cb1869e49b27ebacfc27eb Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:48 +0000 Subject: miscsearch: index UIDVALIDITY, use as startup cache This brings -nntpd startup time down from ~35s to ~5s with 50K inboxes. Further improvements ought to be possible with deeper changes to MiscIdx, since -mda having to load every inbox seems unreasonable; but this general change is fairly unintrusive. --- lib/PublicInbox/ExtSearchIdx.pm | 22 +++++++++------- lib/PublicInbox/MiscIdx.pm | 26 +++++++++++++------ lib/PublicInbox/MiscSearch.pm | 56 ++++++++++++++++++++++++++++++++++++++--- lib/PublicInbox/NNTPD.pm | 4 ++- lib/PublicInbox/Search.pm | 9 ++++++- lib/PublicInbox/SearchIdx.pm | 7 ------ 6 files changed, 94 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index f04e0443..9d64ff5a 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -61,16 +61,20 @@ sub new { sub attach_inbox { my ($self, $ibx) = @_; - my $key = $ibx->eidx_key; - if (!$ibx->over || !$ibx->mm) { - warn "W: skipping $key (unindexed)\n"; - return; - } - if (!defined($ibx->uidvalidity)) { - warn "W: skipping $key (no UIDVALIDITY)\n"; - return; + my $ekey = $ibx->eidx_key; + my $misc = $self->{misc}; + if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed + } else { + if (!$ibx->over || !$ibx->mm) { + warn "W: skipping $ekey (unindexed)\n"; + return; + } + if (!defined($ibx->uidvalidity)) { + warn "W: skipping $ekey (no UIDVALIDITY)\n"; + return; + } } - $self->{ibx_map}->{$key} //= do { + $self->{ibx_map}->{$ekey} //= do { push @{$self->{ibx_list}}, $ibx; $ibx; } diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index 64591d05..a04dd1c5 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -21,6 +21,7 @@ use Carp qw(croak); use File::Path (); use PublicInbox::MiscSearch; use PublicInbox::Config; +my $json; sub new { my ($class, $eidx) = @_; @@ -30,6 +31,7 @@ sub new { nodatacow_dir($mi_dir); my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $json //= PublicInbox::Config::json(); bless { mi_dir => $mi_dir, flags => $flags, @@ -91,17 +93,27 @@ EOF $xdb->delete_document($_) for @drop; # just in case my $doc = $PublicInbox::Search::X{Document}->new; + term_generator($self)->set_document($doc); - # allow sorting by modified + # allow sorting by modified and uidvalidity (created at) add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); - $doc->add_boolean_term('Q'.$eidx_key); - $doc->add_boolean_term('T'.'inbox'); - term_generator($self)->set_document($doc); + $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + $doc->add_boolean_term('T'.'inbox'); # Type + + if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { + $doc->add_boolean_term('T'.'newsgroup'); # additional Type + } + + # force reread from disk, {description} could be loaded from {misc} + delete $ibx->{description}; + my $desc = $ibx->description; # description = S/Subject (or title) # address = A/Author - index_text($self, $ibx->description, 1, 'S'); + index_text($self, $desc, 1, 'S'); + index_text($self, $ibx->{name}, 1, 'XNAME'); my %map = ( address => 'A', listid => 'XLISTID', @@ -113,10 +125,8 @@ EOF index_text($self, $v, 1, $pfx); } } - index_text($self, $ibx->{name}, 1, 'XNAME'); my $data = {}; if (defined(my $max = $ibx->max_git_epoch)) { # v2 - my $desc = $ibx->description; my $pfx = "/$ibx->{name}/git/"; for my $epoch (0..$max) { my $git = $ibx->git_epoch($epoch) or return; @@ -130,7 +140,7 @@ EOF $ent->{git_dir} = $ibx->{inboxdir}; $data->{"/$ibx->{name}"} = $ent; } - $doc->set_data(PublicInbox::Config::json()->encode($data)); + $doc->set_data($json->encode($data)); if (defined $docid) { $xdb->replace_document($docid, $doc); } else { diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index de587d35..c6ce255f 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -5,10 +5,12 @@ package PublicInbox::MiscSearch; use strict; use v5.10.1; -use PublicInbox::Search qw(retry_reopen); +use PublicInbox::Search qw(retry_reopen int_val); +my $json; # Xapian value columns: our $MODIFIED = 0; +our $UIDVALIDITY = 1; # (created time) # avoid conflicting with message Search::prob_prefix for UI/UX reasons my %PROB_PREFIX = ( @@ -24,6 +26,7 @@ my %PROB_PREFIX = ( sub new { my ($class, $dir) = @_; PublicInbox::Search::load_xapian(); + $json //= PublicInbox::Config::json(); bless { xdb => $PublicInbox::Search::X{Database}->new($dir) }, $class; @@ -120,11 +123,13 @@ sub newsgroup_matches { sub ibx_data_once { my ($self, $ibx) = @_; my $xdb = $self->{xdb}; - my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private - my $head = $xdb->postlist_begin('Q'.$eidx_key); - my $tail = $xdb->postlist_end('Q'.$eidx_key); + my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private + my $head = $xdb->postlist_begin($term); + my $tail = $xdb->postlist_end($term); if ($head != $tail) { my $doc = $xdb->get_document($head->get_docid); + $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); + $ibx->{-modified} = int_val($doc, $MODIFIED); $doc->get_data; } else { undef; @@ -136,4 +141,47 @@ sub inbox_data { retry_reopen($self, \&ibx_data_once, $ibx); } +sub ibx_cache_load { + my ($doc, $cache) = @_; + my $end = $doc->termlist_end; + my $cur = $doc->termlist_begin; + $cur->skip_to('Q'); + return if $cur == $end; + my $eidx_key = $cur->get_termname; + $eidx_key =~ s/\AQ// or return; # expired + my $ce = $cache->{$eidx_key} = {}; + $ce->{uidvalidity} = int_val($doc, $UIDVALIDITY); + $ce->{-modified} = int_val($doc, $MODIFIED); + $ce->{description} = do { + # extract description from manifest.js.gz epoch description + my $d; + my $data = $json->decode($doc->get_data); + for (values %$data) { + $d = $_->{description} // next; + $d =~ s/ \[epoch [0-9]+\]\z// or next; + last; + } + $d; + } +} + +sub _nntpd_cache_load { # retry_reopen callback + my ($self) = @_; + my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 }; + my $mset = mset($self, 'type:newsgroup type:inbox', $opt); + my $cache = {}; + for my $it ($mset->items) { + ibx_cache_load($it->get_document, $cache); + } + $cache +} + +# returns { newsgroup => $cache_entry } mapping, $cache_entry contains +# anything which may trigger seeks at startup, currently: description, +# -modified, and uidvalidity. +sub nntpd_cache_load { + my ($self) = @_; + retry_reopen($self, \&_nntpd_cache_load); +} + 1; diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm index 7f9a1d58..6907a03c 100644 --- a/lib/PublicInbox/NNTPD.pm +++ b/lib/PublicInbox/NNTPD.pm @@ -36,10 +36,12 @@ sub refresh_groups { my ($self, $sig) = @_; my $pi_cfg = $sig ? PublicInbox::Config->new : $self->{pi_cfg}; my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox + my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; $pi_cfg->each_inbox(sub { my ($ibx) = @_; my $ngname = $ibx->{newsgroup} // return; - if ($ibx->nntp_usable) { + my $ce = $cache->{$ngname}; + if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) { # only valid if msgmap and over works # preload to avoid fragmentation: $ibx->description; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index b1d38fb9..05c679c9 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -6,7 +6,7 @@ package PublicInbox::Search; use strict; use parent qw(Exporter); -our @EXPORT_OK = qw(retry_reopen); +our @EXPORT_OK = qw(retry_reopen int_val); use List::Util qw(max); # values for searching, changing the numeric value breaks @@ -91,6 +91,7 @@ sub load_xapian () { 1 : Search::Xapian::ENQ_ASCENDING(); *sortable_serialise = $x.'::sortable_serialise'; + *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert @@ -436,4 +437,10 @@ sub help { \@ret; } +sub int_val ($$) { + my ($doc, $col) = @_; + my $val = $doc->get_value($col) or return; # undefined is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index cf2c2c55..d1b0c724 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -106,7 +106,6 @@ sub load_xapian_writable () { } eval 'require '.$X->{WritableDatabase} or die; *sortable_serialise = $xap.'::sortable_serialise'; - *sortable_unserialise = $xap.'::sortable_unserialise'; $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); my $ver = (eval($xap.'::major_version()') << 16) | @@ -501,12 +500,6 @@ sub remove_eidx_info { $self->{xdb}->replace_document($docid, $doc); } -sub int_val ($$) { - my ($doc, $col) = @_; - my $val = $doc->get_value($col) or return; # undefined is '' in Xapian - sortable_unserialise($val) + 0; # PV => IV conversion -} - sub smsg_from_doc ($) { my ($doc) = @_; my $data = $doc->get_data or return; -- cgit v1.2.3-24-ge0c7 From 6f9b927bf1fc5e84b92532477b275a45cd30cb01 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:49 +0000 Subject: extsearchidx: close SQLite handles after attaching This is needed to prevent us from running out of FDs when indexing many inboxes. Perhaps checking these on attach_inbox is unnecessary and may be removed entirely down the line. --- lib/PublicInbox/ExtSearchIdx.pm | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 9d64ff5a..fb627089 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -65,11 +65,14 @@ sub attach_inbox { my $misc = $self->{misc}; if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed } else { - if (!$ibx->over || !$ibx->mm) { + my @sqlite = ($ibx->over, $ibx->mm); + my $uidvalidity = $ibx->uidvalidity; + $ibx->{mm} = $ibx->{over} = undef; + if (scalar(@sqlite) != 2) { warn "W: skipping $ekey (unindexed)\n"; return; } - if (!defined($ibx->uidvalidity)) { + if (!defined($uidvalidity)) { warn "W: skipping $ekey (no UIDVALIDITY)\n"; return; } -- cgit v1.2.3-24-ge0c7 From 83a03a80c7392148fa65143f60fb16d15cf19006 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:50 +0000 Subject: config: _fill: inbox name extraction optimization Using substr() instead of a string copy + s// substitution here reduces ->fill_all from 4.00s to 3.88s with 50K inboxes on my workstation. --- lib/PublicInbox/Config.pm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 577337dc..cd8957a1 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -424,9 +424,7 @@ EOF } } - my $name = $pfx; - $name =~ s/\Apublicinbox\.//; - + my $name = substr($pfx, length('publicinbox.')); if (!valid_inbox_name($name)) { warn "invalid inbox name: '$name'\n"; return; -- cgit v1.2.3-24-ge0c7 From 80424a6a72dbe2b1ea7ba9bfba8f273e4da385a7 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:51 +0000 Subject: config: git_config_dump: pre-compile RE for split It appears the Perl split() operator is not optimized for fixed strings at all. With this change, PublicInbox::Config->new (w/o ->fill_all) time is reduced from 1.81s to 1.22s on a config file with 50K inboxes. --- lib/PublicInbox/Config.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index cd8957a1..4d143c6e 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -165,7 +165,7 @@ sub git_config_dump { return {} unless -e $file; my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; my $fh = popen_rd($cmd); - my $rv = config_fh_parse($fh, "\0", "\n"); + my $rv = config_fh_parse($fh, "\0", qr/\n/); close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } -- cgit v1.2.3-24-ge0c7 From a05445fb400108e60ede7d377cf3b26a0392eb24 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:52 +0000 Subject: config: config_fh_parse: micro-optimize We can avoid a slow regexp capture and instead and rely on rindex + substr to extract the section from the config file. Then we use the defined-or-assignment (//=) operator combined with the documented return value of `push' to ensure @section_order is unique without repeating a hash lookup. Finally, we avoid short-lived variables inside the loop and declare them subroutine-wide to knock a teeny bit of allocation time. Combined, these optimizations bring the ~1.22s PublicInbox::Config->new time down to ~0.98s with 50K inboxes. --- lib/PublicInbox/Config.pm | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 4d143c6e..60107d45 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -132,20 +132,15 @@ sub default_file { sub config_fh_parse ($$$) { my ($fh, $rs, $fs) = @_; - my %rv; - my (%section_seen, @section_order); + my (%rv, %section_seen, @section_order, $line, $k, $v, $section, $cur); local $/ = $rs; - while (defined(my $line = <$fh>)) { + while (defined($line = <$fh>)) { # performance critical with giant configs chomp $line; - my ($k, $v) = split($fs, $line, 2); - my ($section) = ($k =~ /\A(\S+)\.[^\.]+\z/); - unless (defined $section_seen{$section}) { - $section_seen{$section} = 1; - push @section_order, $section; - } + ($k, $v) = split($fs, $line, 2); + $section = substr($k, 0, rindex($k, '.')); + $section_seen{$section} //= push(@section_order, $section); - my $cur = $rv{$k}; - if (defined $cur) { + if (defined($cur = $rv{$k})) { if (ref($cur) eq "ARRAY") { push @$cur, $v; } else { -- cgit v1.2.3-24-ge0c7 From 74e705e24dffaf4dcb3952910678df7529783762 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 23 Dec 2020 08:38:53 +0000 Subject: config: config_fh_parse: micro-optimize harder Instead of relying on split() and a regexp, we'll drop split() entirely and rely on index() + two substr() calls to operate on fixed strings. This brings PublicInbox::Config->new time down from 0.98s down to 0.84s. --- lib/PublicInbox/Config.pm | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 60107d45..21f2161a 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -132,13 +132,14 @@ sub default_file { sub config_fh_parse ($$$) { my ($fh, $rs, $fs) = @_; - my (%rv, %section_seen, @section_order, $line, $k, $v, $section, $cur); + my (%rv, %seen, @section_order, $line, $k, $v, $section, $cur, $i); local $/ = $rs; - while (defined($line = <$fh>)) { # performance critical with giant configs - chomp $line; - ($k, $v) = split($fs, $line, 2); + while (defined($line = <$fh>)) { # perf critical with giant configs + $i = index($line, $fs); + $k = substr($line, 0, $i); + $v = substr($line, $i + 1, -1); # chop off $fs $section = substr($k, 0, rindex($k, '.')); - $section_seen{$section} //= push(@section_order, $section); + $seen{$section} //= push(@section_order, $section); if (defined($cur = $rv{$k})) { if (ref($cur) eq "ARRAY") { @@ -160,7 +161,7 @@ sub git_config_dump { return {} unless -e $file; my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; my $fh = popen_rd($cmd); - my $rv = config_fh_parse($fh, "\0", qr/\n/); + my $rv = config_fh_parse($fh, "\0", "\n"); close $fh or die "failed to close (@$cmd) pipe: $?"; $rv; } -- cgit v1.2.3-24-ge0c7 From b3cf37096874c6c80ef554e5153e5d995c72ab95 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 24 Dec 2020 10:09:18 +0000 Subject: inboxwritable: delay umask_prepare calls This simplifies all ->with_umask callers and opens the door for further optimizations to delay/elide process spawning. --- lib/PublicInbox/ExtSearchIdx.pm | 2 -- lib/PublicInbox/InboxWritable.pm | 6 ++---- lib/PublicInbox/SearchIdx.pm | 1 - lib/PublicInbox/V2Writable.pm | 3 --- lib/PublicInbox/Xapcmd.pm | 1 - 5 files changed, 2 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index fb627089..c43a6c5e 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -934,7 +934,6 @@ sub idx_init { # similar to V2Writable PublicInbox::V2Writable::write_alternates($info_dir, $mode, $o); } $self->parallel_init($self->{indexlevel}); - $self->umask_prepare; $self->with_umask(\&_idx_init, $self, $opt); $self->{oidx}->begin_lazy; $self->{oidx}->eidx_prep; @@ -943,7 +942,6 @@ sub idx_init { # similar to V2Writable no warnings 'once'; *done = \&PublicInbox::V2Writable::done; -*umask_prepare = \&PublicInbox::InboxWritable::umask_prepare; *with_umask = \&PublicInbox::InboxWritable::with_umask; *parallel_init = \&PublicInbox::V2Writable::parallel_init; *nproc_shards = \&PublicInbox::V2Writable::nproc_shards; diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 69275bb0..31eb3f15 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -65,7 +65,6 @@ sub init_inbox { if ($self->version == 1) { my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); - $self->umask_prepare; $self->with_umask(\&_init_v1, $self, $skip_artnum); } else { my $v2w = importer($self); @@ -260,7 +259,7 @@ sub _umask_for { sub with_umask { my ($self, $cb, @arg) = @_; - my $old = umask $self->{umask}; + my $old = umask($self->{umask} //= umask_prepare($self)); my $rv = eval { $cb->(@arg) }; my $err = $@; umask $old; @@ -271,8 +270,7 @@ sub with_umask { sub umask_prepare { my ($self) = @_; my $perm = _git_config_perm($self); - my $umask = _umask_for($perm); - $self->{umask} = $umask; + _umask_for($perm); } sub cleanup ($) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index d1b0c724..c8e309fc 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -67,7 +67,6 @@ sub new { $self->{-set_skip_docdata_once} = 1; $self->{-skip_docdata} = 1; } - $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 3e3b275f..531a72b2 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -97,8 +97,6 @@ sub new { die "$dir does not exist\n"; } } - $v2ibx->umask_prepare; - my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; my $self = { ibx => $v2ibx, @@ -320,7 +318,6 @@ sub idx_init { $ibx->git->cleanup; parallel_init($self, $ibx->{indexlevel}); - $ibx->umask_prepare; $ibx->with_umask(\&_idx_init, $self, $opt); } diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 4f77ef25..ca2345f7 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -270,7 +270,6 @@ sub run { local %SIG = %SIG; setup_signals(); - $ibx->umask_prepare; $ibx->with_umask(\&_run, $ibx, $cb, $opt); } -- cgit v1.2.3-24-ge0c7 From e8d6c34c749d1b0fd1dc1278cd4a2a310b31a9ac Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 24 Dec 2020 10:09:19 +0000 Subject: index: support --fast-noop / -F switch Note: I'm not sure if it's worth documenting and supporting this long-term. We can can avoid taking locks for invocations of "index --all" and rely on high-resolution ctime (struct timespec st_ctim) comparisons of msgmap.sqlite3 and the packed-refs + refs/heads directory of the newest epoch. This cuts public-inbox-index invocations with "--all --no-update-extindex -L basic" down from 0.92s to 0.31s. The change with "-L medium" or "-L full" and (default) non-zero jobs is even more drastic, reducing a 12-13s no-op invocation down to the same 0.31s --- lib/PublicInbox/V2Writable.pm | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 531a72b2..2b849ddf 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1351,11 +1351,19 @@ sub index_sync { $opt //= {}; return xapian_only($self, $opt) if $opt->{xapian_only}; - my $pr = $opt->{-progress}; my $epoch_max; - my $latest = $self->{ibx}->git_dir_latest(\$epoch_max); - return unless defined $latest; + my $latest = $self->{ibx}->git_dir_latest(\$epoch_max) // return; + if ($opt->{'fast-noop'}) { # nanosecond (st_ctim) comparison + use Time::HiRes qw(stat); + if (my @mm = stat("$self->{ibx}->{inboxdir}/msgmap.sqlite3")) { + my $c = $mm[10]; # 10 = ctime (nsec NV) + my @hd = stat("$latest/refs/heads"); + my @pr = stat("$latest/packed-refs"); + return if $c > ($hd[10] // 0) && $c > ($pr[10] // 0); + } + } + my $pr = $opt->{-progress}; my $seq = $opt->{sequential_shard}; my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; -- cgit v1.2.3-24-ge0c7 From 672d146577305baa7f508bd2e33212bba6fdb800 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:10 +0000 Subject: extsearchidx: delay SQLite availability checks This will make attach_inbox faster for no-op calls. It also helps us avoid races in case msgmap or over.sqlite3 gets unlinked while -extindex is running. --- lib/PublicInbox/ExtSearchIdx.pm | 57 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index c43a6c5e..386e1cee 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -61,23 +61,7 @@ sub new { sub attach_inbox { my ($self, $ibx) = @_; - my $ekey = $ibx->eidx_key; - my $misc = $self->{misc}; - if ($misc && $misc->inbox_data($ibx)) { # all good if already indexed - } else { - my @sqlite = ($ibx->over, $ibx->mm); - my $uidvalidity = $ibx->uidvalidity; - $ibx->{mm} = $ibx->{over} = undef; - if (scalar(@sqlite) != 2) { - warn "W: skipping $ekey (unindexed)\n"; - return; - } - if (!defined($uidvalidity)) { - warn "W: skipping $ekey (no UIDVALIDITY)\n"; - return; - } - } - $self->{ibx_map}->{$ekey} //= do { + $self->{ibx_map}->{$ibx->eidx_key} //= do { push @{$self->{ibx_list}}, $ibx; $ibx; } @@ -281,29 +265,36 @@ sub last_commits { $heads; } +sub _ibx_index_reject ($) { + my ($ibx) = @_; + $ibx->mm // return 'unindexed, no msgmap.sqlite3'; + $ibx->uidvalidity // return 'no UIDVALIDITY'; + $ibx->over // return 'unindexed, no over.sqlite3'; + undef; +} + sub _sync_inbox ($$$) { my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + if (defined(my $err = _ibx_index_reject($ibx))) { + return "W: skipping $ekey ($err)"; + } $sync->{ibx} = $ibx; $sync->{nr} = \(my $nr = 0); my $v = $ibx->version; - my $ekey = $ibx->eidx_key; if ($v == 2) { $sync->{epoch_max} = $ibx->max_git_epoch // return; sync_prepare($self, $sync); # or return # TODO: once MiscIdx is stable } elsif ($v == 1) { my $uv = $ibx->uidvalidity; my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv"); - my $head = $ibx->mm->last_commit; - unless (defined $head) { - warn "E: $ibx->{inboxdir} is not indexed\n"; - return; - } + my $head = $ibx->mm->last_commit // + return "E: $ibx->{inboxdir} is not indexed"; my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head); my $unit = { stack => $stk, git => $ibx->git }; push @{$sync->{todo}}, $unit; } else { - warn "E: $ekey unsupported inbox version (v$v)\n"; - return; + return "E: $ekey unsupported inbox version (v$v)"; } for my $unit (@{delete($sync->{todo}) // []}) { last if $sync->{quit}; @@ -311,6 +302,7 @@ sub _sync_inbox ($$$) { } $self->{midx}->index_ibx($ibx) unless $sync->{quit}; $ibx->git->cleanup; # done with this inbox, now + undef; } sub gc_unref_doc ($$$$) { @@ -787,9 +779,14 @@ DELETE FROM xref3 WHERE ibx_id = ? AND xnum = ? AND oidbin = ? sub _reindex_inbox ($$$) { my ($self, $sync, $ibx) = @_; - local $self->{current_info} = $ibx->eidx_key; - _reindex_check_unseen($self, $sync, $ibx); - _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit}; + my $ekey = $ibx->eidx_key; + local $self->{current_info} = $ekey; + if (defined(my $err = _ibx_index_reject($ibx))) { + warn "W: cannot reindex $ekey ($err)\n"; + } else { + _reindex_check_unseen($self, $sync, $ibx); + _reindex_check_stale($self, $sync, $ibx) unless $sync->{quit}; + } delete @$ibx{qw(over mm search git)}; # won't need these for a bit } @@ -847,7 +844,9 @@ sub eidx_sync { # main entry point # don't use $_ here, it'll get clobbered by reindex_checkpoint for my $ibx (@{$self->{ibx_list}}) { last if $sync->{quit}; - _sync_inbox($self, $sync, $ibx); + my $err = _sync_inbox($self, $sync, $ibx); + delete @$ibx{qw(mm over)}; + warn $err, "\n" if defined($err); } $self->{oidx}->rethread_done($opt) unless $sync->{quit}; eidxq_process($self, $sync) unless $sync->{quit}; -- cgit v1.2.3-24-ge0c7 From 14e606423429d6121c295c2bc0599fe1bf66b07c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:11 +0000 Subject: extsearchidx: close DB handles after use if FD constrained Most distros ship with low RLIMIT_NOFILE limits and surprises may lurk for admins who configure many inboxes. Keep FD usage under control to avoid EMFILE errors at inopportune times during reindex. From what I can tell, this is the only place where extindex can have unpredictable FD growth when there's thousands of inboxes, and it's in an extremely rare code path. --- lib/PublicInbox/ExtSearchIdx.pm | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 386e1cee..3f197973 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -393,6 +393,32 @@ sub _ibx_for ($$$) { $self->{ibx_list}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" } +sub _fd_constrained ($) { + my ($self) = @_; + $self->{-fd_constrained} //= do { + my $soft; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($soft, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + chomp($soft = `sh -c 'ulimit -n'`); + } + if (defined($soft)) { + my $want = scalar(@{$self->{ibx_list}}) + 64; # estimate + my $ret = $want > $soft; + if ($ret) { + warn <{sync}; @@ -429,11 +455,16 @@ sub _reindex_finalize ($$$) { my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty"; $x->{num} = delete($x->{xnum}) // die '{xnum} unset'; $ibx = _ibx_for($self, $sync, $x); - my $e = $ibx->over->get_art($x->{num}); - $e->{blob} eq $x->{blob} or die <over) { + my $e = $over->get_art($x->{num}); + $e->{blob} eq $x->{blob} or die <{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num}); EOF - push @todo, $ibx, $e; + push @todo, $ibx, $e; + $over->dbh_close if _fd_constrained($self); + } else { + die "$ibx->{inboxdir}: over.sqlite3 unusable: $!\n"; + } } undef $by_chash; while (my ($ibx, $e) = splice(@todo, 0, 2)) { -- cgit v1.2.3-24-ge0c7 From fb4dd7fdeeed8478cda9b7e63e56564da8cbdacf Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:12 +0000 Subject: index: do not attach inbox to extindex unless updated We'll count the number of log changes (regardless of index or unindex) and only attach inboxes to ExtSearchIdx objects when they get new work. We'll also reduce lock bouncing and only update external indices after all per-inbox indexing is done. This also updates existing v2 indexing/unindexing callers to be more consistent and ensures unindex log entries update per-inbox last commit information. --- lib/PublicInbox/Admin.pm | 1 + lib/PublicInbox/SearchIdx.pm | 2 ++ lib/PublicInbox/V2Writable.pm | 26 +++++++++++++++++++------- 3 files changed, 22 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 9a86d206..b468108e 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -271,6 +271,7 @@ EOM $idx = PublicInbox::SearchIdx->new($ibx, 1); } $idx->index_sync($opt); + $idx->{nidx} // 0; # returns number processed } sub progress_prepare ($) { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c8e309fc..b3361e05 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -615,6 +615,7 @@ sub index_both { # git->cat_async callback $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; ${$sync->{latest_cmt}} = $cur_cmt; } @@ -629,6 +630,7 @@ sub unindex_both { # git->cat_async callback if (defined(my $cur_cmt = $sync->{cur_cmt})) { ${$sync->{latest_cmt}} = $cur_cmt; } + ++$self->{nidx}; } sub with_umask { diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 2b849ddf..ca52874b 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -891,12 +891,22 @@ sub reindex_checkpoint ($$) { $mm_tmp->atfork_parent if $mm_tmp; } +sub index_finalize ($$) { + my ($arg, $index) = @_; + ++$arg->{self}->{nidx}; + if (defined(my $cur = $arg->{cur_cmt})) { + ${$arg->{latest_cmt}} = $cur; + } elsif ($index) { + die 'BUG: {cur_cmt} missing'; + } # else { unindexing @leftovers doesn't set {cur_cmt} +} + sub index_oid { # cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; - return if is_bad_blob($oid, $type, $size, $arg->{oid}); + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 1); # size == 0 purged returns here my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - return if $size == 0; # purged my ($num, $mid0); my $eml = PublicInbox::Eml->new($$bref); my $mids = mids($eml); @@ -967,7 +977,7 @@ sub index_oid { # cat_async callback if (do_idx($self, $bref, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } - ${$arg->{latest_cmt}} = $arg->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + index_finalize($arg, 1); } # only update last_commit for $i on reindex iff newer than current @@ -1157,11 +1167,12 @@ sub unindex_oid_aux ($$$) { } sub unindex_oid ($$;$) { # git->cat_async callback - my ($bref, $oid, $type, $size, $sync) = @_; - return if is_bad_blob($oid, $type, $size, $sync->{oid}); - my $self = $sync->{self}; + my ($bref, $oid, $type, $size, $arg) = @_; + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 0); + my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef; + my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef; my $mm = $self->{mm}; my $mids = mids(PublicInbox::Eml->new($bref)); undef $$bref; @@ -1186,6 +1197,7 @@ sub unindex_oid ($$;$) { # git->cat_async callback } unindex_oid_aux($self, $oid, $mid); } + index_finalize($arg, 0); } sub git { $_[0]->{ibx}->git } -- cgit v1.2.3-24-ge0c7 From 66518051763825d491d0c1df6837d4266edc180a Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:13 +0000 Subject: index: fix --no-fsync flag propagation to extindex Negation in flag names are confusing, but trying to deviate from the DB_NO_SYNC name used by Xapian is also confusing. --- lib/PublicInbox/ExtSearchIdx.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 3f197973..e7fdae48 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -54,7 +54,7 @@ sub new { }, __PACKAGE__; $self->{shards} = $self->count_shards || nproc_shards($opt->{creat}); my $oidx = PublicInbox::OverIdx->new("$self->{xpfx}/over.sqlite3"); - $oidx->{-no_fsync} = 1 if $opt->{-no_fsync}; + $self->{-no_fsync} = $oidx->{-no_fsync} = 1 if !$opt->{fsync}; $self->{oidx} = $oidx; $self } -- cgit v1.2.3-24-ge0c7 From 9a17bc230973405b0656bf05fb76902d46627ffa Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:14 +0000 Subject: v2writable: don't verify tip if reindexing We only rely on git-rev-parse to resolve symbolic names ("HEAD") to a SHA-* git commit ID. We'll assume any git commit IDs we get from SQLite DBs are valid and let "git-log" fail if it isn't. --- lib/PublicInbox/V2Writable.pm | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ca52874b..f20b5c7f 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1104,12 +1104,14 @@ sub sync_prepare ($$) { -d $git_dir or next; # missing epochs are fine my $git = PublicInbox::Git->new($git_dir); my $unit = { git => $git, epoch => $i }; + my $tip; if ($reindex_heads) { - $head = $reindex_heads->[$i] or next; + $tip = $head = $reindex_heads->[$i] or next; + } else { + $tip = $git->qx(qw(rev-parse -q --verify), $head); + next if $?; # new repo + chomp $tip; } - chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head)); - next if $?; # new repo - my $range = log_range($sync, $unit, $tip) or next; # can't use 'rev-list --count' if we use --diff-filter $pr->("$pfx $i.git counting $range ... ") if $pr; -- cgit v1.2.3-24-ge0c7 From 5e05c2eb58a450849f1826f3d02ed62b814b6617 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 05:59:22 +0000 Subject: inboxidle: clue users into resolving ENOSPC from inotify It may not be obvious to users a ENOSPC error is from hitting a (tunable) kernel-imposed limit on inotify watches, and not some storage device running out of space. Give them a hint here to reduce our own support burden. --- lib/PublicInbox/InboxIdle.pm | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib') diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index f1cbc012..84b6d26f 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -39,6 +39,11 @@ sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback $self->{on_unlock}->{$w->name} = $ibx; } else { warn "E: ".ref($inot)."->watch($lock, IN_MODIFY) failed: $!\n"; + if ($!{ENOSPC} && $^O eq 'linux') { + warn <<""; +I: consider increasing /proc/sys/fs/inotify/max_user_watches + + } } # TODO: detect deleted packs (and possibly other files) -- cgit v1.2.3-24-ge0c7 From 10bf54305da8422d9ece6b809996092c1c4b1786 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 09:34:39 +0000 Subject: inboxidle: avoid needless syscalls on refresh We don't have to replace a bunch of existing watches with identical new ones. On Linux with Linux::Inotify2 installed, this avoids a storm of inotify_add_watch(2) and inotify_rm_watch(2) syscalls on SIGHUP with -imapd and "-extindex --watch" --- lib/PublicInbox/InboxIdle.pm | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 84b6d26f..508007d7 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -24,17 +24,26 @@ sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback my $dir = $ibx->{inboxdir}; my $inot = $self->{inot}; my $cur = $self->{pathmap}->{$dir} //= []; + my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); # transfer old subscriptions to the current inbox, cancel the old watch - if (my $old_ibx = $cur->[0]) { + my $old_ibx = $cur->[0]; + $cur->[0] = $ibx; + if ($old_ibx) { $ibx->{unlock_subs} and die "BUG: $dir->{unlock_subs} should not exist"; $ibx->{unlock_subs} = $old_ibx->{unlock_subs}; + + # Linux::Inotify2::Watch::name matches if watches are the + # same, no point in replacing a watch of the same name + if ($cur->[1]->name eq $lock) { + $self->{on_unlock}->{$lock} = $ibx; + return; + } + # rare, name changed (v1 inbox converted to v2) $cur->[1]->cancel; # Linux::Inotify2::Watch::cancel } - $cur->[0] = $ibx; - my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); if (my $w = $cur->[1] = $inot->watch($lock, $IN_MODIFY)) { $self->{on_unlock}->{$w->name} = $ibx; } else { -- cgit v1.2.3-24-ge0c7 From b5e960f50289434025f5904c8c1311e4c8a02b82 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 08:12:52 +0000 Subject: inbox: name variable for values loop iterator ->on_inbox_unlock callbacks could clobber $_, and this seems to fix a problem with -extindex --watch failing to index some inboxes after SIGHUP reload. --- lib/PublicInbox/Inbox.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index ec8469e3..1b9b56ff 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -414,8 +414,8 @@ sub on_unlock { my ($self) = @_; check_inodes($self); my $subs = $self->{unlock_subs} or return; - for (values %$subs) { - eval { $_->on_inbox_unlock($self) }; + for my $obj (values %$subs) { + eval { $obj->on_inbox_unlock($self) }; warn "E: $@ ($self->{inboxdir})\n" if $@; } } -- cgit v1.2.3-24-ge0c7 From 0b018bebe7d8ee807ab07b570cf33669da4875b0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 01:44:36 +0000 Subject: default to CORE::warn in $SIG{__WARN__} handlers As with CORE::die and $SIG{__DIE__}, it turns out CORE::warn is safe to use inside $SIG{__WARN__} handlers without triggering infinite recursion. So fall back to reusing CORE::warn instead of creating a new sub. --- lib/PublicInbox/Admin.pm | 2 +- lib/PublicInbox/ExtSearchIdx.pm | 2 +- lib/PublicInbox/InboxWritable.pm | 2 +- lib/PublicInbox/Watch.pm | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index b468108e..d414e4e2 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -241,7 +241,7 @@ sub index_inbox { } local %SIG = %SIG; setup_signals(\&index_terminate, $ibx); - my $warn_cb = $SIG{__WARN__} // sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} // \&CORE::warn; my $idx = { current_info => $ibx->{inboxdir} }; my $warn_ignore = PublicInbox::InboxWritable->can('warn_ignore'); local $SIG{__WARN__} = sub { diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index e7fdae48..64ebf6db 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -841,7 +841,7 @@ sub eidx_reindex { sub eidx_sync { # main entry point my ($self, $opt) = @_; - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $self->{current_info} = ''; local $SIG{__WARN__} = sub { $warn_cb->($self->{current_info}, ': ', @_); diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 31eb3f15..b1d5caf5 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -292,7 +292,7 @@ sub warn_ignore { # this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..." sub warn_ignore_cb { - my $cb = $SIG{__WARN__} // sub { print STDERR @_ }; + my $cb = $SIG{__WARN__} // \&CORE::warn; sub { return if warn_ignore(@_); $cb->(@_); diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm index e1246096..bc296e01 100644 --- a/lib/PublicInbox/Watch.pm +++ b/lib/PublicInbox/Watch.pm @@ -217,7 +217,7 @@ sub _try_path { warn "unmappable dir: $1\n"; return; } - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; $warn_cb->($pfx, "path: $path\n", @_); @@ -467,7 +467,7 @@ sub imap_fetch_all ($$$) { my $key = $req; $key =~ s/\.PEEK//; my ($uids, $batch); - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; $batch //= '?'; @@ -929,7 +929,7 @@ sub nntp_fetch_all ($$$) { $beg = $l_art + 1; warn "I: $url fetching ARTICLE $beg..$end\n"; - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; my ($err, $art); local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; -- cgit v1.2.3-24-ge0c7 From 46bd595f57cc3d425754b0d0770c125616e74448 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 12:25:42 +0000 Subject: eml: fix undefined vars on Link: https://public-inbox.org/meta/DM6PR12MB49106F8E3BD697B63B943A22DADB0@DM6PR12MB4910.namprd12.prod.outlook.com/ Tested-by: Ali Alnubani --- lib/PublicInbox/Eml.pm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 571edc5c..4d3fffc0 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -378,7 +378,9 @@ sub header_str_set { header_set($self, $name, @vals); } -sub mhdr_decode ($) { eval { $MIME_Header->decode($_[0]) } // $_[0] } +sub mhdr_decode ($) { + eval { $MIME_Header->decode($_[0], Encode::FB_DEFAULT) } // $_[0]; +} sub filename { my $dis = header_raw($_[0], 'Content-Disposition'); -- cgit v1.2.3-24-ge0c7 From 1d96509a3f59c38394d2f3ac4323dc54c74dc202 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 01:44:37 +0000 Subject: extindex: --watch for inotify-based updates This reuses existing InboxIdle infrastructure to update external indices based on per-inbox updates. This is an alternative to auto-updating external indices via the -index command and also works with existing uses of -mda and public-inbox-watch. Using inotify (or EVFILT_VNODE) allows watching thousands of inboxes without having to scan every single one at every invocation. This is especially beneficial in cases where an external index is not writable to the users writing to per-inbox indices. --- lib/PublicInbox/ExtSearchIdx.pm | 126 +++++++++++++++++++++++++++++++++++++--- lib/PublicInbox/InboxIdle.pm | 8 ++- lib/PublicInbox/OverIdx.pm | 8 ++- lib/PublicInbox/V2Writable.pm | 2 +- 4 files changed, 131 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 64ebf6db..53ff2ca1 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -630,7 +630,7 @@ sub eidxq_process ($$) { # for reindexing my $dbh = $self->{oidx}->dbh; my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return; ${$sync->{nr}} = 0; - $sync->{-regen_fmt} = "%u/$tot\n"; + local $sync->{-regen_fmt} = "%u/$tot\n"; my $pr = $sync->{-opt}->{-progress}; if ($pr) { my $min = $dbh->selectrow_array('SELECT MIN(docid) FROM eidxq'); @@ -709,7 +709,8 @@ sub _reindex_check_unseen ($$$) { my $msgs; my $pr = $sync->{-opt}->{-progress}; my $ekey = $ibx->eidx_key; - $sync->{-regen_fmt} = "$ekey checking unseen %u/".$ibx->over->max."\n"; + local $sync->{-regen_fmt} = + "$ekey checking unseen %u/".$ibx->over->max."\n"; ${$sync->{nr}} = 0; while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) { @@ -752,7 +753,7 @@ sub _reindex_check_stale ($$$) { my $pr = $sync->{-opt}->{-progress}; my $fetching; my $ekey = $ibx->eidx_key; - $sync->{-regen_fmt} = + local $sync->{-regen_fmt} = "$ekey check stale/missing %u/".$ibx->over->max."\n"; ${$sync->{nr}} = 0; do { @@ -838,6 +839,13 @@ sub eidx_reindex { eidxq_process($self, $sync) unless $sync->{quit}; } +sub sync_inbox { + my ($self, $sync, $ibx) = @_; + my $err = _sync_inbox($self, $sync, $ibx); + delete @$ibx{qw(mm over)}; + warn $err, "\n" if defined($err); +} + sub eidx_sync { # main entry point my ($self, $opt) = @_; @@ -868,22 +876,21 @@ sub eidx_sync { # main entry point $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key); } if (delete($opt->{reindex})) { - $sync->{checkpoint_unlocks} = 1; + local $sync->{checkpoint_unlocks} = 1; eidx_reindex($self, $sync); } # don't use $_ here, it'll get clobbered by reindex_checkpoint for my $ibx (@{$self->{ibx_list}}) { last if $sync->{quit}; - my $err = _sync_inbox($self, $sync, $ibx); - delete @$ibx{qw(mm over)}; - warn $err, "\n" if defined($err); + sync_inbox($self, $sync, $ibx); } $self->{oidx}->rethread_done($opt) unless $sync->{quit}; eidxq_process($self, $sync) unless $sync->{quit}; eidxq_release($self); - PublicInbox::V2Writable::done($self); + done($self); + $sync; # for eidx_watch } sub update_last_commit { # overrides V2Writable @@ -970,6 +977,109 @@ sub idx_init { # similar to V2Writable $self->{midx}->begin_txn; } +sub _watch_commit { # PublicInbox::DS::add_timer callback + my ($self) = @_; + delete $self->{-commit_timer}; + eidxq_process($self, $self->{-watch_sync}); + eidxq_release($self); + delete local $self->{-watch_sync}->{-regen_fmt}; + reindex_checkpoint($self, $self->{-watch_sync}); + + # call event_step => done unless commit_timer is armed + PublicInbox::DS::requeue($self); +} + +sub on_inbox_unlock { # called by PublicInbox::InboxIdle + my ($self, $ibx) = @_; + my $opt = $self->{-watch_sync}->{-opt}; + my $pr = $opt->{-progress}; + my $ekey = $ibx->eidx_key; + local $0 = "sync $ekey"; + $pr->("indexing $ekey\n") if $pr; + $self->idx_init($opt); + sync_inbox($self, $self->{-watch_sync}, $ibx); + $self->{-commit_timer} //= PublicInbox::DS::add_timer( + $opt->{'commit-interval'} // 10, + \&_watch_commit, $self); +} + +sub eidx_reload { # -extindex --watch SIGHUP handler + my ($self, $idler) = @_; + if ($self->{cfg}) { + my $pr = $self->{-watch_sync}->{-opt}->{-progress}; + $pr->('reloading ...') if $pr; + @{$self->{ibx_list}} = (); + %{$self->{ibx_map}} = (); + delete $self->{-watch_sync}->{id2pos}; + my $cfg = PublicInbox::Config->new; + attach_config($self, $cfg); + $idler->refresh($cfg); + $pr->(" done\n") if $pr; + } else { + warn "reload not supported without --all\n"; + } +} + +sub eidx_resync_start ($) { # -extindex --watch SIGUSR1 handler + my ($self) = @_; + $self->{-resync_queue} //= [ @{$self->{ibx_list}} ]; + PublicInbox::DS::requeue($self); # trigger our ->event_step +} + +sub event_step { # PublicInbox::DS::requeue callback + my ($self) = @_; + if (my $resync_queue = $self->{-resync_queue}) { + if (my $ibx = shift(@$resync_queue)) { + on_inbox_unlock($self, $ibx); + PublicInbox::DS::requeue($self); + } else { + delete $self->{-resync_queue}; + _watch_commit($self); + } + } else { + done($self) unless $self->{-commit_timer}; + } +} + +sub eidx_watch { # public-inbox-extindex --watch main loop + my ($self, $opt) = @_; + require PublicInbox::InboxIdle; + require PublicInbox::DS; + require PublicInbox::Syscall; + require PublicInbox::Sigfd; + my $idler = PublicInbox::InboxIdle->new($self->{cfg}); + if (!$self->{cfg}) { + $idler->watch_inbox($_) for @{$self->{ibx_list}}; + } + $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}}; + my $sync = eidx_sync($self, $opt); # initial sync + return if $sync->{quit}; + my $oldset = PublicInbox::Sigfd::block_signals(); + local $self->{current_info} = ''; + my $cb = $SIG{__WARN__} || \&CORE::warn; + local $SIG{__WARN__} = sub { $cb->($self->{current_info}, ': ', @_) }; + my $sig = { + HUP => sub { eidx_reload($self, $idler) }, + USR1 => sub { eidx_resync_start($self) }, + TSTP => sub { kill('STOP', $$) }, + }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; + my $sigfd = PublicInbox::Sigfd->new($sig, + $PublicInbox::Syscall::SFD_NONBLOCK); + local %SIG = (%SIG, %$sig) if !$sigfd; + local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock + if (!$sigfd) { + # wake up every second to accept signals if we don't + # have signalfd or IO::KQueue: + PublicInbox::Sigfd::sig_setmask($oldset); + PublicInbox::DS->SetLoopTimeout(1000); + } + PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} }); + PublicInbox::DS->EventLoop; # calls InboxIdle->event_step + done($self); +} + no warnings 'once'; *done = \&PublicInbox::V2Writable::done; *with_umask = \&PublicInbox::InboxWritable::with_umask; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 508007d7..35aed696 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -63,6 +63,9 @@ sub refresh { $pi_cfg->each_inbox(\&in2_arm, $self); } +# internal API for ease-of-use +sub watch_inbox { in2_arm($_[1], $_[0]) }; + sub new { my ($class, $pi_cfg) = @_; my $self = bless {}, $class; @@ -78,7 +81,7 @@ sub new { $self->{inot} = $inot; $self->{pathmap} = {}; # inboxdir => [ ibx, watch1, watch2, watch3...] $self->{on_unlock} = {}; # lock path => ibx - refresh($self, $pi_cfg); + refresh($self, $pi_cfg) if $pi_cfg; PublicInbox::FakeInotify::poll_once($self) if !$ino_cls; $self; } @@ -89,7 +92,8 @@ sub event_step { my @events = $self->{inot}->read; # Linux::Inotify2::read my $on_unlock = $self->{on_unlock}; for my $ev (@events) { - if (my $ibx = $on_unlock->{$ev->fullname}) { + my $fn = $ev->fullname // next; # cancelled + if (my $ibx = $on_unlock->{$fn}) { $ibx->on_unlock; } } diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 4a39bf53..dcc2cff3 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -473,10 +473,14 @@ sub dbh_close { sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + Carp::confess('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + File::Path::mkpath(File::Basename::dirname($fn)); } # create the DB: PublicInbox::Over::dbh($self); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index f20b5c7f..567582c5 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -879,7 +879,7 @@ sub reindex_checkpoint ($$) { $self->done; # release lock } - if (my $pr = $sync->{-opt}->{-progress}) { + if (my $pr = $sync->{-regen_fmt} ? $sync->{-opt}->{-progress} : undef) { $pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}})); } -- cgit v1.2.3-24-ge0c7 From ae8df17135014a64a9f1def786f66c9c16b05fcf Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 10:16:21 +0000 Subject: extindex: various --watch signal handling fixes We need to clobber the SIGUSR1 resync queue on SIGHUP to invalidate old inbox objects. Furthermore, the lengthy initial scan needs to ignore signals intended for the event loop to avoid unexpected behavior. Finally, add some progress output to inform users on the terminal to inform users' of progress. --- lib/PublicInbox/ExtSearchIdx.pm | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 53ff2ca1..778154a5 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -1008,6 +1008,7 @@ sub eidx_reload { # -extindex --watch SIGHUP handler if ($self->{cfg}) { my $pr = $self->{-watch_sync}->{-opt}->{-progress}; $pr->('reloading ...') if $pr; + delete $self->{-resync_queue}; @{$self->{ibx_list}} = (); %{$self->{ibx_map}} = (); delete $self->{-watch_sync}->{id2pos}; @@ -1043,6 +1044,10 @@ sub event_step { # PublicInbox::DS::requeue callback sub eidx_watch { # public-inbox-extindex --watch main loop my ($self, $opt) = @_; + local %SIG = %SIG; + for my $sig (qw(HUP USR1 TSTP QUIT INT TERM)) { + $SIG{$sig} = sub { warn "SIG$sig ignored while scanning\n" }; + } require PublicInbox::InboxIdle; require PublicInbox::DS; require PublicInbox::Syscall; @@ -1052,6 +1057,8 @@ sub eidx_watch { # public-inbox-extindex --watch main loop $idler->watch_inbox($_) for @{$self->{ibx_list}}; } $_->subscribe_unlock(__PACKAGE__, $self) for @{$self->{ibx_list}}; + my $pr = $opt->{-progress}; + $pr->("performing initial scan ...\n") if $pr; my $sync = eidx_sync($self, $opt); # initial sync return if $sync->{quit}; my $oldset = PublicInbox::Sigfd::block_signals(); @@ -1067,7 +1074,7 @@ sub eidx_watch { # public-inbox-extindex --watch main loop $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; my $sigfd = PublicInbox::Sigfd->new($sig, $PublicInbox::Syscall::SFD_NONBLOCK); - local %SIG = (%SIG, %$sig) if !$sigfd; + %SIG = (%SIG, %$sig) if !$sigfd; local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock if (!$sigfd) { # wake up every second to accept signals if we don't @@ -1076,6 +1083,7 @@ sub eidx_watch { # public-inbox-extindex --watch main loop PublicInbox::DS->SetLoopTimeout(1000); } PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} }); + $pr->("initial scan complete, entering event loop\n") if $pr; PublicInbox::DS->EventLoop; # calls InboxIdle->event_step done($self); } -- cgit v1.2.3-24-ge0c7 From 02aad3e340d1711359c4def6e91482140a989ce1 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 10:16:23 +0000 Subject: extindex: add undocumented --no-scan switch This makes diagnosing --watch problems easier when there's 50K inboxes by avoiding the lengthy scan (which is the reason --watch exists in the first place). --- lib/PublicInbox/ExtSearchIdx.pm | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 778154a5..07e64698 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -881,9 +881,11 @@ sub eidx_sync { # main entry point } # don't use $_ here, it'll get clobbered by reindex_checkpoint - for my $ibx (@{$self->{ibx_list}}) { - last if $sync->{quit}; - sync_inbox($self, $sync, $ibx); + if ($opt->{scan} // 1) { + for my $ibx (@{$self->{ibx_list}}) { + last if $sync->{quit}; + sync_inbox($self, $sync, $ibx); + } } $self->{oidx}->rethread_done($opt) unless $sync->{quit}; eidxq_process($self, $sync) unless $sync->{quit}; -- cgit v1.2.3-24-ge0c7 From d209c2064ebc8ccddc7f0da068c663fc08077334 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 11:01:41 +0000 Subject: extsearch: unconditionally reopen on access Since ExtSearch lacks the janky cleanup timer of PublicInbox::Inbox objects, its search results get stale. Reopen the Xapian DB on every ->search call for now, as reducing reopen calls doesn't seem worth the complexity. The Xapian::Database::reopen operation itself takes only ~50us on my old workstation with 3 shards totaling <200GB. Other parts of Xapian dominates the search time, so the reopen seems inconsequential with single-digit shard counts. --- lib/PublicInbox/ExtSearch.pm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index a2b97798..7c9586a6 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -29,8 +29,6 @@ sub misc { $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc"); } -sub search { $_[0] } # self - # overrides PublicInbox::Search::_xdb sub _xdb { my ($self) = @_; @@ -126,6 +124,6 @@ no warnings 'once'; *recent = \&PublicInbox::Inbox::recent; *max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef -*isrch = *search; +*isrch = *search = \&PublicInbox::Search::reopen; 1; -- cgit v1.2.3-24-ge0c7 From 54b250c611c752538666f2bf6d361d3762a21781 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 11:01:42 +0000 Subject: miscsearch: take reopen from Search and use it As with ExtSearch, MiscSearch lacks a janky cleanup timer of PublicInbox::Inbox objects, leading to info about inboxes/newsgroups going stale. Fortunately, we don't use MiscSearch very heavily, yet. In the future, we may be able to detect new inboxes without having to SIGHUP or restart daemons using MiscSearch. --- lib/PublicInbox/MiscSearch.pm | 4 ++++ lib/PublicInbox/WwwListing.pm | 3 +++ 2 files changed, 7 insertions(+) (limited to 'lib') diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index c6ce255f..6683d564 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -73,6 +73,7 @@ sub misc_enquire_once { # retry_reopen callback sub mset { my ($self, $qs, $opt) = @_; $opt ||= {}; + reopen($self); my $qp = $self->{qp} //= mi_qp_new($self); $qs = 'type:inbox' if $qs eq ''; my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); @@ -184,4 +185,7 @@ sub nntpd_cache_load { retry_reopen($self, \&_nntpd_cache_load); } +no warnings 'once'; +*reopen = \&PublicInbox::Search::reopen; + 1; diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index fce0e530..4b3f1674 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -69,6 +69,9 @@ sub hide_key { 'www' } sub response { my ($class, $ctx) = @_; bless $ctx, $class; + if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) { + $ALL->misc->reopen; + } my $re = $ctx->url_regexp or return $ctx->psgi_triple; my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg}, \&list_match_i, $re, $ctx); -- cgit v1.2.3-24-ge0c7 From d0e74a3591d9e701af6fea30baacf2ddb51475d5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 19:38:28 +0000 Subject: search: remove pointless {relevance} setting SearchView will set it to `undef', others will set the 'mset' option (for the ->mset method :P) to 2 which causes {relevance} to be ignored. And the 'mset' option is poorly named now that the message is named ->mset... --- lib/PublicInbox/Search.pm | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 05c679c9..ffd19a1f 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -287,7 +287,6 @@ sub mset { $opts ||= {}; my $qp = $self->{qp} //= qparse_new($self); my $query = $qp->parse_query($query_string, $self->{qp_flags}); - $opts->{relevance} = 1 unless exists $opts->{relevance}; _do_enquire($self, $query, $opts); } -- cgit v1.2.3-24-ge0c7 From 5f875446975b1473c1ffd7196e572e13d58ba56f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 19:38:29 +0000 Subject: search: remove {mset} option for ->mset method The ->mset method always returns a Xapian mset nowadays, so naming a parameter {mset} is too confusing. As it does with MiscSearch, setting the {relevance} parameter to -1 now sorts by ascending docid order. -2 is now supported for descending docid order, too, since it may be useful for lei users. --- lib/PublicInbox/ExtMsg.pm | 2 +- lib/PublicInbox/IMAP.pm | 2 +- lib/PublicInbox/Isearch.pm | 2 +- lib/PublicInbox/Mbox.pm | 2 +- lib/PublicInbox/Search.pm | 24 +++++++++++++----------- 5 files changed, 17 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 6a173f67..4df885ab 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -33,7 +33,7 @@ sub search_partial ($$) { my ($ibx, $mid) = @_; return if length($mid) < $MIN_PARTIAL_LEN; my $srch = $ibx->search or return; # NOT ->isrch, we already try ->ALL - my $opt = { limit => PARTIAL_MAX, mset => 2 }; + my $opt = { limit => PARTIAL_MAX, relevance => -1 }; my @try = ("m:$mid*"); my $chop = $mid; if ($chop =~ s/(\W+)(\w*)\z//) { diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index a3a10bde..2af5ab0c 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -1136,7 +1136,7 @@ sub search_common { my $srch = $self->{ibx}->isrch or return "$tag BAD search not available for mailbox\r\n"; my $opt = { - mset => 2, + relevance => -1, limit => UID_SLICE, uid_range => $range_info }; diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index e362c80a..7ca2f9e4 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -61,7 +61,7 @@ sub mset_to_artnums { my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset); my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); my $qmarks = join(',', map { '?' } @$docids); - if ($opt && ($opt->{mset} // 0) == 2) { # opt->{mset} = 2 was used + if ($opt && ($opt->{relevance} // 0) == -1) { # -1 => ENQ_ASCENDING my $range = ''; my @r; if (my $r = $opt->{uid_range}) { diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index c8e4b406..83fa7d8a 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -235,7 +235,7 @@ sub mbox_all { my $over = $ctx->{ibx}->over or return PublicInbox::WWW::need($ctx, 'Overview'); - my $qopts = $ctx->{qopts} = { mset => 2 }; # order by docid + my $qopts = $ctx->{qopts} = { relevance => -1 }; # ORDER BY docid ASC $qopts->{thread} = 1 if $q->{t}; my $mset = $srch->mset($q_string, $qopts); $qopts->{offset} = $mset->size or diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index ffd19a1f..fb3e9975 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -58,7 +58,11 @@ our $QP_FLAGS; our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); our $Xap; # 'Search::Xapian' or 'Xapian' our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') -our $ENQ_ASCENDING; + +# ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, +# let's hope the ABI is stable +our $ENQ_DESCENDING = 0; +our $ENQ_ASCENDING = 1; sub load_xapian () { return 1 if defined $Xap; @@ -84,12 +88,6 @@ sub load_xapian () { 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); - # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm, - # so lets hope this part of the ABI is stable because it's - # just an integer: - $ENQ_ASCENDING = $x eq 'Xapian' ? - 1 : Search::Xapian::ENQ_ASCENDING(); - *sortable_serialise = $x.'::sortable_serialise'; *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public @@ -344,13 +342,17 @@ sub _enquire_once { # retry_reopen callback $enquire->set_query($query); $opts ||= {}; my $desc = !$opts->{asc}; - if (($opts->{mset} || 0) == 2) { # mset == 2: ORDER BY docid/UID + my $rel = $opts->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid/UID + $enquire->set_weighting_scheme($X{BoolWeight}->new); $enquire->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enquire->set_sort_by_value_then_relevance(TS, $desc); + } elsif ($rel == -2) { $enquire->set_weighting_scheme($X{BoolWeight}->new); - } elsif ($opts->{relevance}) { + $enquire->set_docid_order($ENQ_DESCENDING); + } else { # rel > 0 $enquire->set_sort_by_relevance_then_value(TS, $desc); - } else { - $enquire->set_sort_by_value_then_relevance(TS, $desc); } # `mairix -t / --threads' or JMAP collapseThreads -- cgit v1.2.3-24-ge0c7 From 645ec505c03c86ed4500151737bcf3d636d9b18b Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:03 +0000 Subject: git: qx: avoid extra "local" for scalar context case We can use the ternary operator to avoid an early return, here --- lib/PublicInbox/Git.pm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 08406925..73dc7d3e 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -362,10 +362,8 @@ sub popen { sub qx { my ($self, @cmd) = @_; my $fh = $self->popen(@cmd); - local $/ = "\n"; - return <$fh> if wantarray; - local $/; - <$fh> + local $/ = wantarray ? "\n" : undef; + <$fh>; } # check_async and cat_async may trigger the other, so ensure they're -- cgit v1.2.3-24-ge0c7 From 0f545dd95ca88e69f011aa56d07494887e7df822 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:04 +0000 Subject: import: check for git->qx errors, clearer return values Those git commands can fail and git->qx will set $? when it fails. There's no need for the extra indirection of the @ret array, either. Improve git->qx coverage to check for $? while we're at it. --- lib/PublicInbox/Import.pm | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 2cb4896a..e0a84bfd 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -48,7 +48,7 @@ sub gfi_start { return ($self->{in}, $self->{out}) if $self->{pid}; - my (@ret, $out_r, $out_w); + my ($in_r, $pid, $out_r, $out_w); pipe($out_r, $out_w) or die "pipe failed: $!"; $self->lock_acquire; @@ -56,27 +56,28 @@ sub gfi_start { my ($git, $ref) = @$self{qw(git ref)}; local $/ = "\n"; chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + die "fatal: rev-parse --revs-only $ref: \$?=$?" if $?; if ($self->{path_type} ne '2/38' && $self->{tip}) { local $/ = "\0"; my @t = $git->qx(qw(ls-tree -r -z --name-only), $ref); + die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; chomp @t; $self->{-tree} = { map { $_ => 1 } @t }; } my @cmd = ('git', "--git-dir=$git->{git_dir}", qw(fast-import --quiet --done --date-format=raw)); - my ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); + ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); $out_w->autoflush(1); $self->{in} = $in_r; $self->{out} = $out_w; $self->{pid} = $pid; $self->{nchg} = 0; - @ret = ($in_r, $out_w); }; if ($@) { $self->lock_release; die $@; } - @ret; + ($in_r, $out_w); } sub wfail () { die "write to fast-import failed: $!" } -- cgit v1.2.3-24-ge0c7 From 522bb4a4973d4ac41b83be58dd3257e8cd038744 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:05 +0000 Subject: check defined return value for localized slurp errors Reading from regular files (even on STDIN) can fail when dealing with flakey storage. --- lib/PublicInbox/Gcf2.pm | 2 +- lib/PublicInbox/Inbox.pm | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 041dffe7..fe6afef2 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -35,7 +35,7 @@ BEGIN { if (open(my $fh, '<', $f)) { chomp($l, $c); local $/; - $c_src = <$fh>; + defined($c_src = <$fh>) or die "read $f: $!\n"; $CFG{LIBS} = $l; $CFG{CCFLAGSEX} = $c; last; diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 1b9b56ff..af6380a7 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -210,12 +210,9 @@ sub over { sub try_cat { my ($path) = @_; - my $rv = ''; - if (open(my $fh, '<', $path)) { - local $/; - $rv = <$fh>; - } - $rv; + open(my $fh, '<', $path) or return ''; + local $/; + <$fh> // ''; } sub cat_desc ($) { -- cgit v1.2.3-24-ge0c7 From a7794134700c855a7a4fc030720ee8901b38a5f0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:06 +0000 Subject: ds: simplify EventLoop implementation More importantly, make it easier-to-find the sub by avoiding runtime manipulation of subroutine names. There's no point in avoiding a potential call to _InitPoller in EventLoop since entering EventLoop is rare. On the contrary, PublicInbox::DS->new is called often and this change to avoid entering _InitPoller there may have more benefits (which may still be unmeasurable). --- lib/PublicInbox/DS.pm | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index a02b3bb7..12df5919 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -50,7 +50,6 @@ our ( $PostLoopCallback, # subref to call at the end of each loop, if defined (global) $LoopTimeout, # timeout of event loop in milliseconds - $DoneInit, # if we've done the one-time module init yet @Timers, # timers $in_loop, ); @@ -75,12 +74,9 @@ sub Reset { @Timers = (); $PostLoopCallback = undef; - $DoneInit = 0; $_io = undef; # closes real $Epoll FD $Epoll = undef; # may call DSKQXS::DESTROY - - *EventLoop = *FirstTimeEventLoop; } =head2 C<< CLASS->SetLoopTimeout( $timeout ) >> @@ -137,14 +133,13 @@ sub set_cloexec ($) { fcntl($_io, F_SETFD, $fl | FD_CLOEXEC); } +# caller sets return value to $Epoll sub _InitPoller { - return if $DoneInit; - $DoneInit = 1; - if (PublicInbox::Syscall::epoll_defined()) { - $Epoll = epoll_create(); - set_cloexec($Epoll) if (defined($Epoll) && $Epoll >= 0); + my $fd = epoll_create(); + set_cloexec($fd) if (defined($fd) && $fd >= 0); + $fd; } else { my $cls; for (qw(DSKQXS DSPoll)) { @@ -152,9 +147,8 @@ sub _InitPoller last if eval "require $cls"; } $cls->import(qw(epoll_ctl epoll_wait)); - $Epoll = $cls->new; + $cls->new; } - *EventLoop = *EpollEventLoop; } =head2 C<< CLASS->EventLoop() >> @@ -163,13 +157,6 @@ Start processing IO events. In most daemon programs this never exits. See C below for how to exit the loop. =cut -sub FirstTimeEventLoop { - my $class = shift; - - _InitPoller(); - - EventLoop($class); -} sub now () { clock_gettime(CLOCK_MONOTONIC) } @@ -271,7 +258,8 @@ sub PostEventLoop () { $PostLoopCallback ? $PostLoopCallback->(\%DescriptorMap) : 1; } -sub EpollEventLoop { +sub EventLoop { + $Epoll //= _InitPoller(); local $in_loop = 1; do { my @events; @@ -330,8 +318,7 @@ sub new { $self->{sock} = $sock; my $fd = fileno($sock); - _InitPoller(); - + $Epoll //= _InitPoller(); retry: if (epoll_ctl($Epoll, EPOLL_CTL_ADD, $fd, $ev)) { if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { -- cgit v1.2.3-24-ge0c7 From 0f461dcd3317f44670e2fc50346f87ff41e80127 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Dec 2020 02:53:07 +0000 Subject: ds: flatten + reuse @events, epoll_wait style fixes Consistently returning the equivalent of pollfd.revents in a portable manner was never worth the effort for us, as we use the same ->event_step callback regardless of POLLIN/POLLOUT/POLLHUP. Being a Perl, @events knows it size and we don't have to return a maximum index for the caller to iterate on. We can also avoid redundant integer coercion ("+0") since we ensure everything is an IV in other places. Finally, vec() is preferable to ("\0" x $size) for resizing buffers because it only needs to write the extended portion and not overwrite the entire buffer. --- lib/PublicInbox/DS.pm | 20 +++++--------- lib/PublicInbox/DSKQXS.pm | 2 +- lib/PublicInbox/DSPoll.pm | 3 +-- lib/PublicInbox/Syscall.pm | 66 ++++++++++++++++++++++++++-------------------- 4 files changed, 45 insertions(+), 46 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index 12df5919..97a6f6ef 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -87,9 +87,7 @@ A timeout of 0 (zero) means poll forever. A timeout of -1 means poll and return immediately. =cut -sub SetLoopTimeout { - return $LoopTimeout = $_[1] + 0; -} +sub SetLoopTimeout { $LoopTimeout = $_[1] + 0 } =head2 C<< PublicInbox::DS::add_timer( $seconds, $coderef, $arg) >> @@ -200,12 +198,7 @@ sub RunTimers { my $timeout = int(($Timers[0][0] - $now) * 1000) + 1; # -1 is an infinite timeout, so prefer a real timeout - return $timeout if $LoopTimeout == -1; - - # otherwise pick the lower of our regular timeout and time until - # the next timer - return $LoopTimeout if $LoopTimeout < $timeout; - return $timeout; + ($LoopTimeout < 0 || $LoopTimeout >= $timeout) ? $timeout : $LoopTimeout; } # We can't use waitpid(-1) safely here since it can hit ``, system(), @@ -261,19 +254,18 @@ sub PostEventLoop () { sub EventLoop { $Epoll //= _InitPoller(); local $in_loop = 1; + my @events; do { - my @events; - my $i; my $timeout = RunTimers(); # get up to 1000 events - my $evcount = epoll_wait($Epoll, 1000, $timeout, \@events); - for ($i=0; $i<$evcount; $i++) { + epoll_wait($Epoll, 1000, $timeout, \@events); + for my $fd (@events) { # it's possible epoll_wait returned many events, including some at the end # that ones in the front triggered unregister-interest actions. if we # can't find the %sock entry, it's because we're no longer interested # in that event. - $DescriptorMap{$events[$i]->[0]}->event_step; + $DescriptorMap{$fd}->event_step; } } while (PostEventLoop()); _run_later(); diff --git a/lib/PublicInbox/DSKQXS.pm b/lib/PublicInbox/DSKQXS.pm index d1d3fe60..aa2c9168 100644 --- a/lib/PublicInbox/DSKQXS.pm +++ b/lib/PublicInbox/DSKQXS.pm @@ -134,7 +134,7 @@ sub epoll_wait { } } # caller only cares for $events[$i]->[0] - scalar(@$events); + $_ = $_->[0] for @$events; } # kqueue is close-on-fork (not exec), so we must not close it diff --git a/lib/PublicInbox/DSPoll.pm b/lib/PublicInbox/DSPoll.pm index 1d9b51d9..a218f695 100644 --- a/lib/PublicInbox/DSPoll.pm +++ b/lib/PublicInbox/DSPoll.pm @@ -45,14 +45,13 @@ sub epoll_wait { my $fd = $pset[$i++]; my $revents = $pset[$i++] or next; delete($self->{$fd}) if $self->{$fd} & EPOLLONESHOT; - push @$events, [ $fd ]; + push @$events, $fd; } my $nevents = scalar @$events; if ($n != $nevents) { warn "BUG? poll() returned $n, but got $nevents"; } } - $n; } 1; diff --git a/lib/PublicInbox/Syscall.pm b/lib/PublicInbox/Syscall.pm index e4f00a2a..c403f78a 100644 --- a/lib/PublicInbox/Syscall.pm +++ b/lib/PublicInbox/Syscall.pm @@ -227,38 +227,46 @@ sub epoll_ctl_mod8 { our $epoll_wait_events; our $epoll_wait_size = 0; sub epoll_wait_mod4 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 12 x $epoll_wait_size; - } - my $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - for (0..$ct-1) { - @{$_[3]->[$_]}[1,0] = unpack("LL", substr($epoll_wait_events, 12*$_, 8)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 12 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec); + for (0..$ct - 1) { + # 12-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 8 bytes: epoll_data_t union (first 4 bytes are the fd) + # So we skip the first 4 bytes and take the middle 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 12 * $_ + 4, 4)); + } } sub epoll_wait_mod8 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 16 x $epoll_wait_size; - } - my $ct; - if ($no_deprecated) { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0, undef); - } else { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - } - for (0..$ct-1) { - # 16 byte epoll_event structs, with format: - # 4 byte mask [idx 1] - # 4 byte padding (we put it into idx 2, useless) - # 8 byte data (first 4 bytes are fd, into idx 0) - @{$_[3]->[$_]}[1,2,0] = unpack("LLL", substr($epoll_wait_events, 16*$_, 12)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 16 * 8 - 1, 1) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec, + $no_deprecated ? undef : ()); + for (0..$ct - 1) { + # 16-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 4 bytes padding (skipped, useless) + # 8 bytes epoll_data_t union (first 4 bytes are the fd) + # So skip the first 8 bytes, take 4, and ignore the last 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 16 * $_ + 8, 4)); + } } sub signalfd ($$$) { -- cgit v1.2.3-24-ge0c7