diff options
Diffstat (limited to 'lib')
196 files changed, 14343 insertions, 6164 deletions
diff --git a/lib/PublicInbox/Address.pm b/lib/PublicInbox/Address.pm index 2c9c4395..3a59945c 100644 --- a/lib/PublicInbox/Address.pm +++ b/lib/PublicInbox/Address.pm @@ -1,9 +1,8 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::Address; -use strict; -use v5.10.1; -use parent 'Exporter'; +use v5.12; +use parent qw(Exporter); our @EXPORT_OK = qw(pairs); sub xs_emails { @@ -20,8 +19,11 @@ sub xs_names { } sub xs_pairs { # for JMAP, RFC 8621 section 4.1.2.3 - [ map { # LHS (name) may be undef - [ $_->phrase // $_->comment, $_->address ] + [ map { # LHS (name) may be undef if there's an address + my @p = ($_->phrase // $_->comment, $_->address); + # show original if totally bogus: + $p[0] = $_->original unless defined $p[1]; + \@p; } parse_email_addresses($_[0]) ]; } @@ -31,6 +33,7 @@ eval { *emails = \&xs_emails; *names = \&xs_names; *pairs = \&xs_pairs; + *objects = sub { Email::Address::XS->parse(@_) }; }; if ($@) { @@ -38,6 +41,7 @@ if ($@) { *emails = \&PublicInbox::AddressPP::emails; *names = \&PublicInbox::AddressPP::names; *pairs = \&PublicInbox::AddressPP::pairs; + *objects = \&PublicInbox::AddressPP::objects; } 1; diff --git a/lib/PublicInbox/AddressPP.pm b/lib/PublicInbox/AddressPP.pm index 6a3ae4fe..65ba36a9 100644 --- a/lib/PublicInbox/AddressPP.pm +++ b/lib/PublicInbox/AddressPP.pm @@ -1,7 +1,8 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::AddressPP; use strict; +use v5.10.1; # TODO check regexps for unicode_strings compat # very loose regexes, here. We don't need RFC-compliance, # just enough to make thing sanely displayable and pass to git @@ -56,4 +57,13 @@ sub pairs { # for JMAP, RFC 8621 section 4.1.2.3 } emails($s) ]; } +# Mail::Address->name is inconsistent with Email::Address::XS, so we're +# doing our own thing, here: +sub objects { map { bless $_, __PACKAGE__ } @{pairs($_[0])} } + +# OO API for objects() results +sub user { (split(/@/, $_[0]->[1]))[0] } +sub host { (split(/@/, $_[0]->[1]))[1] } +sub name { $_[0]->[0] // user($_[0]) } + 1; diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 11ea8f83..a1b1fc07 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -1,15 +1,15 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # common stuff for administrative command-line tools # Unstable internal API package PublicInbox::Admin; -use strict; +use v5.12; use parent qw(Exporter); -our @EXPORT_OK = qw(setup_signals); +our @EXPORT_OK = qw(setup_signals fmt_localtime); use PublicInbox::Config; use PublicInbox::Inbox; -use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::Spawn qw(run_qx); use PublicInbox::Eml; *rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed; @@ -28,12 +28,12 @@ sub setup_signals { }; } -sub resolve_eidxdir { - my ($cd) = @_; +sub resolve_any_idxdir ($$) { + my ($cd, $lock_bn) = @_; my $try = $cd // '.'; my $root_dev_ino; - while (1) { # favor v2, first - if (-f "$try/ei.lock") { + while (1) { + if (-f "$try/$lock_bn") { # inbox.lock, ei.lock, cidx.lock return rel2abs_collapsed($try); } elsif (-d $try) { my @try = stat _; @@ -49,61 +49,47 @@ sub resolve_eidxdir { } } +sub resolve_eidxdir ($) { resolve_any_idxdir($_[0], 'ei.lock') } +sub resolve_cidxdir ($) { resolve_any_idxdir($_[0], 'cidx.lock') } + sub resolve_inboxdir { my ($cd, $ver) = @_; - my $try = $cd // '.'; - my $root_dev_ino; - while (1) { # favor v2, first - if (-f "$try/inbox.lock") { - $$ver = 2 if $ver; - return rel2abs_collapsed($try); - } elsif (-d $try) { - my @try = stat _; - $root_dev_ino //= do { - my @root = stat('/') or die "stat /: $!\n"; - "$root[0]\0$root[1]"; - }; - last if "$try[0]\0$try[1]" eq $root_dev_ino; - $try .= '/..'; # continue, cd up - } else { - die "`$try' is not a directory\n"; - } - } - # try v1 bare git dirs - my $cmd = [ qw(git rev-parse --git-dir) ]; - my $fh = popen_rd($cmd, undef, {-C => $cd}); - my $dir = do { local $/; <$fh> }; - close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $!\n"; - chomp $dir; - $$ver = 1 if $ver; - rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir); + my $dir; + if (defined($dir = resolve_any_idxdir($cd, 'inbox.lock'))) { # try v2 + $$ver = 2 if $ver; + } elsif (defined($dir = resolve_git_dir($cd))) { # try v1 + $$ver = 1 if $ver; + } # else: not an inbox at all + $dir; } -# for unconfigured inboxes -sub detect_indexlevel ($) { - my ($ibx) = @_; - - my $over = $ibx->over; - my $srch = $ibx->search; - delete @$ibx{qw(over search)}; # don't leave open FDs lying around - - # brand new or never before indexed inboxes default to full - return 'full' unless $over; - my $l = 'basic'; - return $l unless $srch; - if (my $xdb = $srch->xdb) { - $l = 'full'; - my $m = $xdb->get_metadata('indexlevel'); - if ($m eq 'medium') { - $l = $m; - } elsif ($m ne '') { - warn <<""; -$ibx->{inboxdir} has unexpected indexlevel in Xapian: $m +sub valid_pwd { + my $pwd = $ENV{PWD} // return; + my @st_pwd = stat $pwd or return; + my @st_cwd = stat '.' or die "stat(.): $!"; + "@st_pwd[1,0]" eq "@st_cwd[1,0]" ? $pwd : undef; +} - } - $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata'); +sub resolve_git_dir { + my ($cd) = @_; # cd may be `undef' for cwd + # try v1 bare git dirs + my $pwd = valid_pwd(); + my $env; + defined($pwd) && substr($cd // '/', 0, 1) ne '/' and + $env->{PWD} = "$pwd/$cd"; + my $cmd = [ qw(git rev-parse --git-dir) ]; + my $dir = run_qx($cmd, $env, { -C => $cd }); + die "error in @$cmd (cwd:${\($cd // '.')}): $?\n" if $?; + chomp $dir; + # --absolute-git-dir requires git v2.13.0+, and we want to + # respect symlinks when $ENV{PWD} if $ENV{PWD} ne abs_path('.') + # since we store absolute GIT_DIR paths in cindex. + if (substr($dir, 0, 1) ne '/') { + substr($cd // '/', 0, 1) eq '/' or + $cd = File::Spec->rel2abs($cd, $pwd); + $dir = rel2abs_collapsed($dir, $cd); } - $l; + $dir; } sub unconfigured_ibx ($$) { @@ -128,12 +114,22 @@ sub resolve_inboxes ($;$$) { $cfg or die "--all specified, but $cfgfile not readable\n"; @$argv and die "--all specified, but directories specified\n"; } - my (@old, @ibxs, @eidx); + my (@old, @ibxs, @eidx, @cidx); + if ($opt->{-cidx_ok}) { + require PublicInbox::CodeSearchIdx; + @$argv = grep { + if (defined(my $d = resolve_cidxdir($_))) { + push @cidx, PublicInbox::CodeSearchIdx->new( + $d, $opt); + undef; + } else { + 1; + } + } @$argv; + } if ($opt->{-eidx_ok}) { require PublicInbox::ExtSearchIdx; - my $i = -1; @$argv = grep { - $i++; if (defined(my $ei = resolve_eidxdir($_))) { $ei = PublicInbox::ExtSearchIdx->new($ei, $opt); push @eidx, $ei; @@ -155,6 +151,7 @@ sub resolve_inboxes ($;$$) { warn "W: $ibx->{name} $ibx->{inboxdir}: $!\n"; } }); + # TODO: no way to configure cindex in config file, yet } else { # directories specified on the command-line my @dirs = @$argv; push @dirs, '.' if !@dirs && $opt->{-use_cwd}; @@ -195,7 +192,8 @@ sub resolve_inboxes ($;$$) { die "-V$min_ver inboxes not supported by $0\n\t", join("\n\t", @old), "\n"; } - $opt->{-eidx_ok} ? (\@ibxs, \@eidx) : @ibxs; + ($opt->{-eidx_ok} || $opt->{-cidx_ok}) ? (\@ibxs, \@eidx, \@cidx) + : @ibxs; } my @base_mod = (); @@ -203,13 +201,13 @@ my @over_mod = qw(DBD::SQLite DBI); my %mod_groups = ( -index => [ @base_mod, @over_mod ], -base => \@base_mod, - -search => [ @base_mod, @over_mod, 'Search::Xapian' ], + -search => [ @base_mod, @over_mod, 'Xapian' ], ); sub scan_ibx_modules ($$) { my ($mods, $ibx) = @_; if (!$ibx->{indexlevel} || $ibx->{indexlevel} ne 'basic') { - $mods->{'Search::Xapian'} = 1; + $mods->{'Xapian'} = 1; } else { $mods->{$_} = 1 foreach @over_mod; } @@ -221,10 +219,10 @@ sub check_require { while (my $mod = shift @mods) { if (my $groups = $mod_groups{$mod}) { push @mods, @$groups; - } elsif ($mod eq 'Search::Xapian') { + } elsif ($mod eq 'Xapian') { require PublicInbox::Search; PublicInbox::Search::load_xapian() or - $err->{'Search::Xapian || Xapian'} = $@; + $err->{'Xapian || Search::Xapian'} = $@; } else { eval "require $mod"; $err->{$mod} = $@ if $@; @@ -383,4 +381,12 @@ sub do_chdir ($) { } } +sub fmt_localtime ($) { + require POSIX; + my @lt = localtime $_[0]; + my (undef, $M, $H, $d, $m, $Y) = @lt; + sprintf('%u-%02u-%02u % 2u:%02u ', $Y + 1900, $m + 1, $d, $H, $M) + .POSIX::strftime('%z', @lt); +} + 1; diff --git a/lib/PublicInbox/AdminEdit.pm b/lib/PublicInbox/AdminEdit.pm index c8c3d3e8..654141a7 100644 --- a/lib/PublicInbox/AdminEdit.pm +++ b/lib/PublicInbox/AdminEdit.pm @@ -19,11 +19,11 @@ sub check_editable ($) { } # Undefined indexlevel, so `full'... - # Search::Xapian exists and the DB can be read, at least, fine + # Xapian exists and the DB can be read, at least, fine $ibx->search and next; # it's possible for a Xapian directory to exist, - # but Search::Xapian to go missing/broken. + # but Xapian to go missing/broken. # Make sure it's purged in that case: $ibx->over or die "no over.sqlite3 in $ibx->{inboxdir}\n"; diff --git a/lib/PublicInbox/Aspawn.pm b/lib/PublicInbox/Aspawn.pm new file mode 100644 index 00000000..49f8651a --- /dev/null +++ b/lib/PublicInbox/Aspawn.pm @@ -0,0 +1,34 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# async system()/qx() which takes callback +package PublicInbox::Aspawn; +use v5.12; +use parent qw(Exporter); +use PublicInbox::DS qw(awaitpid); +use PublicInbox::Spawn qw(spawn); +our @EXPORT_OK = qw(run_await); + +sub _await_cb { # awaitpid cb + my ($pid, $cmd, $env, $opt, $cb, @args) = @_; + PublicInbox::Spawn::read_out_err($opt); + if ($? && !$opt->{quiet}) { + my ($status, $sig) = ($? >> 8, $? & 127); + my $msg = ''; + $msg .= " (-C=$opt->{-C})" if defined $opt->{-C}; + $msg .= " status=$status" if $status; + $msg .= " signal=$sig" if $sig; + warn "E: @$cmd", $msg, "\n"; + } + $cb->($pid, $cmd, $env, $opt, @args) if $cb; +} + +sub run_await { + my ($cmd, $env, $opt, $cb, @args) = @_; + $opt->{1} //= \(my $out); + my $pid = spawn($cmd, $env, $opt); + awaitpid($pid, \&_await_cb, $cmd, $env, $opt, $cb, @args); + awaitpid($pid); # synchronous for non-$in_loop +} + +1; diff --git a/lib/PublicInbox/AutoReap.pm b/lib/PublicInbox/AutoReap.pm index 23ecce77..ae4984b8 100644 --- a/lib/PublicInbox/AutoReap.pm +++ b/lib/PublicInbox/AutoReap.pm @@ -3,8 +3,7 @@ # automatically kill + reap children when this goes out-of-scope package PublicInbox::AutoReap; -use v5.10.1; -use strict; +use v5.12; sub new { my (undef, $pid, $cb) = @_; @@ -21,8 +20,8 @@ sub join { my $pid = delete $self->{pid} or return; $self->{cb}->() if defined $self->{cb}; CORE::kill($sig, $pid) if defined $sig; - my $ret = waitpid($pid, 0) // die "waitpid($pid): $!"; - $ret == $pid or die "BUG: waitpid($pid) != $ret"; + my $r = waitpid($pid, 0); + $r == $pid or die "BUG? waitpid($pid) => $r (\$?=$? \$!=$!)"; } sub DESTROY { diff --git a/lib/PublicInbox/Cgit.pm b/lib/PublicInbox/Cgit.pm index cc729aa2..78fc9ca0 100644 --- a/lib/PublicInbox/Cgit.pm +++ b/lib/PublicInbox/Cgit.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # wrapper for cgit(1) and git-http-backend(1) for browsing and @@ -6,7 +6,8 @@ # directive to be set in the public-inbox config file. package PublicInbox::Cgit; -use strict; +use v5.12; +use parent qw(PublicInbox::WwwCoderepo); use PublicInbox::GitHTTPBackend; use PublicInbox::Git; # not bothering with Exporter for a one-off @@ -40,10 +41,9 @@ sub locate_cgit ($) { if (defined($cgit_bin) && $cgit_bin =~ m!\A(.+?)/[^/]+\z!) { unshift @dirs, $1 if -d $1; } - foreach my $d (@dirs) { - my $f = "$d/cgit.css"; - next unless -f $f; - $cgit_data = $d; + for (@dirs) { + next unless -f "$_/cgit.css"; + $cgit_data = $_; last; } } @@ -53,29 +53,18 @@ sub locate_cgit ($) { sub new { my ($class, $pi_cfg) = @_; my ($cgit_bin, $cgit_data) = locate_cgit($pi_cfg); - # TODO: support gitweb and other repository viewers? - if (defined(my $cgitrc = $pi_cfg->{-cgitrc_unparsed})) { - $pi_cfg->parse_cgitrc($cgitrc, 0); - } + $cgit_bin // return; # fall back in WWW->cgit my $self = bless { cmd => [ $cgit_bin ], cgit_data => $cgit_data, pi_cfg => $pi_cfg, + cgitrc => $pi_cfg->{'publicinbox.cgitrc'} // $ENV{CGIT_CONFIG}, }, $class; # some cgit repos may not be mapped to inboxes, so ensure those exist: - my $code_repos = $pi_cfg->{-code_repos}; - foreach my $k (keys %$pi_cfg) { - $k =~ /\Acoderepo\.(.+)\.dir\z/ or next; - my $dir = $pi_cfg->{$k}; - $code_repos->{$1} ||= $pi_cfg->fill_code_repo($1); - } - while (my ($nick, $repo) = each %$code_repos) { - $self->{"\0$nick"} = $repo; - } - my $cgit_static = $pi_cfg->{-cgit_static}; - my $static = join('|', map { quotemeta $_ } keys %$cgit_static); - $self->{static} = qr/\A($static)\z/; + PublicInbox::WwwCoderepo::prepare_coderepos($self); + my $s = join('|', map { quotemeta } keys %{$pi_cfg->{-cgit_static}}); + $self->{static} = qr/\A($s)\z/; $self; } @@ -96,14 +85,14 @@ my @PASS_ENV = qw( my $parse_cgi_headers = \&PublicInbox::GitHTTPBackend::parse_cgi_headers; sub call { - my ($self, $env) = @_; + my ($self, $env, $ctx) = @_; # $ctx is optional, used by WWW my $path_info = $env->{PATH_INFO}; my $cgit_data; # handle requests without spawning cgit iff possible: if ($path_info =~ m!\A/(.+?)/($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my ($nick, $path) = ($1, $2); - if (my PublicInbox::Git $git = $self->{"\0$nick"}) { + if (my $git = $self->{pi_cfg}->get_coderepo($nick)) { return serve($env, $git, $path); } } elsif ($path_info =~ m!$self->{static}! && @@ -112,17 +101,14 @@ sub call { return PublicInbox::WwwStatic::response($env, [], $f); } - my $cgi_env = { PATH_INFO => $path_info }; - foreach (@PASS_ENV) { - defined(my $v = $env->{$_}) or next; - $cgi_env->{$_} = $v; - } - $cgi_env->{'HTTPS'} = 'on' if $env->{'psgi.url_scheme'} eq 'https'; + my %cgi_env = (CGIT_CONFIG => $self->{cgitrc}, PATH_INFO => $path_info); + @cgi_env{@PASS_ENV} = @$env{@PASS_ENV}; # spawn ignores undef vals + $cgi_env{HTTPS} = 'on' if $env->{'psgi.url_scheme'} eq 'https'; my $rdr = input_prepare($env) or return r(500); - my $qsp = PublicInbox::Qspawn->new($self->{cmd}, $cgi_env, $rdr); + my $qsp = PublicInbox::Qspawn->new($self->{cmd}, \%cgi_env, $rdr); my $limiter = $self->{pi_cfg}->limiter('-cgit'); - $qsp->psgi_return($env, $limiter, $parse_cgi_headers); + $qsp->psgi_yield($env, $limiter, $parse_cgi_headers, $ctx); } 1; diff --git a/lib/PublicInbox/CidxComm.pm b/lib/PublicInbox/CidxComm.pm new file mode 100644 index 00000000..80a235e9 --- /dev/null +++ b/lib/PublicInbox/CidxComm.pm @@ -0,0 +1,28 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Waits for initial comm(1) output for PublicInbox::CodeSearchIdx. +# The initial output from `comm' can take a while to generate because +# it needs to wait on: +# `git cat-file --batch-all-objects --batch-check --unordered | sort' +# We still rely on blocking reads, here, since comm should be fast once +# it's seeing input. (`--unordered | sort' is intentional for HDDs) +package PublicInbox::CidxComm; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); + +sub new { + my ($cls, $rd, $cidx, $drs) = @_; + my $self = bless { cidx => $cidx, drs => $drs }, $cls; + $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); +} + +sub event_step { + my ($self) = @_; + my $rd = $self->{sock} // return warn('BUG?: no {sock}'); + $self->close; # EPOLL_CTL_DEL + delete($self->{cidx})->cidx_read_comm($rd, delete $self->{drs}); +} + +1; diff --git a/lib/PublicInbox/CidxLogP.pm b/lib/PublicInbox/CidxLogP.pm new file mode 100644 index 00000000..5ea675bf --- /dev/null +++ b/lib/PublicInbox/CidxLogP.pm @@ -0,0 +1,28 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Waits for initial `git log -p' output for PublicInbox::CodeSearchIdx. +# The initial output from `git log -p' can take a while to generate, +# CodeSearchIdx can process prune work while it's happening. Once +# `git log -p' starts generating output, it should be able to keep +# up with Xapian indexing, so we still rely on blocking reads to simplify +# cidx_read_log_p +package PublicInbox::CidxLogP; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); + +sub new { + my ($cls, $rd, $cidx, $git, $roots) = @_; + my $self = bless { cidx => $cidx, git => $git, roots => $roots }, $cls; + $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); +} + +sub event_step { + my ($self) = @_; + my $rd = $self->{sock} // return warn('BUG?: no {sock}'); + $self->close; # EPOLL_CTL_DEL + delete($self->{cidx})->cidx_read_log_p($self, $rd); +} + +1; diff --git a/lib/PublicInbox/CidxXapHelperAux.pm b/lib/PublicInbox/CidxXapHelperAux.pm new file mode 100644 index 00000000..91c9b021 --- /dev/null +++ b/lib/PublicInbox/CidxXapHelperAux.pm @@ -0,0 +1,48 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Intended for PublicInbox::DS::event_loop for -cindex --associate, +# this reports auxilliary status while dumping +package PublicInbox::CidxXapHelperAux; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN); + +# rpipe connects to req->fp[1] in xap_helper.h +sub new { + my ($cls, $rpipe, $cidx, $pfx) = @_; + my $self = bless { cidx => $cidx, pfx => $pfx }, $cls; + $rpipe->blocking(0); + $self->SUPER::new($rpipe, EPOLLIN); +} + +sub event_step { + my ($self) = @_; # xap_helper.h is line-buffered + my $buf = delete($self->{buf}) // ''; + my $n = sysread($self->{sock}, $buf, 65536, length($buf)); + if (!defined($n)) { + return if $!{EAGAIN}; + die "sysread: $!"; + } + my $pfx = $self->{pfx}; + if ($n == 0) { + warn "BUG? $pfx buf=$buf" if $buf ne ''; + if (delete $self->{cidx}->{PENDING}->{$pfx}) { + warn "BUG? $pfx did not get mset.size"; + $self->{cidx}->index_next; + } + return $self->close; + } + my @lines = split(/^/m, $buf); + $self->{buf} = pop @lines if substr($lines[-1], -1) ne "\n"; + for my $l (@lines) { + if ($l =~ /\Amset\.size=[0-9]+ nr_out=[0-9]+\n\z/) { + delete $self->{cidx}->{PENDING}->{$pfx}; + $self->{cidx}->index_next; + } + chomp $l; + $self->{cidx}->progress("$pfx $l"); + } +} + +1; diff --git a/lib/PublicInbox/CmdIPC4.pm b/lib/PublicInbox/CmdIPC4.pm index e368d032..fc77bd03 100644 --- a/lib/PublicInbox/CmdIPC4.pm +++ b/lib/PublicInbox/CmdIPC4.pm @@ -7,31 +7,44 @@ package PublicInbox::CmdIPC4; use v5.12; use Socket qw(SOL_SOCKET SCM_RIGHTS); + +sub sendmsg_retry ($) { + return 1 if $!{EINTR}; + return unless ($!{ENOMEM} || $!{ENOBUFS} || $!{ETOOMANYREFS}); + return if --$_[0] < 0; + warn "# sleeping on sendmsg: $! ($_[0] tries left)\n"; + select(undef, undef, undef, 0.1); + 1; +} + BEGIN { eval { require Socket::MsgHdr; # XS no warnings 'once'; # any number of FDs per-sendmsg(2) + buffer -*send_cmd4 = sub ($$$$) { # (sock, fds, buf, flags) = @_; - my ($sock, $fds, undef, $flags) = @_; +*send_cmd4 = sub ($$$$;$) { # (sock, fds, buf, flags) = @_; + my ($sock, $fds, undef, $flags, $tries) = @_; + $tries //= 50; my $mh = Socket::MsgHdr->new(buf => $_[2]); $mh->cmsghdr(SOL_SOCKET, SCM_RIGHTS, pack('i' x scalar(@$fds), @$fds)); my $s; - my $try = 0; do { $s = Socket::MsgHdr::sendmsg($sock, $mh, $flags); - } while (!defined($s) && - ($!{ENOBUFS} || $!{ENOMEM} || $!{ETOOMANYREFS}) && - (++$try < 50) && - warn "sleeping on sendmsg: $! (#$try)\n" && - select(undef, undef, undef, 0.1) == 0); + } while (!defined($s) && sendmsg_retry($tries)); $s; }; *recv_cmd4 = sub ($$$) { my ($s, undef, $len) = @_; # $_[1] = destination buffer my $mh = Socket::MsgHdr->new(buflen => $len, controllen => 256); - my $r = Socket::MsgHdr::recvmsg($s, $mh, 0) // return (undef); + my $r; + do { + $r = Socket::MsgHdr::recvmsg($s, $mh, 0); + } while (!defined($r) && $!{EINTR}); + if (!defined($r)) { + $_[1] = ''; + return (undef); + } $_[1] = $mh->buf; return () if $r == 0; my (undef, undef, $data) = $mh->cmsghdr; diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm new file mode 100644 index 00000000..e5fa4480 --- /dev/null +++ b/lib/PublicInbox/CodeSearch.pm @@ -0,0 +1,372 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# read-only external index for coderepos +# currently, it only indexes commits and repository metadata +# (pathname, root commits); not blob contents +package PublicInbox::CodeSearch; +use v5.12; +use parent qw(PublicInbox::Search); +use PublicInbox::Config; +use PublicInbox::Search qw(retry_reopen int_val xap_terms); +use PublicInbox::Compat qw(uniqstr); +use Compress::Zlib qw(uncompress); +use constant { + AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail) + CT => 1, # commit time (Unix time stamp, like TS/rt: in mail) + CIDX_SCHEMA_VER => 1, # brand new schema for code search + # for repos (`Tr'), CT(col=1) is used for the latest tip commit time + # in refs/{heads,tags}. AT(col=0) may be used to store disk usage + # in the future, but disk usage calculation is espensive w/ alternates +}; +our @CODE_NRP; +our @CODE_VMAP = ( + [ AT, 'd:' ], # mairix compat + [ AT, 'dt:' ], # public-inbox mail compat + [ CT, 'ct:' ], +); + +# note: the non-X term prefix allocations are shared with Xapian omega, +# see xapian-applications/omega/docs/termprefixes.rst +# bool_pfx_internal: +# type => 'T', # 'c' - commit, 'r' - repo GIT_DIR +# tags are not indexed, only normal branches (refs/heads/*), not hidden +# 'P' # (pathname) GIT_DIR # uniq +# 'G' # (group) root commit (may have multiple roots) +my %bool_pfx_external = ( + oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256) + # type:repo - rel2abs_collapsed(GIT_DIR) + parent => 'XP', + %PublicInbox::Search::PATCH_BOOL_COMMON, +); + +my %prob_prefix = ( # copied from PublicInbox::Search + # do we care about committer? or partial commit OID via Xapian? + # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial + %PublicInbox::Search::PATCH_PROB_COMMON, + + # default: + '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY +); + +sub new { + my ($cls, $dir, $cfg) = @_; + # can't have a PublicInbox::Config here due to circular refs + bless { topdir => $dir, xpfx => "$dir/cidx".CIDX_SCHEMA_VER, + -cfg_f => $cfg->{-f} }, $cls; +} + +sub join_data_key ($) { "join:$_[0]->{-cfg_f}" } + +sub join_data { + my ($self) = @_; + my $key = join_data_key($self); + my $cur = $self->xdb->get_metadata($key) or return; + $cur = eval { PublicInbox::Config::json()->decode(uncompress($cur)) }; + warn "E: $@ (corrupt metadata in `$key' key?)" if $@; + my @m = grep { ref($cur->{$_}) ne 'ARRAY' } qw(ekeys roots ibx2root); + if (@m) { + warn <<EOM; +W: $self->{topdir} join data for $self->{-cfg_f} missing: @m +EOM + undef; + } elsif (@{$cur->{ekeys}} < @{$cur->{ibx2root}}) { + warn <<EOM; +W: $self->{topdir} join data for $self->{-cfg_f} mismatched ekeys and ibx2root +EOM + undef; + } else { + $cur; + } +} + +sub qparse_new ($) { + my ($self) = @_; + my $qp = $self->qp_init_common; + my $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + if (!@CODE_NRP) { + @CODE_NRP = map { + $PublicInbox::Search::NVRP->new(@$_) + } @CODE_VMAP; + } + $cb->($qp, $_) for @CODE_NRP; + while (my ($name, $pfx) = each %bool_pfx_external) { + $qp->add_boolean_prefix($name, $_) for split(/ /, $pfx); + } + while (my ($name, $pfx) = each %prob_prefix) { + $qp->add_prefix($name, $_) for split(/ /, $pfx); + } + $qp; +} + +sub generate_cxx () { # generates snippet for xap_helper.h + my $ret = <<EOM; +# line ${\__LINE__} "${\__FILE__}" +static NRP *code_nrp[${\scalar(@CODE_VMAP)}]; +static void code_nrp_init(void) +{ +EOM + for (0..$#CODE_VMAP) { + my $x = $CODE_VMAP[$_]; + $ret .= qq{\tcode_nrp[$_] = new NRP($x->[0], "$x->[1]");\n} + } +$ret .= <<EOM; +} + +# line ${\__LINE__} "${\__FILE__}" +static void qp_init_code_search(Xapian::QueryParser *qp) +{ + for (size_t i = 0; i < MY_ARRAY_SIZE(code_nrp); i++) + qp->ADD_RP(code_nrp[i]); +EOM + for my $name (sort keys %bool_pfx_external) { + for (split(/ /, $bool_pfx_external{$name})) { + $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} + } + } + for my $name (sort keys %prob_prefix) { + for (split(/ /, $prob_prefix{$name})) { + $ret .= qq{\tqp->add_prefix("$name", "$_");\n} + } + } + $ret .= "}\n"; +} + +# returns a Xapian::Query to filter by roots +sub roots_filter { # retry_reopen callback + my ($self, $git_dir) = @_; + my $xdb = $self->xdb; + my $P = 'P'.$git_dir; + my ($cur, $end) = ($xdb->postlist_begin($P), $xdb->postlist_end($P)); + if ($cur == $end) { + warn "W: $git_dir not indexed?\n"; + return; + } + my @roots = xap_terms('G', $xdb, $cur->get_docid); + if (!@roots) { + warn "W: $git_dir has no root commits?\n"; + return; + } + my $q = $PublicInbox::Search::X{Query}->new('G'.shift(@roots)); + for my $r (@roots) { + $q = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_OR(), + $q, 'G'.$r); + } + $q; +} + +sub mset { + my ($self, $qry_str, $opt) = @_; + my $qp = $self->{qp} //= qparse_new($self); + my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); + + # limit to commits with shared roots + if (defined(my $git_dir = $opt->{git_dir})) { + my $rf = retry_reopen($self, \&roots_filter, $git_dir) + or return; + + $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qry, $rf); + } + + # we only want commits: + $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qry, 'T'.'c'); + $self->do_enquire($qry, $opt, CT); +} + +sub roots2paths { # for diagnostics + my ($self) = @_; + my $cur = $self->xdb->allterms_begin('G'); + my $end = $self->{xdb}->allterms_end('G'); + my $qrepo = $PublicInbox::Search::X{Query}->new('T'.'r'); + my $enq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); + $enq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); + $enq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); + my %ret; + for (; $cur != $end; $cur++) { + my $G_oidhex = $cur->get_termname; + my $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qrepo, $G_oidhex); + $enq->set_query($qry); + my ($size, $off, $lim) = (0, 0, 100000); + my $dirs = $ret{substr($G_oidhex, 1)} = []; + do { + my $mset = $enq->get_mset($off += $size, $lim); + for my $x ($mset->items) { + push @$dirs, xap_terms('P', $x->get_document); + } + $size = $mset->size; + } while ($size); + @$dirs = sort(uniqstr(@$dirs)); + } + \%ret; +} + +sub docids_of_git_dir ($$) { + my ($self, $git_dir) = @_; + my @ids = $self->docids_by_postlist('P'.$git_dir); + warn <<"" if @ids > 1; +BUG: (non-fatal) $git_dir indexed multiple times in $self->{topdir} + + @ids; +} + +sub root_oids ($$) { + my ($self, $git_dir) = @_; + my @ids = docids_of_git_dir $self, $git_dir or warn <<""; +BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir} + + my @ret = map { xap_terms('G', $self->xdb, $_) } @ids; + @ret = uniqstr(@ret) if @ids > 1; + @ret; +} + +sub paths2roots { + my ($self, $paths) = @_; + my %ret; + if ($paths) { + for my $p (keys %$paths) { @{$ret{$p}} = root_oids($self, $p) } + } else { + my $tmp = roots2paths($self); + for my $root_oidhex (keys %$tmp) { + my $paths = delete $tmp->{$root_oidhex}; + push @{$ret{$_}}, $root_oidhex for @$paths; + } + @$_ = sort(@$_) for values %ret; + } + \%ret; +} + +sub load_ct { # retry_reopen cb + my ($self, $git_dir) = @_; + my @ids = docids_of_git_dir $self, $git_dir or return; + for (@ids) { + my $doc = $self->get_doc($_) // next; + return int_val($doc, CT); + } +} + +sub load_commit_times { # each_cindex callback + my ($self, $todo) = @_; # todo = [ [ time, git ], [ time, git ] ...] + my (@pending, $rec, $ct); + while ($rec = shift @$todo) { + $ct = $self->retry_reopen(\&load_ct, $rec->[1]->{git_dir}); + if (defined $ct) { + $rec->[0] = $ct; + } else { # may be in another cindex: + push @pending, $rec; + } + } + @$todo = @pending; +} + +sub load_coderepos { # each_cindex callback + my ($self, $pi_cfg) = @_; + my $name = $self->{name}; + my $cfg_f = $pi_cfg->{-f}; + my $lpfx = $self->{localprefix} or return warn <<EOM; +W: cindex.$name.localprefix unset in $cfg_f, ignoring cindex.$name +EOM + my $lre = join('|', map { $_ .= '/'; tr!/!/!s; quotemeta } @$lpfx); + $lre = qr!\A(?:$lre)!; + my $coderepos = $pi_cfg->{-coderepos}; + my $nick_pfx = $name eq '' ? '' : "$name/"; + my %dir2cr; + for my $p ($self->all_terms('P')) { + my $nick = $p; + $nick =~ s!$lre!$nick_pfx!s or next; + $dir2cr{$p} = $coderepos->{$nick} //= do { + my $git = PublicInbox::Git->new($p); + my %dedupe = ($nick => undef); + ($git->{nick}) = keys %dedupe; # for git->pub_urls + $git; + }; + } + my $jd = $self->retry_reopen(\&join_data, $self) or return warn <<EOM; +W: cindex.$name.topdir=$self->{topdir} has no usable join data for $cfg_f +EOM + my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)}; + my $roots2paths = roots2paths($self); + my %dedupe; # 50x alloc reduction w/ lore + gko mirror (Mar 2024) + for my $root_offs (@$ibx2root) { + my $ekey = shift(@$ekeys) // die 'BUG: {ekeys} empty'; + scalar(@$root_offs) or next; + my $ibx = $pi_cfg->lookup_eidx_key($ekey) // do { + warn "W: `$ekey' gone from $cfg_f\n"; + next; + }; + my $gits = $ibx->{-repo_objs} //= []; + my $cr_score = $ibx->{-cr_score} //= {}; + my %ibx_p2g = map { $_->{git_dir} => $_ } @$gits; + my $ibx2self; # cindex has an association w/ inbox? + for (@$root_offs) { # sorted by $nr descending + my ($nr, $root_off) = @$_; + my $root_oid = $roots->[$root_off] // do { + warn <<EOM; +BUG: root #$root_off invalid in join data for `$ekey' with $cfg_f +EOM + next; + }; + my $git_dirs = $roots2paths->{$root_oid}; + my @gits = map { $dir2cr{$_} // () } @$git_dirs; + $cr_score->{$_->{nick}} //= $nr for @gits; + @$git_dirs = grep { !$ibx_p2g{$_} } @$git_dirs; + # @$git_dirs or warn "W: no matches for $root_oid\n"; + for (@$git_dirs) { + if (my $git = $dir2cr{$_}) { + $ibx_p2g{$_} = $git; + $ibx2self = 1; + if (!$ibx->{-hide_www}) { + # don't stringify $nr directly + # to avoid long-lived PV + my $k = ($nr + 0)."\0". + ($ibx + 0); + my $s = $dedupe{$k} //= + [ $nr, $ibx->{name} ]; + push @{$git->{ibx_score}}, $s; + } + push @$gits, $git; + } else { + warn <<EOM; +W: no coderepo available for $_ (localprefix=@$lpfx) +EOM + } + } + } + if (@$gits) { + push @{$ibx->{-csrch}}, $self if $ibx2self; + } else { + delete $ibx->{-repo_objs}; + delete $ibx->{-cr_score}; + } + } + for my $git (values %dir2cr) { + my $s = $git->{ibx_score}; + @$s = sort { $b->[0] <=> $a->[0] } @$s if $s; + } + my $ALL = $pi_cfg->ALL or return; + my @alls_gits = sort { + scalar @{$b->{ibx_score} // []} <=> + scalar @{$a->{ibx_score} // []} + } values %$coderepos; + my $gits = $ALL->{-repo_objs} //= []; + push @$gits, @alls_gits; + my $cr_score = $ALL->{-cr_score} //= {}; + $cr_score->{$_->{nick}} //= scalar(@{$_->{ibx_score}//[]}) for @$gits; +} + +sub repos_sorted { + my $pi_cfg = shift; + my @recs = map { [ 0, $_ ] } @_; # PublicInbox::Git objects + my @todo = @recs; + $pi_cfg->each_cindex(\&load_commit_times, \@todo); + @recs = sort { $b->[0] <=> $a->[0] } @recs; # sort by commit time +} + +1; diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm new file mode 100644 index 00000000..6d777bf6 --- /dev/null +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -0,0 +1,1391 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# indexer for git coderepos, just commits and repo paths for now +# this stores normalized absolute paths of indexed GIT_DIR inside +# the DB itself and is designed to handle forks by designating roots +# At minimum, it needs to have the pathnames of all git repos in +# memory at runtime. --join also requires all inbox pathnames to +# be in memory (as it happens when loaded from ~/.public-inbox/config). +# +# Unlike mail search, docid isn't tied to NNTP artnum or IMAP UID, +# there's no serial number dependency at all. The first 32-bits of +# the commit SHA-(1|256) is used to select a shard. +# +# We shard repos using the first 32-bits of sha256($ABS_GIT_DIR) +# +# --join associates root commits of coderepos to inboxes based on prefixes. +# +# Internally, each inbox is assigned a non-negative integer index ($IBX_OFF), +# and each root commit object ID (SHA-1/SHA-256 hex) is also assigned +# a non-negative integer index ($ROOT_COMMIT_OID_ID). +# +# join dumps to 2 intermediate files in $TMPDIR: +# +# * to_root_off - each line is of the format: +# +# $PFX @ROOT_COMMIT_OID_OFFS +# +# * to_ibx_off - each line is of the format: +# +# $PFX @IBX_OFFS +# +# $IBX_OFFS is a comma-delimited list of integers ($IBX_ID) +# The $IBX_OFF here is ephemeral (per-join_data) and NOT related to +# the `ibx_off' column of `over.sqlite3' for extindex. +# @ROOT_COMMIT_OID_OFFS is space-delimited +# In both cases, $PFX is typically the value of the 7-(hex)char dfpost +# XDFPOST but it can be configured to use any combination of patchid, +# dfpre, dfpost or dfblob. +# +# WARNING: this is vulnerable to arbitrary memory usage attacks if we +# attempt to index or join against malicious coderepos with +# thousands/millions of root commits. Most coderepos have only one +# root commit, some have several: git.git currently has 7, +# torvalds/linux.git has 4. +# --max-size= is required to keep memory usage reasonable for gigantic +# commits. +# +# See PublicInbox::CodeSearch (read-only API) for more +package PublicInbox::CodeSearchIdx; +use v5.12; +# parent order matters, we want ->DESTROY from IPC, not SearchIdx +use parent qw(PublicInbox::CodeSearch PublicInbox::IPC PublicInbox::SearchIdx); +use PublicInbox::DS qw(awaitpid); +use PublicInbox::PktOp; +use PublicInbox::IPC qw(nproc_shards); +use POSIX qw(WNOHANG SEEK_SET strftime); +use File::Path (); +use File::Spec (); +use List::Util qw(max); +use PublicInbox::SHA qw(sha256_hex sha_all); +use PublicInbox::Search qw(xap_terms); +use PublicInbox::SearchIdx qw(add_val); +use PublicInbox::Config qw(glob2re rel2abs_collapsed); +use PublicInbox::Spawn qw(which spawn popen_rd); +use PublicInbox::OnDestroy; +use PublicInbox::CidxLogP; +use PublicInbox::CidxComm; +use PublicInbox::Git qw(%OFMT2HEXLEN); +use PublicInbox::Compat qw(uniqstr); +use PublicInbox::Aspawn qw(run_await); +use Compress::Zlib qw(compress); +use Carp qw(croak); +use Time::Local qw(timegm); +use autodie qw(close pipe open sysread seek sysseek send); +our $DO_QUIT = 15; # signal number +our ( + $LIVE_JOBS, # integer + $GITS_NR, # number of coderepos + $MY_SIG, # like %SIG + $SIGSET, + $TXN_BYTES, # number of bytes in current shard transaction + $BATCH_BYTES, + @RDONLY_XDB, # Xapian::Database + @IDX_SHARDS, # clones of self + $MAX_SIZE, + $REINDEX, # PublicInbox::SharedKV + @GIT_DIR_GONE, # [ git_dir1, git_dir2 ] + $PRUNE_DONE, # marks off prune completions + $NCHANGE, # current number of changes + $NPROC, + $XHC, # XapClient + $REPO_CTX, # current repo being indexed in shards + $IDXQ, # PublicInbox::Git object arrayref + $SCANQ, # PublicInbox::Git object arrayref + %ALT_FH, # hexlen => tmp IO for TMPDIR git alternates + $TMPDIR, # File::Temp->newdir object for prune + @PRUNEQ, # GIT_DIRs to prepare for pruning + %TODO, @IBXQ, @IBX, + @JOIN, # join(1) command for --join + $CMD_ENV, # env for awk(1), comm(1), sort(1) commands during prune + @AWK, @COMM, @SORT, # awk(1), comm(1), sort(1) commands + %JOIN, # CLI --join= suboptions + @JOIN_PFX, # any combination of XDFID, XDFPRE, XDFPOST + @JOIN_DT, # YYYYmmddHHMMSS for dt: + $QRY_STR, # common query string for both code and inbox associations + $DUMP_IBX_WPIPE, # goes to sort(1) + $ANY_SHARD, # shard round-robin for scan fingerprinting + @OFF2ROOT, + $GIT_VER, + @NO_ABBREV, +); + +# stop walking history if we see >$SEEN_MAX existing commits, this assumes +# branches don't diverge by more than this number of commits... +# git walks commits quickly if it doesn't have to read trees +our $SEEN_MAX = 100000; + +# window for commits/emails to determine a inbox <-> coderepo association +my $JOIN_WINDOW = 50000; + +our @PRUNE_BATCH = qw(cat-file --batch-all-objects --batch-check); + +# TODO: do we care about committer name + email? or tree OID? +my @FMT = qw(H P ct an ae at s b); # (b)ody must be last + +# git log --stdin buffers all commits before emitting, thus --reverse +# doesn't incur extra overhead. We use --reverse to keep Xapian docids +# increasing so we may be able to avoid sorting results in some cases +my @LOG_STDIN = (qw(log --no-decorate --no-color --no-notes -p --stat -M + --reverse --stdin --no-walk=unsorted), '--pretty=format:%n%x00'. + join('%n', map { "%$_" } @FMT)); + +sub new { + my (undef, $dir, $opt) = @_; + my $l = $opt->{indexlevel} // 'full'; + $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and + die "invalid indexlevel=$l\n"; + $l eq 'basic' and die "E: indexlevel=basic not supported\n"; + my $self = bless { + xpfx => "$dir/cidx". PublicInbox::CodeSearch::CIDX_SCHEMA_VER, + cidx_dir => $dir, + creat => 1, # TODO: get rid of this, should be implicit + transact_bytes => 0, # for checkpoint + total_bytes => 0, # for lock_release + current_info => '', + parallel => 1, + -opt => $opt, + lock_path => "$dir/cidx.lock", + }, __PACKAGE__; + $self->{nshard} = count_shards($self) || + nproc_shards({nproc => $opt->{jobs}}); + $self->{-no_fsync} = 1 if !$opt->{fsync}; + $self->{-dangerous} = 1 if $opt->{dangerous}; + $self; +} + +# This is similar to uniq(1) on the first column, but combines the +# contents of subsequent columns using $OFS. +our @UNIQ_FOLD = ($^X, $^W ? ('-w') : (), qw(-MList::Util=uniq -ane), <<'EOM'); +BEGIN { $ofs = $ENV{OFS} // ','; $apfx = '' } +if ($F[0] eq $apfx) { + shift @F; + push @ids, @F; +} else { + print $apfx.' '.join($ofs, uniq(@ids))."\n" if @ids; + ($apfx, @ids) = @F; +} +END { print $apfx.' '.join($ofs, uniq(@ids))."\n" if @ids } +EOM + +# TODO: may be used for reshard/compact +sub count_shards { scalar($_[0]->xdb_shards_flat) } + +sub update_commit ($$$) { + my ($self, $cmt, $roots) = @_; # fields from @FMT + my $x = 'Q'.$cmt->{H}; + my ($docid, @extra) = sort { $a <=> $b } $self->docids_by_postlist($x); + @extra and warn "W: $cmt->{H} indexed multiple times, pruning ", + join(', ', map { "#$_" } @extra), "\n"; + $self->{xdb}->delete_document($_) for @extra; + my $doc = $PublicInbox::Search::X{Document}->new; + $doc->add_boolean_term($x); + $doc->add_boolean_term('G'.$_) for @$roots; + $doc->add_boolean_term('XP'.$_) for split(/ /, $cmt->{P}); + $doc->add_boolean_term('T'.'c'); + + # Author-Time is compatible with dt: for mail search schema_version=15 + add_val($doc, PublicInbox::CodeSearch::AT, + POSIX::strftime('%Y%m%d%H%M%S', gmtime($cmt->{at}))); + + # Commit-Time is the fallback used by rt: (TS) for mail search: + add_val($doc, PublicInbox::CodeSearch::CT, $cmt->{ct}); + + $self->term_generator->set_document($doc); + + # email address is always indexed with positional data for usability + $self->index_phrase("$cmt->{an} <$cmt->{ae}>", 1, 'A'); + + $x = $cmt->{'s'}; + $self->index_text($x, 1, 'S') if $x =~ /\S/s; + $doc->set_data($x); # subject is the first (and currently only) line + + $x = delete $cmt->{b}; + $self->index_body_text($doc, \$x) if $x =~ /\S/s; + defined($docid) ? $self->{xdb}->replace_document($docid, $doc) : + $self->{xdb}->add_document($doc); +} + +sub progress { + my ($self, @msg) = @_; + my $pr = $self->{-opt}->{-progress} or return; + $pr->($self->{git} ? ("$self->{git}->{git_dir}: ") : (), @msg, "\n"); +} + +sub check_objfmt_status ($$$) { + my ($git, $chld_err, $fmt) = @_; + my ($status, $sig) = ($chld_err >> 8, $chld_err & 127); + if (!$sig && $status == 1) { # unset, default is '' (SHA-1) + $fmt = 'sha1'; + } elsif (!$sig && $status == 0) { + chomp($fmt ||= 'sha1'); + } + $fmt // warn("git --git-dir=$git->{git_dir} config \$?=$chld_err"); + $fmt; +} + +sub store_repo { # wq_io_do, sends docid back + my ($self, $repo) = @_; + my $op_p = delete($self->{0}) // die 'BUG: no {0} op_p'; + my $git = bless $repo, 'PublicInbox::Git'; + my $rd = $git->popen(qw(config extensions.objectFormat)); + $self->begin_txn_lazy; + $self->{xdb}->delete_document($_) for @{$repo->{to_delete}}; + my $doc = $PublicInbox::Search::X{Document}->new; + add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct}); + $doc->add_boolean_term("P$repo->{git_dir}"); + $doc->add_boolean_term('T'.'r'); + $doc->add_boolean_term('G'.$_) for @{$repo->{roots}}; + $doc->set_data($repo->{fp}); # \n delimited + my $fmt = readline($rd); + $rd->close; + $fmt = check_objfmt_status $git, $?, $fmt; + $OFMT2HEXLEN{$fmt} // warn <<EOM; # store unknown formats anyways +E: unknown extensions.objectFormat=$fmt in $repo->{git_dir} +EOM + $doc->add_boolean_term('H'.$fmt); + my $did = $repo->{docid}; + $did ? $self->{xdb}->replace_document($did, $doc) + : ($did = $self->{xdb}->add_document($doc)); + send($op_p, "repo_stored $did", 0); +} + +sub cidx_ckpoint ($;$) { + my ($self, $msg) = @_; + progress($self, $msg) if defined($msg); + $TXN_BYTES = $BATCH_BYTES; # reset + return if $PublicInbox::Search::X{CLOEXEC_UNSET}; + $self->commit_txn_lazy; + $self->begin_txn_lazy; +} + +sub truncate_cmt ($$) { + my ($cmt) = @_; # _[1] is $buf (giant) + my ($orig_len, $len); + $len = $orig_len = length($_[1]); + @$cmt{@FMT} = split(/\n/, $_[1], scalar(@FMT)); + undef $_[1]; + $len -= length($cmt->{b}); + + # try to keep the commit message body. + # n.b. this diffstat split may be unreliable but it's not worth + # perfection for giant commits: + my ($bdy) = split(/^---\n/sm, delete($cmt->{b}), 2); + if (($len + length($bdy)) <= $MAX_SIZE) { + $len += length($bdy); + $cmt->{b} = $bdy; + warn <<EOM; +W: $cmt->{H}: truncated body ($orig_len => $len bytes) +W: to be under --max-size=$MAX_SIZE +EOM + } else { + $cmt->{b} = ''; + warn <<EOM; +W: $cmt->{H}: deleted body ($orig_len => $len bytes) +W: to be under --max-size=$MAX_SIZE +EOM + } + $len; +} + +sub cidx_reap_log { # awaitpid cb + my ($pid, $cmd, $self, $op_p) = @_; + if (!$? || ($DO_QUIT && (($? & 127) == $DO_QUIT || + ($? & 127) == POSIX::SIGPIPE))) { + send($op_p, "shard_done $self->{shard}", 0); + } else { + warn "W: @$cmd (\$?=$?)\n"; + $self->{xdb}->cancel_transaction; + } +} + +sub shard_index { # via wq_io_do in IDX_SHARDS + my ($self, $git, $roots) = @_; + + my $in = delete($self->{0}) // die 'BUG: no {0} input'; + my $op_p = delete($self->{1}) // die 'BUG: no {1} op_p'; + sysseek($in, 0, SEEK_SET); + my $cmd = $git->cmd(@NO_ABBREV, @LOG_STDIN); + my $rd = popen_rd($cmd, undef, { 0 => $in }, + \&cidx_reap_log, $cmd, $self, $op_p); + PublicInbox::CidxLogP->new($rd, $self, $git, $roots); + # CidxLogP->event_step will call cidx_read_log_p once there's input +} + +# sharded reader for `git log --pretty=format: --stdin' +sub cidx_read_log_p { + my ($self, $log_p, $rd) = @_; + my $git = delete $log_p->{git} // die 'BUG: no {git}'; + local $self->{current_info} = "$git->{git_dir} [$self->{shard}]"; + my $roots = delete $log_p->{roots} // die 'BUG: no {roots}'; + # local-ized in parent before fork + $TXN_BYTES = $BATCH_BYTES; + local $self->{git} = $git; # for patchid + return if $DO_QUIT; + my $nr = 0; + + # a patch may have \0, see c4201214cbf10636e2c1ab9131573f735b42c8d4 + # in linux.git, so we use $/ = "\n\0" to check end-of-patch + my $FS = "\n\0"; + my $len; + my $cmt = {}; + local $/ = $FS; + my $buf = <$rd> // return; # leading $FS + $buf eq $FS or die "BUG: not LF-NUL: $buf\n"; + $self->begin_txn_lazy; + while (!$DO_QUIT && defined($buf = <$rd>)) { + chomp($buf); + $/ = "\n"; + $len = length($buf); + if (defined($MAX_SIZE) && $len > $MAX_SIZE) { + $len = truncate_cmt($cmt, $buf); + } else { + @$cmt{@FMT} = split(/\n/, $buf, scalar(@FMT)); + } + if (($TXN_BYTES -= $len) <= 0) { + cidx_ckpoint($self, "[$self->{shard}] $nr"); + $TXN_BYTES -= $len; # len may be huge, >TXN_BYTES; + } + update_commit($self, $cmt, $roots); + ++$nr; + cidx_ckpoint($self, "[$self->{shard}] $nr") if $TXN_BYTES <= 0; + $/ = $FS; + } + # return and wait for cidx_reap_log +} + +sub shard_done { # called via PktOp on shard_index completion + my ($self, $repo_ctx, $on_destroy, $n) = @_; + $repo_ctx->{shard_ok}->{$n} = 1; +} + +sub repo_stored { + my ($self, $repo_ctx, $drs, $did) = @_; + # check @IDX_SHARDS instead of DO_QUIT to avoid wasting prior work + # because shard_commit is fast + return unless @IDX_SHARDS; + $did > 0 or die "BUG: $repo_ctx->{repo}->{git_dir}: docid=$did"; + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{shard_done} = [ $self, $repo_ctx, + on_destroy(\&next_repos, $repo_ctx, $drs)]; + # shard_done fires when all shards are committed + my @active = keys %{$repo_ctx->{active}}; + $IDX_SHARDS[$_]->wq_io_do('shard_commit', [ $p->{op_p} ]) for @active; +} + +sub prune_done { # called via prune_do completion + my ($self, $drs, $n) = @_; + return if $DO_QUIT || !$PRUNE_DONE; + die "BUG: \$PRUNE_DONE->[$n] already defined" if $PRUNE_DONE->[$n]; + $PRUNE_DONE->[$n] = 1; + if (grep(defined, @$PRUNE_DONE) == @IDX_SHARDS) { + progress($self, 'prune done'); + index_next($self); # may kick dump_roots_start + } +} + +sub seen ($$) { + my ($xdb, $q) = @_; # $q = "Q$COMMIT_HASH" + for (1..100) { + my $ret = eval { + $xdb->postlist_begin($q) != $xdb->postlist_end($q); + }; + return $ret unless $@; + if (ref($@) =~ /\bDatabaseModifiedError\b/) { + $xdb->reopen; + } else { + Carp::croak($@); + } + } + Carp::croak('too many Xapian DB modifications in progress'); +} + +# used to select the shard for a GIT_DIR +sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) } + +sub _cb { # run_await cb + my ($pid, $cmd, undef, $opt, $cb, $self, $git, @arg) = @_; + return if $DO_QUIT; + return $cb->($opt, $self, $git, @arg) if $opt->{quiet}; + $? ? ($git->{-cidx_err} = warn("W: @$cmd (\$?=$?)\n")) : + $cb->($opt, $self, $git, @arg); +} + +sub run_git { + my ($cmd, $opt, $cb, $self, $git, @arg) = @_; + run_await($git->cmd(@$cmd), undef, $opt, \&_cb, $cb, $self, $git, @arg) +} + +# this is different from the grokmirror-compatible fingerprint since we +# only care about --heads (branches) and --tags, and not even their names +sub fp_start ($$) { + my ($self, $git) = @_; + return if $DO_QUIT; + open my $refs, '+>', undef; + $git->{-repo}->{refs} = $refs; + my ($c, $p) = PublicInbox::PktOp->pair; + my $next_on_err = on_destroy \&index_next, $self; + $c->{ops}->{fp_done} = [ $self, $git, $next_on_err ]; + $IDX_SHARDS[++$ANY_SHARD % scalar(@IDX_SHARDS)]->wq_io_do('fp_async', + [ $p->{op_p}, $refs ], $git->{git_dir}) +} + +sub fp_async { # via wq_io_do in worker + my ($self, $git_dir) = @_; + my $op_p = delete $self->{0} // die 'BUG: no {0} op_p'; + my $refs = delete $self->{1} // die 'BUG: no {1} refs'; + my $git = PublicInbox::Git->new($git_dir); + run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs }, + \&fp_async_done, $self, $git, $op_p); +} + +sub fp_async_done { # run_git cb from worker + my ($opt, $self, $git, $op_p) = @_; + my $refs = delete $opt->{1} // 'BUG: no {-repo}->{refs}'; + sysseek($refs, 0, SEEK_SET); + send($op_p, 'fp_done '.sha_all(256, $refs)->hexdigest, 0); +} + +sub fp_done { # called parent via PktOp by fp_async_done + my ($self, $git, $next_on_err, $hex) = @_; + $next_on_err->cancel; + return if $DO_QUIT; + $git->{-repo}->{fp} = $hex; + my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB); + my $shard = bless { %$self, shard => $n }, ref($self); + $git->{-repo}->{shard_n} = $n; + delete @$shard{qw(lockfh lock_path)}; + local $shard->{xdb} = $RDONLY_XDB[$n] // die "BUG: shard[$n] undef"; + $shard->retry_reopen(\&check_existing, $self, $git); +} + +sub check_existing { # retry_reopen callback + my ($shard, $self, $git) = @_; + my @docids = $shard->docids_of_git_dir($git->{git_dir}); + my $docid = shift(@docids) // return prep_repo($self, $git); # new repo + my $doc = $shard->get_doc($docid) // + die "BUG: no #$docid ($git->{git_dir})"; + my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data; + if ($old_fp eq $git->{-repo}->{fp}) { # no change + delete $git->{-repo}; + return index_next($self); + } + $git->{-repo}->{docid} = $docid; + if (@docids) { + warn "BUG: $git->{git_dir} indexed multiple times, culling\n"; + $git->{-repo}->{to_delete} = \@docids; # XXX needed? + } + prep_repo($self, $git); +} + +sub partition_refs ($$$) { + my ($self, $git, $refs) = @_; # show-ref --heads --tags --hash output + sysseek($refs, 0, SEEK_SET); + my $rfh = $git->popen(qw(rev-list --stdin), undef, { 0 => $refs }); + my $seen = 0; + my @shard_in = map { + $_->reopen; + open my $fh, '+>', undef; + $fh; + } @RDONLY_XDB; + + my $n0 = $NCHANGE; + while (defined(my $cmt = <$rfh>)) { + chomp $cmt; + my $n = hex(substr($cmt, 0, 8)) % scalar(@RDONLY_XDB); + if ($REINDEX && $REINDEX->set_maybe(pack('H*', $cmt), '')) { + say { $shard_in[$n] } $cmt; + ++$NCHANGE; + } elsif (seen($RDONLY_XDB[$n], 'Q'.$cmt)) { + last if ++$seen > $SEEN_MAX; + } else { + say { $shard_in[$n] } $cmt; + ++$NCHANGE; + $seen = 0; + } + if ($DO_QUIT) { + $rfh->close; + return (); + } + } + $rfh->close; + return () if $DO_QUIT; + if (!$? || (($? & 127) == POSIX::SIGPIPE && $seen > $SEEN_MAX)) { + my $n = $NCHANGE - $n0; + progress($self, "$git->{git_dir}: $n commits") if $n; + return @shard_in; + } + die "git --git-dir=$git->{git_dir} rev-list: \$?=$?\n"; +} + +sub shard_commit { # via wq_io_do + my ($self) = @_; + my $op_p = delete($self->{0}) // die 'BUG: no {0} op_p'; + $self->commit_txn_lazy; + send($op_p, "shard_done $self->{shard}", 0); +} + +sub dump_roots_start { + my ($self, $do_join) = @_; + return if $DO_QUIT; + $XHC //= PublicInbox::XapClient::start_helper("-j$NPROC"); + $do_join // die 'BUG: no $do_join'; + progress($self, 'dumping IDs from coderepos'); + local $self->{xdb}; + @OFF2ROOT = $self->all_terms('G'); + my $root2id = "$TMPDIR/root2id"; + open my $fh, '>', $root2id; + my $nr = -1; + for (@OFF2ROOT) { print $fh $_, "\0", ++$nr, "\0" } # mmap-friendly + close $fh; + # dump_roots | sort -k1,1 | OFS=' ' uniq_fold >to_root_off + my ($sort_opt, $fold_opt); + pipe(local $sort_opt->{0}, my $sort_w); + pipe(local $fold_opt->{0}, local $sort_opt->{1}); + my @sort = (@SORT, '-k1,1'); + my $dst = "$TMPDIR/to_root_off"; + open $fold_opt->{1}, '>', $dst; + my $fold_env = { %$CMD_ENV, OFS => ' ' }; + run_await(\@sort, $CMD_ENV, $sort_opt, \&cmd_done, $do_join); + run_await(\@UNIQ_FOLD, $fold_env, $fold_opt, \&cmd_done, $do_join); + my $window = $JOIN{window} // $JOIN_WINDOW; + my @m = $window <= 0 ? () : ('-m', $window); + my @arg = ((map { ('-A', $_) } @JOIN_PFX), '-c', + @m, $root2id, $QRY_STR); + for my $d ($self->shard_dirs) { + pipe(my $err_r, my $err_w); + $XHC->mkreq([$sort_w, $err_w], qw(dump_roots -d), $d, @arg); + my $desc = "dump_roots $d"; + $self->{PENDING}->{$desc} = $do_join; + PublicInbox::CidxXapHelperAux->new($err_r, $self, $desc); + } + progress($self, 'waiting on dump_roots sort'); +} + +sub dump_ibx { # sends to xap_helper.h + my ($self, $ibx_off) = @_; + my $ibx = $IBX[$ibx_off] // die "BUG: no IBX[$ibx_off]"; + my $ekey = $ibx->eidx_key; + my $srch = $ibx->isrch or return warn <<EOM; +W: $ekey not indexed for search +EOM + # note: we don't send `-m MAX' to dump_ibx since we have to + # post-filter non-patch messages for now... + my @cmd = ('dump_ibx', $srch->xh_args, + (map { ('-A', $_) } @JOIN_PFX), $ibx_off, $QRY_STR); + pipe(my $r, my $w); + $XHC->mkreq([$DUMP_IBX_WPIPE, $w], @cmd); + $self->{PENDING}->{$ekey} = $TODO{do_join}; + PublicInbox::CidxXapHelperAux->new($r, $self, $ekey); +} + +sub dump_ibx_start { + my ($self, $do_join) = @_; + return if $DO_QUIT; + $XHC //= PublicInbox::XapClient::start_helper("-j$NPROC"); + my ($sort_opt, $fold_opt); + pipe(local $sort_opt->{0}, $DUMP_IBX_WPIPE); + pipe(local $fold_opt->{0}, local $sort_opt->{1}); + my @sort = (@SORT, '-k1,1'); # sort only on JOIN_PFX + # pipeline: dump_ibx | sort -k1,1 | uniq_fold >to_ibx_off + open $fold_opt->{1}, '>', "$TMPDIR/to_ibx_off"; + run_await(\@sort, $CMD_ENV, $sort_opt, \&cmd_done, $do_join); + run_await(\@UNIQ_FOLD, $CMD_ENV, $fold_opt, \&cmd_done, $do_join); +} + +sub index_next ($) { + my ($self) = @_; + return if $DO_QUIT; + if ($IDXQ && @$IDXQ) { + index_repo($self, shift @$IDXQ); + } elsif ($SCANQ && @$SCANQ) { + fp_start $self, shift @$SCANQ; + } elsif ($TMPDIR) { + delete $TODO{dump_roots_start}; + delete $TODO{dump_ibx_start}; # runs OnDestroy once + return dump_ibx($self, shift @IBXQ) if @IBXQ; + undef $DUMP_IBX_WPIPE; # done dumping inboxes + delete $TODO{do_join}; + } + # else: wait for shards_active (post_loop_do) callback +} + +sub next_repos { # OnDestroy cb + my ($repo_ctx, $drs) = @_; + my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)}; + progress($self, "$repo->{git_dir}: done"); + return if $DO_QUIT || !$REPO_CTX; + my $n = grep { ! $repo_ctx->{shard_ok}->{$_} } keys %$active; + die "E: $repo->{git_dir} $n shards failed" if $n; + $REPO_CTX == $repo_ctx or die "BUG: $REPO_CTX != $repo_ctx"; + $REPO_CTX = undef; + index_next($self); +} + +sub index_done { # OnDestroy cb called when done indexing each code repo + my ($repo_ctx, $drs) = @_; + return if $DO_QUIT; + my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)}; + # $active may be undef here, but it's fine to vivify + my $n = grep { ! $repo_ctx->{shard_ok}->{$_} } keys %$active; + die "E: $repo->{git_dir} $n shards failed" if $n; + $repo_ctx->{shard_ok} = {}; # reset for future shard_done + $n = $repo->{shard_n}; + $repo_ctx->{active}->{$n} = undef; # may vivify $repo_ctx->{active} + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{repo_stored} = [ $self, $repo_ctx, $drs ]; + $IDX_SHARDS[$n]->wq_io_do('store_repo', [ $p->{op_p} ], $repo); + # repo_stored will fire once store_repo is done +} + +sub index_repo { + my ($self, $git) = @_; + return if $DO_QUIT; + my $repo = $git->{-repo} // die 'BUG: no {-repo}'; + return index_next($self) if $git->{-cidx_err}; + if (!defined($repo->{ct})) { + warn "W: $git->{git_dir} has no commits, skipping\n"; + return index_next($self); + } + return push(@$IDXQ, $git) if $REPO_CTX; # busy + delete $git->{-repo}; + my $roots_fh = delete $repo->{roots_fh} // die 'BUG: no {roots_fh}'; + seek($roots_fh, 0, SEEK_SET); + chomp(my @roots = PublicInbox::IO::read_all $roots_fh); + if (!@roots) { + warn("E: $git->{git_dir} has no root commits\n"); + return index_next($self); + } + $repo->{roots} = \@roots; + local $self->{current_info} = $git->{git_dir}; + my @shard_in = partition_refs($self, $git, delete($repo->{refs})); + $repo->{git_dir} = $git->{git_dir}; + my $repo_ctx = $REPO_CTX = { self => $self, repo => $repo }; + delete $git->{-cidx_gits_fini}; # may fire gits_fini + my $drs = delete $git->{-cidx_dump_roots_start}; + my $index_done = on_destroy \&index_done, $repo_ctx, $drs; + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{shard_done} = [ $self, $repo_ctx, $index_done ]; + for my $n (0..$#shard_in) { + $shard_in[$n]->flush or die "flush shard[$n]: $!"; + -s $shard_in[$n] or next; + last if $DO_QUIT; + $IDX_SHARDS[$n]->wq_io_do('shard_index', + [ $shard_in[$n], $p->{op_p} ], + $git, \@roots); + $repo_ctx->{active}->{$n} = undef; + } + # shard_done fires when shard_index is done +} + +sub ct_fini { # run_git cb + my ($opt, $self, $git, $index_repo) = @_; + my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF + $git->{-repo}->{ct} = $ct + 0; +} + +# TODO: also index gitweb.owner and the full fingerprint for grokmirror? +sub prep_repo ($$) { + my ($self, $git) = @_; + return if $DO_QUIT; + my $index_repo = on_destroy \&index_repo, $self, $git; + my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}'; + sysseek($refs, 0, SEEK_SET); + open my $roots_fh, '+>', undef; + $git->{-repo}->{roots_fh} = $roots_fh; + run_git([ qw(rev-list --stdin --max-parents=0) ], + { 0 => $refs, 1 => $roots_fh }, \&PublicInbox::Config::noop, + $self, $git, $index_repo); + run_git([ qw[for-each-ref --sort=-committerdate + --format=%(committerdate:raw) --count=1 + refs/heads/ refs/tags/] ], undef, # capture like qx + \&ct_fini, $self, $git, $index_repo); +} + +# for PublicInbox::SearchIdx `git patch-id' call and with_umask +sub git { $_[0]->{git} } + +sub load_existing ($) { # for -u/--update + my ($self) = @_; + my $dirs = $self->{git_dirs} //= []; + if ($self->{-opt}->{update} || $self->{-opt}->{prune}) { + local $self->{xdb}; + $self->xdb or + die "E: $self->{cidx_dir} non-existent for --update\n"; + my @cur = grep { + if (-e $_) { + 1; + } else { + push @GIT_DIR_GONE, $_; + undef; + } + } $self->all_terms('P'); + if (@GIT_DIR_GONE && !$self->{-opt}->{prune}) { + warn "W: the following repos no longer exist:\n", + (map { "W:\t$_\n" } @GIT_DIR_GONE), + "W: use --prune to remove them from ", + $self->{cidx_dir}, "\n"; + } + push @$dirs, @cur; + } + @$dirs = uniqstr @$dirs; +} + +# SIG handlers: +sub shard_quit { $DO_QUIT = POSIX->can("SIG$_[0]")->() } +sub shard_usr1 { $TXN_BYTES = -1 } + +sub cidx_init ($) { + my ($self) = @_; + my $dir = $self->{cidx_dir}; + unless (-d $dir) { + warn "# creating $dir\n" if !$self->{-opt}->{quiet}; + File::Path::mkpath($dir); + } + $self->lock_acquire; + my @shards; + my $l = $self->{indexlevel} //= $self->{-opt}->{indexlevel}; + + for my $n (0..($self->{nshard} - 1)) { + my $shard = bless { %$self, shard => $n }, ref($self); + delete @$shard{qw(lockfh lock_path)}; + my $xdb = $shard->idx_acquire; + if (!$n) { + if (($l // '') eq 'medium') { + $xdb->set_metadata('indexlevel', $l); + } elsif (($l // '') eq 'full') { + $xdb->set_metadata('indexlevel', ''); # unset + } + $l ||= $xdb->get_metadata('indexlevel') || 'full'; + } + $shard->{indexlevel} = $l; + $shard->idx_release; + $shard->wq_workers_start("cidx shard[$n]", 1, $SIGSET, { + siblings => \@shards, # for ipc_atfork_child + }, \&shard_done_wait, $self); + push @shards, $shard; + } + $self->{indexlevel} //= $l; + # this warning needs to happen after idx_acquire + state $once; + warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++; +W: Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks, +W: memory usage may be high for large indexing runs +EOM + @shards; +} + +# called when all git coderepos are done +sub gits_fini { + undef $GITS_NR; + PublicInbox::DS::enqueue_reap(); # kick @post_loop_do +} + +sub scan_git_dirs ($) { + my ($self) = @_; + @$SCANQ = () unless $self->{-opt}->{scan}; + $GITS_NR = @$SCANQ or return; + my $gits_fini = on_destroy \&gits_fini; + $_->{-cidx_gits_fini} = $gits_fini for @$SCANQ; + if (my $drs = $TODO{dump_roots_start}) { + $_->{-cidx_dump_roots_start} = $drs for @$SCANQ; + } + progress($self, "scanning $GITS_NR code repositories..."); +} + +sub prune_init { # via wq_io_do in IDX_SHARDS + my ($self) = @_; + $self->{nr_prune} = 0; + $TXN_BYTES = $BATCH_BYTES; + $self->begin_txn_lazy; +} + +sub prune_one { # via wq_io_do in IDX_SHARDS + my ($self, $term) = @_; + my @docids = $self->docids_by_postlist($term); + for (@docids) { + $TXN_BYTES -= $self->{xdb}->get_doclength($_) * 42; + $self->{xdb}->delete_document($_); + } + ++$self->{nr_prune}; + $TXN_BYTES < 0 and + cidx_ckpoint($self, "prune [$self->{shard}] $self->{nr_prune}"); +} + +sub prune_commit { # via wq_io_do in IDX_SHARDS + my ($self) = @_; + my $prune_op_p = delete $self->{0} // die 'BUG: no {0} op_p'; + my $nr = delete $self->{nr_prune} // die 'BUG: nr_prune undef'; + cidx_ckpoint($self, "prune [$self->{shard}] $nr done") if $nr; + send($prune_op_p, "prune_done $self->{shard}", 0); +} + +sub shards_active { # post_loop_do + return if $DO_QUIT; + return if grep(defined, $PRUNE_DONE, $SCANQ, $IDXQ) != 3; + return 1 if grep(defined, @$PRUNE_DONE) != @IDX_SHARDS; + return 1 if $GITS_NR || scalar(@$IDXQ) || $REPO_CTX; + return 1 if @IBXQ || keys(%TODO); + for my $s (grep { $_->{-wq_s1} } @IDX_SHARDS) { + $s->{-cidx_quit} = 1 if defined($s->{-wq_s1}); + $s->wq_close; # may recurse via awaitpid outside of event_loop + } + scalar(grep { $_->{-cidx_quit} } @IDX_SHARDS); +} + +# signal handlers +sub kill_shards { $_->wq_kill(@_) for (@IDX_SHARDS) } + +sub parent_quit { + $DO_QUIT = POSIX->can("SIG$_[0]")->(); + $XHC = 0; # stops the process + kill_shards(@_); + warn "# SIG$_[0] received, quitting...\n"; +} + +sub prep_umask ($) { + my ($self) = @_; + if ($self->{-cidx_internal}) { # respect core.sharedRepository + @{$self->{git_dirs}} == 1 or die 'BUG: only for GIT_DIR'; + local $self->{git} = + PublicInbox::Git->new($self->{git_dirs}->[0]); + $self->with_umask; + } elsif (-d $self->{cidx_dir}) { # respect existing perms + my @st = stat(_); + my $um = (~$st[2] & 0777); + $self->{umask} = $um; # for SearchIdx->with_umask + umask == $um or progress($self, 'using umask from ', + $self->{cidx_dir}, ': ', + sprintf('0%03o', $um)); + on_destroy \&CORE::umask, umask($um); + } else { + $self->{umask} = umask; # for SearchIdx->with_umask + undef; + } +} + +sub prep_alternate_end ($$) { + my ($objdir, $fmt) = @_; + my $hexlen = $OFMT2HEXLEN{$fmt} // return warn <<EOM; +E: ignoring objdir=$objdir, unknown extensions.objectFormat=$fmt +EOM + unless ($ALT_FH{$hexlen}) { + require PublicInbox::Import; + my $git_dir = "$TMPDIR/hexlen$hexlen.git"; + PublicInbox::Import::init_bare($git_dir, 'cidx-all', $fmt); + open $ALT_FH{$hexlen}, '>', "$git_dir/objects/info/alternates"; + } + say { $ALT_FH{$hexlen} } $objdir; +} + +sub store_objfmt { # via wq_do - make early cidx users happy + my ($self, $docid, $git_dir, $fmt) = @_; + $self->begin_txn_lazy; + my $doc = $self->get_doc($docid) // return + warn "BUG? #$docid for $git_dir missing"; + my @p = xap_terms('P', $doc) or return + warn "BUG? #$docid for $git_dir has no P(ath)"; + @p == 1 or return warn "BUG? #$docid $git_dir multi: @p"; + $p[0] eq $git_dir or return warn "BUG? #$docid $git_dir != @p"; + $doc->add_boolean_term('H'.$fmt); + $self->{xdb}->replace_document($docid, $doc); + # wait for prune_commit to commit... +} + +# TODO: remove prep_alternate_read and store_objfmt 1-2 years after 2.0 is out +# they are for compatibility with pre-release indices +sub prep_alternate_read { # run_git cb for config extensions.objectFormat + my ($opt, $self, $git, $objdir, $docid, $shard_n, $run_prune) = @_; + return if $DO_QUIT; + my $chld_err = $?; + prep_alternate_start($self, shift(@PRUNEQ), $run_prune) if @PRUNEQ; + my $fmt = check_objfmt_status $git, $chld_err, ${$opt->{1}}; + $IDX_SHARDS[$shard_n]->wq_do('store_objfmt', # async + $docid, $git->{git_dir}, $fmt); + prep_alternate_end $objdir, $fmt; +} + +sub prep_alternate_start { + my ($self, $git, $run_prune) = @_; + local $self->{xdb}; + my ($o, $n, @ids, @fmt); +start: + $o = $git->git_path('objects'); + while (!-d $o) { + $git = shift(@PRUNEQ) // return; + $o = $git->git_path('objects'); + } + $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB); + $self->{xdb} = $RDONLY_XDB[$n] // croak("BUG: no shard[$n]"); + @ids = $self->docids_by_postlist('P'.$git->{git_dir}); + @fmt = @ids ? xap_terms('H', $self->{xdb}, $ids[0]) : (); + @fmt > 1 and warn "BUG? multi `H' for shard[$n] #$ids[0]: @fmt"; + + if (@fmt) { # cache hit + prep_alternate_end $o, $fmt[0]; + $git = shift(@PRUNEQ) and goto start; + } else { # compatibility w/ early cidx format + run_git([qw(config extensions.objectFormat)], { quiet => 1 }, + \&prep_alternate_read, $self, $git, $o, $ids[0], $n, + $run_prune); + } +} + +sub cmd_done { # run_await cb for sort, xapian-delve, sed failures + my ($pid, $cmd, undef, undef, $run_on_destroy) = @_; + $? and die "fatal: @$cmd (\$?=$?)\n"; + # $run_on_destroy calls do_join() or run_prune() +} + +sub current_join_data ($) { + my ($self) = @_; + local $self->{xdb} = $RDONLY_XDB[0] // die 'BUG: shard[0] undef'; + # we support multiple PI_CONFIG files for a cindex: + $self->join_data; +} + +# combined previously stored stats with new +sub score_old_join_data ($$$) { + my ($self, $score, $ekeys_new) = @_; + my $old = ($JOIN{reset} ? undef : current_join_data($self)) or return; + progress($self, 'merging old join data...'); + my ($ekeys_old, $roots_old, $ibx2root_old) = + @$old{qw(ekeys roots ibx2root)}; + # score: "ibx_off root_off" => nr + my $i = -1; + my %root2id_new = map { $_ => ++$i } @OFF2ROOT; + $i = -1; + my %ekey2id_new = map { $_ => ++$i } @$ekeys_new; + for my $ibx_off_old (0..$#$ibx2root_old) { + my $root_offs_old = $ibx2root_old->[$ibx_off_old]; + my $ekey = $ekeys_old->[$ibx_off_old] // do { + warn "W: no ibx #$ibx_off_old in old join data\n"; + next; + }; + my $ibx_off_new = $ekey2id_new{$ekey} // do { + warn "W: `$ekey' no longer exists\n"; + next; + }; + for (@$root_offs_old) { + my ($nr, $rid_old) = @$_; + my $root_old = $roots_old->[$rid_old] // do { + warn "W: no root #$rid_old in old data\n"; + next; + }; + my $rid_new = $root2id_new{$root_old} // do { + warn "W: root `$root_old' no longer exists\n"; + next; + }; + $score->{"$ibx_off_new $rid_new"} += $nr; + } + } +} + +sub metadata_set { # via wq_do + my ($self, $key, $val, $commit) = @_; + $self->begin_txn_lazy; + $self->{xdb}->set_metadata($key, $val); + $self->commit_txn_lazy if $commit || defined(wantarray); +} + +# runs once all inboxes and shards are dumped via OnDestroy +sub do_join { + my ($self) = @_; + return if $DO_QUIT; + $XHC = 0; # should not be recreated again + @IDX_SHARDS or return warn("# aborting on no shards\n"); + unlink("$TMPDIR/root2id"); + my @pending = keys %{$self->{PENDING}}; + die "BUG: pending=@pending jobs not done\n" if @pending; + progress($self, 'joining...'); + my @join = (@JOIN, 'to_ibx_off', 'to_root_off'); + if (my $time = which('time')) { unshift @join, $time }; + my $rd = popen_rd(\@join, $CMD_ENV, { -C => "$TMPDIR" }); + my %score; + while (<$rd>) { # PFX ibx_offs root_off + chop eq "\n" or die "no newline from @join: <$_>"; + my (undef, $ibx_offs, @root_offs) = split / /, $_; + for my $ibx_off (split(/,/, $ibx_offs)) { + ++$score{"$ibx_off $_"} for @root_offs; + } + } + $rd->close or die "fatal: @join failed: \$?=$?"; + my $nr = scalar(keys %score) or do { + delete $TODO{joining}; + return progress($self, 'no potential new pairings'); + }; + progress($self, "$nr potential new pairings..."); + my @ekeys = map { $_->eidx_key } @IBX; + score_old_join_data($self, \%score, \@ekeys); + my $new; + while (my ($k, $nr) = each %score) { + my ($ibx_off, $root_off) = split(/ /, $k); + my ($ekey, $root) = ($ekeys[$ibx_off], $OFF2ROOT[$root_off]); + progress($self, "$ekey => $root has $nr matches"); + push @{$new->{ibx2root}->[$ibx_off]}, [ $nr, $root_off ]; + } + for my $ary (values %$new) { # sort by nr (largest first) + for (@$ary) { @$_ = sort { $b->[0] <=> $a->[0] } @$_ } + } + $new->{ekeys} = \@ekeys; + $new->{roots} = \@OFF2ROOT; + $new->{dt} = \@JOIN_DT; + $new = compress(PublicInbox::Config::json()->encode($new)); + my $key = $self->join_data_key; + my $wait = $IDX_SHARDS[0]->wq_do('metadata_set', $key, $new); + delete $TODO{joining}; +} + +sub require_progs { + my $op = shift; + while (my ($x, $argv) = splice(@_, 0, 2)) { + my $e = $x; + $e =~ tr/a-z-/A-Z_/; + my $c = $ENV{$e} // $x; + $argv->[0] //= which($c) // die "E: `$x' required for --$op\n"; + } +} + +sub init_join_postfork ($) { + my ($self) = @_; + return unless $self->{-opt}->{join}; + require_progs('join', join => \@JOIN); + my $d2 = '([0-9]{2})'; + my $dt_re = qr!([0-9]{4})$d2$d2$d2$d2$d2!; + if (my $cur = $JOIN{reset} ? undef : current_join_data($self)) { + if (($cur->{dt}->[1] // '') =~ m!\A$dt_re\z!o) { + my ($Y, $m, $d, $H, $M, $S) = ($1, $2, $3, $4, $5, $6); + my $t = timegm($S, $M, $H, $d, $m - 1, $Y); + $t = strftime('%Y%m%d%H%M%S', gmtime($t + 1)); + $JOIN{dt} //= "$t.."; + } else { + warn <<EOM; +BUG?: previous --join invocation did not store usable `dt' key +EOM + } + } + if ($JOIN{aggressive}) { + $JOIN{window} //= -1; + $JOIN{dt} //= '..1.month.ago'; + } + $QRY_STR = $JOIN{dt} // '1.year.ago..'; + index($QRY_STR, '..') >= 0 or die "E: dt:$QRY_STR is not a range\n"; + # Account for send->apply delay (torvalds/linux.git mean is ~20 days + # from Author to CommitDate in cases where CommitDate > AuthorDate + $QRY_STR .= '1.month.ago' if $QRY_STR =~ /\.\.\z/; + @{$self->{git_dirs} // []} or die "E: no coderepos to join\n"; + @IBX or die "E: no inboxes to join\n"; + my $approx_git = PublicInbox::Git->new($self->{git_dirs}->[0]); # ugh + substr($QRY_STR, 0, 0) = 'dt:'; + $self->query_approxidate($approx_git, $QRY_STR); # in-place + ($JOIN_DT[1]) = ($QRY_STR =~ /\.\.([0-9]{14})\z/); # YYYYmmddHHMMSS + ($JOIN_DT[0]) = ($QRY_STR =~ /\Adt:([0-9]{14})/); # YYYYmmddHHMMSS + $JOIN_DT[0] //= '19700101'.'000000'; # git uses unsigned times + $TODO{do_join} = on_destroy \&do_join, $self; + $TODO{joining} = 1; # keep shards_active() happy + $TODO{dump_ibx_start} = on_destroy \&dump_ibx_start, + $self, $TODO{do_join}; + $TODO{dump_roots_start} = on_destroy \&dump_roots_start, + $self, $TODO{do_join}; + progress($self, "will join in $QRY_STR date range..."); + my $id = -1; + @IBXQ = map { ++$id } @IBX; +} + +sub init_prune ($) { + my ($self) = @_; + return (@$PRUNE_DONE = map { 1 } @IDX_SHARDS) if !$self->{-opt}->{prune}; + + # Dealing with millions of commits here at once, so use faster tools. + # xapian-delve is nearly an order-of-magnitude faster than Xapian Perl + # bindings. sed/awk are faster than Perl for simple stream ops, and + # sort+comm are more memory-efficient with gigantic lists. + # pipeline: delve | sed | sort >indexed_commits + my @delve = (undef, qw(-A Q -1)); + my @sed = (undef, '-ne', 's/^Q//p'); + @COMM = (undef, qw(-2 -3 indexed_commits -)); + @AWK = (undef, '$2 == "commit" { print $1 }'); # --batch-check output + require_progs('prune', 'xapian-delve' => \@delve, sed => \@sed, + comm => \@COMM, awk => \@AWK); + for (0..$#IDX_SHARDS) { push @delve, "$self->{xpfx}/$_" } + my $run_prune = on_destroy \&run_prune, $self, $TODO{dump_roots_start}; + my ($sort_opt, $sed_opt, $delve_opt); + pipe(local $sed_opt->{0}, local $delve_opt->{1}); + pipe(local $sort_opt->{0}, local $sed_opt->{1}); + open($sort_opt->{1}, '+>', "$TMPDIR/indexed_commits"); + run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done, $run_prune); + run_await(\@sed, $CMD_ENV, $sed_opt, \&cmd_done, $run_prune); + run_await(\@delve, undef, $delve_opt, \&cmd_done, $run_prune); + @PRUNEQ = @$SCANQ; + for (1..$LIVE_JOBS) { + prep_alternate_start($self, shift(@PRUNEQ) // last, $run_prune); + } +} + +sub dump_git_commits { # run_await cb + my ($pid, $cmd, undef, $batch_opt, $self) = @_; + (defined($pid) && $?) and die "E: @$cmd \$?=$?"; + return if $DO_QUIT; + my ($hexlen) = keys(%ALT_FH) or return; # done, DESTROY batch_opt->{1} + close(delete $ALT_FH{$hexlen}); # flushes `say' buffer + progress($self, "preparing $hexlen-byte hex OID commits for prune..."); + my $g = PublicInbox::Git->new("$TMPDIR/hexlen$hexlen.git"); + run_await($g->cmd(@PRUNE_BATCH), undef, $batch_opt, + \&dump_git_commits, $self); +} + +sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done + my ($self, $drs) = @_; + return if $DO_QUIT; + # setup the following pipeline: ( + # git --git-dir=hexlen40.git cat-file \ + # --batch-all-objects --batch-check && + # git --git-dir=hexlen64.git cat-file \ + # --batch-all-objects --batch-check + # ) | awk | sort | comm | cidx_read_comm() + my ($awk_opt, $sort_opt, $batch_opt); + my $comm_opt = { -C => "$TMPDIR" }; + pipe(local $awk_opt->{0}, $batch_opt->{1}); + pipe(local $sort_opt->{0}, local $awk_opt->{1}); + pipe(local $comm_opt->{0}, local $sort_opt->{1}); + run_await(\@AWK, $CMD_ENV, $awk_opt, \&cmd_done); + run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done); + my $comm_rd = popen_rd(\@COMM, $CMD_ENV, $comm_opt, \&cmd_done, \@COMM); + PublicInbox::CidxComm->new($comm_rd, $self, $drs); # ->cidx_read_comm + push @PRUNE_BATCH, '--buffer' if $GIT_VER ge v2.6; + + # Yes, we pipe --unordered git output to sort(1) because sorting + # inside git leads to orders-of-magnitude slowdowns on rotational + # storage. GNU sort(1) also works well on larger-than-memory + # datasets, and it's not worth eliding sort(1) for old git. + push @PRUNE_BATCH, '--unordered' if $GIT_VER ge v2.19; + warn(sprintf(<<EOM, $GIT_VER)) if $GIT_VER lt v2.19; +W: git v2.19+ recommended for high-latency storage (have git v%vd) +EOM + dump_git_commits(undef, undef, undef, $batch_opt, $self); +} + +sub cidx_read_comm { # via PublicInbox::CidxComm::event_step + my ($self, $comm_rd, $drs) = @_; + return if $DO_QUIT; + progress($self, 'starting prune...'); + $_->wq_do('prune_init') for @IDX_SHARDS; + while (defined(my $cmt = <$comm_rd>)) { + chop($cmt) eq "\n" or die "BUG: no LF in comm output ($cmt)"; + my $n = hex(substr($cmt, 0, 8)) % scalar(@IDX_SHARDS); + $IDX_SHARDS[$n]->wq_do('prune_one', 'Q'.$cmt); + last if $DO_QUIT; + } + for my $git_dir (@GIT_DIR_GONE) { + my $n = git_dir_hash($git_dir) % scalar(@IDX_SHARDS); + $IDX_SHARDS[$n]->wq_do('prune_one', 'P'.$git_dir); + last if $DO_QUIT; + } + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{prune_done} = [ $self, $drs ]; + $_->wq_io_do('prune_commit', [ $p->{op_p} ]) for @IDX_SHARDS; +} + +sub init_join_prefork ($) { + my ($self) = @_; + my $subopt = $self->{-opt}->{join} // return; + %JOIN = map { + my ($k, $v) = split /:/, $_, 2; + $k => $v // 1; + } split(/,/, join(',', @$subopt)); + require PublicInbox::CidxXapHelperAux; + require PublicInbox::XapClient; + my @unknown; + my $pfx = $JOIN{prefixes} // 'dfpost7'; + for my $p (split /\+/, $pfx) { + my $n = ''; + $p =~ s/([0-9]+)\z// and $n = $1; + my $v = $PublicInbox::Search::PATCH_BOOL_COMMON{$p} // + push(@unknown, $p); + push(@JOIN_PFX, map { $_.$n } split(/ /, $v)); + } + @unknown and die <<EOM; +E: --join=prefixes= contains unsupported prefixes: @unknown +EOM + @JOIN_PFX = uniqstr @JOIN_PFX; + my %incl = map { + if (-f "$_/inbox.lock" || -d "$_/public-inbox") { + rel2abs_collapsed($_) => undef; + } else { + warn "W: `$_' is not a public inbox, skipping\n"; + (); + } + } (@{$self->{-opt}->{include} // []}); + my $all = $self->{-opt}->{all}; + if (my $only = $self->{-opt}->{only}) { + die <<'' if $all; +E: --all is incompatible with --only + + $incl{rel2abs_collapsed($_)} = undef for @$only; + } + unless (keys(%incl)) { + $all = 1; + warn <<EOM unless $self->{opt}->{quiet}; +# --all implied since no inboxes were specified with --only or --include +EOM + } + $self->{-opt}->{-pi_cfg}->each_inbox(\&_prep_ibx, $self, \%incl, $all); + my $nr = scalar(@IBX) or die "E: no inboxes to join with\n"; + progress($self, "will join with $nr inboxes in ", + $self->{-opt}->{-pi_cfg}->{-f}, " using: $pfx"); +} + +sub _prep_ibx { # each_inbox callback + my ($ibx, $self, $incl, $all) = @_; + ($all || exists($incl->{$ibx->{inboxdir}})) and push @IBX, $ibx; +} + +sub show_json { # for diagnostics (unstable output) + my ($self) = @_; + my $s = $self->{-opt}->{show} or return; # for diagnostics + local $self->{xdb}; + my %ret; + my @todo = @$s; + while (defined(my $f = shift @todo)) { + if ($f =~ /\A(?:roots2paths|paths2roots|join_data)\z/) { + $ret{$f} = $self->$f; + } elsif ($f eq '') { # default --show (no args) + push @todo, qw(roots2paths join_data); + } else { + warn "E: cannot show `$f'\n"; + } + } + my $json = ref(PublicInbox::Config::json())->new; + $json->utf8->canonical->pretty; # n.b. FS pathnames may not be UTF-8... + say $json->encode(\%ret); +} + +sub do_inits { # called via PublicInbox::DS::add_timer + my ($self) = @_; + grep !!$_, @{$self->{-opt}}{qw(scan prune)} and + @$SCANQ = map PublicInbox::Git->new($_), @{$self->{git_dirs}}; + init_join_postfork $self; + init_prune $self; + scan_git_dirs $self; + my $max = $TODO{do_join} ? max($LIVE_JOBS, $NPROC) : $LIVE_JOBS; + index_next($self) for (1..$max); +} + +sub cidx_run { # main entry point + my ($self) = @_; + my $restore_umask = prep_umask($self); + local $SIGSET = PublicInbox::DS::block_signals( + POSIX::SIGTSTP, POSIX::SIGCONT); + my $restore = on_destroy \&PublicInbox::DS::sig_setmask, $SIGSET; + local $PRUNE_DONE = []; + local $IDXQ = []; + local $SCANQ = []; + local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, @PRUNEQ, + $REPO_CTX, %ALT_FH, $TMPDIR, @AWK, @COMM, $CMD_ENV, + %TODO, @IBXQ, @IBX, @JOIN, %JOIN, @JOIN_PFX, @NO_ABBREV, + @JOIN_DT, $DUMP_IBX_WPIPE, @OFF2ROOT, $XHC, @SORT, $GITS_NR); + local $BATCH_BYTES = $self->{-opt}->{batch_size} // + $PublicInbox::SearchIdx::BATCH_BYTES; + local $MAX_SIZE = $self->{-opt}->{max_size}; + local $self->{PENDING} = {}; # used by PublicInbox::CidxXapHelperAux + my $cfg = $self->{-opt}->{-pi_cfg} // die 'BUG: -pi_cfg unset'; + $self->{-cfg_f} = $cfg->{-f} = rel2abs_collapsed($cfg->{-f}); + local $GIT_VER = PublicInbox::Git::git_version(); + @NO_ABBREV = ('-c', 'core.abbrev='.($GIT_VER lt v2.31.0 ? 40 : 'no')); + if (grep { $_ } @{$self->{-opt}}{qw(prune join)}) { + require File::Temp; + $TMPDIR = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1); + $CMD_ENV = { TMPDIR => "$TMPDIR", LC_ALL => 'C', LANG => 'C' }; + require_progs('(prune|join)', sort => \@SORT); + for (qw(parallel compress-program buffer-size)) { # GNU sort + my $v = $self->{-opt}->{"sort-$_"}; + push @SORT, "--$_=$v" if defined $v; + } + ($self->{-opt}->{prune} && $GIT_VER le v2.6) and + die "W: --prune requires git v2.6+\n"; + init_join_prefork($self) + } + local @IDX_SHARDS = cidx_init($self); # forks workers + local $ANY_SHARD = -1; + local $self->{current_info} = ''; + local $MY_SIG = { + CHLD => \&PublicInbox::DS::enqueue_reap, + USR1 => \&kill_shards, + }; + local @PRUNE_BATCH = @PRUNE_BATCH; + $MY_SIG->{$_} = \&parent_quit for qw(TERM QUIT INT); + my $cb = $SIG{__WARN__} || \&CORE::warn; + local $SIG{__WARN__} = sub { + my $m = shift @_; + $self->{current_info} eq '' or + $m =~ s/\A(#?\s*)/$1$self->{current_info}: /; + $cb->($m, @_); + }; + load_existing($self) unless $self->{-cidx_internal}; + if ($self->{-opt}->{reindex}) { + require PublicInbox::SharedKV; + $REINDEX = PublicInbox::SharedKV->new; + delete $REINDEX->{lock_path}; + $REINDEX->dbh; + } + my @nc = grep { File::Spec->canonpath($_) ne $_ } @{$self->{git_dirs}}; + if (@nc) { + warn "E: BUG? paths in $self->{cidx_dir} not canonicalized:\n"; + for my $d (@{$self->{git_dirs}}) { + my $c = File::Spec->canonpath($_); + warn "E: $d => $c\n"; + $d = $c; + } + warn "E: canonicalized and attempting to continue\n"; + } + if (defined(my $excl = $self->{-opt}->{exclude})) { + my $re = '(?:'.join('\\z|', map { + glob2re($_) // qr/\A\Q$_\E/ + } @$excl).'\\z)'; + my @excl; + @{$self->{git_dirs}} = grep { + $_ =~ /$re/ ? (push(@excl, $_), 0) : 1; + } @{$self->{git_dirs}}; + warn("# excluding $_\n") for @excl; + @GIT_DIR_GONE = uniqstr @GIT_DIR_GONE, @excl; + } + local $NCHANGE = 0; + local $NPROC = PublicInbox::IPC::detect_nproc(); + local $LIVE_JOBS = $self->{-opt}->{jobs} || $NPROC || 2; + local @RDONLY_XDB = $self->xdb_shards_flat; + PublicInbox::DS::add_timer(0, \&do_inits, $self); + + # FreeBSD ignores/discards SIGCHLD while signals are blocked and + # EVFILT_SIGNAL is inactive, so we pretend we have a SIGCHLD pending + PublicInbox::DS::enqueue_reap(); + + local @PublicInbox::DS::post_loop_do = (\&shards_active); + PublicInbox::DS::event_loop($MY_SIG, $SIGSET); + $self->lock_release(!!$NCHANGE); + show_json($self); +} + +sub ipc_atfork_child { # @IDX_SHARDS + my ($self) = @_; + $self->SUPER::ipc_atfork_child; + $SIG{USR1} = \&shard_usr1; + $SIG{$_} = \&shard_quit for qw(INT TERM QUIT); + my $x = delete $self->{siblings} // die 'BUG: no {siblings}'; + $_->wq_close for @$x; + undef; +} + +sub shard_done_wait { # awaitpid cb via ipc_worker_reap + my ($pid, $shard, $self) = @_; + my $quit_req = delete($shard->{-cidx_quit}); + return if $DO_QUIT; + if ($? == 0) { # success + $quit_req // warn 'BUG: {-cidx_quit} unset'; + } else { + warn "PID:$pid $shard->{shard} exited with \$?=$?\n"; + ++$self->{shard_err} if defined($self->{shard_err}); + } +} + +1; diff --git a/lib/PublicInbox/Compat.pm b/lib/PublicInbox/Compat.pm new file mode 100644 index 00000000..78cba90e --- /dev/null +++ b/lib/PublicInbox/Compat.pm @@ -0,0 +1,24 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# compatibility code for old Perl and standard modules, mainly +# List::Util but maybe other stuff +package PublicInbox::Compat; +use v5.12; +use parent qw(Exporter); +require List::Util; + +our @EXPORT_OK = qw(uniqstr); + +# uniqstr is in List::Util 1.45+, which means Perl 5.26+; +# so maybe 2030 for us since we need to support enterprise distros. +# We can use uniqstr everywhere in our codebase and don't need +# to account for special cases of `uniqnum' nor `uniq' in List::Util +# even if they make more sense in some contexts +no warnings 'once'; +*uniqstr = List::Util->can('uniqstr') // sub (@) { + my %seen; + grep { !$seen{$_}++ } @_; +}; + +1; diff --git a/lib/PublicInbox/CompressNoop.pm b/lib/PublicInbox/CompressNoop.pm index e3301473..5135299f 100644 --- a/lib/PublicInbox/CompressNoop.pm +++ b/lib/PublicInbox/CompressNoop.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Provide the same methods as Compress::Raw::Zlib::Deflate but @@ -10,7 +10,7 @@ use Compress::Raw::Zlib qw(Z_OK); sub new { bless \(my $self), __PACKAGE__ } sub deflate { # ($self, $input, $output) - $_[2] .= $_[1]; + $_[2] .= ref($_[1]) ? ${$_[1]} : $_[1]; Z_OK; } diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index ad8b8e9d..d6300610 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -10,8 +10,10 @@ package PublicInbox::Config; use strict; use v5.10.1; +use parent qw(Exporter); +our @EXPORT_OK = qw(glob2re rel2abs_collapsed); use PublicInbox::Inbox; -use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::Spawn qw(popen_rd run_qx); our $LD_PRELOAD = $ENV{LD_PRELOAD}; # only valid at startup our $DEDUPE; # set to {} to dedupe or clear cache @@ -20,23 +22,16 @@ sub _array ($) { ref($_[0]) eq 'ARRAY' ? $_[0] : [ $_[0] ] } # returns key-value pairs of config directives in a hash # if keys may be multi-value, the value is an array ref containing all values sub new { - my ($class, $file, $errfh) = @_; + my ($class, $file, $lei) = @_; $file //= default_file(); - my $self; - my $set_dedupe; - if (ref($file) eq 'SCALAR') { # used by some tests - open my $fh, '<', $file or die; # PerlIO::scalar - $self = config_fh_parse($fh, "\n", '='); - bless $self, $class; - } else { - if (-f $file && $DEDUPE) { - $file = rel2abs_collapsed($file); - $self = $DEDUPE->{$file} and return $self; - $set_dedupe = 1; - } - $self = git_config_dump($class, $file, $errfh); - $self->{'-f'} = $file; - } + my ($self, $set_dedupe); + if (-f $file && $DEDUPE) { + $file = rel2abs_collapsed($file); + $self = $DEDUPE->{$file} and return $self; + $set_dedupe = 1; + } + $self = git_config_dump($class, $file, $lei); + $self->{-f} = $file; # caches $self->{-by_addr} = {}; $self->{-by_list_id} = {}; @@ -45,8 +40,7 @@ sub new { $self->{-by_eidx_key} = {}; $self->{-no_obfuscate} = {}; $self->{-limiters} = {}; - $self->{-code_repos} = {}; # nick => PublicInbox::Git object - $self->{-cgitrc_unparsed} = $self->{'publicinbox.cgitrc'}; + $self->{-coderepos} = {}; # nick => PublicInbox::Git object if (my $no = delete $self->{'publicinbox.noobfuscate'}) { $no = _array($no); @@ -130,9 +124,9 @@ sub lookup_newsgroup { sub limiter { my ($self, $name) = @_; $self->{-limiters}->{$name} //= do { - require PublicInbox::Qspawn; + require PublicInbox::Limiter; my $max = $self->{"publicinboxlimiter.$name.max"} || 1; - my $limiter = PublicInbox::Qspawn::Limiter->new($max); + my $limiter = PublicInbox::Limiter->new($max); $limiter->setup_rlimit($name, $self); $limiter; }; @@ -150,8 +144,11 @@ sub config_fh_parse ($$$) { local $/ = $rs; while (defined($line = <$fh>)) { # perf critical with giant configs $i = index($line, $fs); + # $i may be -1 if $fs not found and it's a key-only entry + # (meaning boolean true). Either way the -1 will drop the + # $rs either from $k or $v. $k = substr($line, 0, $i); - $v = substr($line, $i + 1, -1); # chop off $fs + $v = $i >= 0 ? substr($line, $i + 1, -1) : 1; $section = substr($k, 0, rindex($k, '.')); $seen{$section} //= push(@section_order, $section); @@ -170,13 +167,34 @@ sub config_fh_parse ($$$) { \%rv; } +sub tmp_cmd_opt ($$) { + my ($env, $opt) = @_; + # quiet global and system gitconfig if supported by installed git, + # but normally harmless if too noisy (NOGLOBAL no longer exists) + $env->{GIT_CONFIG_NOSYSTEM} = 1; + $env->{GIT_CONFIG_GLOBAL} = '/dev/null'; # git v2.32+ + $opt->{-C} = '/'; # avoid $worktree/.git/config on MOST systems :P +} + sub git_config_dump { - my ($class, $file, $errfh) = @_; - return bless {}, $class unless -e $file; - my $cmd = [ qw(git config -z -l --includes), "--file=$file" ]; - my $fh = popen_rd($cmd, undef, { 2 => $errfh // 2 }); + my ($class, $file, $lei) = @_; + my @opt_c = map { ('-c', $_) } @{$lei->{opt}->{c} // []}; + $file = undef if !-e $file; + # XXX should we set {-f} if !-e $file? + return bless {}, $class if (!@opt_c && !defined($file)); + my %env; + my $opt = { 2 => $lei->{2} // 2 }; + if (@opt_c) { + unshift(@opt_c, '-c', "include.path=$file") if defined($file); + tmp_cmd_opt(\%env, $opt); + } + my @cmd = ('git', @opt_c, qw(config -z -l --includes)); + push(@cmd, '-f', $file) if !@opt_c && defined($file); + my $fh = popen_rd(\@cmd, \%env, $opt); my $rv = config_fh_parse($fh, "\0", "\n"); - close $fh or die "@$cmd failed: \$?=$?\n"; + $fh->close or die "@cmd failed: \$?=$?\n"; + $rv->{-opt_c} = \@opt_c if @opt_c; # for ->urlmatch + $rv->{-f} = $file; bless $rv, $class; } @@ -227,7 +245,6 @@ sub cgit_repo_merge ($$$) { $rel =~ s!/?\.git\z!!; } $self->{"coderepo.$rel.dir"} //= $path; - $self->{"coderepo.$rel.cgiturl"} //= _array($rel); } sub is_git_dir ($) { @@ -263,10 +280,11 @@ sub scan_tree_coderepo ($$) { scan_path_coderepo($self, $path, $path); } -sub scan_projects_coderepo ($$$) { - my ($self, $list, $path) = @_; - open my $fh, '<', $list or do { - warn "failed to open cgit projectlist=$list: $!\n"; +sub scan_projects_coderepo ($$) { + my ($self, $path) = @_; + my $l = $self->{-cgit_project_list} // die 'BUG: no cgit_project_list'; + open my $fh, '<', $l or do { + warn "failed to open cgit project-list=$l: $!\n"; return; }; while (<$fh>) { @@ -275,8 +293,20 @@ sub scan_projects_coderepo ($$$) { } } +sub apply_cgit_scan_path { + my ($self, @paths) = @_; + @paths or @paths = @{$self->{-cgit_scan_path}}; + if (defined($self->{-cgit_project_list})) { + for my $p (@paths) { scan_projects_coderepo($self, $p) } + } else { + for my $p (@paths) { scan_tree_coderepo($self, $p) } + } +} + sub parse_cgitrc { my ($self, $cgitrc, $nesting) = @_; + $cgitrc //= $self->{'publicinbox.cgitrc'} // + $ENV{CGIT_CONFIG} // return; if ($nesting == 0) { # defaults: my %s = map { $_ => 1 } qw(/cgit.css /cgit.png @@ -316,34 +346,41 @@ sub parse_cgitrc { my ($k, $v) = ($1, $2); $k =~ tr/-/_/; $self->{"-cgit_$k"} = $v; + delete $self->{-cgit_scan_path} if $k eq 'project_list'; } elsif (m!\Ascan-path=(.+)\z!) { - if (defined(my $list = $self->{-cgit_project_list})) { - scan_projects_coderepo($self, $list, $1); - } else { - scan_tree_coderepo($self, $1); - } + # this depends on being after project-list in the + # config file, just like cgit.c + push @{$self->{-cgit_scan_path}}, $1; + apply_cgit_scan_path($self, $1); } elsif (m!\A(?:css|favicon|logo|repo\.logo)=(/.+)\z!) { # absolute paths for static files via PublicInbox::Cgit $self->{-cgit_static}->{$1} = 1; + } elsif (s!\Asnapshots=\s*!!) { + $self->{'coderepo.snapshots'} = $_; } } cgit_repo_merge($self, $repo->{dir}, $repo) if $repo; } +sub valid_dir ($$) { + my $dir = get_1($_[0], $_[1]) // return; + index($dir, "\n") < 0 ? $dir : do { + warn "E: `$_[1]=$dir' must not contain `\\n'\n"; + undef; + } +} + # parse a code repo, only git is supported at the moment -sub fill_code_repo { +sub fill_coderepo { my ($self, $nick) = @_; my $pfx = "coderepo.$nick"; - my $dir = $self->{"$pfx.dir"} // do { # aka "GIT_DIR" - warn "$pfx.dir unset\n"; - return; - }; - my $git = PublicInbox::Git->new($dir); + my $git = PublicInbox::Git->new(valid_dir($self, "$pfx.dir") // return); if (defined(my $cgits = $self->{"$pfx.cgiturl"})) { $git->{cgit_url} = $cgits = _array($cgits); $self->{"$pfx.cgiturl"} = $cgits; } - + my %dedupe = ($nick => undef); + ($git->{nick}) = keys %dedupe; $git; } @@ -368,7 +405,7 @@ sub git_bool { # is sufficient and doesn't leave "/.." or "/../" sub rel2abs_collapsed { require File::Spec; - my $p = File::Spec->rel2abs($_[-1]); + my $p = File::Spec->rel2abs(@_); return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; require Cwd; Cwd::abs_path($p); @@ -384,11 +421,12 @@ sub get_1 { sub repo_objs { my ($self, $ibxish) = @_; - my $ibx_code_repos = $ibxish->{coderepo} // return; $ibxish->{-repo_objs} // do { - my $code_repos = $self->{-code_repos}; + my $ibx_coderepos = $ibxish->{coderepo} // return; + parse_cgitrc($self, undef, 0); + my $coderepos = $self->{-coderepos}; my @repo_objs; - for my $nick (@$ibx_code_repos) { + for my $nick (@$ibx_coderepos) { my @parts = split(m!/!, $nick); for (@parts) { @parts = () unless valid_foo_name($_); @@ -397,12 +435,16 @@ sub repo_objs { warn "invalid coderepo name: `$nick'\n"; next; } - my $repo = $code_repos->{$nick} //= - fill_code_repo($self, $nick); - push @repo_objs, $repo if $repo; + my $repo = $coderepos->{$nick} //= + fill_coderepo($self, $nick); + $repo ? push(@repo_objs, $repo) : + warn("coderepo.$nick.dir unset\n"); } if (scalar @repo_objs) { - $ibxish ->{-repo_objs} = \@repo_objs; + for (@repo_objs) { + push @{$_->{ibx_names}}, $ibxish->{name}; + } + $ibxish->{-repo_objs} = \@repo_objs; } else { delete $ibxish->{coderepo}; } @@ -417,18 +459,15 @@ sub _fill_ibx { my $v = $self->{"$pfx.$k"}; $ibx->{$k} = $v if defined $v; } - for my $k (qw(filter inboxdir newsgroup replyto httpbackendmax feedmax + for my $k (qw(filter newsgroup replyto httpbackendmax feedmax indexlevel indexsequentialshard boost)) { my $v = get_1($self, "$pfx.$k") // next; $ibx->{$k} = $v; } # "mainrepo" is backwards compatibility: - my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return; - if (index($dir, "\n") >= 0) { - warn "E: `$dir' must not contain `\\n'\n"; - return; - } + my $dir = $ibx->{inboxdir} = valid_dir($self, "$pfx.inboxdir") // + valid_dir($self, "$pfx.mainrepo") // return; for my $k (qw(obfuscate)) { my $v = $self->{"$pfx.$k"} // next; if (defined(my $bval = git_bool($v))) { @@ -448,7 +487,8 @@ sub _fill_ibx { } return unless valid_foo_name($name, 'publicinbox'); - $ibx->{name} = $name; + my %dedupe = ($name => undef); + ($ibx->{name}) = keys %dedupe; # used as a key everywhere $ibx->{-pi_cfg} = $self; $ibx = PublicInbox::Inbox->new($ibx); foreach (@{$ibx->{address}}) { @@ -476,9 +516,16 @@ sub _fill_ibx { delete $ibx->{newsgroup}; warn "newsgroup name invalid: `$ngname'\n"; } else { + my $lc = $ibx->{newsgroup} = lc $ngname; + warn <<EOM if $lc ne $ngname; +W: newsgroup=`$ngname' lowercased to `$lc' +EOM # PublicInbox::NNTPD does stricter ->nntp_usable # checks, keep this lean for startup speed - $self->{-by_newsgroup}->{$ngname} = $ibx; + my $cur = $self->{-by_newsgroup}->{$lc} //= $ibx; + warn <<EOM if $cur != $ibx; +W: newsgroup=`$lc' is used by both `$cur->{name}' and `$ibx->{name}' +EOM } } unless (defined $ibx->{newsgroup}) { # for ->eidx_key @@ -498,19 +545,18 @@ sub _fill_ibx { require PublicInbox::Isearch; $ibx->{isrch} = PublicInbox::Isearch->new($ibx, $es); } - $self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx; + my $cur = $self->{-by_eidx_key}->{my $ekey = $ibx->eidx_key} //= $ibx; + $cur == $ibx or warn + "W: `$ekey' used by both `$cur->{name}' and `$ibx->{name}'\n"; + $ibx; } sub _fill_ei ($$) { my ($self, $name) = @_; eval { require PublicInbox::ExtSearch } or return; my $pfx = "extindex.$name"; - my $d = $self->{"$pfx.topdir"} // return; + my $d = valid_dir($self, "$pfx.topdir") // return; -d $d or return; - if (index($d, "\n") >= 0) { - warn "E: `$d' must not contain `\\n'\n"; - return; - } my $es = PublicInbox::ExtSearch->new($d); for my $k (qw(indexlevel indexsequentialshard)) { my $v = get_1($self, "$pfx.$k") // next; @@ -525,23 +571,76 @@ sub _fill_ei ($$) { $es; } +sub _fill_csrch ($$) { + my ($self, $name) = @_; # "" is a valid name for cindex + return if $name ne '' && !valid_foo_name($name, 'cindex'); + eval { require PublicInbox::CodeSearch } or return; + my $pfx = "cindex.$name"; + my $d = valid_dir($self, "$pfx.topdir") // return; + -d $d or return; + my $csrch = PublicInbox::CodeSearch->new($d, $self); + for my $k (qw(localprefix)) { + my $v = $self->{"$pfx.$k"} // next; + $csrch->{$k} = _array($v); + } + $csrch->{name} = $name; + $csrch; +} + +sub lookup_cindex ($$) { + my ($self, $name) = @_; + $self->{-csrch_by_name}->{$name} //= _fill_csrch($self, $name); +} + +sub each_cindex { + my ($self, $cb, @arg) = @_; + my @csrch = map { + lookup_cindex($self, substr($_, length('cindex.'))) // () + } grep(m!\Acindex\.[^\./]*\z!, @{$self->{-section_order}}); + if (ref($cb) eq 'CODE') { + $cb->($_, @arg) for @csrch; + } else { # string function + $_->$cb(@arg) for @csrch; + } +} + +sub config_cmd { + my ($self, $env, $opt) = @_; + my $f = $self->{-f} // default_file(); + my @opt_c = @{$self->{-opt_c} // []}; + my @cmd = ('git', @opt_c, 'config'); + @opt_c ? tmp_cmd_opt($env, $opt) : push(@cmd, '-f', $f); + \@cmd; +} + sub urlmatch { - my ($self, $key, $url) = @_; + my $self = shift; + my @bool = $_[0] eq '--bool' ? (shift) : (); + my ($key, $url, $try_git) = @_; state $urlmatch_broken; # requires git 1.8.5 return if $urlmatch_broken; - my $file = $self->{'-f'} // default_file(); - my $cmd = [qw/git config -z --includes --get-urlmatch/, - "--file=$file", $key, $url ]; - my $fh = popen_rd($cmd); - local $/ = "\0"; - my $val = <$fh>; - if (close($fh)) { - chomp($val); - $val; - } else { - $urlmatch_broken = 1 if (($? >> 8) != 1); - undef; + my (%env, %opt); + my $cmd = $self->config_cmd(\%env, \%opt); + push @$cmd, @bool, qw(--includes -z --get-urlmatch), $key, $url; + my $val = run_qx($cmd, \%env, \%opt); + if ($?) { + undef $val; + if (@bool && ($? >> 8) == 128) { # not boolean + } elsif (($? >> 8) != 1) { + $urlmatch_broken = 1; + } elsif ($try_git) { # n.b. this takes cwd into account + $val = run_qx([qw(git config), @bool, + qw(-z --get-urlmatch), $key, $url]); + undef $val if $?; + } + } + $? = 0; # don't influence lei exit status + if (defined($val)) { + local $/ = "\0"; + chomp $val; + $val = git_bool($val) if @bool; } + $val; } sub json { @@ -565,4 +664,48 @@ sub squote_maybe ($) { $val; } +my %re_map = ( '*' => '[^/]*?', '?' => '[^/]', + '/**' => '/.*', '**/' => '.*/', '/**/' => '(?:/.*?/|/)', + '[' => '[', ']' => ']', ',' => ',' ); + +sub glob2re ($) { + my ($re) = @_; + my $p = ''; + my $in_bracket = 0; + my $qm = 0; + my $schema_host_port = ''; + + # don't glob URL-looking things that look like IPv6 + if ($re =~ s!\A([a-z0-9\+]+://\[[a-f0-9\:]+\](?::[0-9]+)?/)!!i) { + $schema_host_port = quotemeta $1; # "http://[::1]:1234" + } + my $changes = ($re =~ s!(/\*\*/|\A\*\*/|/\*\*\z|.)! + $re_map{$p eq '\\' ? '' : do { + if ($1 eq '[') { ++$in_bracket } + elsif ($1 eq ']') { --$in_bracket } + elsif ($1 eq ',') { ++$qm } # no change + $p = $1; + }} // do { + $p = $1; + ($p eq '-' && $in_bracket) ? $p : (++$qm, "\Q$p") + }!sge); + # bashism (also supported by curl): {a,b,c} => (a|b|c) + $changes += ($re =~ s/([^\\]*)\\\{([^,]*,[^\\]*)\\\}/ + (my $in_braces = $2) =~ tr!,!|!; + $1."($in_braces)"; + /sge); + ($changes - $qm) ? $schema_host_port.$re : undef; +} + +sub get_coderepo { + my ($self, $nick) = @_; + $self->{-coderepos}->{$nick} // do { + defined($self->{-cgit_scan_path}) ? do { + apply_cgit_scan_path($self); + my $cr = fill_coderepo($self, $nick); + $cr ? ($self->{-coderepos}->{$nick} = $cr) : undef; + } : undef; + }; +} + 1; diff --git a/lib/PublicInbox/ConfigIter.pm b/lib/PublicInbox/ConfigIter.pm index 14fcef83..f9e3451a 100644 --- a/lib/PublicInbox/ConfigIter.pm +++ b/lib/PublicInbox/ConfigIter.pm @@ -1,12 +1,11 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Intended for PublicInbox::DS::event_loop in read-only daemons # to avoid each_inbox() monopolizing the event loop when hundreds/thousands # of inboxes are in play. package PublicInbox::ConfigIter; -use strict; -use v5.10.1; +use v5.12; sub new { my ($class, $pi_cfg, $cb, @args) = @_; @@ -25,7 +24,7 @@ sub event_step { PublicInbox::DS::requeue($self) if defined($section); } -# for generic PSGI servers +# for generic PSGI servers, but also ManifestJsGz w/ ALL extindex sub each_section { my $self = shift; my ($pi_cfg, $i, $cb, @arg) = @$self; diff --git a/lib/PublicInbox/ContentDigestDbg.pm b/lib/PublicInbox/ContentDigestDbg.pm new file mode 100644 index 00000000..853624f1 --- /dev/null +++ b/lib/PublicInbox/ContentDigestDbg.pm @@ -0,0 +1,22 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::ContentDigestDbg; # cf. PublicInbox::ContentDigest +use v5.12; +use Data::Dumper; +use PublicInbox::SHA; +$Data::Dumper::Useqq = $Data::Dumper::Terse = 1; + +sub new { bless [ PublicInbox::SHA->new(256), $_[1] ], __PACKAGE__ } + +sub add { + $_[0]->[0]->add($_[1]); + my @dbg = split(/^/sm, $_[1]); + if (@dbg && $dbg[0] =~ /\A(To|Cc)\0/) { # fold excessively long lines + @dbg = map { split(/,/s, $_) } @dbg; + } + print { $_[0]->[1] } Dumper(\@dbg) or die "print $!"; +} + +sub hexdigest { $_[0]->[0]->hexdigest } + +1; diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index bacc9cdd..95ca2929 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Unstable internal API. @@ -15,7 +15,8 @@ use PublicInbox::MID qw(mids references); use PublicInbox::MsgIter; # not sure if less-widely supported hash families are worth bothering with -use Digest::SHA; +use PublicInbox::SHA; # faster, but no ->clone +use Digest::SHA; # we still need this for ->clone sub digest_addr ($$$) { my ($dig, $h, $v) = @_; @@ -44,7 +45,7 @@ sub content_dig_i { my $ct = $part->content_type || 'text/plain'; my ($s, undef) = msg_part_text($part, $ct); if (defined $s) { - $s =~ s/\r\n/\n/gs; + $s =~ s/\r\n/\n/gs; # TODO: consider \r+\n to match View $s =~ s/\s*\z//s; utf8::encode($s); } else { @@ -53,18 +54,26 @@ sub content_dig_i { $dig->add($s); } -sub content_digest ($;$) { - my ($eml, $dig) = @_; +sub content_digest ($;$$) { + my ($eml, $dig, $hash_mids) = @_; $dig //= Digest::SHA->new(256); # References: and In-Reply-To: get used interchangeably # in some "duplicates" in LKML. We treat them the same # in SearchIdx, so treat them the same for this: # do NOT consider the Message-ID as part of the content_hash - # if we got here, we've already got Message-ID reuse - my %seen = map { $_ => 1 } @{mids($eml)}; - foreach my $mid (@{references($eml)}) { - $dig->add("ref\0$mid\0") unless $seen{$mid}++; + # if we got here, we've already got Message-ID reuse for v2. + # + # However, `lei q --dedupe=content' does use $hash_mids since + # it doesn't have any other dedupe + my $mids = mids($eml); + if ($hash_mids) { + $dig->add("mid\0$_\0") for @$mids; + } + my %seen = map { $_ => 1 } @$mids; + for (grep { !$seen{$_}++ } @{references($eml)}) { + utf8::encode($_); + $dig->add("ref\0$_\0"); } # Only use Sender: if From is not present @@ -74,8 +83,7 @@ sub content_digest ($;$) { last; } foreach my $h (qw(Subject Date)) { - my @v = $eml->header($h); - foreach my $v (@v) { + for my $v ($eml->header($h)) { utf8::encode($v); $dig->add("$h\0$v\0"); } @@ -84,23 +92,22 @@ sub content_digest ($;$) { # not in the original message. For the purposes of deduplication, # do not take it into account: foreach my $h (qw(To Cc)) { - my @v = $eml->header($h); - digest_addr($dig, $h, $_) foreach @v; + digest_addr($dig, $h, $_) for ($eml->header($h)); } msg_iter($eml, \&content_dig_i, $dig); $dig; } sub content_hash ($) { - content_digest($_[0])->digest; + content_digest($_[0], PublicInbox::SHA->new(256))->digest; } +# don't clone the result of this sub git_sha ($$) { my ($n, $eml) = @_; - my $dig = Digest::SHA->new($n); + my $dig = PublicInbox::SHA->new($n); my $bref = ref($eml) eq 'SCALAR' ? $eml : \($eml->as_string); - $dig->add('blob '.length($$bref)."\0"); - $dig->add($$bref); + $dig->add('blob '.length($$bref)."\0", $$bref); $dig; } diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index 5e8a6a66..a6fec954 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -24,29 +24,30 @@ use strict; use v5.10.1; use parent qw(Exporter); use bytes qw(length substr); # FIXME(?): needed for PublicInbox::NNTP -use POSIX qw(WNOHANG sigprocmask SIG_SETMASK); +use POSIX qw(WNOHANG sigprocmask SIG_SETMASK SIG_UNBLOCK); use Fcntl qw(SEEK_SET :DEFAULT O_APPEND); use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use Scalar::Util qw(blessed); -use PublicInbox::Syscall qw(:epoll); +use PublicInbox::Syscall qw(%SIGNUM + EPOLLIN EPOLLOUT EPOLLONESHOT EPOLLEXCLUSIVE); use PublicInbox::Tmpfile; -use Errno qw(EAGAIN EINVAL); +use PublicInbox::Select; +use PublicInbox::OnDestroy; +use Errno qw(EAGAIN EINVAL ECHILD); use Carp qw(carp croak); -our @EXPORT_OK = qw(now msg_more dwaitpid add_timer add_uniq_timer); +our @EXPORT_OK = qw(now msg_more awaitpid add_timer add_uniq_timer); -my %Stack; my $nextq; # queue for next_tick -my $wait_pids; # list of [ pid, callback, callback_arg ] my $reap_armed; -my $ToClose; # sockets to close when event loop is done -our ( - %DescriptorMap, # fd (num) -> PublicInbox::DS object - $Epoll, # Global epoll fd (or DSKQXS ref) - $ep_io, # IO::Handle for Epoll +my @active; # FDs (or objs) returned by epoll/kqueue +our (%AWAIT_PIDS, # pid => [ $callback, @args ] + $cur_runq, # only set inside next_tick + @FD_MAP, # fd (num) -> PublicInbox::DS object + $Poller, # global Select, Epoll, DSPoll, or DSKQXS ref - $PostLoopCallback, # subref to call at the end of each loop, if defined (global) + @post_loop_do, # subref + args to call at the end of each loop - $LoopTimeout, # timeout of event loop in milliseconds + $loop_timeout, # timeout of event loop in milliseconds @Timers, # timers %UniqTimer, $in_loop, @@ -54,6 +55,9 @@ our ( Reset(); +# clobber everything explicitly to avoid DESTROY ordering problems w/ DBI +END { Reset() } + ##################################################################### ### C L A S S M E T H O D S ##################################################################### @@ -64,37 +68,32 @@ Reset all state =cut sub Reset { + $Poller = bless [], 'PublicInbox::DummyPoller'; do { $in_loop = undef; # first in case DESTROY callbacks use this - %DescriptorMap = (); + # clobbering $Poller may call DSKQXS::DESTROY, + # we must always have this set to something to avoid + # needing branches before ep_del/ep_mod calls (via ->close). + @FD_MAP = (); @Timers = (); %UniqTimer = (); - $PostLoopCallback = undef; - - # we may be iterating inside one of these on our stack - my @q = delete @Stack{keys %Stack}; - for my $q (@q) { @$q = () } - $wait_pids = $nextq = $ToClose = undef; - $ep_io = undef; # closes real $Epoll FD - $Epoll = undef; # may call DSKQXS::DESTROY - } while (@Timers || keys(%Stack) || $nextq || $wait_pids || - $ToClose || keys(%DescriptorMap) || - $PostLoopCallback || keys(%UniqTimer)); + @post_loop_do = (); + + # we may be called from an *atfork_child inside next_tick: + @$cur_runq = () if $cur_runq; + @active = (); + $nextq = undef; # may call ep_del + %AWAIT_PIDS = (); + } while (@Timers || $nextq || keys(%AWAIT_PIDS) || + @active || @FD_MAP || + @post_loop_do || keys(%UniqTimer) || + scalar(@{$cur_runq // []})); # do not vivify cur_runq $reap_armed = undef; - $LoopTimeout = -1; # no timeout by default + $loop_timeout = -1; # no timeout by default + $Poller = PublicInbox::Select->new; } -=head2 C<< CLASS->SetLoopTimeout( $timeout ) >> - -Set the loop timeout for the event loop to some value in milliseconds. - -A timeout of 0 (zero) means poll forever. A timeout of -1 means poll and return -immediately. - -=cut -sub SetLoopTimeout { $LoopTimeout = $_[1] + 0 } - sub _add_named_timer { my ($name, $secs, $coderef, @args) = @_; my $fire_time = now() + $secs; @@ -124,49 +123,35 @@ sub add_uniq_timer { # ($name, $secs, $coderef, @args) = @_; $UniqTimer{$_[0]} //= _add_named_timer(@_); } -# caller sets return value to $Epoll +# caller sets return value to $Poller sub _InitPoller () { - if (PublicInbox::Syscall::epoll_defined()) { - my $fd = epoll_create(); - die "epoll_create: $!" if $fd < 0; - open($ep_io, '+<&=', $fd) or return; - my $fl = fcntl($ep_io, F_GETFD, 0); - fcntl($ep_io, F_SETFD, $fl | FD_CLOEXEC); - $fd; - } else { - my $cls; - for (qw(DSKQXS DSPoll)) { - $cls = "PublicInbox::$_"; - last if eval "require $cls"; - } - $cls->import(qw(epoll_ctl epoll_wait)); - $cls->new; + my @try = ($^O eq 'linux' ? 'Epoll' : 'DSKQXS'); + my $cls; + for (@try, 'DSPoll') { + $cls = "PublicInbox::$_"; + last if eval "require $cls"; } + $cls->new; } sub now () { clock_gettime(CLOCK_MONOTONIC) } sub next_tick () { - my $q = $nextq or return; + $cur_runq = $nextq or return; $nextq = undef; - $Stack{cur_runq} = $q; - for my $obj (@$q) { + while (my $obj = shift @$cur_runq) { # avoid "ref" on blessed refs to workaround a Perl 5.16.3 leak: # https://rt.perl.org/Public/Bug/Display.html?id=114340 - if (blessed($obj)) { - $obj->event_step; - } else { - $obj->(); - } + blessed($obj) ? $obj->event_step : $obj->(); } - delete $Stack{cur_runq}; + 1; } # runs timers and returns milliseconds for next one, or next event loop sub RunTimers { - next_tick(); + my $ran = next_tick(); - return (($nextq || $ToClose) ? 0 : $LoopTimeout) unless @Timers; + return ($nextq || $ran ? 0 : $loop_timeout) unless @Timers; my $now = now(); @@ -175,60 +160,64 @@ sub RunTimers { my $to_run = shift(@Timers); delete $UniqTimer{$to_run->[1] // ''}; $to_run->[2]->(@$to_run[3..$#$to_run]); + $ran = 1; } # timers may enqueue into nextq: - return 0 if ($nextq || $ToClose); + return 0 if $nextq || $ran; - return $LoopTimeout unless @Timers; + return $loop_timeout unless @Timers; # convert time to an even number of milliseconds, adding 1 # extra, otherwise floating point fun can occur and we'll # call RunTimers like 20-30 times, each returning a timeout # of 0.0000212 seconds - my $timeout = int(($Timers[0][0] - $now) * 1000) + 1; + my $t = int(($Timers[0][0] - $now) * 1000) + 1; # -1 is an infinite timeout, so prefer a real timeout - ($LoopTimeout < 0 || $LoopTimeout >= $timeout) ? $timeout : $LoopTimeout + ($loop_timeout < 0 || $loop_timeout >= $t) ? $t : $loop_timeout } sub sig_setmask { sigprocmask(SIG_SETMASK, @_) or die "sigprocmask: $!" } -sub block_signals () { - my $oldset = POSIX::SigSet->new; +# ensure we detect bugs, HW problems and user rlimits +our @UNBLOCKABLE = (POSIX::SIGABRT, POSIX::SIGBUS, POSIX::SIGFPE, + POSIX::SIGILL, POSIX::SIGSEGV, POSIX::SIGXCPU, POSIX::SIGXFSZ); + +sub block_signals { # anything in @_ stays unblocked my $newset = POSIX::SigSet->new; $newset->fillset or die "fillset: $!"; + for (@_, @UNBLOCKABLE) { $newset->delset($_) or die "delset($_): $!" } + my $oldset = POSIX::SigSet->new; sig_setmask($newset, $oldset); $oldset; } -# We can't use waitpid(-1) safely here since it can hit ``, system(), -# and other things. So we scan the $wait_pids list, which is hopefully -# not too big. We keep $wait_pids small by not calling dwaitpid() -# until we've hit EOF when reading the stdout of the child. +sub await_cb ($;@) { + my ($pid, @cb_args) = @_; + my $cb = shift @cb_args or return; + eval { $cb->($pid, @cb_args) }; + warn "E: awaitpid($pid): $@" if $@; +} +# This relies on our Perl process being single-threaded, or at least +# no threads spawning and waiting on processes (``, system(), etc...) +# Threads are officially discouraged by the Perl5 team, and I expect +# that to remain the case. sub reap_pids { $reap_armed = undef; - my $tmp = $wait_pids or return; - $wait_pids = undef; - $Stack{reap_runq} = $tmp; - my $oldset = block_signals(); - foreach my $ary (@$tmp) { - my ($pid, $cb, $arg) = @$ary; - my $ret = waitpid($pid, WNOHANG); - if ($ret == 0) { - push @$wait_pids, $ary; # autovivifies @$wait_pids - } elsif ($ret == $pid) { - if ($cb) { - eval { $cb->($arg, $pid) }; - warn "E: dwaitpid($pid) in_loop: $@" if $@; - } - } else { - warn "waitpid($pid, WNOHANG) = $ret, \$!=$!, \$?=$?"; + while (1) { + my $pid = waitpid(-1, WNOHANG) or return; + if (defined(my $cb_args = delete $AWAIT_PIDS{$pid})) { + await_cb($pid, @$cb_args) if $cb_args; + } elsif ($pid == -1 && $! == ECHILD) { + return requeue(\&dflush); # force @post_loop_do to run + } elsif ($pid > 0) { + warn "W: reaped unknown PID=$pid: \$?=$?\n"; + } else { # does this happen? + return warn("W: waitpid(-1, WNOHANG) => $pid ($!)"); } } - sig_setmask($oldset); - delete $Stack{reap_runq}; } # reentrant SIGCHLD handler (since reap_pids is not reentrant) @@ -236,81 +225,82 @@ sub enqueue_reap () { $reap_armed //= requeue(\&reap_pids) } sub in_loop () { $in_loop } +# use inside @post_loop_do, returns number of busy clients +sub close_non_busy () { + my $n = 0; + for my $s (grep defined, @FD_MAP) { + # close as much as possible, early as possible + ($s->busy ? ++$n : $s->close) if $s->can('busy'); + } + $n; +} + # Internal function: run the post-event callback, send read events # for pushed-back data, and close pending connections. returns 1 # if event loop should continue, or 0 to shut it all down. sub PostEventLoop () { - # now we can close sockets that wanted to close during our event - # processing. (we didn't want to close them during the loop, as we - # didn't want fd numbers being reused and confused during the event - # loop) - if (my $close_now = $ToClose) { - $ToClose = undef; # will be autovivified on push - @$close_now = map { fileno($_) } @$close_now; - - # ->DESTROY methods may populate ToClose - delete @DescriptorMap{@$close_now}; - } - # by default we keep running, unless a postloop callback cancels it - $PostLoopCallback ? $PostLoopCallback->(\%DescriptorMap) : 1; + @post_loop_do ? $post_loop_do[0]->(@post_loop_do[1..$#post_loop_do]) : 1 +} + +sub sigset_prep ($$$) { + my ($sig, $init, $each) = @_; # $sig: { signame => whatever } + my $ret = POSIX::SigSet->new; + $ret->$init or die "$init: $!"; + for my $s (keys %$sig) { + my $num = $SIGNUM{$s} // POSIX->can("SIG$s")->(); + $ret->$each($num) or die "$each ($s => $num): $!"; + } + for (@UNBLOCKABLE) { $ret->$each($_) or die "$each ($_): $!" } + $ret; } +sub allowset ($) { sigset_prep $_[0], 'fillset', 'delset' } +sub unblockset ($) { sigset_prep $_[0], 'emptyset', 'addset' } + # Start processing IO events. In most daemon programs this never exits. See -# C<PostLoopCallback> for how to exit the loop. +# C<post_loop_do> for how to exit the loop. sub event_loop (;$$) { my ($sig, $oldset) = @_; - $Epoll //= _InitPoller(); + $Poller //= _InitPoller(); require PublicInbox::Sigfd if $sig; - my $sigfd = PublicInbox::Sigfd->new($sig, 1) if $sig; + my $sigfd = $sig ? PublicInbox::Sigfd->new($sig) : undef; + if ($sigfd && $sigfd->{is_kq}) { + my $tmp = allowset($sig); + local @SIG{keys %$sig} = values(%$sig); + sig_setmask($tmp, my $old = POSIX::SigSet->new); + # Unlike Linux signalfd, EVFILT_SIGNAL can't handle + # signals received before the filter is created, + # so we peek at signals here. + sig_setmask($old); + } local @SIG{keys %$sig} = values(%$sig) if $sig && !$sigfd; local $SIG{PIPE} = 'IGNORE'; if (!$sigfd && $sig) { # wake up every second to accept signals if we don't # have signalfd or IO::KQueue: - sig_setmask($oldset); - PublicInbox::DS->SetLoopTimeout(1000); + sig_setmask($oldset) if $oldset; + sigprocmask(SIG_UNBLOCK, unblockset($sig)) or + die "SIG_UNBLOCK: $!"; + $loop_timeout = 1000; } $_[0] = $sigfd = $sig = undef; # $_[0] == sig local $in_loop = 1; - my @events; do { my $timeout = RunTimers(); - # get up to 1000 events - epoll_wait($Epoll, 1000, $timeout, \@events); - for my $fd (@events) { - # it's possible epoll_wait returned many events, - # including some at the end that ones in the front - # triggered unregister-interest actions. if we can't - # find the %sock entry, it's because we're no longer - # interested in that event. - - # guard stack-not-refcounted w/ Carp + @DB::args - my $obj = $DescriptorMap{$fd}; + # grab whatever FDs are ready + $Poller->ep_wait($timeout, \@active); + + # map all FDs to their associated Perl object + @active = @FD_MAP[@active]; + + while (my $obj = shift @active) { $obj->event_step; } } while (PostEventLoop()); } -=head2 C<< CLASS->SetPostLoopCallback( CODEREF ) >> - -Sets post loop callback function. Pass a subref and it will be -called every time the event loop finishes. - -Return 1 (or any true value) from the sub to make the loop continue, 0 or false -and it will exit. - -The callback function will be passed two parameters: \%DescriptorMap - -=cut -sub SetPostLoopCallback { - my ($class, $ref) = @_; - - # global callback - $PostLoopCallback = (defined $ref && ref $ref eq 'CODE') ? $ref : undef; -} - ##################################################################### ### PublicInbox::DS-the-object code ##################################################################### @@ -332,19 +322,19 @@ sub new { $self->{sock} = $sock; my $fd = fileno($sock); - $Epoll //= _InitPoller(); + $Poller //= _InitPoller(); retry: - if (epoll_ctl($Epoll, EPOLL_CTL_ADD, $fd, $ev)) { + if ($Poller->ep_add($sock, $ev)) { if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { $ev &= ~EPOLLEXCLUSIVE; goto retry; } die "EPOLL_CTL_ADD $self/$sock/$fd: $!"; } - croak("FD:$fd in use by $DescriptorMap{$fd} (for $self/$sock)") - if defined($DescriptorMap{$fd}); + defined($FD_MAP[$fd]) and + croak("BUG: FD:$fd in use by $FD_MAP[$fd] (for $self/$sock)"); - $DescriptorMap{$fd} = $self; + $FD_MAP[$fd] = $self; } # for IMAP, NNTP, and POP3 which greet clients upon connect @@ -353,8 +343,7 @@ sub greet { my $ev = EPOLLIN; my $wbuf; if ($sock->can('accept_SSL') && !$sock->accept_SSL) { - return CORE::close($sock) if $! != EAGAIN; - $ev = PublicInbox::TLS::epollbit() or return CORE::close($sock); + return if $! != EAGAIN || !($ev = PublicInbox::TLS::epollbit()); $wbuf = [ \&accept_tls_step, $self->can('do_greet')]; } new($self, $sock, $ev | EPOLLONESHOT); @@ -366,46 +355,21 @@ sub greet { $self; } -##################################################################### -### I N S T A N C E M E T H O D S -##################################################################### - sub requeue ($) { push @$nextq, $_[0] } # autovivifies -=head2 C<< $obj->close >> - -Close the socket. - -=cut +# drop the IO::Handle ref, true if successful, false if not (or already dropped) +# (this is closer to CORE::close than Danga::Socket::close) sub close { - my ($self) = @_; - my $sock = delete $self->{sock} or return; - - # we need to flush our write buffer, as there may - # be self-referential closures (sub { $client->close }) - # preventing the object from being destroyed - delete $self->{wbuf}; - - # if we're using epoll, we have to remove this from our epoll fd so we stop getting - # notifications about it - my $fd = fileno($sock); - epoll_ctl($Epoll, EPOLL_CTL_DEL, $fd, 0) and - croak("EPOLL_CTL_DEL($self/$sock): $!"); - - # we explicitly don't delete from DescriptorMap here until we - # actually close the socket, as we might be in the middle of - # processing an epoll_wait/etc that returned hundreds of fds, one - # of which is not yet processed and is what we're closing. if we - # keep it in DescriptorMap, then the event harnesses can just - # looked at $pob->{sock} == undef and ignore it. but if it's an - # un-accounted for fd, then it (understandably) freak out a bit - # and emit warnings, thinking their state got off. + my ($self) = @_; + my $sock = delete $self->{sock} or return; - # defer closing the actual socket until the event loop is done - # processing this round of events. (otherwise we might reuse fds) - push @$ToClose, $sock; # autovivifies $ToClose + # we need to clear our write buffer, as there may + # be self-referential closures (sub { $client->close }) + # preventing the object from being destroyed + delete $self->{wbuf}; + $FD_MAP[fileno($sock)] = undef; - return 0; + !$Poller->ep_del($sock); # stop getting notifications } # portable, non-thread-safe sendfile emulation (no pread, yet) @@ -451,8 +415,7 @@ next_buf: shift @$wbuf; goto next_buf; } - } elsif ($! == EAGAIN) { - my $ev = epbit($sock, EPOLLOUT) or return $self->close; + } elsif ($! == EAGAIN && (my $ev = epbit($sock, EPOLLOUT))) { epwait($sock, $ev | EPOLLONESHOT); return 0; } else { @@ -482,28 +445,28 @@ sub rbuf_idle ($$) { } } +# returns true if bytes are read, false otherwise sub do_read ($$$;$) { - my ($self, $rbuf, $len, $off) = @_; - my $r = sysread(my $sock = $self->{sock}, $$rbuf, $len, $off // 0); - return ($r == 0 ? $self->close : $r) if defined $r; - # common for clients to break connections without warning, - # would be too noisy to log here: - if ($! == EAGAIN) { - my $ev = epbit($sock, EPOLLIN) or return $self->close; - epwait($sock, $ev | EPOLLONESHOT); - rbuf_idle($self, $rbuf); - 0; - } else { - $self->close; - } + my ($self, $rbuf, $len, $off) = @_; + my ($ev, $r, $s); + $r = sysread($s = $self->{sock}, $$rbuf, $len, $off // 0) and return $r; + + if (!defined($r) && $! == EAGAIN && ($ev = epbit($s, EPOLLIN))) { + epwait($s, $ev | EPOLLONESHOT); + rbuf_idle($self, $rbuf); + } else { + $self->close; + } + undef; } # drop the socket if we hit unrecoverable errors on our system which # require BOFH attention: ENOSPC, EFBIG, EIO, EMFILE, ENFILE... sub drop { - my $self = shift; - carp(@_); - $self->close; + my $self = shift; + carp(@_); + $self->close; + undef; } sub tmpio ($$$) { @@ -545,7 +508,8 @@ sub write { push @$wbuf, $bref; } else { my $tmpio = $wbuf->[-1]; - if ($tmpio && !defined($tmpio->[2])) { # append to tmp file buffer + if (ref($tmpio) eq 'ARRAY' && !defined($tmpio->[2])) { + # append to tmp file buffer $tmpio->[0]->print($$bref) or return drop($self, "print: $!"); } else { my $tmpio = tmpio($self, $bref, 0) or return 0; @@ -607,9 +571,8 @@ sub msg_more ($$) { } sub epwait ($$) { - my ($sock, $ev) = @_; - epoll_ctl($Epoll, EPOLL_CTL_MOD, fileno($sock), $ev) and - croak("EPOLL_CTL_MOD($sock): $!"); + my ($io, $ev) = @_; + $Poller->ep_mod($io, $ev) and croak("EPOLL_CTL_MOD($io): $!"); } # return true if complete, false if incomplete (or failure) @@ -624,7 +587,7 @@ sub accept_tls_step ($) { 0; } -# return true if complete, false if incomplete (or failure) +# return value irrelevant sub shutdn_tls_step ($) { my ($self) = @_; my $sock = $self->{sock} or return; @@ -633,19 +596,14 @@ sub shutdn_tls_step ($) { my $ev = PublicInbox::TLS::epollbit() or return $self->close; epwait($sock, $ev | EPOLLONESHOT); unshift(@{$self->{wbuf}}, \&shutdn_tls_step); # autovivifies - 0; } # don't bother with shutdown($sock, 2), we don't fork+exec w/o CLOEXEC # or fork w/o exec, so no inadvertent socket sharing sub shutdn ($) { - my ($self) = @_; - my $sock = $self->{sock} or return; - if ($sock->can('stop_SSL')) { - shutdn_tls_step($self); - } else { - $self->close; - } + my ($self) = @_; + my $sock = $self->{sock} or return; + $sock->can('stop_SSL') ? shutdn_tls_step($self) : $self->close; } sub dflush {} # overridden by DSdeflate @@ -660,8 +618,8 @@ sub long_step { if ($@ || !$self->{sock}) { # something bad happened... delete $self->{long_cb}; my $elapsed = now() - $t0; - $@ and $self->err("%s during long response[$fd] - %0.6f", - $@, $elapsed); + $@ and warn("$@ during long response[$fd] - ", + sprintf('%0.6f', $elapsed),"\n"); $self->out(" deferred[$fd] aborted - %0.6f", $elapsed); $self->close; } elsif ($more) { # $self->{wbuf}: @@ -671,7 +629,6 @@ sub long_step { delete $self->{long_cb}; $self->long_response_done; my $elapsed = now() - $t0; - my $fd = fileno($self->{sock}); $self->out(" deferred[$fd] done - %0.6f", $elapsed); my $wbuf = $self->{wbuf}; # do NOT autovivify requeue($self) unless $wbuf && @$wbuf; @@ -702,25 +659,46 @@ sub long_response ($$;@) { undef; } -sub dwaitpid ($;$$) { - my ($pid, $cb, $arg) = @_; - if ($in_loop) { - push @$wait_pids, [ $pid, $cb, $arg ]; - # We could've just missed our SIGCHLD, cover it, here: - enqueue_reap(); - } else { +sub awaitpid { + my ($pid, @cb_args) = @_; # @cb_args = ($cb, @args), $cb may be undef + $AWAIT_PIDS{$pid} = \@cb_args if @cb_args; + # provide synchronous API + if (defined(wantarray) || (!$in_loop && !@cb_args)) { my $ret = waitpid($pid, 0); if ($ret == $pid) { - if ($cb) { - eval { $cb->($arg, $pid) }; - carp "E: dwaitpid($pid) !in_loop: $@" if $@; - } + my $cb_args = delete $AWAIT_PIDS{$pid}; + @cb_args = @$cb_args if !@cb_args && $cb_args; + await_cb($pid, @cb_args); } else { - carp "waitpid($pid, 0) = $ret, \$!=$!, \$?=$?"; + carp "waitpid($pid) => $ret ($!)"; + delete $AWAIT_PIDS{$pid}; } + return $ret; + } elsif ($in_loop) { # We could've just missed our SIGCHLD, cover it, here: + enqueue_reap(); } } +# for persistent child process +sub fork_persist () { + my $seed = rand(0xffffffff); + my $pid = PublicInbox::OnDestroy::fork_tmp; + if ($pid == 0) { + srand($seed); + eval { Net::SSLeay::randomize() }; # may not be loaded + Reset(); + } + $pid; +} + +package PublicInbox::DummyPoller; # only used during Reset +use v5.12; + +sub ep_del {} +no warnings 'once'; +*ep_add = \&ep_del; +*ep_mod = \&ep_del; + 1; =head1 AUTHORS (Danga::Socket) diff --git a/lib/PublicInbox/DSKQXS.pm b/lib/PublicInbox/DSKQXS.pm index eccfa56d..dc6621e4 100644 --- a/lib/PublicInbox/DSKQXS.pm +++ b/lib/PublicInbox/DSKQXS.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # Licensed the same as Danga::Socket (and Perl5) # License: GPL-1.0+ or Artistic-1.0-Perl # <https://www.gnu.org/licenses/gpl-1.0.txt> @@ -11,15 +11,12 @@ # # It also implements signalfd(2) emulation via "tie". package PublicInbox::DSKQXS; -use strict; -use warnings; -use parent qw(Exporter); +use v5.12; use Symbol qw(gensym); use IO::KQueue; use Errno qw(EAGAIN); -use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT EPOLLET - EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL); -our @EXPORT_OK = qw(epoll_ctl epoll_wait); +use PublicInbox::OnDestroy; +use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT EPOLLET); sub EV_DISPATCH () { 0x0080 } @@ -41,23 +38,23 @@ sub kq_flag ($$) { sub new { my ($class) = @_; - bless { kq => IO::KQueue->new, owner_pid => $$ }, $class; + my $fgen = $PublicInbox::OnDestroy::fork_gen; + bless { kq => IO::KQueue->new, fgen => $fgen }, $class; } # returns a new instance which behaves like signalfd on Linux. # It's wasteful in that it uses another FD, but it simplifies # our epoll-oriented code. sub signalfd { - my ($class, $signo, $nonblock) = @_; + my ($class, $signo) = @_; my $sym = gensym; - tie *$sym, $class, $signo, $nonblock; # calls TIEHANDLE + tie *$sym, $class, $signo; # calls TIEHANDLE $sym } sub TIEHANDLE { # similar to signalfd() - my ($class, $signo, $nonblock) = @_; + my ($class, $signo) = @_; my $self = $class->new; - $self->{timeout} = $nonblock ? 0 : -1; my $kq = $self->{kq}; $kq->EV_SET($_, EVFILT_SIGNAL, EV_ADD) for @$signo; $self; @@ -66,12 +63,11 @@ sub TIEHANDLE { # similar to signalfd() sub READ { # called by sysread() for signalfd compatibility my ($self, undef, $len, $off) = @_; # $_[1] = buf die "bad args for signalfd read" if ($len % 128) // defined($off); - my $timeout = $self->{timeout}; my $sigbuf = $self->{sigbuf} //= []; my $nr = $len / 128; my $r = 0; $_[1] = ''; - do { + while (1) { while ($nr--) { my $signo = shift(@$sigbuf) or last; # caller only cares about signalfd_siginfo.ssi_signo: @@ -79,13 +75,13 @@ sub READ { # called by sysread() for signalfd compatibility $r += 128; } return $r if $r; - my @events = eval { $self->{kq}->kevent($timeout) }; + my @events = eval { $self->{kq}->kevent(0) }; # workaround https://rt.cpan.org/Ticket/Display.html?id=116615 if ($@) { next if $@ =~ /Interrupted system call/; die; } - if (!scalar(@events) && $timeout == 0) { + if (!scalar(@events)) { $! = EAGAIN; return; } @@ -94,36 +90,37 @@ sub READ { # called by sysread() for signalfd compatibility # field shows coalesced signals, and maybe we'll use it # in the future... @$sigbuf = map { $_->[0] } @events; - } while (1); + } } # for fileno() calls in PublicInbox::DS sub FILENO { ${$_[0]->{kq}} } -sub epoll_ctl { - my ($self, $op, $fd, $ev) = @_; - my $kq = $self->{kq}; - if ($op == EPOLL_CTL_MOD) { - $kq->EV_SET($fd, EVFILT_READ, kq_flag(EPOLLIN, $ev)); - eval { $kq->EV_SET($fd, EVFILT_WRITE, kq_flag(EPOLLOUT, $ev)) }; - } elsif ($op == EPOLL_CTL_DEL) { - $kq->EV_SET($fd, EVFILT_READ, EV_DISABLE); - eval { $kq->EV_SET($fd, EVFILT_WRITE, EV_DISABLE) }; - } else { # EPOLL_CTL_ADD - $kq->EV_SET($fd, EVFILT_READ, EV_ADD|kq_flag(EPOLLIN, $ev)); - - # we call this blindly for read-only FDs such as tied - # DSKQXS (signalfd emulation) and Listeners - eval { - $kq->EV_SET($fd, EVFILT_WRITE, EV_ADD | - kq_flag(EPOLLOUT, $ev)); - }; - } +sub _ep_mod_add ($$$$) { + my ($kq, $fd, $ev, $add) = @_; + $kq->EV_SET($fd, EVFILT_READ, $add|kq_flag(EPOLLIN, $ev)); + + # we call this blindly for read-only FDs such as tied + # DSKQXS (signalfd emulation) and Listeners + eval { $kq->EV_SET($fd, EVFILT_WRITE, $add|kq_flag(EPOLLOUT, $ev)) }; 0; } -sub epoll_wait { - my ($self, $maxevents, $timeout_msec, $events) = @_; +sub ep_add { _ep_mod_add($_[0]->{kq}, fileno($_[1]), $_[2], EV_ADD) }; +sub ep_mod { _ep_mod_add($_[0]->{kq}, fileno($_[1]), $_[2], 0) }; + +sub ep_del { + my ($self, $io, $ev) = @_; + my $kq = $_[0]->{kq} // return; # called in cleanup + my $fd = fileno($io); + $kq->EV_SET($fd, EVFILT_READ, EV_DISABLE); + eval { $kq->EV_SET($fd, EVFILT_WRITE, EV_DISABLE) }; + 0; +} + +sub ep_wait { + my ($self, $timeout_msec, $events) = @_; + # n.b.: IO::KQueue is hard-coded to return up to 1000 events @$events = eval { $self->{kq}->kevent($timeout_msec) }; if (my $err = $@) { # workaround https://rt.cpan.org/Ticket/Display.html?id=116615 @@ -142,9 +139,8 @@ sub epoll_wait { sub DESTROY { my ($self) = @_; my $kq = delete $self->{kq} or return; - if (delete($self->{owner_pid}) == $$) { + delete($self->{fgen}) == $PublicInbox::OnDestroy::fork_gen and POSIX::close($$kq); - } } 1; diff --git a/lib/PublicInbox/DSPoll.pm b/lib/PublicInbox/DSPoll.pm index 56a400c2..a7055ec9 100644 --- a/lib/PublicInbox/DSPoll.pm +++ b/lib/PublicInbox/DSPoll.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # Licensed the same as Danga::Socket (and Perl5) # License: GPL-1.0+ or Artistic-1.0-Perl # <https://www.gnu.org/licenses/gpl-1.0.txt> @@ -9,49 +9,47 @@ # an all encompassing emulation of epoll via IO::Poll, but just to # support cases public-inbox-nntpd/httpd care about. package PublicInbox::DSPoll; -use strict; -use warnings; -use parent qw(Exporter); +use v5.12; use IO::Poll; -use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT EPOLL_CTL_DEL); -our @EXPORT_OK = qw(epoll_ctl epoll_wait); +use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT); +use Carp qw(carp); +use Errno (); -sub new { bless {}, $_[0] } # fd => events +sub new { bless {}, __PACKAGE__ } # fd => events -sub epoll_ctl { - my ($self, $op, $fd, $ev) = @_; - - # not wasting time on error checking - if ($op != EPOLL_CTL_DEL) { - $self->{$fd} = $ev; - } else { - delete $self->{$fd}; - } - 0; -} - -sub epoll_wait { - my ($self, $maxevents, $timeout_msec, $events) = @_; - my @pset; +sub ep_wait { + my ($self, $timeout_msec, $events) = @_; + my (@pset, $n, $fd, $revents, $nval); while (my ($fd, $events) = each %$self) { my $pevents = $events & EPOLLIN ? POLLIN : 0; $pevents |= $events & EPOLLOUT ? POLLOUT : 0; push(@pset, $fd, $pevents); } @$events = (); - my $n = IO::Poll::_poll($timeout_msec, @pset); - if ($n >= 0) { - for (my $i = 0; $i < @pset; ) { - my $fd = $pset[$i++]; - my $revents = $pset[$i++] or next; - delete($self->{$fd}) if $self->{$fd} & EPOLLONESHOT; + $n = IO::Poll::_poll($timeout_msec, @pset) or return; # timeout expired + return if $n < 0 && $! == Errno::EINTR; # caller recalculates timeout + die "poll: $!" if $n < 0; + while (defined($fd = shift @pset)) { + $revents = shift @pset or next; # no event + if ($revents & POLLNVAL) { + carp "E: FD=$fd invalid in poll"; + delete $self->{$fd}; + $nval = 1; + } else { + delete $self->{$fd} if $self->{$fd} & EPOLLONESHOT; push @$events, $fd; } - my $nevents = scalar @$events; - if ($n != $nevents) { - warn "BUG? poll() returned $n, but got $nevents"; - } + } + if ($nval && !@$events) { + $! = Errno::EBADF; + die "poll: $!"; } } +sub ep_del { delete($_[0]->{fileno($_[1])}); 0 } +sub ep_add { $_[0]->{fileno($_[1])} = $_[2]; 0 } + +no warnings 'once'; +*ep_mod = \&ep_add; + 1; diff --git a/lib/PublicInbox/DSdeflate.pm b/lib/PublicInbox/DSdeflate.pm index 639690e2..539adf0f 100644 --- a/lib/PublicInbox/DSdeflate.pm +++ b/lib/PublicInbox/DSdeflate.pm @@ -46,7 +46,7 @@ sub enable { my ($class, $self) = @_; my ($in, $err) = Compress::Raw::Zlib::Inflate->new(%IN_OPT); if ($err != Z_OK) { - $self->err("Inflate->new failed: $err"); + warn("Inflate->new failed: $err\n"); return; } bless $self, $class; diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index 16bae231..28458b19 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -5,31 +5,30 @@ # and designed for handling thousands of untrusted clients over slow # and/or lossy connections. package PublicInbox::Daemon; -use strict; -use v5.10.1; +use v5.12; use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); use IO::Handle; # ->autoflush use IO::Socket; use File::Spec; -use POSIX qw(WNOHANG :signal_h); +use POSIX qw(WNOHANG :signal_h F_SETFD); use Socket qw(IPPROTO_TCP SOL_SOCKET); STDOUT->autoflush(1); STDERR->autoflush(1); -use PublicInbox::DS qw(now); +use PublicInbox::DS qw(now awaitpid); use PublicInbox::Listener; use PublicInbox::EOFpipe; -use PublicInbox::Sigfd; use PublicInbox::Git; use PublicInbox::GitAsyncCat; use PublicInbox::Eml; use PublicInbox::Config; +use PublicInbox::OnDestroy; +use PublicInbox::Search; +use PublicInbox::XapClient; our $SO_ACCEPTFILTER = 0x1000; my @CMD; -my ($set_user, $oldset); +my ($set_user, $oldset, $xh_workers); my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize); -my $worker_processes = 1; -my @listeners; -my (%pids, %logs); +my ($nworker, @listeners, %WORKERS, %logs); my %tls_opt; # scheme://sockname => args for IO::Socket::SSL::SSL_Context->new my $reexec_pid; my ($uid, $gid); @@ -40,6 +39,19 @@ my %SCHEME2PORT = map { $KNOWN_TLS{$_} => $_ + 0 } keys %KNOWN_TLS; for (keys %KNOWN_STARTTLS) { $SCHEME2PORT{$KNOWN_STARTTLS{$_}} = $_ + 0 } $SCHEME2PORT{http} = 80; +our ($parent_pipe, %POST_ACCEPT, %XNETD); +our %WORKER_SIG = ( + INT => \&worker_quit, + QUIT => \&worker_quit, + TERM => \&worker_quit, + TTIN => 'IGNORE', + TTOU => 'IGNORE', + USR1 => \&reopen_logs, + USR2 => 'IGNORE', + WINCH => 'IGNORE', + CHLD => \&PublicInbox::DS::enqueue_reap, +); + sub listener_opt ($) { my ($str) = @_; # opt1=val1,opt2=val2 (opt may repeat for multi-value) my $o = {}; @@ -134,13 +146,18 @@ sub load_mod ($;$$) { $tlsd->{$f} = $logs{$p} //= open_log_path(my $fh, $p); warn "# $scheme://$addr $f=$p\n"; } + # for per-listener $SIG{__WARN__}: my $err = $tlsd->{err}; - $tlsd->{warn_cb} = sub { print $err @_ }; # for local $SIG{__WARN__} + $tlsd->{warn_cb} = sub { + print $err @_ unless PublicInbox::Eml::warn_ignore(@_) + }; + $opt->{'multi-accept'} and + $xn{'multi-accept'} = $opt->{'multi-accept'}->[-1]; \%xn; } -sub daemon_prepare ($$) { - my ($default_listen, $xnetd) = @_; +sub daemon_prepare ($) { + my ($default_listen) = @_; my $listener_names = {}; # sockname => IO::Handle $oldset = PublicInbox::DS::block_signals(); @CMD = ($0, @ARGV); @@ -153,8 +170,9 @@ options: -l ADDRESS address to listen on$dh --cert=FILE default SSL/TLS certificate - --key=FILE default SSL/TLS certificate + --key=FILE default SSL/TLS certificate key -W WORKERS number of worker processes to spawn (default: 1) + -X XWORKERS number of Xapian helper processes (default: undefined) See public-inbox-daemon(8) and $prog(1) man pages for more. EOF @@ -162,13 +180,15 @@ EOF 'l|listen=s' => \@cfg_listen, '1|stdout=s' => \$stdout, '2|stderr=s' => \$stderr, - 'W|worker-processes=i' => \$worker_processes, + 'W|worker-processes=i' => \$nworker, 'P|pid-file=s' => \$pid_file, 'u|user=s' => \$user, 'g|group=s' => \$group, 'D|daemonize' => \$daemonize, + 'multi-accept=i' => \$PublicInbox::Listener::MULTI_ACCEPT, 'cert=s' => \$default_cert, 'key=s' => \$default_key, + 'X|xapian-helpers=i' => \$xh_workers, 'help|h' => \(my $show_help), ); GetOptions(%opt) or die $help; @@ -215,7 +235,7 @@ EOF die "$orig specified w/o cert=\n"; } if ($listener_names->{$l}) { # already inherited - $xnetd->{$l} = load_mod($scheme, $opt, $l); + $XNETD{$l} = load_mod($scheme, $opt, $l); next; } my (%o, $sock_pkg); @@ -251,7 +271,7 @@ EOF $s->blocking(0); my $sockname = sockname($s); warn "# bound $scheme://$sockname\n"; - $xnetd->{$sockname} //= load_mod($scheme); + $XNETD{$sockname} //= load_mod($scheme, $opt); $listener_names->{$sockname} = $s; push @listeners, $s; } @@ -265,10 +285,10 @@ EOF for my $x (@inherited_names) { $x =~ /:([0-9]+)\z/ or next; # no TLS for AF_UNIX if (my $scheme = $KNOWN_TLS{$1}) { - $xnetd->{$x} //= load_mod($scheme); + $XNETD{$x} //= load_mod($scheme); $tls_opt{"$scheme://$x"} ||= accept_tls_opt(''); } elsif (($scheme = $KNOWN_STARTTLS{$1})) { - $xnetd->{$x} //= load_mod($scheme); + $XNETD{$x} //= load_mod($scheme); $tls_opt{"$scheme://$x"} ||= accept_tls_opt(''); } elsif (defined $stls) { $tls_opt{"$stls://$x"} ||= accept_tls_opt(''); @@ -277,7 +297,7 @@ EOF } if (defined $default_scheme) { for my $x (@inherited_names) { - $xnetd->{$x} //= load_mod($default_scheme); + $XNETD{$x} //= load_mod($default_scheme); } } die "No listeners bound\n" unless @listeners; @@ -323,22 +343,38 @@ EOF }; if ($daemonize) { - my $pid = fork // die "fork: $!"; + my $pid = PublicInbox::OnDestroy::fork_tmp; exit if $pid; - open(STDIN, '+<', '/dev/null') or die "redirect stdin failed: $!\n"; open STDOUT, '>&STDIN' or die "redirect stdout failed: $!\n"; open STDERR, '>&STDIN' or die "redirect stderr failed: $!\n"; POSIX::setsid(); - $pid = fork // die "fork: $!"; + $pid = PublicInbox::OnDestroy::fork_tmp; exit if $pid; } return unless defined $pid_file; write_pid($pid_file); - # for ->DESTROY: - bless { pid => $$, pid_file => \$pid_file }, __PACKAGE__; + on_destroy \&unlink_pid_file_safe_ish, \$pid_file; +} + +sub has_busy_clients { # post_loop_do CB + my ($state) = @_; + my $now = now(); + my $n = PublicInbox::DS::close_non_busy(); + if ($n) { + if ($state->{-w} < now()) { + warn "$$ quitting, $n client(s) left\n"; + $state->{-w} = now() + 5; + } + unless (defined $state->{0}) { + $state->{0} = (split(/\s+/, $0))[0]; + $state->{0} =~ s!\A.*?([^/]+)\z!$1!; + } + $0 = "$state->{0} quitting, $n client(s) left"; + } + $n; # true: loop continues, false: loop breaks } sub worker_quit { # $_[0] = signal name or number (unused) @@ -347,40 +383,35 @@ sub worker_quit { # $_[0] = signal name or number (unused) $_->close foreach @listeners; # call PublicInbox::DS::close @listeners = (); - my $proc_name; - my $warn = 0; + # drop idle connections and try to quit gracefully - PublicInbox::DS->SetPostLoopCallback(sub { - my ($dmap, undef) = @_; - my $n = 0; - my $now = now(); - for my $s (values %$dmap) { - $s->can('busy') or next; - if ($s->busy) { - ++$n; - } else { # close as much as possible, early as possible - $s->close; - } - } - if ($n) { - if (($warn + 5) < now()) { - warn "$$ quitting, $n client(s) left\n"; - $warn = now(); - } - unless (defined $proc_name) { - $proc_name = (split(/\s+/, $0))[0]; - $proc_name =~ s!\A.*?([^/]+)\z!$1!; - } - $0 = "$proc_name quitting, $n client(s) left"; - } - $n; # true: loop continues, false: loop breaks - }); + @PublicInbox::DS::post_loop_do = (\&has_busy_clients, { -w => 0 }) +} + +sub spawn_xh () { + $xh_workers // return; + require PublicInbox::XhcMset; + local $) = $gid if defined $gid; + local $( = $gid if defined $gid; + local $> = $uid if defined $uid; + local $< = $uid if defined $uid; + $PublicInbox::Search::XHC = eval { + local $ENV{STDERR_PATH} = $stderr; + local $ENV{STDOUT_PATH} = $stdout; + PublicInbox::XapClient::start_helper('-j', $xh_workers) + }; + warn "E: $@" if $@; + awaitpid($PublicInbox::Search::XHC->{io}->attached_pid, \&respawn_xh) + if $PublicInbox::Search::XHC; } sub reopen_logs { + my ($sig) = @_; $logs{$stdout} //= \*STDOUT if defined $stdout; $logs{$stderr} //= \*STDERR if defined $stderr; while (my ($p, $fh) = each %logs) { open_log_path($fh, $p) } + ($sig && defined($xh_workers) && $PublicInbox::Search::XHC) and + kill('USR1', $PublicInbox::Search::XHC->{io}->attached_pid); } sub sockname ($) { @@ -468,168 +499,139 @@ sub upgrade { # $_[0] = signal name or number (unused) warn "BUG: .oldbin suffix exists: $pid_file\n"; return; } - unlink_pid_file_safe_ish($$, $pid_file); + unlink_pid_file_safe_ish(\$pid_file); $pid_file .= '.oldbin'; write_pid($pid_file); } - my $pid = fork; - unless (defined $pid) { - warn "fork failed: $!\n"; - return; - } - if ($pid == 0) { - use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); + my $pid = eval { PublicInbox::OnDestroy::fork_tmp }; + if (!defined($pid)) { + warn "fork failed: $! $@\n"; + } elsif ($pid == 0) { $ENV{LISTEN_FDS} = scalar @listeners; $ENV{LISTEN_PID} = $$; foreach my $s (@listeners) { # @listeners are globs with workers, PI::L w/o workers $s = $s->{sock} if ref($s) eq 'PublicInbox::Listener'; - - my $fl = fcntl($s, F_GETFD, 0); - fcntl($s, F_SETFD, $fl &= ~FD_CLOEXEC); + fcntl($s, F_SETFD, 0) // die "F_SETFD: $!"; } exec @CMD; die "Failed to exec: $!\n"; + } else { + awaitpid($pid, \&upgrade_aborted); + $reexec_pid = $pid; } - $reexec_pid = $pid; } -sub kill_workers ($) { - my ($sig) = @_; - kill $sig, keys(%pids); -} +sub kill_workers ($) { kill $_[0], values(%WORKERS) } -sub upgrade_aborted ($) { - my ($p) = @_; - warn "reexec PID($p) died with: $?\n"; +sub upgrade_aborted { + my ($pid) = @_; + warn "reexec PID($pid) died with: $?\n"; $reexec_pid = undef; return unless $pid_file; my $file = $pid_file; $file =~ s/\.oldbin\z// or die "BUG: no '.oldbin' suffix in $file"; - unlink_pid_file_safe_ish($$, $pid_file); + unlink_pid_file_safe_ish(\$pid_file); $pid_file = $file; eval { write_pid($pid_file) }; warn $@, "\n" if $@; } -sub reap_children { # $_[0] = 'CHLD' or POSIX::SIGCHLD() - while (1) { - my $p = waitpid(-1, WNOHANG) or return; - if (defined $reexec_pid && $p == $reexec_pid) { - upgrade_aborted($p); - } elsif (defined(my $id = delete $pids{$p})) { - warn "worker[$id] PID($p) died with: $?\n"; - } elsif ($p > 0) { - warn "unknown PID($p) reaped: $?\n"; - } else { - return; - } - } -} - -sub unlink_pid_file_safe_ish ($$) { - my ($unlink_pid, $file) = @_; - return unless defined $unlink_pid && $unlink_pid == $$; +sub unlink_pid_file_safe_ish ($) { + my ($fref) = @_; - open my $fh, '<', $file or return; + open my $fh, '<', $$fref or return; local $/ = "\n"; defined(my $read_pid = <$fh>) or return; chomp $read_pid; - if ($read_pid == $unlink_pid) { - Net::Server::Daemonize::unlink_pid_file($file); - } + Net::Server::Daemonize::unlink_pid_file($$fref) if $read_pid == $$; } sub master_quit ($) { exit unless @listeners; @listeners = (); - kill_workers($_[0]); + exit unless kill_workers($_[0]); +} + +sub reap_worker { # awaitpid CB + my ($pid, $nr) = @_; + warn "worker[$nr] died \$?=$?\n" if $?; + delete $WORKERS{$nr}; + exit if !@listeners && !keys(%WORKERS); + PublicInbox::DS::requeue(\&start_workers); +} + +sub start_worker ($) { + my ($nr) = @_; + return unless @listeners; + my $pid = PublicInbox::DS::fork_persist; + if ($pid == 0) { + undef %WORKERS; + undef $xh_workers; + local $PublicInbox::DS::Poller; # allow epoll/kqueue + $set_user->() if $set_user; + PublicInbox::EOFpipe->new($parent_pipe, \&worker_quit); + worker_loop(); + exit 0; + } else { + $WORKERS{$nr} = $pid; + awaitpid($pid, \&reap_worker, $nr); + } +} + +sub start_workers { + my @idx = grep { !defined($WORKERS{$_}) } (0..($nworker - 1)) or return; + eval { start_worker($_) for @idx }; + warn "E: $@\n" if $@; +} + +sub trim_workers { + my @nr = grep { $_ >= $nworker } keys %WORKERS; + kill('TERM', @WORKERS{@nr}); } sub master_loop { - pipe(my ($p0, $p1)) or die "failed to create parent-pipe: $!"; - my $set_workers = $worker_processes; + local $parent_pipe; + pipe($parent_pipe, my $p1) or die "failed to create parent-pipe: $!"; + my $set_workers = $nworker; # for SIGWINCH reopen_logs(); - my $ignore_winch; - my $sig = { - USR1 => sub { reopen_logs(); kill_workers($_[0]); }, + spawn_xh; + my $msig = { + USR1 => sub { reopen_logs($_[0]); kill_workers($_[0]); }, USR2 => \&upgrade, QUIT => \&master_quit, INT => \&master_quit, TERM => \&master_quit, WINCH => sub { - return if $ignore_winch || !@listeners; - if (-t STDIN || -t STDOUT || -t STDERR) { - $ignore_winch = 1; - warn <<EOF; -ignoring SIGWINCH since we are not daemonized -EOF - } else { - $worker_processes = 0; - } + $nworker = 0; + trim_workers(); }, HUP => sub { - return unless @listeners; - $worker_processes = $set_workers; + $nworker = $set_workers; # undo WINCH kill_workers($_[0]); + PublicInbox::DS::requeue(\&start_workers) }, TTIN => sub { - return unless @listeners; - if ($set_workers > $worker_processes) { - ++$worker_processes; + if ($set_workers > $nworker) { + ++$nworker; } else { - $worker_processes = ++$set_workers; + $nworker = ++$set_workers; } + PublicInbox::DS::requeue(\&start_workers); }, TTOU => sub { - $worker_processes = --$set_workers if $set_workers > 0; + return if $nworker <= 0; + --$nworker; + trim_workers(); }, - CHLD => \&reap_children, + CHLD => \&PublicInbox::DS::enqueue_reap, }; - my $sigfd = PublicInbox::Sigfd->new($sig); - local @SIG{keys %$sig} = values(%$sig) unless $sigfd; - PublicInbox::DS::sig_setmask($oldset) if !$sigfd; - while (1) { # main loop - my $n = scalar keys %pids; - unless (@listeners) { - exit if $n == 0; - $set_workers = $worker_processes = $n = 0; - } - - if ($n > $worker_processes) { - while (my ($k, $v) = each %pids) { - kill('TERM', $k) if $v >= $worker_processes; - } - $n = $worker_processes; - } - my $want = $worker_processes - 1; - if ($n <= $want) { - PublicInbox::DS::block_signals() if !$sigfd; - for my $i ($n..$want) { - my $seed = rand(0xffffffff); - my $pid = fork; - if (!defined $pid) { - warn "failed to fork worker[$i]: $!\n"; - } elsif ($pid == 0) { - srand($seed); - eval { Net::SSLeay::randomize() }; - $set_user->() if $set_user; - return $p0; # run normal work code - } else { - warn "PID=$pid is worker[$i]\n"; - $pids{$pid} = $i; - } - } - PublicInbox::DS::sig_setmask($oldset) if !$sigfd; - } - - if ($sigfd) { # Linux and IO::KQueue users: - $sigfd->wait_once; - } else { # wake up every second - sleep(1); - } - } + $msig->{WINCH} = sub { + warn "ignoring SIGWINCH since we are not daemonized\n"; + } if -t STDIN || -t STDOUT || -t STDERR; + start_workers(); + PublicInbox::DS::event_loop($msig, $oldset); exit # never gets here, just for documentation } @@ -651,88 +653,91 @@ sub defer_accept ($$) { my $sec = unpack('i', $x); return if $sec > 0; # systemd users may set a higher value setsockopt($s, IPPROTO_TCP, $TCP_DEFER_ACCEPT, 1); - } elsif ($^O eq 'freebsd') { + } elsif ($^O =~ /\A(?:freebsd|netbsd|dragonfly)\z/) { my $x = getsockopt($s, SOL_SOCKET, $SO_ACCEPTFILTER); - return if defined $x; # don't change if set + return if ($x // "\0") =~ /[^\0]/s; # don't change if set my $accf_arg = pack('a16a240', $af_name, ''); setsockopt($s, SOL_SOCKET, $SO_ACCEPTFILTER, $accf_arg); } } -sub daemon_loop ($) { - my ($xnetd) = @_; +sub daemon_loop () { local $PublicInbox::Config::DEDUPE = {}; # enable dedupe cache - my $refresh = sub { + my $refresh = $WORKER_SIG{HUP} = sub { my ($sig) = @_; %$PublicInbox::Config::DEDUPE = (); # clear cache - for my $xn (values %$xnetd) { + for my $xn (values %XNETD) { delete $xn->{tlsd}->{ssl_ctx}; # PublicInbox::TLS::start eval { $xn->{refresh}->($sig) }; warn "refresh $@\n" if $@; } }; - my %post_accept; while (my ($k, $ctx_opt) = each %tls_opt) { $ctx_opt // next; my ($scheme, $l) = split(m!://!, $k, 2); - my $xn = $xnetd->{$l} // die "BUG: no xnetd for $k"; + my $xn = $XNETD{$l} // die "BUG: no xnetd for $k"; $xn->{tlsd}->{ssl_ctx_opt} //= $ctx_opt; $scheme =~ m!\A(?:https|imaps|nntps|pop3s)! and - $post_accept{$l} = tls_cb(@$xn{qw(post_accept tlsd)}); + $POST_ACCEPT{$l} = tls_cb(@$xn{qw(post_accept tlsd)}); } undef %tls_opt; - my $sig = { - HUP => $refresh, - INT => \&worker_quit, - QUIT => \&worker_quit, - TERM => \&worker_quit, - TTIN => 'IGNORE', - TTOU => 'IGNORE', - USR1 => \&reopen_logs, - USR2 => 'IGNORE', - WINCH => 'IGNORE', - CHLD => \&PublicInbox::DS::enqueue_reap, - }; - if ($worker_processes > 0) { + if ($nworker > 0) { $refresh->(); # preload by default - my $fh = master_loop(); # returns if in child process - PublicInbox::EOFpipe->new($fh, \&worker_quit, undef); + return master_loop(); } else { reopen_logs(); $set_user->() if $set_user; - $sig->{USR2} = sub { worker_quit() if upgrade() }; + $WORKER_SIG{USR2} = sub { worker_quit() if upgrade() }; $refresh->(); } + local $PublicInbox::DS::Poller; # allow epoll/kqueue + worker_loop(); +} + +sub worker_loop { $uid = $gid = undef; reopen_logs(); + spawn_xh; # only for -W0 @listeners = map {; my $l = sockname($_); - my $tls_cb = $post_accept{$l}; - my $xn = $xnetd->{$l} // die "BUG: no xnetd for $l"; + my $tls_cb = $POST_ACCEPT{$l}; + my $xn = $XNETD{$l} // die "BUG: no xnetd for $l"; # NNTPS, HTTPS, HTTP, IMAPS and POP3S are client-first traffic # IMAP, NNTP and POP3 are server-first defer_accept($_, $tls_cb ? 'dataready' : $xn->{af_default}); # this calls epoll_create: - PublicInbox::Listener->new($_, $tls_cb || $xn->{post_accept}) + PublicInbox::Listener->new($_, $tls_cb || $xn->{post_accept}, + $xn->{'multi-accept'}) } @listeners; - PublicInbox::DS::event_loop($sig, $oldset); + PublicInbox::DS::event_loop(\%WORKER_SIG, $oldset); +} + +sub respawn_xh { # awaitpid cb + my ($pid) = @_; + return unless @listeners; + warn "W: xap_helper PID:$pid died: \$?=$?, respawning...\n"; + spawn_xh; } sub run { my ($default_listen) = @_; - daemon_prepare($default_listen, my $xnetd = {}); - my $for_destroy = daemonize(); + $nworker = 1; + local (%XNETD, %POST_ACCEPT); + daemon_prepare($default_listen); + my $unlink_on_leave = daemonize(); # localize GCF2C for tests: local $PublicInbox::GitAsyncCat::GCF2C; local $PublicInbox::Git::async_warn = 1; local $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb(); + local %WORKER_SIG = %WORKER_SIG; + local $PublicInbox::XapClient::tries = 0; + local $PublicInbox::Search::XHC if defined($xh_workers); - daemon_loop($xnetd); - PublicInbox::DS->Reset; - # ->DESTROY runs when $for_destroy goes out-of-scope + daemon_loop(); + # $unlink_on_leave runs } sub write_pid ($) { @@ -741,8 +746,4 @@ sub write_pid ($) { do_chown($path); } -sub DESTROY { - unlink_pid_file_safe_ish($_[0]->{pid}, ${$_[0]->{pid_file}}); -} - 1; diff --git a/lib/PublicInbox/DirIdle.pm b/lib/PublicInbox/DirIdle.pm index 9206da9c..230df166 100644 --- a/lib/PublicInbox/DirIdle.pm +++ b/lib/PublicInbox/DirIdle.pm @@ -1,22 +1,22 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Used by public-inbox-watch for Maildir (and possibly MH in the future) package PublicInbox::DirIdle; -use strict; +use v5.12; use parent 'PublicInbox::DS'; use PublicInbox::Syscall qw(EPOLLIN); use PublicInbox::In2Tie; my ($MAIL_IN, $MAIL_GONE, $ino_cls); -if ($^O eq 'linux' && eval { require Linux::Inotify2; 1 }) { - $MAIL_IN = Linux::Inotify2::IN_MOVED_TO() | - Linux::Inotify2::IN_CREATE(); - $MAIL_GONE = Linux::Inotify2::IN_DELETE() | - Linux::Inotify2::IN_DELETE_SELF() | - Linux::Inotify2::IN_MOVE_SELF() | - Linux::Inotify2::IN_MOVED_FROM(); - $ino_cls = 'Linux::Inotify2'; +if ($^O eq 'linux' && eval { require PublicInbox::Inotify; 1 }) { + $MAIL_IN = PublicInbox::Inotify::IN_MOVED_TO() | + PublicInbox::Inotify::IN_CREATE(); + $MAIL_GONE = PublicInbox::Inotify::IN_DELETE() | + PublicInbox::Inotify::IN_DELETE_SELF() | + PublicInbox::Inotify::IN_MOVE_SELF() | + PublicInbox::Inotify::IN_MOVED_FROM(); + $ino_cls = 'PublicInbox::Inotify'; # Perl 5.22+ is needed for fileno(DIRHANDLE) support: } elsif ($^V ge v5.22 && eval { require PublicInbox::KQNotify }) { $MAIL_IN = PublicInbox::KQNotify::MOVED_TO_OR_CREATE(); @@ -68,12 +68,18 @@ sub rm_watches { } } +sub close { + my ($self) = @_; + delete $self->{cb}; + $self->SUPER::close; # if using real kevent/inotify +} + sub event_step { my ($self) = @_; - my $cb = $self->{cb}; - local $PublicInbox::DS::in_loop = 0; # waitpid() synchronously + my $cb = $self->{cb} or return; + local $PublicInbox::DS::in_loop = 0; # waitpid() synchronously (FIXME) eval { - my @events = $self->{inot}->read; # Linux::Inotify2->read + my @events = $self->{inot}->read; # Inotify3->read $cb->($_) for @events; }; warn "$self->{inot}->read err: $@\n" if $@; @@ -82,8 +88,8 @@ sub event_step { sub force_close { my ($self) = @_; my $inot = delete $self->{inot} // return; - if ($inot->can('fh')) { # Linux::Inotify2 2.3+ - close($inot->fh) or warn "CLOSE ERROR: $!"; + if ($inot->can('fh')) { # Inotify3 or Linux::Inotify2 2.3+ + $inot->fh->close or warn "CLOSE ERROR: $!"; } elsif ($inot->isa('Linux::Inotify2')) { require PublicInbox::LI2Wrap; PublicInbox::LI2Wrap::wrapclose($inot); diff --git a/lib/PublicInbox/EOFpipe.pm b/lib/PublicInbox/EOFpipe.pm index e537e2aa..77b699a2 100644 --- a/lib/PublicInbox/EOFpipe.pm +++ b/lib/PublicInbox/EOFpipe.pm @@ -1,23 +1,24 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::EOFpipe; -use strict; +use v5.12; use parent qw(PublicInbox::DS); -use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT $F_SETPIPE_SZ); sub new { - my (undef, $rd, $cb, $arg) = @_; - my $self = bless { cb => $cb, arg => $arg }, __PACKAGE__; - # 1031: F_SETPIPE_SZ, 4096: page size - fcntl($rd, 1031, 4096) if $^O eq 'linux'; + my (undef, $rd, @cb_args) = @_; + my $self = bless { cb_args => \@cb_args }, __PACKAGE__; + # 4096: page size + fcntl($rd, $F_SETPIPE_SZ, 4096) if $F_SETPIPE_SZ; $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); } sub event_step { my ($self) = @_; if ($self->do_read(my $buf, 1) == 0) { # auto-closed - $self->{cb}->($self->{arg}); + my ($cb, @args) = @{delete $self->{cb_args}}; + $cb->(@args); } } diff --git a/lib/PublicInbox/Emergency.pm b/lib/PublicInbox/Emergency.pm index 5a1ed1d7..968d7d6f 100644 --- a/lib/PublicInbox/Emergency.pm +++ b/lib/PublicInbox/Emergency.pm @@ -1,32 +1,24 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Emergency Maildir delivery for MDA package PublicInbox::Emergency; -use strict; -use v5.10.1; +use v5.12; use Fcntl qw(:DEFAULT SEEK_SET); use Sys::Hostname qw(hostname); use IO::Handle; # ->flush use Errno qw(EEXIST); +use File::Path (); sub new { my ($class, $dir) = @_; - - foreach (qw(new tmp cur)) { - my $d = "$dir/$_"; - next if -d $d; - require File::Path; - if (!File::Path::mkpath($d) && !-d $d) { - die "failed to mkpath($d): $!\n"; - } - } + File::Path::make_path(map { $dir.$_ } qw(/tmp /new /cur)); bless { dir => $dir, t => 0 }, $class; } sub _fn_in { my ($self, $pid, $dir) = @_; - my $host = $self->{short_host} //= (split(/\./, hostname))[0]; + my $host = $self->{-host} //= (split(/\./, hostname))[0] // 'localhost'; my $now = time; my $n; if ($self->{t} != $now) { @@ -42,14 +34,14 @@ sub prepare { my ($self, $strref) = @_; my $pid = $$; my $tmp_key = "tmp.$pid"; - die "already in transaction: $self->{$tmp_key}" if $self->{$tmp_key}; + die "BUG: in transaction: $self->{$tmp_key}" if $self->{$tmp_key}; my ($tmp, $fh); do { $tmp = _fn_in($self, $pid, 'tmp'); $! = undef; } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_RDWR) and $! == EEXIST); - print $fh $$strref or die "write failed: $!"; - $fh->flush or die "flush failed: $!"; + print $fh $$strref or die "print: $!"; + $fh->flush or die "flush: $!"; $self->{fh} = $fh; $self->{$tmp_key} = $tmp; } @@ -58,15 +50,15 @@ sub abort { my ($self) = @_; delete $self->{fh}; my $tmp = delete $self->{"tmp.$$"} or return; - unlink($tmp) or warn "Failed to unlink $tmp: $!"; + unlink($tmp) or warn "W: unlink($tmp): $!"; undef; } sub fh { my ($self) = @_; - my $fh = $self->{fh} or die "{fh} not open!\n"; - seek($fh, 0, SEEK_SET) or die "seek(fh) failed: $!"; - sysseek($fh, 0, SEEK_SET) or die "sysseek(fh) failed: $!"; + my $fh = $self->{fh} or die "BUG: {fh} not open"; + seek($fh, 0, SEEK_SET) or die "seek: $!"; + sysseek($fh, 0, SEEK_SET) or die "sysseek: $!"; $fh; } @@ -80,7 +72,7 @@ sub commit { $new = _fn_in($self, $pid, 'new'); } while (!($ok = link($tmp, $new)) && $! == EEXIST); die "link($tmp, $new): $!" unless $ok; - unlink($tmp) or warn "Failed to unlink $tmp: $!"; + unlink($tmp) or warn "W: unlink($tmp): $!"; } sub DESTROY { commit($_[0]) } diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 485f637a..d59d7c3f 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Lazy MIME parser, it still slurps the full message but keeps short @@ -144,6 +144,7 @@ sub header_raw { my $re = re_memo($_[1]); my @v = (${ $_[0]->{hdr} } =~ /$re/g); for (@v) { + utf8::decode($_); # SMTPUTF8 # for compatibility w/ Email::Simple::Header, s/\s+\z//s; s/\A\s+//s; @@ -333,6 +334,11 @@ sub body_set { undef; } +# workaround https://rt.cpan.org/Public/Bug/Display.html?id=139622 +# Encode 2.87..3.12 leaks on croak, so we defer and croak ourselves +our @enc_warn; +my $enc_warn = sub { push @enc_warn, @_ }; + sub body_str_set { my ($self, $str) = @_; my $cs = ct($self)->{attributes}->{charset} // @@ -340,10 +346,10 @@ sub body_str_set { my $enc = find_encoding($cs) // croak "unknown encoding `$cs'"; my $tmp; { - my @w; - local $SIG{__WARN__} = sub { push @w, @_ }; + local @enc_warn; + local $SIG{__WARN__} = $enc_warn; $tmp = $enc->encode($str, Encode::FB_WARN); - croak(@w) if @w; + croak(@enc_warn) if @enc_warn; }; body_set($self, \$tmp); } @@ -359,14 +365,15 @@ sub header_set { $pfx .= ': '; my $len = 78 - length($pfx); @vals = map {; + utf8::encode(my $v = $_); # to bytes, support SMTPUTF8 # folding differs from Email::Simple::Header, # we favor tabs for visibility (and space savings :P) if (length($_) >= $len && (/\n[^ \t]/s || !/\n/s)) { local $Text::Wrap::columns = $len; local $Text::Wrap::huge = 'overflow'; - $pfx . wrap('', "\t", $_) . $self->{crlf}; + $pfx . wrap('', "\t", $v) . $self->{crlf}; } else { - $pfx . $_ . $self->{crlf}; + $pfx . $v . $self->{crlf}; } } @vals; $$hdr =~ s!$re!shift(@vals) // ''!ge; # replace current headers, first @@ -468,12 +475,11 @@ sub body_str { join("\n\t", header_raw($self, 'Content-Type'))); }; my $enc = find_encoding($cs) or croak "unknown encoding `$cs'"; - my $tmp = body($self); - # workaround https://rt.cpan.org/Public/Bug/Display.html?id=139622 - my @w; - local $SIG{__WARN__} = sub { push @w, @_ }; - my $ret = $enc->decode($tmp, Encode::FB_WARN); - croak(@w) if @w; + my $ret = body($self); + local @enc_warn; + local $SIG{__WARN__} = $enc_warn; + $ret = $enc->decode($ret, Encode::FB_WARN); + croak(@enc_warn) if @enc_warn; $ret; } @@ -526,4 +532,10 @@ sub willneed { re_memo($_) for @_ } willneed(qw(From To Cc Date Subject Content-Type In-Reply-To References Message-ID X-Alt-Message-ID)); +# This fixes an old bug from import (pre-a0c07cba0e5d8b6a) +# mutt also pipes single RFC822 messages with a "From " line, +# but no Content-Length or "From " escaping. +# "git format-patch" also generates such files by default. +sub strip_from { $_[0] =~ s/\A[\r\n]*From [^\n]*\n//s } + 1; diff --git a/lib/PublicInbox/Epoll.pm b/lib/PublicInbox/Epoll.pm new file mode 100644 index 00000000..7e0aa6e7 --- /dev/null +++ b/lib/PublicInbox/Epoll.pm @@ -0,0 +1,26 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# OO API for epoll +package PublicInbox::Epoll; +use v5.12; +use PublicInbox::Syscall qw(epoll_create epoll_ctl epoll_wait + EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL); +use Fcntl qw(F_SETFD FD_CLOEXEC); +use autodie qw(open fcntl); + +sub new { + open(my $fh, '+<&=', epoll_create()); + fcntl($fh, F_SETFD, FD_CLOEXEC); + bless \$fh, __PACKAGE__; +} + +sub ep_add { epoll_ctl(fileno(${$_[0]}), EPOLL_CTL_ADD, fileno($_[1]), $_[2]) } +sub ep_mod { epoll_ctl(fileno(${$_[0]}), EPOLL_CTL_MOD, fileno($_[1]), $_[2]) } +sub ep_del { epoll_ctl(fileno(${$_[0]}), EPOLL_CTL_DEL, fileno($_[1]), 0) } + +# n.b. maxevents=1000 is the historical default. maxevents=1 (yes, one) +# is more fair under load with multiple worker processes sharing one listener +sub ep_wait { epoll_wait(fileno(${$_[0]}), 1000, @_[1, 2]) } + +1; diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 72cae005..95feb885 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used by the web interface to link to messages outside of the our @@ -11,7 +11,7 @@ use warnings; use PublicInbox::Hval qw(ascii_html prurl mid_href); use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::Smsg; -our $MIN_PARTIAL_LEN = 16; +our $MIN_PARTIAL_LEN = 14; # for 'XXXXXXXXXX.fsf' msgids gnus generates # TODO: user-configurable our @EXT_URL = map { ascii_html($_) } ( diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index 2460d74f..d43c23e6 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Read-only external (detached) index for cross inbox search. @@ -33,9 +33,11 @@ sub misc { # same as per-inbox ->over, for now... sub over { my ($self) = @_; - $self->{over} //= do { + $self->{over} // eval { PublicInbox::Inbox::_cleanup_later($self); - PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + my $over = PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + $over->dbh; # may die + $self->{over} = $over; }; } @@ -108,7 +110,7 @@ sub altid_map { {} } sub description { my ($self) = @_; ($self->{description} //= - PublicInbox::Inbox::cat_desc("$self->{topdir}/description")) // + PublicInbox::Git::cat_desc("$self->{topdir}/description")) // '$EXTINDEX_DIR/description missing'; } @@ -117,13 +119,14 @@ sub search { $_[0]; } +sub thing_type { 'external index' } + no warnings 'once'; *base_url = \&PublicInbox::Inbox::base_url; *smsg_eml = \&PublicInbox::Inbox::smsg_eml; *smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid; *msg_by_mid = \&PublicInbox::Inbox::msg_by_mid; *modified = \&PublicInbox::Inbox::modified; -*recent = \&PublicInbox::Inbox::recent; *max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef *isrch = \&search; diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 7c44a1a4..774fa47b 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -16,13 +16,13 @@ package PublicInbox::ExtSearchIdx; use strict; use v5.10.1; -use parent qw(PublicInbox::ExtSearch PublicInbox::Lock); +use parent qw(PublicInbox::ExtSearch PublicInbox::Umask PublicInbox::Lock); use Carp qw(croak carp); use Scalar::Util qw(blessed); use Sys::Hostname qw(hostname); -use POSIX qw(strftime); use File::Glob qw(bsd_glob GLOB_NOSORT); use PublicInbox::MultiGit; +use PublicInbox::Spawn (); use PublicInbox::Search; use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob); use PublicInbox::OverIdx; @@ -34,6 +34,7 @@ use PublicInbox::ContentHash qw(content_hash); use PublicInbox::Eml; use PublicInbox::DS qw(now add_timer); use DBI qw(:sql_types); # SQL_BLOB +use PublicInbox::Admin qw(fmt_localtime); sub new { my (undef, $dir, $opt) = @_; @@ -113,11 +114,30 @@ sub check_batch_limit ($) { ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes}; } +sub bad_ibx_id ($$;$) { + my ($self, $ibx_id, $cb) = @_; + my $msg = "E: bad/stale ibx_id=#$ibx_id encountered"; + my $ekey = $self->{oidx}->dbh->selectrow_array(<<EOM, undef, $ibx_id); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1 +EOM + $msg .= " (formerly `$ekey')" if defined $ekey; + $cb //= \&carp; + $cb->($msg, "\nE: running $0 --gc may be required"); +} + +sub check_xr3 ($$$) { + my ($self, $id2pos, $xr3) = @_; + @$xr3 = grep { + defined($id2pos->{$_->[0]}) ? 1 : bad_ibx_id($self, $_->[0]) + } @$xr3; +} + sub apply_boost ($$) { my ($req, $smsg) = @_; my $id2pos = $req->{id2pos}; # index in ibx_sorted my $xr3 = $req->{self}->{oidx}->get_xref3($smsg->{num}, 1); - @$xr3 = sort { + check_xr3($req->{self}, $id2pos, $xr3); + @$xr3 = sort { # sort ascending $id2pos->{$a->[0]} <=> $id2pos->{$b->[0]} || $a->[1] <=> $b->[1] # break ties with {xnum} @@ -406,14 +426,14 @@ EOM while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) { next if $self->{ibx_map}->{$eidx_key}; $self->{midx}->remove_eidx_key($eidx_key); - warn "I: deleting messages for $eidx_key...\n"; + warn "# deleting messages for $eidx_key...\n"; $x3_doc->execute($ibx_id); my $ibx = { -ibx_id => $ibx_id, -gc_eidx_key => $eidx_key }; while (my ($docid, $xnum, $oid) = $x3_doc->fetchrow_array) { my $r = _unref_doc($sync, $docid, $ibx, $xnum, $oid); $oid = unpack('H*', $oid); $r = $r ? 'unref' : 'remove'; - warn "I: $r #$docid $eidx_key $oid\n"; + warn "# $r #$docid $eidx_key $oid\n"; if (checkpoint_due($sync)) { $x3_doc = $ibx_ck = undef; reindex_checkpoint($self, $sync); @@ -433,12 +453,12 @@ SELECT key FROM eidx_meta WHERE key LIKE ? ESCAPE ? $lc_i->execute("lc-%:$pat//%", '\\'); while (my ($key) = $lc_i->fetchrow_array) { next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!; - warn "I: removing $key\n"; + warn "# removing $key\n"; $self->{oidx}->dbh->do(<<'', undef, $key); DELETE FROM eidx_meta WHERE key = ? } - warn "I: $eidx_key removed\n"; + warn "# $eidx_key removed\n"; } } @@ -447,20 +467,20 @@ sub eidx_gc_scan_shards ($$) { # TODO: use for lei/store my $nr = $self->{oidx}->dbh->do(<<''); DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) - warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; + warn "# eliminated $nr stale xref3 entries\n" if $nr != 0; reindex_checkpoint($self, $sync) if checkpoint_due($sync); # fixup from old bugs: $nr = $self->{oidx}->dbh->do(<<''); DELETE FROM over WHERE num > 0 AND num NOT IN (SELECT docid FROM xref3) - warn "I: eliminated $nr stale over entries\n" if $nr != 0; + warn "# eliminated $nr stale over entries\n" if $nr != 0; reindex_checkpoint($self, $sync) if checkpoint_due($sync); $nr = $self->{oidx}->dbh->do(<<''); DELETE FROM eidxq WHERE docid NOT IN (SELECT num FROM over) - warn "I: eliminated $nr stale reindex queue entries\n" if $nr != 0; + warn "# eliminated $nr stale reindex queue entries\n" if $nr != 0; reindex_checkpoint($self, $sync) if checkpoint_due($sync); my ($cur) = $self->{oidx}->dbh->selectrow_array(<<EOM); @@ -490,7 +510,7 @@ SELECT num FROM over WHERE num >= ? ORDER BY num ASC LIMIT 10000 reindex_checkpoint($self, $sync); } } - warn "I: eliminated $nr stale Xapian documents\n" if $nr != 0; + warn "# eliminated $nr stale Xapian documents\n" if $nr != 0; } sub eidx_gc { @@ -513,8 +533,9 @@ sub eidx_gc { sub _ibx_for ($$$) { my ($self, $sync, $smsg) = @_; - my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset'; - my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos"; + my $ibx_id = delete($smsg->{ibx_id}) // die 'BUG: {ibx_id} unset'; + my $pos = $sync->{id2pos}->{$ibx_id} // + bad_ibx_id($self, $ibx_id, \&croak); $self->{-ibx_ary_known}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" } @@ -657,7 +678,8 @@ BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex # hit the common case in _reindex_finalize without rereading # from git (or holding multiple messages in memory). my $id2pos = $sync->{id2pos}; # index in ibx_sorted - @$xr3 = sort { + check_xr3($self, $id2pos, $xr3); + @$xr3 = sort { # sort descending $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]} || $b->[1] <=> $a->[1] # break ties with {xnum} @@ -728,16 +750,14 @@ sub eidxq_lock_acquire ($) { return $locked if $locked eq $cur; } my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4); - my $t = strftime('%Y-%m-%d %k:%M %z', localtime($time)); + my $t = fmt_localtime($time); local $self->{current_info} = 'eidxq'; if ($euid == $> && $ident eq host_ident) { - if (kill(0, $pid)) { - warn <<EOM; return; -I: PID:$pid (re)indexing since $t, it will continue our work + kill(0, $pid) and warn <<EOM and return; +# PID:$pid (re)indexing since $t, it will continue our work EOM - } if ($!{ESRCH}) { - warn "I: eidxq_lock is stale ($cur), clobbering\n"; + warn "# eidxq_lock is stale ($cur), clobbering\n"; return _eidxq_take($self); } warn "E: kill(0, $pid) failed: $!\n"; # fall-through: @@ -837,7 +857,7 @@ sub reindex_unseen ($$$$) { xnum => $xsmsg->{num}, # {mids} and {chash} will be filled in at _reindex_unseen }; - warn "I: reindex_unseen ${\$ibx->eidx_key}:$req->{xnum}:$req->{oid}\n"; + warn "# reindex_unseen ${\$ibx->eidx_key}:$req->{xnum}:$req->{oid}\n"; $self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req); } @@ -1181,12 +1201,6 @@ sub update_last_commit { # overrides V2Writable $self->{oidx}->eidx_meta($meta_key, $latest_cmt); } -sub _idx_init { # with_umask callback - my ($self, $opt) = @_; - PublicInbox::V2Writable::_idx_init($self, $opt); # acquires ei.lock - $self->{midx} = PublicInbox::MiscIdx->new($self); -} - sub symlink_packs ($$) { my ($ibx, $pd) = @_; my $ret = 0; @@ -1272,15 +1286,17 @@ sub idx_init { # similar to V2Writable } ($has_new || $prune_nr || $new ne '') and $self->{mg}->write_alternates($mode, $alt, $new); - $git_midx and $self->with_umask(sub { + my $restore = $self->with_umask; + if ($git_midx && ($opt->{'multi-pack-index'} // 1)) { my @cmd = ('multi-pack-index'); push @cmd, '--no-progress' if ($opt->{quiet}//0) > 1; my $lk = $self->lock_for_scope; system('git', "--git-dir=$ALL", @cmd, 'write'); # ignore errors, fairly new command, may not exist - }); + } $self->parallel_init($self->{indexlevel}); - $self->with_umask(\&_idx_init, $self, $opt); + PublicInbox::V2Writable::_idx_init($self, $opt); # acquires ei.lock + $self->{midx} = PublicInbox::MiscIdx->new($self); $self->{oidx}->begin_lazy; $self->{oidx}->eidx_prep; $self->{midx}->create_xdb if $new ne ''; @@ -1390,7 +1406,7 @@ sub eidx_watch { # public-inbox-extindex --watch main loop my $quit = PublicInbox::SearchIdx::quit_cb($sync); $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock - PublicInbox::DS->SetPostLoopCallback(sub { !$sync->{quit} }); + local @PublicInbox::DS::post_loop_do = (sub { !$sync->{quit} }); $pr->("initial scan complete, entering event loop\n") if $pr; # calls InboxIdle->event_step: PublicInbox::DS::event_loop($sig, $oldset); @@ -1399,7 +1415,6 @@ sub eidx_watch { # public-inbox-extindex --watch main loop no warnings 'once'; *done = \&PublicInbox::V2Writable::done; -*with_umask = \&PublicInbox::InboxWritable::with_umask; *parallel_init = \&PublicInbox::V2Writable::parallel_init; *nproc_shards = \&PublicInbox::V2Writable::nproc_shards; *sync_prepare = \&PublicInbox::V2Writable::sync_prepare; @@ -1409,5 +1424,6 @@ no warnings 'once'; *idx_shard = \&PublicInbox::V2Writable::idx_shard; *reindex_checkpoint = \&PublicInbox::V2Writable::reindex_checkpoint; *checkpoint = \&PublicInbox::V2Writable::checkpoint; +*barrier = \&PublicInbox::V2Writable::barrier; 1; diff --git a/lib/PublicInbox/FakeInotify.pm b/lib/PublicInbox/FakeInotify.pm index 6d269601..8be07135 100644 --- a/lib/PublicInbox/FakeInotify.pm +++ b/lib/PublicInbox/FakeInotify.pm @@ -1,14 +1,13 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # for systems lacking Linux::Inotify2 or IO::KQueue, just emulates -# enough of Linux::Inotify2 +# enough of Linux::Inotify2 we use. package PublicInbox::FakeInotify; -use strict; -use v5.10.1; -use parent qw(Exporter); +use v5.12; use Time::HiRes qw(stat); use PublicInbox::DS qw(add_timer); +use Errno qw(ENOTDIR ENOENT); sub IN_MODIFY () { 0x02 } # match Linux inotify # my $IN_MOVED_FROM 0x00000040 /* File was moved from X. */ # my $IN_MOVED_TO = 0x80; @@ -18,98 +17,125 @@ sub IN_DELETE () { 0x200 } sub IN_DELETE_SELF () { 0x400 } sub IN_MOVE_SELF () { 0x800 } -our @EXPORT_OK = qw(fill_dirlist on_dir_change); - my $poll_intvl = 2; # same as Filesys::Notify::Simple -sub new { bless { watch => {}, dirlist => {} }, __PACKAGE__ } +sub new { bless {}, __PACKAGE__ } -sub fill_dirlist ($$$) { - my ($self, $path, $dh) = @_; - my $dirlist = $self->{dirlist}->{$path} = {}; - while (defined(my $n = readdir($dh))) { - $dirlist->{$n} = undef if $n !~ /\A\.\.?\z/; - } -} +sub on_dir_change ($$$$$) { # used by KQNotify subclass + my ($self, $events, $dh, $path, $dir_delete) = @_; + my $old = $self->{dirlist}->{$path}; + my @cur = grep(!/\A\.\.?\z/, readdir($dh)); + $self->{dirlist}->{$path} = \@cur; -# behaves like Linux::Inotify2->watch -sub watch { - my ($self, $path, $mask) = @_; - my @st = stat($path) or return; - my $k = "$path\0$mask"; - $self->{watch}->{$k} = $st[10]; # 10 - ctime - if ($mask & IN_DELETE) { - opendir(my $dh, $path) or return; - fill_dirlist($self, $path, $dh); + # new files: + my %tmp = map { $_ => undef } @cur; + delete @tmp{@$old}; + push(@$events, map { + bless \"$path/$_", 'PublicInbox::FakeInotify::Event' + } keys %tmp); + + if ($dir_delete) { + %tmp = map { $_ => undef } @$old; + delete @tmp{@cur}; + push(@$events, map { + bless \"$path/$_", 'PublicInbox::FakeInotify::GoneEvent' + } keys %tmp); } - bless [ $self->{watch}, $k ], 'PublicInbox::FakeInotify::Watch'; } -# also used by KQNotify since it kevent requires readdir on st_nlink -# count changes. -sub on_dir_change ($$$$$) { - my ($events, $dh, $path, $old_ctime, $dirlist) = @_; - my $oldlist = $dirlist->{$path}; - my $newlist = $oldlist ? {} : undef; - while (defined(my $base = readdir($dh))) { - next if $base =~ /\A\.\.?\z/; - my $full = "$path/$base"; - my @st = stat($full); - if (@st && $st[10] > $old_ctime) { - push @$events, - bless(\$full, 'PublicInbox::FakeInotify::Event') +sub watch_open ($$$) { # used by KQNotify subclass + my ($self, $path, $dir_delete) = @_; + my ($fh, @st, @st0, $tries); + do { +again: + unless (@st0 = stat($path)) { + warn "W: stat($path): $!" if $! != ENOENT; + return; } - if (!@st) { - # ignore ENOENT due to race - warn "unhandled stat($full) error: $!\n" if !$!{ENOENT}; - } elsif ($newlist) { - $newlist->{$base} = undef; + if (!(-d _ ? opendir($fh, $path) : open($fh, '<', $path))) { + goto again if $! == ENOTDIR && ++$tries < 10; + warn "W: open($path): $!" if $! != ENOENT; + return; } + @st = stat($fh) or die "fstat($path): $!"; + } while ("@st[0,1]" ne "@st0[0,1]" && + ((++$tries < 10) || (warn(<<EOM) && return))); +E: $path switching inodes too frequently to watch +EOM + if (-d _) { + $self->{dirlist}->{$path} = []; + on_dir_change($self, [], $fh, $path, $$dir_delete); + } else { + $$dir_delete = 0; } - return if !$newlist; - delete @$oldlist{keys %$newlist}; - $dirlist->{$path} = $newlist; - push(@$events, map { - bless \"$path/$_", 'PublicInbox::FakeInotify::GoneEvent' - } keys %$oldlist); + bless [ @st[0, 1, 10], $path, $fh ], 'PublicInbox::FakeInotify::Watch' +} + +# behaves like Linux::Inotify2->watch +sub watch { + my ($self, $path, $mask) = @_; # mask is ignored + my $dir_delete = $mask & IN_DELETE ? 1 : 0; + my $w = watch_open($self, $path, \$dir_delete) or return; + pop @$w; # no need to keep $fh open for non-kqueue + $self->{watch}->{"$path\0$dir_delete"} = $w; +} + +sub gone ($$$) { # used by KQNotify subclass + my ($self, $ident, $path) = @_; + delete $self->{watch}->{$ident}; + delete $self->{dirlist}->{$path}; + bless(\$path, 'PublicInbox::FakeInotify::SelfGoneEvent'); +} + +# fuzz the time for freshly modified directories for low-res VFS +sub dir_adj ($) { + my ($old_ctime) = @_; + my $now = Time::HiRes::time; + my $diff = $now - $old_ctime; + my $adj = $poll_intvl + 1; + ($diff > -$adj && $diff < $adj) ? 1 : 0; } # behaves like non-blocking Linux::Inotify2->read sub read { my ($self) = @_; - my $watch = $self->{watch} or return (); - my $events = []; - my @watch_gone; - for my $x (keys %$watch) { - my ($path, $mask) = split(/\0/, $x, 2); - my @now = stat($path); - if (!@now && $!{ENOENT} && ($mask & IN_DELETE_SELF)) { - push @$events, bless(\$path, - 'PublicInbox::FakeInotify::SelfGoneEvent'); - push @watch_gone, $x; - delete $self->{dirlist}->{$path}; + my $ret = []; + while (my ($ident, $w) = each(%{$self->{watch}})) { + if (!@$w) { # cancelled + delete($self->{watch}->{$ident}); + next; } - next if !@now; - my $old_ctime = $watch->{$x}; - $watch->{$x} = $now[10]; - next if $old_ctime == $now[10]; - if ($mask & IN_MODIFY) { - push @$events, - bless(\$path, 'PublicInbox::FakeInotify::Event') - } elsif ($mask & (MOVED_TO_OR_CREATE | IN_DELETE)) { - if (opendir(my $dh, $path)) { - on_dir_change($events, $dh, $path, $old_ctime, - $self->{dirlist}); - } elsif ($!{ENOENT}) { - push @watch_gone, $x; - delete $self->{dirlist}->{$path}; - } else { - warn "W: opendir $path: $!\n"; + my $dir_delete = (split(/\0/, $ident, 2))[1]; + my ($old_dev, $old_ino, $old_ctime, $path) = @$w; + my @new_st = stat($path); + warn "W: stat($path): $!\n" if !@new_st && $! != ENOENT; + if (!@new_st || "$old_dev $old_ino" ne "@new_st[0,1]") { + push @$ret, gone($self, $ident, $path); + next; + } + if (-d _ && $new_st[10] > ($old_ctime - dir_adj($old_ctime))) { + opendir(my $fh, $path) or do { + if ($! == ENOENT || $! == ENOTDIR) { + push @$ret, gone($self, $ident, $path); + } else { + warn "W: opendir($path): $!"; + } + next; + }; + @new_st = stat($fh) or die "fstat($path): $!"; + if ("$old_dev $old_ino" ne "@new_st[0,1]") { + push @$ret, gone($self, $ident, $path); + next; } + $w->[2] = $new_st[10]; + on_dir_change($self, $ret, $fh, $path, $dir_delete); + } elsif ($new_st[10] > $old_ctime) { # regular files, etc + $w->[2] = $new_st[10]; + push @$ret, bless(\$path, + 'PublicInbox::FakeInotify::Event'); } } - delete @$watch{@watch_gone}; - @$events; + @$ret; } sub poll_once { @@ -119,20 +145,14 @@ sub poll_once { } package PublicInbox::FakeInotify::Watch; -use strict; +use v5.12; -sub cancel { - my ($self) = @_; - delete $self->[0]->{$self->[1]}; -} +sub cancel { @{$_[0]} = () } -sub name { - my ($self) = @_; - (split(/\0/, $self->[1], 2))[0]; -} +sub name { $_[0]->[3] } package PublicInbox::FakeInotify::Event; -use strict; +use v5.12; sub fullname { ${$_[0]} } @@ -141,14 +161,14 @@ sub IN_MOVED_FROM { 0 } sub IN_DELETE_SELF { 0 } package PublicInbox::FakeInotify::GoneEvent; -use strict; +use v5.12; our @ISA = qw(PublicInbox::FakeInotify::Event); sub IN_DELETE { 1 } sub IN_MOVED_FROM { 0 } package PublicInbox::FakeInotify::SelfGoneEvent; -use strict; +use v5.12; our @ISA = qw(PublicInbox::FakeInotify::GoneEvent); sub IN_DELETE_SELF { 1 } diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index ee579f6d..225565f4 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -7,7 +7,7 @@ use strict; use v5.10.1; use PublicInbox::View; use PublicInbox::WwwAtomStream; -use PublicInbox::Smsg; # this loads w/o Search::Xapian +use PublicInbox::Smsg; # this loads w/o Xapian sub generate_i { my ($ctx) = @_; @@ -19,29 +19,25 @@ sub generate { my ($ctx) = @_; my $msgs = $ctx->{msgs} = recent_msgs($ctx); return _no_thread() unless @$msgs; - PublicInbox::WwwAtomStream->response($ctx, 200, \&generate_i); + PublicInbox::WwwAtomStream->response($ctx, \&generate_i); } sub generate_thread_atom { my ($ctx) = @_; my $msgs = $ctx->{msgs} = $ctx->{ibx}->over->get_thread($ctx->{mid}); return _no_thread() unless @$msgs; - PublicInbox::WwwAtomStream->response($ctx, 200, \&generate_i); + PublicInbox::WwwAtomStream->response($ctx, \&generate_i); } sub generate_html_index { my ($ctx) = @_; # if the 'r' query parameter is given, it is a legacy permalink # which we must continue supporting: - my $qp = $ctx->{qp}; - my $ibx = $ctx->{ibx}; - if ($qp && !$qp->{r} && $ibx->over) { + !$ctx->{qp}->{r} && $ctx->{ibx}->over and return PublicInbox::View::index_topics($ctx); - } - my $env = $ctx->{env}; - my $url = $ibx->base_url($env) . 'new.html'; - my $qs = $env->{QUERY_STRING}; + my $url = $ctx->{ibx}->base_url($ctx->{env}) . 'new.html'; + my $qs = $ctx->{env}->{QUERY_STRING}; $url .= "?$qs" if $qs ne ''; [302, [ 'Location', $url, 'Content-Type', 'text/plain'], [ "Redirecting to $url\n" ] ]; @@ -49,12 +45,15 @@ sub generate_html_index { sub new_html_i { my ($ctx, $eml) = @_; - $ctx->zmore($ctx->html_top) if exists $ctx->{-html_tip}; + print { $ctx->zfh } $ctx->html_top if exists $ctx->{-html_tip}; - $eml and return PublicInbox::View::eml_entry($ctx, $eml); + if ($eml) { + $ctx->{smsg}->populate($eml) if !$ctx->{ibx}->{over}; + return PublicInbox::View::eml_entry($ctx, $eml); + } my $smsg = shift @{$ctx->{msgs}} or - $ctx->zmore(PublicInbox::View::pagination_footer( - $ctx, './new.html')); + print { $ctx->zfh } PublicInbox::View::pagination_footer( + $ctx, './new.html'); $smsg; } @@ -67,8 +66,9 @@ sub new_html { } $ctx->{-html_tip} = '<pre>'; $ctx->{-upfx} = ''; + $ctx->{-spfx} = '' if $ctx->{ibx}->{coderepo}; $ctx->{-hr} = 1; - PublicInbox::WwwStream::aresponse($ctx, 200, \&new_html_i); + PublicInbox::WwwStream::aresponse($ctx, \&new_html_i); } # private subs @@ -84,7 +84,6 @@ sub recent_msgs { return PublicInbox::View::paginate_recent($ctx, $max) if $ibx->over; # only for rare v1 inboxes which aren't indexed at all - my $qp = $ctx->{qp}; my $hex = '[a-f0-9]'; my $addmsg = qr!^:000000 100644 \S+ (\S+) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 (\S+) \S+ D\t(${hex}{2}/${hex}{38})$!; @@ -92,7 +91,7 @@ sub recent_msgs { # revision ranges may be specified my $range = 'HEAD'; - my $r = $qp->{r} if $qp; + my $r = $ctx->{qp}->{r}; if ($r && ($r =~ /\A(?:$refhex\.\.)?$refhex\z/o)) { $range = $r; } diff --git a/lib/PublicInbox/Fetch.pm b/lib/PublicInbox/Fetch.pm index 5261cad1..b0f1437c 100644 --- a/lib/PublicInbox/Fetch.pm +++ b/lib/PublicInbox/Fetch.pm @@ -2,54 +2,52 @@ # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Wrapper to "git fetch" remote public-inboxes package PublicInbox::Fetch; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::IPC); use URI (); -use PublicInbox::Spawn qw(popen_rd run_die spawn); +use PublicInbox::Spawn qw(popen_rd run_qx run_wait); use PublicInbox::Admin; use PublicInbox::LEI; use PublicInbox::LeiCurl; use PublicInbox::LeiMirror; +use PublicInbox::SHA qw(sha_all); use File::Temp (); -use PublicInbox::Config; -use IO::Compress::Gzip qw(gzip $GzipError); sub new { bless {}, __PACKAGE__ } -sub fetch_args ($$) { - my ($lei, $opt) = @_; - my @cmd; # (git --git-dir=...) to be added by caller - $opt->{$_} = $lei->{$_} for (0..2); - # we support "-c $key=$val" for arbitrary git config options - # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 - push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []}; - push @cmd, 'fetch'; - push @cmd, '-q' if $lei->{opt}->{quiet}; - push @cmd, '-v' if $lei->{opt}->{verbose}; - @cmd; -} - sub remote_url ($$) { my ($lei, $dir) = @_; my $rn = $lei->{opt}->{'try-remote'} // [ 'origin', '_grokmirror' ]; for my $r (@$rn) { my $cmd = [ qw(git config), "remote.$r.url" ]; - my $fh = popen_rd($cmd, undef, { -C => $dir, 2 => $lei->{2} }); - my $url = <$fh>; - close $fh or next; + my $url = run_qx($cmd, undef, { -C => $dir, 2 => $lei->{2} }); + next if $?; $url =~ s!/*\n!!s; return $url; } undef } +# PSGI mount prefixes and manifest.js.gz prefixes don't always align... +# TODO: remove, handle multi-inbox fetch +sub deduce_epochs ($$) { + my ($m, $path) = @_; + my ($v1_ent, @v2_epochs); + my $path_pfx = ''; + $path =~ s!/+\z!!; + do { + $v1_ent = $m->{$path}; + @v2_epochs = grep(m!\A\Q$path\E/git/[0-9]+\.git\z!, keys %$m); + } while (!defined($v1_ent) && !@v2_epochs && + $path =~ s!\A(/[^/]+)/!/! and $path_pfx .= $1); + ($path_pfx, $v1_ent ? $path : undef, @v2_epochs); +} + sub do_manifest ($$$) { my ($lei, $dir, $ibx_uri) = @_; my $muri = URI->new("$ibx_uri/manifest.js.gz"); my $ft = File::Temp->new(TEMPLATE => 'm-XXXX', UNLINK => 1, DIR => $dir, SUFFIX => '.tmp'); - my $fn = $ft->filename; my $mf = "$dir/manifest.js.gz"; my $m0; # current manifest.js.gz contents if (open my $fh, '<', $mf) { @@ -58,7 +56,7 @@ sub do_manifest ($$$) { }; warn($@) if $@; } - my ($bn) = ($fn =~ m!/([^/]+)\z!); + my ($bn) = ($ft->filename =~ m!/([^/]+)\z!); my $curl_cmd = $lei->{curl}->for_uri($lei, $muri, qw(-R -o), $bn); my $opt = { -C => $dir }; $opt->{$_} = $lei->{$_} for (0..2); @@ -69,7 +67,7 @@ sub do_manifest ($$$) { return; } my $m1 = eval { - PublicInbox::LeiMirror::decode_manifest($ft, $fn, $muri); + PublicInbox::LeiMirror::decode_manifest($ft, $ft, $muri); } or return [ 404, $muri ]; my $mdiff = { %$m1 }; @@ -88,15 +86,14 @@ sub do_manifest ($$$) { return; } my (undef, $v1_path, @v2_epochs) = - PublicInbox::LeiMirror::deduce_epochs($mdiff, $ibx_uri->path); + deduce_epochs($mdiff, $ibx_uri->path); [ 200, $muri, $v1_path, \@v2_epochs, $ft, $mf, $m1 ]; } sub get_fingerprint2 { my ($git_dir) = @_; - require Digest::SHA; my $rd = popen_rd([qw(git show-ref)], undef, { -C => $git_dir }); - Digest::SHA::sha256(do { local $/; <$rd> }); + sha_all(256, $rd)->digest; # ignore show-ref errors } sub writable_dir ($) { @@ -133,10 +130,10 @@ sub do_fetch { # main entry point $epoch = $nr; } else { warn "W: $edir missing remote.*.url\n"; - my $pid = spawn([qw(git config -l)], undef, - { 1 => $lei->{2}, 2 => $lei->{2} }); - waitpid($pid, 0); - $lei->child_error($?) if $?; + my $o = { -C => $edir }; + $o->{1} = $o->{2} = $lei->{2}; + run_wait([qw(git config -l)], undef, $o) and + $lei->child_error($?); } } @epochs = grep { !$skip->{$_} } @epochs if $skip; @@ -192,7 +189,7 @@ EOM if (-d $d) { $fp2->[0] = get_fingerprint2($d) if $fp2; $cmd = [ @$torsocks, 'git', "--git-dir=$d", - fetch_args($lei, $opt) ]; + PublicInbox::LeiMirror::fetch_args($lei, $opt)]; } else { my $e_uri = $ibx_uri->clone; my ($epath) = ($d =~ m!(/git/[0-9]+\.git)\z!); @@ -218,11 +215,7 @@ EOM } for my $i (@new_epoch) { $mg->epoch_cfg_set($i) } if ($ft) { - if ($mculled) { - my $json = PublicInbox::Config->json->encode($m1); - my $fn = $ft->filename; - gzip(\$json => $fn) or die "gzip: $GzipError"; - } + PublicInbox::LeiMirror::dump_manifest($m1 => $ft) if $mculled; PublicInbox::LeiMirror::ft_rename($ft, $mf, 0666); } $lei->child_error($xit << 8) if $fp2 && $xit; diff --git a/lib/PublicInbox/Filter/RubyLang.pm b/lib/PublicInbox/Filter/RubyLang.pm index 09aa6aa8..57ebbe78 100644 --- a/lib/PublicInbox/Filter/RubyLang.pm +++ b/lib/PublicInbox/Filter/RubyLang.pm @@ -1,11 +1,10 @@ -# Copyright (C) 2017-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Filter for lists.ruby-lang.org trailers package PublicInbox::Filter::RubyLang; -use base qw(PublicInbox::Filter::Base); -use strict; -use warnings; +use v5.10.1; +use parent qw(PublicInbox::Filter::Base); use PublicInbox::MID qw(mids); my $l1 = qr/Unsubscribe:\s @@ -56,16 +55,22 @@ sub scrub { my $hdr = $mime->header_obj; my $mids = mids($hdr); return $self->REJECT('Message-ID missing') unless (@$mids); - my @v = $hdr->header_raw('X-Mail-Count'); my $n; - foreach (@v) { - /\A\s*([0-9]+)\s*\z/ or next; - $n = $1; - last; - } - unless (defined $n) { - return $self->REJECT('X-Mail-Count not numeric'); + my @v = $hdr->header_raw('X-Mail-Count'); # old host only + if (@v) { + for (@v) { + /\A\s*([0-9]+)\s*\z/ or next; + $n = $1; + last; + } + } else { # new host: nue.mailmanlists.eu + for ($hdr->header_str('Subject')) { + /\A\[ruby-[^:]+:([0-9]+)\]/ or next; + $n = $1; + last; + } } + $n // return $self->REJECT('could not get count not numeric'); foreach my $mid (@$mids) { my $r = $altid->mm_alt->mid_set($n, $mid); next if $r == 0; diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 41ee0715..78392990 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -1,84 +1,74 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # backend for a git-cat-file-workalike based on libgit2, # other libgit2 stuff may go here, too. package PublicInbox::Gcf2; -use strict; -use v5.10.1; -use PublicInbox::Spawn qw(which popen_rd); # may set PERL_INLINE_DIRECTORY -use Fcntl qw(LOCK_EX SEEK_SET); +use v5.12; +use PublicInbox::Spawn qw(which run_qx); # may set PERL_INLINE_DIRECTORY +use Fcntl qw(SEEK_SET); use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use IO::Handle; # autoflush +use PublicInbox::Git qw($ck_unlinked_packs); +use PublicInbox::Lock; +use autodie qw(close open seek truncate); + BEGIN { my (%CFG, $c_src); # PublicInbox::Spawn will set PERL_INLINE_DIRECTORY - # to ~/.cache/public-inbox/inline-c if it exists + # to ~/.cache/public-inbox/inline-c if it exists and Inline::C works my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} // die 'PERL_INLINE_DIRECTORY not defined'; - my $f = "$inline_dir/.public-inbox.lock"; - open my $fh, '+>', $f or die "open($f): $!"; # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking - flock($fh, LOCK_EX) or die "LOCK_EX($f): $!\n"; + my $lk = PublicInbox::Lock->new("$inline_dir/.public-inbox.lock"); + my $fh = $lk->lock_acquire; my $pc = which($ENV{PKG_CONFIG} // 'pkg-config') // die "pkg-config missing for libgit2"; my ($dir) = (__FILE__ =~ m!\A(.+?)/[^/]+\z!); - my $ef = "$inline_dir/.public-inbox.pkg-config.err"; - open my $err, '+>', $ef or die "open($ef): $!"; - for my $x (qw(libgit2)) { - my $rdr = { 2 => $err }; - my ($l, $pid) = popen_rd([$pc, '--libs', $x], undef, $rdr); - $l = do { local $/; <$l> }; - waitpid($pid, 0); - next if $?; - (my $c, $pid) = popen_rd([$pc, '--cflags', $x], undef, $rdr); - $c = do { local $/; <$c> }; - waitpid($pid, 0); - next if $?; - - # note: we name C source files .h to prevent - # ExtUtils::MakeMaker from automatically trying to - # build them. - my $f = "$dir/gcf2_$x.h"; - open(my $src, '<', $f) or die "E: open($f): $!"; - chomp($l, $c); - local $/; - defined($c_src = <$src>) or die "read $f: $!"; - $CFG{LIBS} = $l; - $CFG{CCFLAGSEX} = $c; - last; - } - unless ($c_src) { - seek($err, 0, SEEK_SET); - $err = do { local $/; <$err> }; - die "E: libgit2 not installed: $err\n"; + my $vals = {}; + my $rdr = { 2 => \(my $err) }; + my @switches = qw(modversion cflags libs); + for my $k (@switches) { + chomp(my $val = run_qx([$pc, "--$k", 'libgit2'], undef, $rdr)); + die "E: libgit2 not installed: $err\n" if $?; + $vals->{$k} = $val; } - open my $oldout, '>&', \*STDOUT or die "dup(1): $!"; - open my $olderr, '>&', \*STDERR or die "dup(2): $!"; - open STDOUT, '>&', $fh or die "1>$f: $!"; - open STDERR, '>&', $fh or die "2>$f: $!"; + my $f = "$dir/gcf2_libgit2.h"; + $c_src = PublicInbox::IO::try_cat $f or die "cat $f: $!"; + # append pkg-config results to the source to ensure Inline::C + # can rebuild if there's changes (it doesn't seem to detect + # $CFG{CCFLAGSEX} nor $CFG{CPPFLAGS} changes) + $c_src .= "/* $pc --$_ libgit2 => $vals->{$_} */\n" for @switches; + open my $oldout, '>&', \*STDOUT; + open my $olderr, '>&', \*STDERR; + open STDOUT, '>&', $fh; + open STDERR, '>&', $fh; STDERR->autoflush(1); STDOUT->autoflush(1); + $CFG{CCFLAGSEX} = $vals->{cflags}; + $CFG{LIBS} = $vals->{libs}; # we use Capitalized and ALLCAPS for compatibility with old Inline::C - eval <<'EOM'; + CORE::eval <<'EOM'; use Inline C => Config => %CFG, BOOT => q[git_libgit2_init();]; use Inline C => $c_src, BUILD_NOISY => 1; EOM $err = $@; - open(STDERR, '>&', $olderr) or warn "restore stderr: $!"; - open(STDOUT, '>&', $oldout) or warn "restore stdout: $!"; + open(STDERR, '>&', $olderr); + open(STDOUT, '>&', $oldout); if ($err) { seek($fh, 0, SEEK_SET); my @msg = <$fh>; + truncate($fh, 0); die "Inline::C Gcf2 build failed:\n", $err, "\n", @msg; } } sub add_alt ($$) { - my ($gcf2, $objdir) = @_; + my ($gcf2, $git_dir) = @_; + my $objdir = PublicInbox::Git->new($git_dir)->git_path('objects'); # libgit2 (tested 0.27.7+dfsg.1-0.2 and 0.28.3+dfsg.1-1~bpo10+1 # in Debian) doesn't handle relative epochs properly when nested @@ -89,23 +79,13 @@ sub add_alt ($$) { # to refer to $V2INBOX_DIR/git/$EPOCH.git/objects # # See https://bugs.debian.org/975607 - if (open(my $fh, '<', "$objdir/info/alternates")) { - chomp(my @abs_alt = grep(m!^/!, <$fh>)); - $gcf2->add_alternate($_) for @abs_alt; + if (my $s = PublicInbox::IO::try_cat("$objdir/info/alternates")) { + $gcf2->add_alternate($_) for ($s =~ m!^(/[^\n]+)\n!gms); } $gcf2->add_alternate($objdir); 1; } -sub have_unlinked_files () { - # FIXME: port gcf2-like over to git.git so we won't need to - # deal with libgit2 - return 1 if $^O ne 'linux'; - open my $fh, '<', "/proc/$$/maps" or return; - while (<$fh>) { return 1 if /\.(?:idx|pack) \(deleted\)$/ } - undef; -} - # Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop [EXPIRE-TIMEOUT] # (see lib/PublicInbox/Gcf2Client.pm) sub loop (;$) { @@ -114,23 +94,24 @@ sub loop (;$) { my (%seen, $check_at); STDERR->autoflush(1); STDOUT->autoflush(1); + my $pid = $$; while (<STDIN>) { chomp; my ($oid, $git_dir) = split(/ /, $_, 2); - $seen{$git_dir} //= add_alt($gcf2, "$git_dir/objects"); + $seen{$git_dir} //= add_alt($gcf2, $git_dir); if (!$gcf2->cat_oid(1, $oid)) { # retry once if missing. We only get unabbreviated OIDs # from SQLite or Xapian DBs, here, so malicious clients # can't trigger excessive retries: - warn "I: $$ $oid missing, retrying in $git_dir\n"; + warn "# $$ $oid missing, retrying in $git_dir\n"; $gcf2 = new(); - %seen = ($git_dir => add_alt($gcf2,"$git_dir/objects")); + %seen = ($git_dir => add_alt($gcf2, $git_dir)); $check_at = clock_gettime(CLOCK_MONOTONIC) + $exp; if ($gcf2->cat_oid(1, $oid)) { - warn "I: $$ $oid found after retry\n"; + warn "# $$ $oid found after retry\n"; } else { warn "W: $$ $oid missing after retry\n"; print "$oid missing\n"; # mimic git-cat-file @@ -140,7 +121,8 @@ sub loop (;$) { $check_at //= $now + $exp; if ($now > $check_at) { undef $check_at; - if (have_unlinked_files()) { + if (!$ck_unlinked_packs || + $ck_unlinked_packs->($pid)) { $gcf2 = new(); %seen = (); } diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm index 09c3aa06..07ff7dcb 100644 --- a/lib/PublicInbox/Gcf2Client.pm +++ b/lib/PublicInbox/Gcf2Client.pm @@ -1,15 +1,18 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # connects public-inbox processes to PublicInbox::Gcf2::loop() package PublicInbox::Gcf2Client; -use strict; +use v5.12; use parent qw(PublicInbox::DS); use PublicInbox::Git; use PublicInbox::Gcf2; # fails if Inline::C or libgit2-dev isn't available use PublicInbox::Spawn qw(spawn); use Socket qw(AF_UNIX SOCK_STREAM); -use PublicInbox::Syscall qw(EPOLLIN EPOLLET); +use PublicInbox::Syscall qw(EPOLLIN); +use PublicInbox::IO; +use autodie qw(socketpair); + # fields: # sock => socket to Gcf2::loop # The rest of these fields are compatible with what PublicInbox::Git @@ -18,68 +21,39 @@ use PublicInbox::Syscall qw(EPOLLIN EPOLLET); # pid.owner => process which spawned {pid} # in => same as {sock}, for compatibility with PublicInbox::Git # inflight => array (see PublicInbox::Git) -# rbuf => scalarref, may be non-existent or empty sub new { - my ($rdr) = @_; + my ($opt) = @_; my $self = bless {}, __PACKAGE__; # ensure the child process has the same @INC we do: my $env = { PERL5LIB => join(':', @INC) }; - my ($s1, $s2); - socketpair($s1, $s2, AF_UNIX, SOCK_STREAM, 0) or die "socketpair $!"; - $rdr //= {}; - $rdr->{0} = $rdr->{1} = $s2; - my $cmd = [$^X, qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop]]; - $self->{'pid.owner'} = $$; - $self->{pid} = spawn($cmd, $env, $rdr); + socketpair(my $s1, my $s2, AF_UNIX, SOCK_STREAM, 0); $s1->blocking(0); + $opt->{0} = $opt->{1} = $s2; + my $cmd = [$^X, $^W ? ('-w') : (), + qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop]]; $self->{inflight} = []; - $self->{in} = $s1; - $self->SUPER::new($s1, EPOLLIN|EPOLLET); -} - -sub fail { - my $self = shift; - $self->close; # PublicInbox::DS::close - PublicInbox::Git::fail($self, @_); + PublicInbox::IO::attach_pid($s1, spawn($cmd, $env, $opt), + \&PublicInbox::Git::gcf_drain, $self->{inflight}); + $self->{epwatch} = \undef; # for Git->cleanup + $self->SUPER::new($s1, EPOLLIN); } sub gcf2_async ($$$;$) { my ($self, $req, $cb, $arg) = @_; - my $inflight = $self->{inflight} or return $self->close; - - # {wbuf} is rare, I hope: - cat_async_step($self, $inflight) if $self->{wbuf}; - - $self->fail("gcf2c write: $!") if !$self->write($req) && !$self->{sock}; - push @$inflight, $req, $cb, $arg; + my $inflight = $self->gcf_inflight or return; + PublicInbox::Git::write_all($self, $req, \&cat_async_step, $inflight); + push @$inflight, \$req, $cb, $arg; # ref prevents Git.pm retries } # ensure PublicInbox::Git::cat_async_step never calls cat_async_retry sub alternates_changed {} -# DS::event_loop will call this -sub event_step { - my ($self) = @_; - $self->flush_write; - $self->close if !$self->{in} || !$self->{sock}; # process died - my $inflight = $self->{inflight}; - if ($inflight && @$inflight) { - cat_async_step($self, $inflight); - return $self->close unless $self->{in}; # process died - - # ok, more to do, requeue for fairness - $self->requeue if @$inflight || exists($self->{rbuf}); - } -} - -sub DESTROY { - my ($self) = @_; - delete $self->{sock}; # if outside event_loop - PublicInbox::Git::DESTROY($self); -} - no warnings 'once'; -*cat_async_step = \&PublicInbox::Git::cat_async_step; +*gcf_inflight = \&PublicInbox::Git::gcf_inflight; # for event_step +*cat_async_step = \&PublicInbox::Git::cat_async_step; # for event_step +*event_step = \&PublicInbox::Git::event_step; +*fail = \&PublicInbox::Git::fail; +*DESTROY = \&PublicInbox::Git::DESTROY; 1; diff --git a/lib/PublicInbox/GetlineBody.pm b/lib/PublicInbox/GetlineBody.pm deleted file mode 100644 index 0e781224..00000000 --- a/lib/PublicInbox/GetlineBody.pm +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> - -# Wrap a pipe or file for PSGI streaming response bodies and calls the -# end callback when the object goes out-of-scope. -# This depends on rpipe being _blocking_ on getline. -# -# This is only used by generic PSGI servers and not public-inbox-httpd -package PublicInbox::GetlineBody; -use strict; -use warnings; - -sub new { - my ($class, $rpipe, $end, $end_arg, $buf, $filter) = @_; - bless { - rpipe => $rpipe, - end => $end, - end_arg => $end_arg, - initial_buf => $buf, - filter => $filter, - }, $class; -} - -# close should always be called after getline returns undef, -# but a client aborting a connection can ruin our day; so lets -# hope our underlying PSGI server does not leak references, here. -sub DESTROY { $_[0]->close } - -sub getline { - my ($self) = @_; - my $rpipe = $self->{rpipe} or return; # EOF was set on previous call - my $buf = delete($self->{initial_buf}) // $rpipe->getline; - delete($self->{rpipe}) unless defined $buf; # set EOF for next call - if (my $filter = $self->{filter}) { - $buf = $filter->translate($buf); - } - $buf; -} - -sub close { - my ($self) = @_; - my ($end, $end_arg) = delete @$self{qw(end end_arg)}; - $end->($end_arg) if $end; -} - -1; diff --git a/lib/PublicInbox/GetlineResponse.pm b/lib/PublicInbox/GetlineResponse.pm new file mode 100644 index 00000000..290cce74 --- /dev/null +++ b/lib/PublicInbox/GetlineResponse.pm @@ -0,0 +1,40 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# For generic PSGI servers (not public-inbox-httpd/netd) which assumes their +# getline response bodies can be backpressure-aware for slow clients +# This depends on rpipe being _blocking_ on getline. +package PublicInbox::GetlineResponse; +use v5.12; + +sub response { + my ($qsp) = @_; + my ($res, $rbuf); + do { # read header synchronously + sysread($qsp->{rpipe}, $rbuf, 65536); + $res = $qsp->parse_hdr_done($rbuf); # fills $bref + } until defined($res); + my ($wcb, $filter) = $qsp->yield_pass(undef, $res) or return; + my $self = $res->[2] = bless { + qsp => $qsp, + filter => $filter, + }, __PACKAGE__; + my ($bref) = @{delete $qsp->{yield_parse_hdr}}; + $self->{rbuf} = $$bref if $$bref ne ''; + $wcb->($res); +} + +sub getline { + my ($self) = @_; + my $rpipe = $self->{qsp}->{rpipe} // do { + delete($self->{qsp})->finish; + return; # EOF was set on previous call + }; + my $buf = delete($self->{rbuf}) // $rpipe->getline; + $buf // delete($self->{qsp}->{rpipe}); # set EOF for next call + $self->{filter} ? $self->{filter}->translate($buf) : $buf; +} + +sub close {} + +1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index b2ae75c8..aea389e8 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -9,27 +9,34 @@ package PublicInbox::Git; use strict; use v5.10.1; -use parent qw(Exporter); +use parent qw(Exporter PublicInbox::DS); +use autodie qw(socketpair read); use POSIX (); -use IO::Handle; # ->autoflush -use Errno qw(EINTR EAGAIN ENOENT); +use Socket qw(AF_UNIX SOCK_STREAM); +use PublicInbox::Syscall qw(EPOLLIN EPOLLET); +use Errno qw(EAGAIN); use File::Glob qw(bsd_glob GLOB_NOSORT); use File::Spec (); -use Time::HiRes qw(stat); -use PublicInbox::Spawn qw(popen_rd spawn); +use PublicInbox::Spawn qw(spawn popen_rd run_qx which); +use PublicInbox::IO qw(read_all try_cat); use PublicInbox::Tmpfile; -use IO::Poll qw(POLLIN); use Carp qw(croak carp); -use Digest::SHA (); -use PublicInbox::DS qw(dwaitpid); -our @EXPORT_OK = qw(git_unquote git_quote); -our $PIPE_BUFSIZ = 65536; # Linux default +use PublicInbox::SHA qw(sha_all); +our %HEXLEN2SHA = (40 => 1, 64 => 256); +our %OFMT2HEXLEN = (sha1 => 40, sha256 => 64); +our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN + $ck_unlinked_packs); our $in_cleanup; -our $RDTIMEO = 60_000; # milliseconds our $async_warn; # true in read-only daemons -use constant MAX_INFLIGHT => (POSIX::PIPE_BUF * 3) / - 65; # SHA-256 hex size + "\n" in preparation for git using non-SHA1 +# committerdate:unix is git 2.9.4+ (2017-05-05), so using raw instead +my @MODIFIED_DATE = qw[for-each-ref --sort=-committerdate + --format=%(committerdate:raw) --count=1]; + +use constant { + MAX_INFLIGHT => 18, # arbitrary, formerly based on PIPE_BUF + BATCH_CMD_VER => v2.36.0, # git 2.36+ +}; my %GIT_ESC = ( a => "\a", @@ -44,6 +51,28 @@ my %GIT_ESC = ( ); my %ESC_GIT = map { $GIT_ESC{$_} => $_ } keys %GIT_ESC; +my $EXE_ST = ''; # pack('dd', st_dev, st_ino); # no `q' in some 32-bit builds +my ($GIT_EXE, $GIT_VER); + +sub check_git_exe () { + $GIT_EXE = which('git') // die "git not found in $ENV{PATH}"; + my @st = stat(_) or die "stat($GIT_EXE): $!"; # can't do HiRes w/ _ + my $st = pack('dd', $st[0], $st[1]); + if ($st ne $EXE_ST) { + my $v = run_qx([ $GIT_EXE, '--version' ]); + die "$GIT_EXE --version: \$?=$?" if $?; + $v =~ /\b([0-9]+(?:\.[0-9]+){2})/ or die + "$GIT_EXE --version output: $v # unparseable"; + $GIT_VER = eval("v$1") // die "BUG: bad vstring: $1 ($v)"; + $EXE_ST = $st; + } + $GIT_EXE; +} + +sub git_version { + check_git_exe(); + $GIT_VER; +} # unquote pathnames used by git, see quote.c::unquote_c_style.c in git.git sub git_unquote ($) { @@ -63,34 +92,42 @@ sub git_quote ($) { sub new { my ($class, $git_dir) = @_; + $git_dir .= '/'; $git_dir =~ tr!/!/!s; - $git_dir =~ s!/*\z!!s; + chop $git_dir; # may contain {-tmp} field for File::Temp::Dir - bless { git_dir => $git_dir, alt_st => '', -git_path => {} }, $class + my %dedupe = ($git_dir => undef); + bless { git_dir => (keys %dedupe)[0] }, $class } sub git_path ($$) { my ($self, $path) = @_; $self->{-git_path}->{$path} //= do { - local $/ = "\n"; - chomp(my $str = $self->qx(qw(rev-parse --git-path), $path)); - - # git prior to 2.5.0 did not understand --git-path - if ($str eq "--git-path\n$path") { - $str = "$self->{git_dir}/$path"; + my $d = "$self->{git_dir}/$path"; + if (-e $d) { + $d; + } else { + local $/ = "\n"; + my $rdr = { 2 => \my $err }; + my $s = $self->qx([qw(rev-parse --git-path), $path], + undef, $rdr); + chomp $s; + + # git prior to 2.5.0 did not understand --git-path + $s eq "--git-path\n$path" ? $d : $s; } - $str; }; } sub alternates_changed { my ($self) = @_; my $alt = git_path($self, 'objects/info/alternates'); + use Time::HiRes qw(stat); my @st = stat($alt) or return 0; # can't rely on 'q' on some 32-bit builds, but `d' works my $st = pack('dd', $st[10], $st[7]); # 10: ctime, 7: size - return 0 if $self->{alt_st} eq $st; + return 0 if ($self->{alt_st} // '') eq $st; $self->{alt_st} = $st; # always a true value } @@ -103,124 +140,124 @@ sub object_format { sub last_check_err { my ($self) = @_; - my $fh = $self->{err_c} or return; - sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!"); - defined(sysread($fh, my $buf, -s $fh)) or - $self->fail("sysread failed: $!"); + my $fh = $self->{err_c} or return ''; + sysseek($fh, 0, 0) or $self->fail("sysseek: $!"); + my $size = -s $fh or return ''; + sysread($fh, my $buf, $size) // $self->fail("sysread: $!"); + truncate($fh, 0) or $self->fail("truncate: $!"); $buf; } -sub _bidi_pipe { - my ($self, $batch, $in, $out, $pid, $err) = @_; - if ($self->{$pid}) { - if (defined $err) { # "err_c" - my $fh = $self->{$err}; - sysseek($fh, 0, 0) or $self->fail("sysseek failed: $!"); - truncate($fh, 0) or $self->fail("truncate failed: $!"); - } - return; +sub gcf_drain { # awaitpid cb + my ($pid, $inflight, $bc) = @_; + while (@$inflight) { + my ($req, $cb, $arg) = splice(@$inflight, 0, 3); + $req = $$req if ref($req); + $bc and $req =~ s/\A(?:contents|info) //; + $req =~ s/ .*//; # drop git_dir for Gcf2Client + eval { $cb->(undef, $req, undef, undef, $arg) }; + warn "E: (in abort) $req: $@" if $@; } - pipe(my ($out_r, $out_w)) or $self->fail("pipe failed: $!"); - my $rdr = { 0 => $out_r, pgid => 0 }; +} + +sub _sock_cmd { + my ($self, $batch, $err_c) = @_; + $self->{sock} and Carp::confess('BUG: {sock} exists'); + socketpair(my $s1, my $s2, AF_UNIX, SOCK_STREAM, 0); + $s1->blocking(0); + my $opt = { pgid => 0, 0 => $s2, 1 => $s2 }; my $gd = $self->{git_dir}; if ($gd =~ s!/([^/]+/[^/]+)\z!/!) { - $rdr->{-C} = $gd; + $opt->{-C} = $gd; $gd = $1; } - my @cmd = (qw(git), "--git-dir=$gd", - qw(-c core.abbrev=40 cat-file), $batch); - if ($err) { - my $id = "git.$self->{git_dir}$batch.err"; - my $fh = tmpfile($id) or $self->fail("tmpfile($id): $!"); - $self->{$err} = $fh; - $rdr->{2} = $fh; - } - my ($in_r, $p) = popen_rd(\@cmd, undef, $rdr); - $self->{$pid} = $p; - $self->{"$pid.owner"} = $$; - $out_w->autoflush(1); - if ($^O eq 'linux') { # 1031: F_SETPIPE_SZ - fcntl($out_w, 1031, 4096); - fcntl($in_r, 1031, 4096) if $batch eq '--batch-check'; - } - $self->{$out} = $out_w; - $self->{$in} = $in_r; -} - -sub poll_in ($) { IO::Poll::_poll($RDTIMEO, fileno($_[0]), my $ev = POLLIN) } - -sub my_read ($$$) { - my ($fh, $rbuf, $len) = @_; - my $left = $len - length($$rbuf); - my $r; - while ($left > 0) { - $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)); - if ($r) { - $left -= $r; - } elsif (defined($r)) { # EOF - return 0; - } else { - next if ($! == EAGAIN and poll_in($fh)); - next if $! == EINTR; # may be set by sysread or poll_in - return; # unrecoverable error - } - } - my $no_pad = substr($$rbuf, 0, $len, ''); - \$no_pad; -} - -sub my_readline ($$) { - my ($fh, $rbuf) = @_; - while (1) { - if ((my $n = index($$rbuf, "\n")) >= 0) { - return substr($$rbuf, 0, $n + 1, ''); - } - my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)) - and next; - # return whatever's left on EOF - return substr($$rbuf, 0, length($$rbuf)+1, '') if defined($r); - - next if ($! == EAGAIN and poll_in($fh)); - next if $! == EINTR; # may be set by sysread or poll_in - return; # unrecoverable error + # git 2.31.0+ supports -c core.abbrev=no, don't bother with + # core.abbrev=64 since not many releases had SHA-256 prior to 2.31 + my $abbr = $GIT_VER lt v2.31.0 ? 40 : 'no'; + my @cmd = ($GIT_EXE, "--git-dir=$gd", '-c', "core.abbrev=$abbr", + 'cat-file', "--$batch"); + if ($err_c) { + my $id = "git.$self->{git_dir}.$batch.err"; + $self->{err_c} = $opt->{2} = tmpfile($id, undef, 1) or + $self->fail("tmpfile($id): $!"); } + my $inflight = []; # TODO consider moving this into the IO object + my $pid = spawn(\@cmd, undef, $opt); + $self->{sock} = PublicInbox::IO::attach_pid($s1, $pid, + \&gcf_drain, $inflight, $self->{-bc}); + $self->{inflight} = $inflight; } sub cat_async_retry ($$) { - my ($self, $inflight) = @_; + my ($self, $old_inflight) = @_; # {inflight} may be non-existent, but if it isn't we delete it # here to prevent cleanup() from waiting: - delete $self->{inflight}; - cleanup($self); + my ($sock, $epwatch) = delete @$self{qw(sock epwatch inflight)}; + $self->SUPER::close if $epwatch; + my $new_inflight = batch_prepare($self); + + while (my ($oid, $cb, $arg) = splice(@$old_inflight, 0, 3)) { + write_all($self, $oid."\n", \&cat_async_step, $new_inflight); + $oid = \$oid if !@$new_inflight; # to indicate oid retried + push @$new_inflight, $oid, $cb, $arg; + } + $sock->close if $sock; # only safe once old_inflight is empty + cat_async_step($self, $new_inflight); # take one step +} - $self->{inflight} = $inflight; - batch_prepare($self); - my $buf = ''; - for (my $i = 0; $i < @$inflight; $i += 3) { - $buf .= "$inflight->[$i]\n"; +sub gcf_inflight ($) { + my ($self) = @_; + # FIXME: the first {sock} check can succeed but Perl can complain + # about an undefined value. Not sure why or how this happens but + # t/imapd.t can complain about it, sometimes. + if ($self->{sock}) { + if (eval { $self->{sock}->can_reap }) { + return $self->{inflight}; + } elsif ($@) { + no warnings 'uninitialized'; + warn "E: $self sock=$self->{sock}: can_reap failed: ". + "$@ (continuing...)"; + } + delete @$self{qw(sock inflight)}; + } else { + $self->close; } - print { $self->{out} } $buf or $self->fail("write error: $!"); - my $req = shift @$inflight; - unshift(@$inflight, \$req); # \$ref to indicate retried + undef; +} - cat_async_step($self, $inflight); # take one step +# returns true if prefetch is successful +sub async_prefetch { + my ($self, $oid, $cb, $arg) = @_; + my $inflight = gcf_inflight($self) or return; + return if @$inflight; + substr($oid, 0, 0) = 'contents ' if $self->{-bc}; + write_all($self, "$oid\n", \&cat_async_step, $inflight); + push(@$inflight, $oid, $cb, $arg); } sub cat_async_step ($$) { my ($self, $inflight) = @_; - die 'BUG: inflight empty or odd' if scalar(@$inflight) < 3; + croak 'BUG: inflight empty or odd' if scalar(@$inflight) < 3; my ($req, $cb, $arg) = @$inflight[0, 1, 2]; - my $rbuf = delete($self->{rbuf}) // \(my $new = ''); my ($bref, $oid, $type, $size); - my $head = my_readline($self->{in}, $rbuf); + my $head = $self->{sock}->my_readline; + my $cmd = ref($req) ? $$req : $req; # ->fail may be called via Gcf2Client.pm + my $info = $self->{-bc} && substr($cmd, 0, 5) eq 'info '; if ($head =~ /^([0-9a-f]{40,}) (\S+) ([0-9]+)$/) { ($oid, $type, $size) = ($1, $2, $3 + 0); - $bref = my_read($self->{in}, $rbuf, $size + 1) or - $self->fail(defined($bref) ? 'read EOF' : "read: $!"); - chop($$bref) eq "\n" or $self->fail('LF missing after blob'); + unless ($info) { # --batch-command + $bref = $self->{sock}->my_bufread($size + 1) or + $self->fail(defined($bref) ? + 'read EOF' : "read: $!"); + chop($$bref) eq "\n" or + $self->fail('LF missing after blob'); + } + } elsif ($info && $head =~ / (missing|ambiguous)\n/) { + $type = $1; + $oid = substr($cmd, 5); # remove 'info ' } elsif ($head =~ s/ missing\n//s) { $oid = $head; # ref($req) indicates it's already been retried @@ -229,27 +266,34 @@ sub cat_async_step ($$) { return cat_async_retry($self, $inflight); } $type = 'missing'; - $oid = ref($req) ? $$req : $req if $oid eq ''; + if ($oid eq '') { + $oid = $cmd; + $oid =~ s/\A(?:contents|info) // if $self->{-bc}; + } } else { my $err = $! ? " ($!)" : ''; $self->fail("bad result from async cat-file: $head$err"); } - $self->{rbuf} = $rbuf if $$rbuf ne ''; splice(@$inflight, 0, 3); # don't retry $cb on ->fail eval { $cb->($bref, $oid, $type, $size, $arg) }; - async_err($self, $req, $oid, $@, 'cat') if $@; + async_err($self, $req, $oid, $@, $info ? 'check' : 'cat') if $@; } sub cat_async_wait ($) { my ($self) = @_; - my $inflight = $self->{inflight} or return; - while (scalar(@$inflight)) { - cat_async_step($self, $inflight); - } + my $inflight = gcf_inflight($self) or return; + cat_async_step($self, $inflight) while (scalar(@$inflight)); } sub batch_prepare ($) { - _bidi_pipe($_[0], qw(--batch in out pid)); + my ($self) = @_; + check_git_exe(); + if ($GIT_VER ge BATCH_CMD_VER) { + $self->{-bc} = 1; + _sock_cmd($self, 'batch-command', 1); + } else { + _sock_cmd($self, 'batch'); + } } sub _cat_file_cb { @@ -266,55 +310,86 @@ sub cat_file { } sub check_async_step ($$) { - my ($self, $inflight_c) = @_; - die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3; - my ($req, $cb, $arg) = @$inflight_c[0, 1, 2]; - my $rbuf = delete($self->{rbuf_c}) // \(my $new = ''); - chomp(my $line = my_readline($self->{in_c}, $rbuf)); + my ($ck, $inflight) = @_; + croak 'BUG: inflight empty or odd' if scalar(@$inflight) < 3; + my ($req, $cb, $arg) = @$inflight[0, 1, 2]; + chomp(my $line = $ck->{sock}->my_readline); my ($hex, $type, $size) = split(/ /, $line); - # Future versions of git.git may have type=ambiguous, but for now, - # we must handle 'dangling' below (and maybe some other oddball - # stuff): + # git <2.21 would show `dangling' (2.21+ shows `ambiguous') # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ - if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') { - my $ret = my_read($self->{in_c}, $rbuf, $type + 1); - $self->fail(defined($ret) ? 'read EOF' : "read: $!") if !$ret; + if ($hex eq 'dangling') { + my $ret = $ck->{sock}->my_bufread($type + 1); + $ck->fail(defined($ret) ? 'read EOF' : "read: $!") if !$ret; } - $self->{rbuf_c} = $rbuf if $$rbuf ne ''; - splice(@$inflight_c, 0, 3); # don't retry $cb on ->fail - eval { $cb->($hex, $type, $size, $arg, $self) }; - async_err($self, $req, $hex, $@, 'check') if $@; + splice(@$inflight, 0, 3); # don't retry $cb on ->fail + eval { $cb->(undef, $hex, $type, $size, $arg) }; + async_err($ck, $req, $hex, $@, 'check') if $@; } sub check_async_wait ($) { my ($self) = @_; - my $inflight_c = $self->{inflight_c} or return; - while (scalar(@$inflight_c)) { - check_async_step($self, $inflight_c); - } + return cat_async_wait($self) if $self->{-bc}; + my $ck = $self->{ck} or return; + my $inflight = gcf_inflight($ck) or return; + check_async_step($ck, $inflight) while (scalar(@$inflight)); +} + +# git <2.36 +sub ck { + $_[0]->{ck} //= bless { git_dir => $_[0]->{git_dir} }, + 'PublicInbox::GitCheck'; } sub check_async_begin ($) { my ($self) = @_; cleanup($self) if alternates_changed($self); - _bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c)); - die 'BUG: already in async check' if $self->{inflight_c}; - $self->{inflight_c} = []; + check_git_exe(); + if ($GIT_VER ge BATCH_CMD_VER) { + $self->{-bc} = 1; + _sock_cmd($self, 'batch-command', 1); + } else { + _sock_cmd($self = ck($self), 'batch-check', 1); + } +} + +sub write_all { + my ($self, $buf, $read_step, $inflight) = @_; + $self->{sock} // Carp::confess 'BUG: no {sock}'; + Carp::confess('BUG: not an array') if ref($inflight) ne 'ARRAY'; + $read_step->($self, $inflight) while @$inflight >= MAX_INFLIGHT; + do { + my $w = syswrite($self->{sock}, $buf); + if (defined $w) { + return if $w == length($buf); + substr($buf, 0, $w, ''); # sv_chop + } elsif ($! != EAGAIN) { + $self->fail("write: $!"); + } + $read_step->($self, $inflight); + } while (1); } sub check_async ($$$$) { my ($self, $oid, $cb, $arg) = @_; - my $inflight_c = $self->{inflight_c} // check_async_begin($self); - while (scalar(@$inflight_c) >= MAX_INFLIGHT) { - check_async_step($self, $inflight_c); + my $inflight; + if ($self->{-bc}) { # likely as time goes on +batch_command: + $inflight = gcf_inflight($self) // cat_async_begin($self); + substr($oid, 0, 0) = 'info '; + write_all($self, "$oid\n", \&cat_async_step, $inflight); + } else { # accounts for git upgrades while we're running: + my $ck = $self->{ck}; # undef OK, maybe set in check_async_begin + $inflight = ($ck ? gcf_inflight($ck) : undef) + // check_async_begin($self); + goto batch_command if $self->{-bc}; + write_all($self->{ck}, "$oid\n", \&check_async_step, $inflight); } - print { $self->{out_c} } $oid, "\n" or $self->fail("write error: $!"); - push(@$inflight_c, $oid, $cb, $arg); + push(@$inflight, $oid, $cb, $arg); } sub _check_cb { # check_async callback - my ($hex, $type, $size, $result) = @_; + my (undef, $hex, $type, $size, $result) = @_; @$result = ($hex, $type, $size); } @@ -325,48 +400,15 @@ sub check { check_async_wait($self); my ($hex, $type, $size) = @$result; - # Future versions of git.git may show 'ambiguous', but for now, - # we must handle 'dangling' below (and maybe some other oddball - # stuff): + # git <2.21 would show `dangling' (2.21+ shows `ambiguous') # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ - return if $type eq 'missing' || $type eq 'ambiguous'; - return if $hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop'; + return if $type =~ /\A(?:missing|ambiguous)\z/ || $hex eq 'dangling'; ($hex, $type, $size); } -sub _destroy { - my ($self, $rbuf, $in, $out, $pid, $err) = @_; - delete @$self{($rbuf, $in, $out)}; - delete $self->{$err} if $err; # `err_c' - - # GitAsyncCat::event_step may delete {pid} - my $p = delete $self->{$pid} or return; - dwaitpid($p) if $$ == $self->{"$pid.owner"}; -} - -sub async_abort ($) { - my ($self) = @_; - while (scalar(@{$self->{inflight_c} // []}) || - scalar(@{$self->{inflight} // []})) { - for my $c ('', '_c') { - my $q = $self->{"inflight$c"} or next; - while (@$q) { - my ($req, $cb, $arg) = splice(@$q, 0, 3); - $req = $$req if ref($req); - $req =~ s/ .*//; # drop git_dir for Gcf2Client - eval { $cb->(undef, $req, undef, undef, $arg) }; - warn "E: (in abort) $req: $@" if $@; - } - delete $self->{"inflight$c"}; - delete $self->{"rbuf$c"}; - } - } - cleanup($self); -} - -sub fail { # may be augmented in subclasses +sub fail { my ($self, $msg) = @_; - async_abort($self); + $self->close; croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg"); } @@ -377,6 +419,11 @@ sub async_err ($$$$$) { $async_warn ? carp($msg) : $self->fail($msg); } +sub cmd { + my $self = shift; + [ $GIT_EXE // check_git_exe(), "--git-dir=$self->{git_dir}", @_ ] +} + # $git->popen(qw(show f00)); # or # $git->popen(qw(show f00), { GIT_CONFIG => ... }, { 2 => ... }); sub popen { @@ -391,12 +438,12 @@ sub qx { my $fh = popen(@_); if (wantarray) { my @ret = <$fh>; - close $fh; # caller should check $? + $fh->close; # caller should check $? @ret; } else { local $/; my $ret = <$fh>; - close $fh; # caller should check $? + $fh->close; # caller should check $? $ret; } } @@ -408,12 +455,16 @@ sub date_parse { } $self->qx('rev-parse', map { "--since=$_" } @_); } +sub _active ($) { + scalar(@{gcf_inflight($_[0]) // []}) || + ($_[0]->{ck} && scalar(@{gcf_inflight($_[0]->{ck}) // []})) +} + # check_async and cat_async may trigger the other, so ensure they're # both completely done by using this: sub async_wait_all ($) { my ($self) = @_; - while (scalar(@{$self->{inflight_c} // []}) || - scalar(@{$self->{inflight} // []})) { + while (_active($self)) { check_async_wait($self); cat_async_wait($self); } @@ -422,15 +473,11 @@ sub async_wait_all ($) { # returns true if there are pending "git cat-file" processes sub cleanup { my ($self, $lazy) = @_; - return 1 if $lazy && (scalar(@{$self->{inflight_c} // []}) || - scalar(@{$self->{inflight} // []})); + ($lazy && _active($self)) and + return $self->{epwatch} ? watch_async($self) : 1; local $in_cleanup = 1; - delete $self->{async_cat}; async_wait_all($self); - delete $self->{inflight}; - delete $self->{inflight_c}; - _destroy($self, qw(rbuf in out pid)); - _destroy($self, qw(rbuf_c in_c out_c pid_c err_c)); + $_->close for ($self, (delete($self->{ck}) // ())); undef; } @@ -441,135 +488,194 @@ sub packed_bytes { my ($self) = @_; my $n = 0; my $pack_dir = git_path($self, 'objects/pack'); - foreach my $p (bsd_glob("$pack_dir/*.pack", GLOB_NOSORT)) { - $n += -s $p; - } + $n += (-s $_ // 0) for (bsd_glob("$pack_dir/*.pack", GLOB_NOSORT)); $n } -sub DESTROY { cleanup(@_) } +sub DESTROY { cleanup($_[0]) } sub local_nick ($) { # don't show full FS path, basename should be OK: - $_[0]->{git_dir} =~ m!/([^/]+?)(?:/*\.git/*)?\z! ? "$1.git" : '???'; + $_[0]->{nick} // ($_[0]->{git_dir} =~ m!/([^/]+?)(?:/*\.git/*)?\z! ? + "$1.git" : undef); } sub host_prefix_url ($$) { my ($env, $url) = @_; return $url if index($url, '//') >= 0; - my $scheme = $env->{'psgi.url_scheme'}; my $host_port = $env->{HTTP_HOST} // "$env->{SERVER_NAME}:$env->{SERVER_PORT}"; - "$scheme://$host_port". ($env->{SCRIPT_NAME} || '/') . $url; + my $sn = $env->{SCRIPT_NAME} // ''; + "$env->{'psgi.url_scheme'}://\L$host_port\E$sn/$url"; +} + +sub base_url { # for coderepos, PSGI-only + my ($self, $env) = @_; # env - PSGI env + my $nick = $self->{nick} // return undef; + my $url = host_prefix_url($env, ''); + # for mount in Plack::Builder + $url .= '/' if substr($url, -1, 1) ne '/'; + $url . $nick . '/'; } +sub isrch {} # TODO + sub pub_urls { my ($self, $env) = @_; if (my $urls = $self->{cgit_url}) { - return map { host_prefix_url($env, $_) } @$urls; + map { host_prefix_url($env, $_) } @$urls; + } else { + (base_url($self, $env) // '???'); } - (local_nick($self)); } sub cat_async_begin { my ($self) = @_; cleanup($self) if $self->alternates_changed; - $self->batch_prepare; - die 'BUG: already in async' if $self->{inflight}; - $self->{inflight} = []; + die 'BUG: already in async' if gcf_inflight($self); + batch_prepare($self); } sub cat_async ($$$;$) { my ($self, $oid, $cb, $arg) = @_; - my $inflight = $self->{inflight} // cat_async_begin($self); - while (scalar(@$inflight) >= MAX_INFLIGHT) { - cat_async_step($self, $inflight); - } - print { $self->{out} } $oid, "\n" or $self->fail("write error: $!"); + my $inflight = gcf_inflight($self) // cat_async_begin($self); + substr($oid, 0, 0) = 'contents ' if $self->{-bc}; + write_all($self, $oid."\n", \&cat_async_step, $inflight); push(@$inflight, $oid, $cb, $arg); } # returns the modified time of a git repo, same as the "modified" field # of a grokmirror manifest -sub modified ($) { - # committerdate:unix is git 2.9.4+ (2017-05-05), so using raw instead - my $fh = popen($_[0], qw[for-each-ref --sort=-committerdate - --format=%(committerdate:raw) --count=1]); +sub modified ($;$) { + my $fh = $_[1] // popen($_[0], @MODIFIED_DATE); (split(/ /, <$fh> // time))[0] + 0; # integerize for JSON } +sub cat_desc ($) { + my $desc = try_cat($_[0]); + chomp $desc; + utf8::decode($desc); + $desc =~ s/\s+/ /smg; + $desc eq '' ? undef : $desc; +} + +sub description { + cat_desc("$_[0]->{git_dir}/description") // 'Unnamed repository'; +} + +sub cloneurl { + my ($self, $env) = @_; + $self->{cloneurl} // do { + my @urls = split(/\s+/s, try_cat("$self->{git_dir}/cloneurl")); + scalar(@urls) ? ($self->{cloneurl} = \@urls) : undef; + } // [ substr(base_url($self, $env), 0, -1) ]; +} + # for grokmirror, which doesn't read gitweb.description # templates/hooks--update.sample and git-multimail in git.git # only match "Unnamed repository", not the full contents of # templates/this--description in git.git sub manifest_entry { my ($self, $epoch, $default_desc) = @_; - my $fh = $self->popen('show-ref'); - my $dig = Digest::SHA->new(1); - while (read($fh, my $buf, 65536)) { - $dig->add($buf); - } - close $fh or return; # empty, uninitialized git repo - undef $fh; # for open, below - my $git_dir = $self->{git_dir}; - my $ent = { - fingerprint => $dig->hexdigest, - reference => undef, - modified => modified($self), - }; - chomp(my $owner = $self->qx('config', 'gitweb.owner')); - utf8::decode($owner); - $ent->{owner} = $owner eq '' ? undef : $owner; - my $desc = ''; - if (open($fh, '<', "$git_dir/description")) { - local $/ = "\n"; - chomp($desc = <$fh>); - utf8::decode($desc); - } - $desc = 'Unnamed repository' if $desc eq ''; - if (defined $epoch && $desc =~ /\AUnnamed repository/) { - $desc = "$default_desc [epoch $epoch]"; + check_git_exe(); + my $gd = $self->{git_dir}; + my @git = ($GIT_EXE, "--git-dir=$gd"); + my $sr = popen_rd([@git, 'show-ref']); + my $own = popen_rd([@git, qw(config gitweb.owner)]); + my $mod = popen_rd([@git, @MODIFIED_DATE]); + my $buf = description($self); + if (defined $epoch && index($buf, 'Unnamed repository') == 0) { + $buf = "$default_desc [epoch $epoch]"; } - $ent->{description} = $desc; - if (open($fh, '<', "$git_dir/objects/info/alternates")) { + my $ent = { description => $buf, reference => undef }; + if (open(my $alt, '<', "$gd/objects/info/alternates")) { # n.b.: GitPython doesn't seem to handle comments or C-quoted # strings like native git does; and we don't for now, either. local $/ = "\n"; - chomp(my @alt = <$fh>); + chomp(my @alt = <$alt>); # grokmirror only supports 1 alternate for "reference", if (scalar(@alt) == 1) { - my $objdir = "$git_dir/objects"; - my $ref = File::Spec->rel2abs($alt[0], $objdir); - $ref =~ s!/[^/]+/?\z!!; # basename - $ent->{reference} = $ref; + $buf = File::Spec->rel2abs($alt[0], "$gd/objects"); + $buf =~ s!/[^/]+/?\z!!; # basename + $ent->{reference} = $buf; } } + $ent->{fingerprint} = sha_all(1, $sr)->hexdigest; + $sr->close or return; # empty, uninitialized git repo + $ent->{modified} = modified(undef, $mod); + chomp($buf = <$own> // ''); + utf8::decode($buf); + $ent->{owner} = $buf eq '' ? undef : $buf; $ent; } +our $ck_unlinked_packs = $^O eq 'linux' ? sub { + # FIXME: port gcf2-like over to git.git so we won't need to + # deal with libgit2 + my $s = try_cat "/proc/$_[0]/maps"; + $s =~ /\.(?:idx|pack) \(deleted\)/s ? 1 : undef; +} : undef; + # returns true if there are pending cat-file processes sub cleanup_if_unlinked { my ($self) = @_; - return cleanup($self, 1) if $^O ne 'linux'; + $ck_unlinked_packs or return cleanup($self, 1); # Linux-specific /proc/$PID/maps access # TODO: support this inside git.git - my $ret = 0; - for my $fld (qw(pid pid_c)) { - my $pid = $self->{$fld} // next; - open my $fh, '<', "/proc/$pid/maps" or return cleanup($self, 1); - while (<$fh>) { - # n.b. we do not restart for unlinked multi-pack-index - # since it's not too huge, and the startup cost may - # be higher. - /\.(?:idx|pack) \(deleted\)$/ and - return cleanup($self, 1); - } - ++$ret; + my $nr_live = 0; + for my $obj ($self, ($self->{ck} // ())) { + my $sock = $obj->{sock} // next; + my $pid = $sock->attached_pid // next; + $ck_unlinked_packs->($pid) and return cleanup($self, 1); + ++$nr_live; + } + $nr_live; +} + +sub event_step { + my ($self) = @_; + my $inflight = gcf_inflight($self); + if ($inflight && @$inflight) { + $self->cat_async_step($inflight); + return $self->close unless $self->{sock}; + # don't loop here to keep things fair, but we must requeue + # if there's already-read data in pi_io_rbuf + $self->requeue if $self->{sock}->has_rbuf; } - $ret; } +sub schedule_cleanup { + my ($self) = @_; + PublicInbox::DS::add_uniq_timer($self+0, 30, \&cleanup, $self, 1); +} + +# idempotently registers with DS epoll/kqueue/select/poll +sub watch_async ($) { + my ($self) = @_; + schedule_cleanup($self); + $self->{epwatch} //= do { + $self->SUPER::new($self->{sock}, EPOLLIN); + \undef; + } +} + +sub close { + my ($self) = @_; + my $sock = $self->{sock}; + delete @$self{qw(-bc err_c inflight)}; + delete($self->{epwatch}) ? $self->SUPER::close : delete($self->{sock}); + $sock->close if $sock; # calls gcf_drain via awaitpid +} + +package PublicInbox::GitCheck; # only for git <2.36 +use v5.12; +our @ISA = qw(PublicInbox::Git); +no warnings 'once'; + +# for event_step +*cat_async_step = \&PublicInbox::Git::check_async_step; + 1; __END__ =pod diff --git a/lib/PublicInbox/GitAsyncCat.pm b/lib/PublicInbox/GitAsyncCat.pm index 6b7425f6..f57e0336 100644 --- a/lib/PublicInbox/GitAsyncCat.pm +++ b/lib/PublicInbox/GitAsyncCat.pm @@ -1,73 +1,38 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# -# internal class used by PublicInbox::Git + PublicInbox::DS -# This parses the output pipe of "git cat-file --batch" package PublicInbox::GitAsyncCat; -use strict; -use parent qw(PublicInbox::DS Exporter); -use POSIX qw(WNOHANG); -use PublicInbox::Syscall qw(EPOLLIN EPOLLET); -our @EXPORT = qw(ibx_async_cat ibx_async_prefetch); -use PublicInbox::Git (); +use v5.12; +use parent qw(Exporter); +our @EXPORT = qw(ibx_async_cat ibx_async_prefetch async_check); our $GCF2C; # singleton PublicInbox::Gcf2Client -sub close { - my ($self) = @_; - if (my $git = delete $self->{git}) { - $git->async_abort; - } - $self->SUPER::close; # PublicInbox::DS::close -} - -sub event_step { - my ($self) = @_; - my $git = $self->{git} or return; - return $self->close if ($git->{in} // 0) != ($self->{sock} // 1); - my $inflight = $git->{inflight}; - if ($inflight && @$inflight) { - $git->cat_async_step($inflight); - - # child death? - if (($git->{in} // 0) != ($self->{sock} // 1)) { - $self->close; - } elsif (@$inflight || exists $git->{rbuf}) { - # ok, more to do, requeue for fairness - $self->requeue; - } - } elsif ((my $pid = waitpid($git->{pid}, WNOHANG)) > 0) { - # May happen if the child process is killed by a BOFH - # (or segfaults) - delete $git->{pid}; - warn "E: git $pid exited with \$?=$?\n"; - $self->close; - } -} - sub ibx_async_cat ($$$$) { my ($ibx, $oid, $cb, $arg) = @_; - my $git = $ibx->git; + my $isrch = $ibx->{isrch}; + my $git = $isrch ? $isrch->{es}->git : ($ibx->{git} // $ibx->git); # {topdir} means ExtSearch (likely [extindex "all"]) with potentially - # 100K alternates. git(1) has a proposed patch for 100K alternates: - # <https://lore.kernel.org/git/20210624005806.12079-1-e@80x24.org/> - if (!defined($ibx->{topdir}) && ($GCF2C //= eval { + # 100K alternates. git v2.33+ can handle 100k alternates fairly well. + if (!$isrch && !defined($ibx->{topdir}) && !defined($git->{-tmp}) && + ($GCF2C //= eval { require PublicInbox::Gcf2Client; PublicInbox::Gcf2Client::new(); } // 0)) { # 0: do not retry if libgit2 or Inline::C are missing - $GCF2C->gcf2_async(\"$oid $git->{git_dir}\n", $cb, $arg); + $GCF2C->gcf2_async("$oid $git->{git_dir}\n", $cb, $arg); \undef; } else { # read-only end of git-cat-file pipe $git->cat_async($oid, $cb, $arg); - $git->{async_cat} //= do { - my $self = bless { git => $git }, __PACKAGE__; - $git->{in}->blocking(0); - $self->SUPER::new($git->{in}, EPOLLIN|EPOLLET); - \undef; # this is a true ref() - }; + $git->watch_async; } } +sub async_check ($$$$) { + my ($ibx, $oidish, $cb, $arg) = @_; # $ibx may be $ctx + my $git = $ibx->{git} // $ibx->git; + $git->check_async($oidish, $cb, $arg); + ($git->{ck} // $git)->watch_async; +} + # this is safe to call inside $cb, but not guaranteed to enqueue # returns true if successful, undef if not. For fairness, we only # prefetch if there's no in-flight requests. @@ -75,16 +40,12 @@ sub ibx_async_prefetch { my ($ibx, $oid, $cb, $arg) = @_; my $git = $ibx->git; if (!defined($ibx->{topdir}) && $GCF2C) { - if (!@{$GCF2C->{inflight} // []}) { + if (!@{$GCF2C->gcf_inflight // []}) { $oid .= " $git->{git_dir}\n"; - return $GCF2C->gcf2_async(\$oid, $cb, $arg); # true - } - } elsif ($git->{async_cat} && (my $inflight = $git->{inflight})) { - if (!@$inflight) { - print { $git->{out} } $oid, "\n" or - $git->fail("write error: $!"); - return push(@$inflight, $oid, $cb, $arg); + return $GCF2C->gcf2_async($oid, $cb, $arg); # true } + } elsif ($git->{epwatch}) { + return $git->async_prefetch($oid, $cb, $arg); } undef; } diff --git a/lib/PublicInbox/GitCredential.pm b/lib/PublicInbox/GitCredential.pm index b18bba1e..bb225ff3 100644 --- a/lib/PublicInbox/GitCredential.pm +++ b/lib/PublicInbox/GitCredential.pm @@ -1,34 +1,36 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# git-credential wrapper with built-in .netrc fallback package PublicInbox::GitCredential; -use strict; +use v5.12; use PublicInbox::Spawn qw(popen_rd); +use autodie qw(close pipe); sub run ($$;$) { my ($self, $op, $lei) = @_; my ($in_r, $in_w, $out_r); my $cmd = [ qw(git credential), $op ]; - pipe($in_r, $in_w) or die "pipe: $!"; + pipe($in_r, $in_w); if ($lei) { # we'll die if disconnected: - pipe($out_r, my $out_w) or die "pipe: $!"; + pipe($out_r, my $out_w); $lei->send_exec_cmd([ $in_r, $out_w ], $cmd, {}); } else { $out_r = popen_rd($cmd, undef, { 0 => $in_r }); } - close $in_r or die "close in_r: $!"; + close $in_r; my $out = ''; for my $k (qw(url protocol host username password)) { - defined(my $v = $self->{$k}) or next; + my $v = $self->{$k} // next; die "`$k' contains `\\n' or `\\0'\n" if $v =~ /[\n\0]/; $out .= "$k=$v\n"; } - $out .= "\n"; - print $in_w $out or die "print (git credential $op): $!"; - close $in_w or die "close (git credential $op): $!"; + say $in_w $out; + close $in_w; return $out_r if $op eq 'fill'; <$out_r> and die "unexpected output from `git credential $op'\n"; - close $out_r or die "`git credential $op' failed: \$!=$! \$?=$?\n"; + $out_r->close or die "`git credential $op' failed: \$!=$! \$?=$?\n"; } sub check_netrc { @@ -59,7 +61,7 @@ sub fill { /\A([^=]+)=(.*)\z/ or die "bad line: $_\n"; $self->{$1} = $2; } - close $out_r or die "git credential fill failed: \$!=$! \$?=$?\n"; + $out_r->close or die "git credential fill failed: \$!=$! \$?=$?\n"; $self->{filled} = 1; } diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index ba3a8f20..396aa783 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # when no endpoints match, fallback to this and serve a static file @@ -9,13 +9,14 @@ use v5.10.1; use Fcntl qw(:seek); use IO::Handle; # ->flush use HTTP::Date qw(time2str); +use PublicInbox::Limiter; use PublicInbox::Qspawn; use PublicInbox::Tmpfile; use PublicInbox::WwwStatic qw(r @NO_CACHE); use Carp (); # 32 is same as the git-daemon connection limit -my $default_limiter = PublicInbox::Qspawn::Limiter->new(32); +my $default_limiter = PublicInbox::Limiter->new(32); # n.b. serving "description" and "cloneurl" should be innocuous enough to # not cause problems. serving "config" might... @@ -23,13 +24,10 @@ my @text = qw[HEAD info/refs info/attributes objects/info/(?:http-alternates|alternates|packs) cloneurl description]; -my @binary = qw! - objects/[a-f0-9]{2}/[a-f0-9]{38} - objects/pack/pack-[a-f0-9]{40}\.(?:pack|idx) - !; +my @binary = ('objects/[a-f0-9]{2}/[a-f0-9]{38,62}', + 'objects/pack/pack-[a-f0-9]{40,64}\.(?:pack|idx)'); our $ANY = join('|', @binary, @text, 'git-upload-pack'); -my $BIN = join('|', @binary); my $TEXT = join('|', @text); sub serve { @@ -62,13 +60,13 @@ sub serve_dumb { my $h = []; my $type; - if ($path =~ m!\Aobjects/[a-f0-9]{2}/[a-f0-9]{38}\z!) { + if ($path =~ m!\Aobjects/[a-f0-9]{2}/[a-f0-9]{38,62}\z!) { $type = 'application/x-git-loose-object'; cache_one_year($h); - } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.pack\z!) { + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40,64}\.pack\z!) { $type = 'application/x-git-packed-objects'; cache_one_year($h); - } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.idx\z!) { + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40,64}\.idx\z!) { $type = 'application/x-git-packed-objects-toc'; cache_one_year($h); } elsif ($path =~ /\A(?:$TEXT)\z/o) { @@ -81,10 +79,10 @@ sub serve_dumb { PublicInbox::WwwStatic::response($env, $h, $path, $type); } -sub git_parse_hdr { # {parse_hdr} for Qspawn - my ($r, $bref, $dumb_args) = @_; +sub ghb_parse_hdr { # header parser for Qspawn + my ($r, $bref, @dumb_args) = @_; my $res = parse_cgi_headers($r, $bref) or return; # incomplete - $res->[0] == 403 ? serve_dumb(@$dumb_args) : $res; + $res->[0] == 403 ? serve_dumb(@dumb_args) : $res; } # returns undef if 403 so it falls back to dumb HTTP @@ -107,8 +105,9 @@ sub serve_smart { $env{GIT_HTTP_EXPORT_ALL} = '1'; $env{PATH_TRANSLATED} = "$git->{git_dir}/$path"; my $rdr = input_prepare($env) or return r(500); + $rdr->{quiet} = 1; my $qsp = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, $rdr); - $qsp->psgi_return($env, $limiter, \&git_parse_hdr, [$env, $git, $path]); + $qsp->psgi_yield($env, $limiter, \&ghb_parse_hdr, $env, $git, $path); } sub input_prepare { @@ -131,8 +130,8 @@ sub input_prepare { { 0 => $in }; } -sub parse_cgi_headers { - my ($r, $bref) = @_; +sub parse_cgi_headers { # {parse_hdr} for Qspawn + my ($r, $bref, $ctx) = @_; return r(500) unless defined $r && $r >= 0; $$bref =~ s/\A(.*?)\r?\n\r?\n//s or return $r == 0 ? r(500) : undef; my $h = $1; @@ -146,7 +145,18 @@ sub parse_cgi_headers { push @h, $k, $v; } } - [ $code, \@h ] + + # fallback to WwwCoderepo if cgit 404s + if ($code == 404 && $ctx->{www} && !$ctx->{_coderepo_tried}++) { + my $wcb = delete $ctx->{env}->{'qspawn.wcb'}; + $ctx->{env}->{'plack.skip-deflater'} = 1; # prevent 2x gzip + $ctx->{env}->{'qspawn.fallback'} = $code; + my $res = $ctx->{www}->coderepo->srv($ctx); + $ctx->{env}->{'qspawn.wcb'} = $wcb; + $res; # CODE or ARRAY ref + } else { + [ $code, \@h ] + } } 1; diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm index bdd313f5..8b630f25 100644 --- a/lib/PublicInbox/GzipFilter.pm +++ b/lib/PublicInbox/GzipFilter.pm @@ -18,6 +18,7 @@ use Compress::Raw::Zlib qw(Z_OK); use PublicInbox::CompressNoop; use PublicInbox::Eml; use PublicInbox::GitAsyncCat; +use Carp qw(carp); our @EXPORT_OK = qw(gzf_maybe); my %OPT = (-WindowBits => 15 + 16, -AppendOutput => 1); @@ -92,27 +93,20 @@ sub gone { # what: search/over/mm undef; } -# for GetlineBody (via Qspawn) when NOT using $env->{'pi-httpd.async'} +# for GetlineResponse (via Qspawn) when NOT using $env->{'pi-httpd.async'} # Also used for ->getline callbacks -sub translate ($$) { - my $self = $_[0]; # $_[1] => input +sub translate { + my $self = shift; # $_[1] => input # allocate the zlib context lazily here, instead of in ->new. # Deflate contexts are memory-intensive and this object may # be sitting in the Qspawn limiter queue for a while. - my $gz = $self->{gz} //= gzip_or_die(); - my $zbuf = delete($self->{zbuf}); - if (defined $_[1]) { # my $buf = $_[1]; - my $err = $gz->deflate($_[1], $zbuf); - die "gzip->deflate: $err" if $err != Z_OK; - return $zbuf if length($zbuf) >= 8192; - - $self->{zbuf} = $zbuf; - ''; + $self->{gz} //= gzip_or_die(); + if (defined $_[0]) { # my $buf = $_[1]; + zmore($self, @_); + length($self->{zbuf}) >= 8192 ? delete($self->{zbuf}) : ''; } else { # undef == EOF - my $err = $gz->flush($zbuf); - die "gzip->flush: $err" if $err != Z_OK; - $zbuf; + $self->zflush; } } @@ -129,56 +123,65 @@ sub http_out ($) { }; } +# returns undef if HTTP client disconnected, may return 0 +# because ->translate can return '' sub write { - # my $ret = bytes::length($_[1]); # XXX does anybody care? - http_out($_[0])->write(translate($_[0], $_[1])); + my $self = shift; + http_out($self)->write($self->translate(@_)); +} + +sub zfh { + $_[0]->{zfh} // do { + open($_[0]->{zfh}, '>>', \($_[0]->{pbuf} //= '')) or + die "open: $!"; + $_[0]->{zfh} + }; } # similar to ->translate; use this when we're sure we know we have # more data to buffer after this sub zmore { - my $self = $_[0]; # $_[1] => input + my $self = shift; + my $zfh = delete $self->{zfh}; + if (@_ > 1 || $zfh) { + print { $zfh // zfh($self) } @_; + @_ = (delete $self->{pbuf}); + delete $self->{zfh}; + }; http_out($self); - my $err = $self->{gz}->deflate($_[1], $self->{zbuf}); - die "gzip->deflate: $err" if $err != Z_OK; - undef; + my $err; + ($err = $self->{gz}->deflate($_[0], $self->{zbuf})) == Z_OK or + die "gzip->deflate: $err"; } # flushes and returns the final bit of gzipped data -sub zflush ($;$) { - my $self = $_[0]; # $_[1] => final input (optional) - my $zbuf = delete $self->{zbuf}; - my $gz = delete $self->{gz}; +sub zflush ($;@) { + my $self = shift; # $_[1..Inf] => final input (optional) + zmore($self, @_) if scalar(@_) || $self->{zfh}; + # not a bug, recursing on DS->write failure + my $gz = delete $self->{gz} // return ''; my $err; - if (defined $_[1]) { # it's a bug iff $gz is undef w/ $_[1] - $err = $gz->deflate($_[1], $zbuf); - die "gzip->deflate: $err" if $err != Z_OK; - } - $gz // return ''; # not a bug, recursing on DS->write failure - $err = $gz->flush($zbuf); - die "gzip->flush: $err" if $err != Z_OK; + my $zbuf = delete $self->{zbuf}; + ($err = $gz->flush($zbuf)) == Z_OK or die "gzip->flush: $err"; $zbuf; } sub close { my ($self) = @_; my $http_out = http_out($self) // return; - $http_out->write(zflush($self)); + $http_out->write($self->zflush); (delete($self->{http_out}) // return)->close; } -sub bail { +sub bail { my $self = shift; - if (my $env = $self->{env}) { - warn @_, "\n"; - my $http = $env->{'psgix.io'} or return; # client abort - eval { $http->close }; # should hit our close - warn "E: error in http->close: $@" if $@; - eval { $self->close }; # just in case... - warn "E: error in self->close: $@" if $@; - } else { - warn @_, "\n"; - } + carp @_; + my $env = $self->{env} or return; + my $http = $env->{'psgix.io'} or return; # client abort + eval { $http->close }; # should hit our close + carp "E: error in http->close: $@" if $@; + eval { $self->close }; # just in case... + carp "E: error in self->close: $@" if $@; } # this is public-inbox-httpd-specific @@ -186,16 +189,19 @@ sub async_blob_cb { # git->cat_async callback my ($bref, $oid, $type, $size, $self) = @_; my $http = $self->{env}->{'psgix.io'}; # PublicInbox::HTTP $http->{forward} or return; # client aborted - my $smsg = $self->{smsg} or bail($self, 'BUG: no smsg'); - if (!defined($oid)) { + my $smsg = $self->{smsg} or return bail($self, 'BUG: no smsg'); + $type // return + bail($self, "abort: $smsg->{blob} $self->{ibx}->{inboxdir}"); + if ($type ne 'blob') { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message - warn "E: $smsg->{blob} missing in $self->{ibx}->{inboxdir}\n"; + warn "E: $smsg->{blob} $type in $self->{ibx}->{inboxdir}\n"; return $http->next_step($self->can('async_next')); } - $smsg->{blob} eq $oid or bail($self, "BUG: $smsg->{blob} != $oid"); + $smsg->{blob} eq $oid or return + bail($self, "BUG: $smsg->{blob} != $oid"); eval { $self->async_eml(PublicInbox::Eml->new($bref)) }; - bail($self, "E: async_eml: $@") if $@; + return bail($self, "E: async_eml: $@") if $@; if ($self->{-low_prio}) { # run via PublicInbox::WWW::event_step push(@{$self->{www}->{-low_prio_q}}, $self) == 1 and PublicInbox::DS::requeue($self->{www}); diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 0dba425d..7162732e 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -43,7 +43,13 @@ use Errno qw(EAGAIN); our $MAX_REQUEST_BUFFER = $ENV{GIT_HTTP_MAX_REQUEST_BUFFER} || (10 * 1024 * 1024); -open(my $null_io, '<', '/dev/null') or die "failed to open /dev/null: $!"; +open(my $null_io, '<', '/dev/null') or die "open /dev/null: $!"; +{ + my @n = stat($null_io) or die "stat(/dev/null): $!"; + my @i = stat(STDIN) or die "stat(STDIN): $!"; + $null_io = *STDIN{IO} if "@n[0, 1]" eq "@i[0, 1]"; +} + my $http_date; my $prev = 0; sub http_date () { @@ -57,8 +63,8 @@ sub new ($$$) { my $ev = EPOLLIN; my $wbuf; if ($sock->can('accept_SSL') && !$sock->accept_SSL) { - return CORE::close($sock) if $! != EAGAIN; - $ev = PublicInbox::TLS::epollbit() or return CORE::close($sock); + return $sock->close if $! != EAGAIN; + $ev = PublicInbox::TLS::epollbit() or return $sock->close; $wbuf = [ \&PublicInbox::DS::accept_tls_step ]; } $self->{wbuf} = $wbuf if $wbuf; @@ -70,7 +76,7 @@ sub new ($$$) { sub event_step { # called by PublicInbox::DS my ($self) = @_; local $SIG{__WARN__} = $self->{srv_env}->{'pi-httpd.warn_cb'}; - return unless $self->flush_write && $self->{sock}; + return unless $self->flush_write && $self->{sock} && !$self->{forward}; # only read more requests if we've drained the write buffer, # otherwise we can be buffering infinitely w/o backpressure @@ -224,6 +230,13 @@ sub identity_write ($$) { sub response_done { my ($self, $alive) = @_; + if (my $forward = delete $self->{forward}) { # avoid recursion + eval { $forward->close }; + if ($@) { + warn "response forward->close error: $@"; + return $self->close; # idempotent + } + } delete $self->{env}; # we're no longer busy # HEAD requests set $alive = 3 so we don't send "0\r\n\r\n"; $self->write(\"0\r\n\r\n") if $alive == 2; @@ -262,14 +275,6 @@ sub getline_pull { warn "response ->getline error: $@"; $self->close; } - # avoid recursion - if (delete $self->{forward}) { - eval { $forward->close }; - if ($@) { - warn "response ->close error: $@"; - $self->close; # idempotent - } - } response_done($self, delete $self->{alive}); } @@ -449,11 +454,12 @@ sub next_step { # They may be exposed to the PSGI application when the PSGI app # returns a CODE ref for "push"-based responses package PublicInbox::HTTP::Chunked; -use strict; +use v5.12; sub write { # ([$http], $buf) = @_; - PublicInbox::HTTP::chunked_write($_[0]->[0], $_[1]) + PublicInbox::HTTP::chunked_write($_[0]->[0], $_[1]); + $_[0]->[0]->{sock} ? length($_[1]) : undef; } sub close { @@ -462,12 +468,13 @@ sub close { } package PublicInbox::HTTP::Identity; -use strict; +use v5.12; our @ISA = qw(PublicInbox::HTTP::Chunked); sub write { # ([$http], $buf) = @_; PublicInbox::HTTP::identity_write($_[0]->[0], $_[1]); + $_[0]->[0]->{sock} ? length($_[1]) : undef; } 1; diff --git a/lib/PublicInbox/HTTPD.pm b/lib/PublicInbox/HTTPD.pm index bae7281b..6a6347d8 100644 --- a/lib/PublicInbox/HTTPD.pm +++ b/lib/PublicInbox/HTTPD.pm @@ -9,9 +9,6 @@ use strict; use Plack::Util (); use Plack::Builder; use PublicInbox::HTTP; -use PublicInbox::HTTPD::Async; - -sub pi_httpd_async { PublicInbox::HTTPD::Async->new(@_) } # we have a different env for ever listener socket for # SERVER_NAME, SERVER_PORT and psgi.url_scheme @@ -45,7 +42,7 @@ sub env_for ($$$) { # this to limit git-http-backend(1) parallelism. # We also check for the truthiness of this to # detect when to use async paths for slow blobs - 'pi-httpd.async' => \&pi_httpd_async, + 'pi-httpd.async' => 1, 'pi-httpd.app' => $self->{app}, 'pi-httpd.warn_cb' => $self->{warn_cb}, } diff --git a/lib/PublicInbox/HTTPD/Async.pm b/lib/PublicInbox/HTTPD/Async.pm deleted file mode 100644 index 1651da88..00000000 --- a/lib/PublicInbox/HTTPD/Async.pm +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# -# XXX This is a totally unstable API for public-inbox internal use only -# This is exposed via the 'pi-httpd.async' key in the PSGI env hash. -# The name of this key is not even stable! -# Currently intended for use with read-only pipes with expensive -# processes such as git-http-backend(1), cgit(1) -# -# fields: -# http: PublicInbox::HTTP ref -# fh: PublicInbox::HTTP::{Identity,Chunked} ref (can ->write + ->close) -# cb: initial read callback -# arg: arg for {cb} -# end_obj: CODE or object which responds to ->event_step when ->close is called -package PublicInbox::HTTPD::Async; -use strict; -use parent qw(PublicInbox::DS); -use Errno qw(EAGAIN); -use PublicInbox::Syscall qw(EPOLLIN); - -# This is called via: $env->{'pi-httpd.async'}->() -# $io is a read-only pipe ($rpipe) for now, but may be a -# bidirectional socket in the future. -sub new { - my ($class, $io, $cb, $arg, $end_obj) = @_; - - # no $io? call $cb at the top of the next event loop to - # avoid recursion: - unless (defined($io)) { - PublicInbox::DS::requeue($cb ? $cb : $arg); - die '$end_obj unsupported w/o $io' if $end_obj; - return; - } - my $self = bless { - cb => $cb, # initial read callback - arg => $arg, # arg for $cb - end_obj => $end_obj, # like END{}, can ->event_step - }, $class; - my $pp = tied *$io; - $pp->{fh}->blocking(0) // die "$io->blocking(0): $!"; - $self->SUPER::new($io, EPOLLIN); -} - -sub event_step { - my ($self) = @_; - if (my $cb = delete $self->{cb}) { - # this may call async_pass when headers are done - $cb->(my $refcnt_guard = delete $self->{arg}); - } elsif (my $sock = $self->{sock}) { - my $http = $self->{http}; - # $self->{sock} is a read pipe for git-http-backend or cgit - # and 65536 is the default Linux pipe size - my $r = sysread($sock, my $buf, 65536); - if ($r) { - $self->{fh}->write($buf); # may call $http->close - # let other clients get some work done, too - return if $http->{sock}; # !closed - - # else: fall through to close below... - } elsif (!defined $r && $! == EAGAIN) { - return; # EPOLLIN means we'll be notified - } - - # Done! Error handling will happen in $self->{fh}->close - # called by end_obj->event_step handler - delete $http->{forward}; - $self->close; # queues end_obj->event_step to be called - } # else { # we may've been requeued but closed by $http -} - -# once this is called, all data we read is passed to the -# to the PublicInbox::HTTP instance ($http) via $fh->write -sub async_pass { - my ($self, $http, $fh, $bref) = @_; - # In case the client HTTP connection ($http) dies, it - # will automatically close this ($self) object. - $http->{forward} = $self; - - # write anything we overread when we were reading headers - $fh->write($$bref); # PublicInbox:HTTP::{chunked,identity}_wcb - - # we're done with this, free this memory up ASAP since the - # calls after this may use much memory: - $$bref = undef; - - $self->{http} = $http; - $self->{fh} = $fh; -} - -# may be called as $forward->close in PublicInbox::HTTP or EOF (event_step) -sub close { - my $self = $_[0]; - $self->SUPER::close; # DS::close - - # we defer this to the next timer loop since close is deferred - if (my $end_obj = delete $self->{end_obj}) { - # this calls $end_obj->event_step - # (likely PublicInbox::Qspawn::event_step, - # NOT PublicInbox::HTTPD::Async::event_step) - PublicInbox::DS::requeue($end_obj); - } -} - -1; diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index 00b3c8b4..963dbb71 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -4,15 +4,16 @@ # represents a header value in various forms. Used for HTML generation # in our web interface(s) package PublicInbox::Hval; +use v5.10.1; # be careful about unicode_strings in v5.12; use strict; -use warnings; use Encode qw(find_encoding); use PublicInbox::MID qw/mid_clean mid_escape/; use base qw/Exporter/; our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape - to_attr prurl mid_href fmt_ts ts2str/; + to_attr prurl mid_href fmt_ts ts2str utf8_maybe/; use POSIX qw(strftime); my $enc_ascii = find_encoding('us-ascii'); +use File::Spec; # safe-ish acceptable filename pattern for portability our $FN = '[a-zA-Z0-9][a-zA-Z0-9_\-\.]+[a-zA-Z0-9]'; # needs \z anchor @@ -69,7 +70,16 @@ sub prurl ($$) { $u = $host_match[0] // $u->[0]; # fall through to below: } - index($u, '//') == 0 ? "$env->{'psgi.url_scheme'}:$u" : $u; + my $dslash = index($u, '//'); + if ($dslash == 0) { + "$env->{'psgi.url_scheme'}:$u" + } elsif ($dslash < 0 && substr($u, 0, 1) ne '/' && + substr(my $path = $env->{PATH_INFO}, 0, 1) eq '/') { + # this won't touch the FS at all: + File::Spec->abs2rel("/$u", $path); + } else { + $u; + } } # for misguided people who believe in this stuff, give them a @@ -118,7 +128,7 @@ $ESCAPES{'/'} = ':'; # common sub to_attr ($) { my ($str) = @_; - # git would never do this to us: + # git would never do this to us, mail diff uses // to prevent anchors: return if index($str, '//') >= 0; my $first = ''; @@ -135,6 +145,15 @@ sub to_attr ($) { sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) }; # human-friendly format -sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) } +sub fmt_ts ($) { + # strftime %k is not portable and leading zeros in %H slow me down + my (undef, $M, $H, $d, $m, $Y) = gmtime $_[0]; + sprintf '%u-%02u-%02u % 2u:%02u', $Y + 1900, $m + 1, $d, $H, $M; +} + +sub utf8_maybe ($) { + utf8::decode($_[0]); + utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists +} 1; diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index 2be1b763..b12533cb 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -39,14 +39,7 @@ use PublicInbox::DS qw(now); use PublicInbox::GitAsyncCat; use Text::ParseWords qw(parse_line); use Errno qw(EAGAIN); -use PublicInbox::IMAPsearchqp; - -my $Address; -for my $mod (qw(Email::Address::XS Mail::Address)) { - eval "require $mod" or next; - $Address = $mod and last; -} -die "neither Email::Address::XS nor Mail::Address loaded: $@" if !$Address; +use PublicInbox::Address; sub LINE_MAX () { 8000 } # RFC 2683 3.2.1.5 @@ -426,8 +419,10 @@ sub _esc ($) { if (!defined($v)) { 'NIL'; } elsif ($v =~ /[{"\r\n%*\\\[]/) { # literal string + utf8::encode($v); '{' . length($v) . "}\r\n" . $v; } else { # quoted string + utf8::encode($v); qq{"$v"} } } @@ -437,7 +432,7 @@ sub addr_envelope ($$;$) { my $v = $eml->header_raw($x) // ($y ? $eml->header_raw($y) : undef) // return 'NIL'; - my @x = $Address->parse($v) or return 'NIL'; + my @x = PublicInbox::Address::objects($v) or return 'NIL'; '(' . join('', map { '(' . join(' ', _esc($_->name), 'NIL', @@ -590,14 +585,16 @@ sub fetch_blob_cb { # called by git->cat_async via ibx_async_cat my ($self, undef, $msgs, $range_info, $ops, $partial) = @$fetch_arg; my $ibx = $self->{ibx} or return $self->close; # client disconnected my $smsg = shift @$msgs or die 'BUG: no smsg'; - if (!defined($oid)) { + if (!defined($type)) { + warn "E: git aborted on $oid / $smsg->{blob} $ibx->{inboxdir}"; + return $self->close; + } elsif ($type ne 'blob') { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message - warn "E: $smsg->{blob} missing in $ibx->{inboxdir}\n"; + warn "E: $smsg->{blob} $type in $ibx->{inboxdir}\n"; return $self->requeue_once; - } else { - $smsg->{blob} eq $oid or die "BUG: $smsg->{blob} != $oid"; } + $smsg->{blob} eq $oid or die "BUG: $smsg->{blob} != $oid"; my $pre; ($self->{anon} && !$self->{wbuf} && $msgs->[0]) and $pre = ibx_async_prefetch($ibx, $msgs->[0]->{blob}, @@ -661,7 +658,7 @@ sub op_eml_new { $_[4] = PublicInbox::Eml->new($_[3]) } # s/From / fixes old bug from import (pre-a0c07cba0e5d8b6a) sub to_crlf_full { ${$_[0]} =~ s/(?<!\r)\n/\r\n/sg; - ${$_[0]} =~ s/\A[\r\n]*From [^\r\n]*\r\n//s; + PublicInbox::Eml::strip_from(${$_[0]}); } sub op_crlf_bref { to_crlf_full($_[3]) } @@ -1005,7 +1002,7 @@ sub fetch_compile ($) { # stabilize partial order for consistency and ease-of-debugging: if (scalar keys %partial) { $need |= NEED_BLOB; - $r[2] = [ map { [ $_, @{$partial{$_}} ] } sort keys %partial ]; + @{$r[2]} = map { [ $_, @{$partial{$_}} ] } sort keys %partial; } push @op, $OP_EML_NEW if ($need & (EML_HDR|EML_BDY)); @@ -1028,7 +1025,7 @@ sub fetch_compile ($) { # r[1] = [ $key1, $cb1, $key2, $cb2, ... ] use sort 'stable'; # makes output more consistent - $r[1] = [ map { ($_->[2], $_->[1]) } sort { $a->[0] <=> $b->[0] } @op ]; + @{$r[1]} = map { ($_->[2], $_->[1]) } sort { $a->[0] <=> $b->[0] } @op; @r; } @@ -1084,6 +1081,7 @@ sub search_uid_range { # long_response sub parse_imap_query ($$) { my ($self, $query) = @_; + # IMAPsearchqp gets loaded in IMAPD->refresh_groups my $q = PublicInbox::IMAPsearchqp::parse($self, $query); if (ref($q)) { my $max = $self->{ibx}->over(1)->max; @@ -1165,17 +1163,11 @@ sub process_line ($$) { my $err = $@; if ($err && $self->{sock}) { $l =~ s/\r?\n//s; - err($self, 'error from: %s (%s)', $l, $err); + warn("error from: $l ($err)\n"); $tag //= '*'; - $res = "$tag BAD program fault - command not performed\r\n"; + $res = \"$tag BAD program fault - command not performed\r\n"; } - return 0 unless defined $res; - $self->write($res); -} - -sub err ($$;@) { - my ($self, $fmt, @args) = @_; - printf { $self->{imapd}->{err} } $fmt."\n", @args; + defined($res) ? $self->write($res) : 0; } sub out ($$;@) { diff --git a/lib/PublicInbox/IMAPD.pm b/lib/PublicInbox/IMAPD.pm index 78323e57..42dc2a9f 100644 --- a/lib/PublicInbox/IMAPD.pm +++ b/lib/PublicInbox/IMAPD.pm @@ -27,13 +27,8 @@ sub _refresh_ibx { # pi_cfg->each_inbox cb my ($ibx, $imapd, $cache, $dummies) = @_; my $ngname = $ibx->{newsgroup} // return; - # We require lower-case since IMAP mailbox names are - # case-insensitive (but -nntpd matches INN in being - # case-sensitive) - if ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! || - # don't confuse with 50K slices - $ngname =~ /\.[0-9]+\z/) { - warn "mailbox name invalid: newsgroup=`$ngname'\n"; + if ($ngname =~ /\.[0-9]+\z/) { # don't confuse with 50K slices + warn "E: mailbox name invalid: newsgroup=`$ngname' (ignored)\n"; return; } my $ce = $cache->{$ngname}; @@ -55,6 +50,7 @@ sub _refresh_ibx { # pi_cfg->each_inbox cb sub refresh_groups { my ($self, $sig) = @_; my $pi_cfg = PublicInbox::Config->new; + require PublicInbox::IMAPsearchqp; $self->{mailboxes} = $pi_cfg->{-imap_mailboxes} // do { my $mailboxes = $self->{mailboxes} = {}; my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; diff --git a/lib/PublicInbox/IMAPsearchqp.pm b/lib/PublicInbox/IMAPsearchqp.pm index 9f0c1205..0c37220c 100644 --- a/lib/PublicInbox/IMAPsearchqp.pm +++ b/lib/PublicInbox/IMAPsearchqp.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # IMAP search query parser. cf RFC 3501 @@ -279,6 +279,8 @@ sub parse { my $sql = ''; %$q = (sql => \$sql, imap => $imap); # imap = PublicInbox::IMAP obj # $::RD_TRACE = 1; + local $::RD_ERRORS = undef; + local $::RD_WARN = undef; my $res = eval { $prd->search_key(uc($query)) }; return $@ if $@ && $@ =~ /\A(?:BAD|NO) /; return 'BAD unexpected result' if !$res || $res != $q; diff --git a/lib/PublicInbox/IO.pm b/lib/PublicInbox/IO.pm new file mode 100644 index 00000000..8640f112 --- /dev/null +++ b/lib/PublicInbox/IO.pm @@ -0,0 +1,152 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# supports reaping of children tied to a pipe or socket +package PublicInbox::IO; +use v5.12; +use parent qw(IO::Handle Exporter); +use PublicInbox::DS qw(awaitpid); +our @EXPORT_OK = qw(poll_in read_all try_cat write_file); +use Carp qw(croak); +use IO::Poll qw(POLLIN); +use Errno qw(EINTR EAGAIN); +use PublicInbox::OnDestroy; +# don't autodie in top-level for Perl 5.16.3 (and maybe newer versions) +# we have our own ->close, so we scope autodie into each sub + +sub waitcb { # awaitpid callback + my ($pid, $errref, $cb, @args) = @_; + $$errref = $?; # sets .cerr for _close + $cb->($pid, @args) if $cb; # may clobber $? +} + +sub attach_pid { + my ($io, $pid, @cb_arg) = @_; + bless $io, __PACKAGE__; + # we share $err (and not $self) with awaitpid to avoid a ref cycle + my $e = \(my $err); + ${*$io}{pi_io_reap} = [ $PublicInbox::OnDestroy::fork_gen, $pid, $e ]; + awaitpid($pid, \&waitcb, $e, @cb_arg); + $io; +} + +sub attached_pid { + my ($io) = @_; + ${${*$io}{pi_io_reap} // []}[1]; +} + +sub can_reap { + my ($io) = @_; + ${${*$io}{pi_io_reap} // [-1]}[0] == $PublicInbox::OnDestroy::fork_gen; +} + +# caller cares about error result if they call close explicitly +# reap->[2] may be set before this is called via waitcb +sub close { + my ($io) = @_; + my $ret = $io->SUPER::close; + my $reap = delete ${*$io}{pi_io_reap}; + return $ret if ($reap->[0] // -1) != $PublicInbox::OnDestroy::fork_gen; + if (defined ${$reap->[2]}) { # reap_pids already reaped asynchronously + $? = ${$reap->[2]}; + } else { # wait synchronously + my $w = awaitpid($reap->[1]); + } + $? ? '' : $ret; +} + +sub DESTROY { + my ($io) = @_; + my $reap = delete ${*$io}{pi_io_reap}; + if (($reap->[0] // -1) == $PublicInbox::OnDestroy::fork_gen) { + $io->SUPER::close; + ${$reap->[2]} // awaitpid($reap->[1]); + } + $io->SUPER::DESTROY; +} + +sub write_file ($$@) { # mode, filename, LIST (for print) + use autodie qw(open close); + open(my $fh, shift, shift); + print $fh @_; + defined(wantarray) && !wantarray ? $fh : close $fh; +} + +sub poll_in ($;$) { + IO::Poll::_poll($_[1] // -1, fileno($_[0]), my $ev = POLLIN); +} + +sub read_all ($;$$$) { # pass $len=0 to read until EOF for :utf8 handles + use autodie qw(read); + my ($io, $len, $bref, $off) = @_; + $bref //= \(my $buf); + $off //= 0; + my $r = 0; + if (my $left = $len //= -s $io) { # known size (binmode :raw/:unix) + do { # retry for binmode :unix + $r = read($io, $$bref, $left, $off += $r) or croak( + "read($io) premature EOF ($left/$len remain)"); + } while ($left -= $r); + } else { # read until EOF + while (($r = read($io, $$bref, 65536, $off += $r))) {} + } + wantarray ? split(/^/sm, $$bref) : $$bref +} + +sub try_cat ($) { + my ($path) = @_; + open(my $fh, '<', $path) or return ''; + read_all $fh; +} + +# TODO: move existing HTTP/IMAP/NNTP/POP3 uses of rbuf here +sub my_bufread { + my ($io, $len) = @_; + my $rbuf = ${*$io}{pi_io_rbuf} //= \(my $new = ''); + my $left = $len - length($$rbuf); + my $r; + while ($left > 0) { + $r = sysread($io, $$rbuf, $left, length($$rbuf)); + if ($r) { + $left -= $r; + } elsif (defined($r)) { # EOF + return 0; + } else { + next if ($! == EAGAIN and poll_in($io)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error + } + } + my $no_pad = substr($$rbuf, 0, $len, ''); + delete(${*$io}{pi_io_rbuf}) if $$rbuf eq ''; + \$no_pad; +} + +# always uses "\n" +sub my_readline { + my ($io) = @_; + my $rbuf = ${*$io}{pi_io_rbuf} //= \(my $new = ''); + while (1) { + if ((my $n = index($$rbuf, "\n")) >= 0) { + my $ret = substr($$rbuf, 0, $n + 1, ''); + delete(${*$io}{pi_io_rbuf}) if $$rbuf eq ''; + return $ret; + } + my $r = sysread($io, $$rbuf, 65536, length($$rbuf)); + if (!defined($r)) { + next if ($! == EAGAIN and poll_in($io)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error + } elsif ($r == 0) { # return whatever's left on EOF + delete(${*$io}{pi_io_rbuf}); + return $$rbuf; + } # else { continue + } +} + +sub has_rbuf { + my ($io) = @_; + defined(${*$io}{pi_io_rbuf}); +} + +1; diff --git a/lib/PublicInbox/IPC.pm b/lib/PublicInbox/IPC.pm index 74862673..ed6d27fd 100644 --- a/lib/PublicInbox/IPC.pm +++ b/lib/PublicInbox/IPC.pm @@ -8,18 +8,17 @@ # use ipc_do when you need work done on a certain process # use wq_io_do when your work can be done on any idle worker package PublicInbox::IPC; -use strict; -use v5.10.1; +use v5.12; use parent qw(Exporter); +use autodie qw(close pipe read socketpair sysread); use Carp qw(croak); -use PublicInbox::DS qw(dwaitpid); +use PublicInbox::DS qw(awaitpid); use PublicInbox::Spawn; use PublicInbox::OnDestroy; use PublicInbox::WQWorker; -use Socket qw(AF_UNIX MSG_EOR SOCK_STREAM); +use Socket qw(AF_UNIX SOCK_STREAM SOCK_SEQPACKET); my $MY_MAX_ARG_STRLEN = 4096 * 33; # extra 4K for serialization -my $SEQPACKET = eval { Socket::SOCK_SEQPACKET() }; # portable enough? -our @EXPORT_OK = qw(ipc_freeze ipc_thaw); +our @EXPORT_OK = qw(ipc_freeze ipc_thaw nproc_shards); my ($enc, $dec); # ->imports at BEGIN turns sereal_*_with_object into custom ops on 5.14+ # and eliminate method call overhead @@ -42,7 +41,7 @@ if ($enc && $dec) { # should be custom ops *ipc_thaw = \&Storable::thaw; } -my $recv_cmd = PublicInbox::Spawn->can('recv_cmd4'); +our $recv_cmd = PublicInbox::Spawn->can('recv_cmd4'); our $send_cmd = PublicInbox::Spawn->can('send_cmd4') // do { require PublicInbox::CmdIPC4; $recv_cmd //= PublicInbox::CmdIPC4->can('recv_cmd4'); @@ -55,9 +54,9 @@ our $send_cmd = PublicInbox::Spawn->can('send_cmd4') // do { sub _get_rec ($) { my ($r) = @_; - defined(my $len = <$r>) or return; + my $len = <$r> // return; chop($len) eq "\n" or croak "no LF byte in $len"; - defined(my $n = read($r, my $buf, $len)) or croak "read error: $!"; + my $n = read($r, my $buf, $len); $n == $len or croak "short read: $n != $len"; ipc_thaw($buf); } @@ -94,28 +93,26 @@ sub ipc_worker_loop ($$$) { } } +sub exit_exception { exit(!!$@) } + # starts a worker if Sereal or Storable is installed sub ipc_worker_spawn { - my ($self, $ident, $oldset, $fields) = @_; + my ($self, $ident, $oldset, $fields, @cb_args) = @_; return if ($self->{-ipc_ppid} // -1) == $$; # idempotent delete(@$self{qw(-ipc_req -ipc_res -ipc_ppid -ipc_pid)}); - pipe(my ($r_req, $w_req)) or die "pipe: $!"; - pipe(my ($r_res, $w_res)) or die "pipe: $!"; + pipe(my $r_req, my $w_req); + pipe(my $r_res, my $w_res); my $sigset = $oldset // PublicInbox::DS::block_signals(); $self->ipc_atfork_prepare; - my $seed = rand(0xffffffff); - my $pid = fork // die "fork: $!"; + my $pid = PublicInbox::DS::fork_persist; if ($pid == 0) { - srand($seed); - eval { Net::SSLeay::randomize() }; - eval { PublicInbox::DS->Reset }; delete @$self{qw(-wq_s1 -wq_s2 -wq_workers -wq_ppid)}; $w_req = $r_res = undef; $w_res->autoflush(1); $SIG{$_} = 'IGNORE' for (qw(TERM INT QUIT)); local $0 = $ident; # ensure we properly exit even if warn() dies: - my $end = PublicInbox::OnDestroy->new($$, sub { exit(!!$@) }); + my $end = on_destroy \&exit_exception; eval { $fields //= {}; local @$self{keys %$fields} = values(%$fields); @@ -133,29 +130,20 @@ sub ipc_worker_spawn { $self->{-ipc_req} = $w_req; $self->{-ipc_res} = $r_res; $self->{-ipc_ppid} = $$; + awaitpid($pid, \&ipc_worker_reap, $self, @cb_args); $self->{-ipc_pid} = $pid; } -sub ipc_worker_reap { # dwaitpid callback - my ($args, $pid) = @_; - my ($self, @uargs) = @$args; +sub ipc_worker_reap { # awaitpid callback + my ($pid, $self, $cb, @args) = @_; delete $self->{-wq_workers}->{$pid}; - return $self->{-reap_do}->($args, $pid) if $self->{-reap_do}; + return $cb->($pid, $self, @args) if $cb; return if !$?; my $s = $? & 127; # TERM(15) is our default exit signal, PIPE(13) is likely w/ pager warn "$self->{-wq_ident} PID:$pid died \$?=$?\n" if $s != 15 && $s != 13 } -sub wq_wait_async { - my ($self, $cb, @uargs) = @_; - local $PublicInbox::DS::in_loop = 1; - $self->{-reap_async} = 1; - $self->{-reap_do} = $cb; - my @pids = keys %{$self->{-wq_workers}}; - dwaitpid($_, \&ipc_worker_reap, [ $self, @uargs ]) for @pids; -} - # for base class, override in sub classes sub ipc_atfork_prepare {} @@ -170,7 +158,7 @@ sub ipc_atfork_child { # idempotent, can be called regardless of whether worker is active or not sub ipc_worker_stop { - my ($self, $args) = @_; + my ($self) = @_; my ($pid, $ppid) = delete(@$self{qw(-ipc_pid -ipc_ppid)}); my ($w_req, $r_res) = delete(@$self{qw(-ipc_req -ipc_res)}); if (!$w_req && !$r_res) { @@ -179,18 +167,7 @@ sub ipc_worker_stop { } die 'no PID with IPC pipes' unless $pid; $w_req = $r_res = undef; - - return if $$ != $ppid; - dwaitpid($pid, \&ipc_worker_reap, [$self, $args]); -} - -# use this if we have multiple readers reading curl or "pigz -dc" -# and writing to the same store -sub ipc_lock_init { - my ($self, $f) = @_; - $f // die 'BUG: no filename given'; - require PublicInbox::Lock; - $self->{-ipc_lock} //= bless { lock_path => $f }, 'PublicInbox::Lock' + awaitpid($pid) if $$ == $ppid; # for non-event loop } sub _wait_return ($$) { @@ -204,8 +181,6 @@ sub _wait_return ($$) { sub ipc_do { my ($self, $sub, @args) = @_; if (my $w_req = $self->{-ipc_req}) { # run in worker - my $ipc_lock = $self->{-ipc_lock}; - my $lock = $ipc_lock ? $ipc_lock->lock_for_scope : undef; if (defined(wantarray)) { my $r_res = $self->{-ipc_res} or die 'no ipc_res'; _send_rec($w_req, [ wantarray, $sub, @args ]); @@ -234,15 +209,12 @@ sub recv_and_run { my $n = length($buf) or return 0; my $nfd = 0; for my $fd (@fds) { - if (open(my $cmdfh, '+<&=', $fd)) { - $self->{$nfd++} = $cmdfh; - $cmdfh->autoflush(1); - } else { - die "$$ open(+<&=$fd) (FD:$nfd): $!"; - } + open(my $cmdfh, '+<&=', $fd); + $self->{$nfd++} = $cmdfh; + $cmdfh->autoflush(1); } while ($full_stream && $n < $len) { - my $r = sysread($s2, $buf, $len - $n, $n) // croak "read: $!"; + my $r = sysread($s2, $buf, $len - $n, $n); croak "read EOF after $n/$len bytes" if $r == 0; $n = length($buf); } @@ -256,12 +228,19 @@ sub recv_and_run { $n; } -sub wq_worker_loop ($$) { - my ($self, $bcast2) = @_; +sub sock_defined { # PublicInbox::DS::post_loop_do CB + my ($wqw) = @_; + defined($wqw->{sock}); +} + +sub wq_worker_loop ($$$) { + my ($self, $bcast2, $oldset) = @_; my $wqw = PublicInbox::WQWorker->new($self, $self->{-wq_s2}); PublicInbox::WQWorker->new($self, $bcast2) if $bcast2; - PublicInbox::DS->SetPostLoopCallback(sub { $wqw->{sock} }); - PublicInbox::DS::event_loop(); + local @PublicInbox::DS::post_loop_do = (\&sock_defined, $wqw); + my $sig = delete($self->{wq_sig}); + $sig->{CHLD} //= \&PublicInbox::DS::enqueue_reap; + PublicInbox::DS::event_loop($sig, $oldset); PublicInbox::DS->Reset; } @@ -272,56 +251,40 @@ sub do_sock_stream { # via wq_io_do, for big requests sub wq_broadcast { my ($self, $sub, @args) = @_; - if (my $wkr = $self->{-wq_workers}) { - my $buf = ipc_freeze([$sub, @args]); - for my $bcast1 (values %$wkr) { - my $sock = $bcast1 // $self->{-wq_s1} // next; - send($sock, $buf, MSG_EOR) // croak "send: $!"; - # XXX shouldn't have to deal with EMSGSIZE here... - } - } else { - eval { $self->$sub(@args) }; - warn "wq_broadcast: $@" if $@; + my $wkr = $self->{-wq_workers} or Carp::confess('no -wq_workers'); + my $buf = ipc_freeze([$sub, @args]); + for my $bcast1 (values %$wkr) { + my $sock = $bcast1 // $self->{-wq_s1} // next; + send($sock, $buf, 0) // croak "send: $!"; + # XXX shouldn't have to deal with EMSGSIZE here... } } sub stream_in_full ($$$) { my ($s1, $fds, $buf) = @_; - socketpair(my $r, my $w, AF_UNIX, SOCK_STREAM, 0) or - croak "socketpair: $!"; + socketpair(my $r, my $w, AF_UNIX, SOCK_STREAM, 0); my $n = $send_cmd->($s1, [ fileno($r) ], ipc_freeze(['do_sock_stream', length($buf)]), - MSG_EOR) // croak "sendmsg: $!"; + 0) // croak "sendmsg: $!"; undef $r; $n = $send_cmd->($w, $fds, $buf, 0) // croak "sendmsg: $!"; - while ($n < length($buf)) { - my $x = syswrite($w, $buf, length($buf) - $n, $n) // - croak "syswrite: $!"; - croak "syswrite wrote 0 bytes" if $x == 0; - $n += $x; - } + print $w substr($buf, $n) if $n < length($buf); # need > 2G on Linux + close $w; # autodies } sub wq_io_do { # always async my ($self, $sub, $ios, @args) = @_; - if (my $s1 = $self->{-wq_s1}) { # run in worker - my $fds = [ map { fileno($_) } @$ios ]; - my $buf = ipc_freeze([$sub, @args]); - if (length($buf) > $MY_MAX_ARG_STRLEN) { - stream_in_full($s1, $fds, $buf); - } else { - my $n = $send_cmd->($s1, $fds, $buf, MSG_EOR); - return if defined($n); # likely - $!{ETOOMANYREFS} and - croak "sendmsg: $! (check RLIMIT_NOFILE)"; - $!{EMSGSIZE} ? stream_in_full($s1, $fds, $buf) : - croak("sendmsg: $!"); - } + my $s1 = $self->{-wq_s1} or Carp::confess('no -wq_s1'); + my $fds = [ map { fileno($_) } @$ios ]; + my $buf = ipc_freeze([$sub, @args]); + if (length($buf) > $MY_MAX_ARG_STRLEN) { + stream_in_full($s1, $fds, $buf); } else { - @$self{0..$#$ios} = @$ios; - eval { $self->$sub(@args) }; - warn "wq_io_do: $@" if $@; - delete @$self{0..$#$ios}; # don't close + my $n = $send_cmd->($s1, $fds, $buf, 0); + return if defined($n); # likely + $!{ETOOMANYREFS} and croak "sendmsg: $! (check RLIMIT_NOFILE)"; + $!{EMSGSIZE} ? stream_in_full($s1, $fds, $buf) : + croak("sendmsg: $!"); } } @@ -339,7 +302,7 @@ sub wq_sync_run { sub wq_do { my ($self, $sub, @args) = @_; if (defined(wantarray)) { - pipe(my ($r, $w)) or die "pipe: $!"; + pipe(my $r, my $w); wq_io_do($self, 'wq_sync_run', [ $w ], wantarray, $sub, @args); undef $w; _wait_return($r, $sub); @@ -350,7 +313,6 @@ sub wq_do { sub prepare_nonblock { ($_[0]->{-wq_s1} // die 'BUG: no {-wq_s1}')->blocking(0); - $_[0]->{-reap_async} or die 'BUG: {-reap_async} needed for nonblock'; require PublicInbox::WQBlocked; } @@ -360,24 +322,19 @@ sub wq_nonblock_do { # always async if ($self->{wqb}) { # saturated once, assume saturated forever $self->{wqb}->flush_send($buf); } else { - $send_cmd->($self->{-wq_s1}, [], $buf, MSG_EOR) // + $send_cmd->($self->{-wq_s1}, [], $buf, 0) // ($!{EAGAIN} ? PublicInbox::WQBlocked->new($self, $buf) : croak("sendmsg: $!")); } } -sub _wq_worker_start ($$$$) { - my ($self, $oldset, $fields, $one) = @_; +sub _wq_worker_start { + my ($self, $oldset, $fields, $one, @cb_args) = @_; my ($bcast1, $bcast2); - $one or socketpair($bcast1, $bcast2, AF_UNIX, $SEQPACKET, 0) or - die "socketpair: $!"; - my $seed = rand(0xffffffff); - my $pid = fork // die "fork: $!"; + $one or socketpair($bcast1, $bcast2, AF_UNIX, SOCK_SEQPACKET, 0); + my $pid = PublicInbox::DS::fork_persist; if ($pid == 0) { - srand($seed); - eval { Net::SSLeay::randomize() }; undef $bcast1; - eval { PublicInbox::DS->Reset }; delete @$self{qw(-wq_s1 -wq_ppid)}; $self->{-wq_worker_nr} = keys %{delete($self->{-wq_workers}) // {}}; @@ -385,30 +342,28 @@ sub _wq_worker_start ($$$$) { local $0 = $one ? $self->{-wq_ident} : "$self->{-wq_ident} $self->{-wq_worker_nr}"; # ensure we properly exit even if warn() dies: - my $end = PublicInbox::OnDestroy->new($$, sub { exit(!!$@) }); + my $end = on_destroy \&exit_exception; eval { $fields //= {}; local @$self{keys %$fields} = values(%$fields); my $on_destroy = $self->ipc_atfork_child; local @SIG{keys %SIG} = values %SIG; - PublicInbox::DS::sig_setmask($oldset); - wq_worker_loop($self, $bcast2); + wq_worker_loop($self, $bcast2, $oldset); }; warn "worker $self->{-wq_ident} PID:$$ died: $@" if $@; undef $end; # trigger exit } else { $self->{-wq_workers}->{$pid} = $bcast1; + awaitpid($pid, \&ipc_worker_reap, $self, @cb_args); } } # starts workqueue workers if Sereal or Storable is installed sub wq_workers_start { - my ($self, $ident, $nr_workers, $oldset, $fields) = @_; - ($send_cmd && $recv_cmd && defined($SEQPACKET)) or return; + my ($self, $ident, $nr_workers, $oldset, $fields, @cb_args) = @_; + ($send_cmd && $recv_cmd) or return; return if $self->{-wq_s1}; # idempotent - $self->{-wq_s1} = $self->{-wq_s2} = undef; - socketpair($self->{-wq_s1}, $self->{-wq_s2}, AF_UNIX, $SEQPACKET, 0) or - die "socketpair: $!"; + socketpair($self->{-wq_s1}, $self->{-wq_s2},AF_UNIX, SOCK_SEQPACKET, 0); $self->ipc_atfork_prepare; $nr_workers //= $self->{-wq_nr_workers}; # was set earlier my $sigset = $oldset // PublicInbox::DS::block_signals(); @@ -416,7 +371,9 @@ sub wq_workers_start { $self->{-wq_ident} = $ident; my $one = $nr_workers == 1; $self->{-wq_nr_workers} = $nr_workers; - _wq_worker_start($self, $sigset, $fields, $one) for (1..$nr_workers); + for (1..$nr_workers) { + _wq_worker_start($self, $sigset, $fields, $one, @cb_args); + } PublicInbox::DS::sig_setmask($sigset) unless $oldset; $self->{-wq_ppid} = $$; } @@ -424,13 +381,11 @@ sub wq_workers_start { sub wq_close { my ($self) = @_; if (my $wqb = delete $self->{wqb}) { - $self->{-reap_async} or die 'BUG: {-reap_async} unset'; $wqb->enq_close; } delete @$self{qw(-wq_s1 -wq_s2)} or return; - return if $self->{-reap_async}; - my @pids = keys %{$self->{-wq_workers}}; - dwaitpid($_, \&ipc_worker_reap, [ $self ]) for @pids; + return if ($self->{-wq_ppid} // -1) != $$; + awaitpid($_) for keys %{$self->{-wq_workers}}; } sub wq_kill { @@ -446,23 +401,52 @@ sub DESTROY { ipc_worker_stop($self); } +# _SC_NPROCESSORS_ONLN = 84 on both Linux glibc and musl, +# emitted using: $^X devel/sysdefs-list +my %NPROCESSORS_ONLN = ( + linux => 84, + freebsd => 58, + dragonfly => 58, + openbsd => 503, + netbsd => 1002 +); + sub detect_nproc () { - # _SC_NPROCESSORS_ONLN = 84 on both Linux glibc and musl - return POSIX::sysconf(84) if $^O eq 'linux'; - return POSIX::sysconf(58) if $^O eq 'freebsd'; - # TODO: more OSes + my $n = $NPROCESSORS_ONLN{$^O}; + return POSIX::sysconf($n) if defined $n; - # getconf(1) is POSIX, but *NPROCESSORS* vars are not + # getconf(1) is POSIX, but *NPROCESSORS* vars are not even if + # glibc, {Free,Net,Open}BSD all support them. for (qw(_NPROCESSORS_ONLN NPROCESSORS_ONLN)) { `getconf $_ 2>/dev/null` =~ /^(\d+)$/ and return $1; } - for my $nproc (qw(nproc gnproc)) { # GNU coreutils nproc - `$nproc 2>/dev/null` =~ /^(\d+)$/ and return $1; + # note: GNU nproc(1) checks CPU affinity, which is nice but + # isn't remotely portable + undef +} + +# SATA storage lags behind what CPUs are capable of, so relying on +# nproc(1) can be misleading and having extra Xapian shards is a +# waste of FDs and space. It can also lead to excessive IO latency +# and slow things down. Users on NVME or other fast storage can +# use the NPROC env or switches in our script/public-inbox-* programs +# to increase Xapian shards +our $NPROC_MAX_DEFAULT = 4; + +sub nproc_shards ($) { + my ($creat_opt) = @_; + my $n = $creat_opt->{nproc} if ref($creat_opt) eq 'HASH'; + $n //= $ENV{NPROC}; + if (!$n) { + # assume 2 cores if not detectable or zero + state $NPROC_DETECTED = PublicInbox::IPC::detect_nproc() || 2; + $n = $NPROC_DETECTED; + $n = $NPROC_MAX_DEFAULT if $n > $NPROC_MAX_DEFAULT; } - # should we bother with `sysctl hw.ncpu`? Those only give - # us total processor count, not online processor count. - undef + # subtract for the main process and git-fast-import + $n -= 1; + $n < 1 ? 1 : $n; } 1; diff --git a/lib/PublicInbox/IdxStack.pm b/lib/PublicInbox/IdxStack.pm index 54d480bd..7681ee6f 100644 --- a/lib/PublicInbox/IdxStack.pm +++ b/lib/PublicInbox/IdxStack.pm @@ -1,16 +1,18 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # temporary stack for public-inbox-index +# FIXME: needs to support multi-hash in the same repo once git itself can package PublicInbox::IdxStack; -use v5.10.1; -use strict; +use v5.12; use Fcntl qw(:seek); use constant PACK_FMT => eval { pack('Q', 1) } ? 'A1QQH*H*' : 'A1IIH*H*'; +use autodie qw(open seek); +use PublicInbox::IO qw(read_all); # start off in write-only mode sub new { - open(my $io, '+>', undef) or die "open: $!"; + open(my $io, '+>', undef); # latest_cmt is still useful when the newest revision is a `d'(elete), # otherwise we favor $sync->{latest_cmt} for checkpoints and {quit} bless { wr => $io, latest_cmt => $_[1] }, __PACKAGE__ @@ -27,7 +29,7 @@ sub push_rec { $self->{rec_size} = length($rec); $self->{unpack_fmt} = $fmt; }; - print { $self->{wr} } $rec or die "print: $!"; + print { $self->{wr} } $rec; $self->{tot_size} += length($rec); } @@ -49,12 +51,8 @@ sub pop_rec { my $sz = $self->{rec_size} or return; my $rec_pos = $self->{tot_size} -= $sz; return if $rec_pos < 0; - my $io = $self->{rd}; - seek($io, $rec_pos, SEEK_SET) or die "seek: $!"; - my $r = read($io, my $buf, $sz); - defined($r) or die "read: $!"; - $r == $sz or die "read($r != $sz)"; - unpack($self->{unpack_fmt}, $buf); + seek($self->{rd}, $rec_pos, SEEK_SET); + unpack($self->{unpack_fmt}, read_all($self->{rd}, $sz)); } 1; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 60ce7b66..ed34d548 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # git fast-import-based ssoma-mda MDA replacement @@ -6,10 +6,9 @@ # and public-inbox-watch. Not the WWW or NNTP code which only # requires read-only access. package PublicInbox::Import; -use strict; +use v5.12; use parent qw(PublicInbox::Lock); -use v5.10.1; -use PublicInbox::Spawn qw(run_die popen_rd); +use PublicInbox::Spawn qw(run_die run_qx spawn); use PublicInbox::MID qw(mids mid2path); use PublicInbox::Address; use PublicInbox::Smsg; @@ -18,13 +17,16 @@ use PublicInbox::ContentHash qw(content_digest); use PublicInbox::MDA; use PublicInbox::Eml; use POSIX qw(strftime); +use autodie qw(socketpair); +use Carp qw(croak); +use Socket qw(AF_UNIX SOCK_STREAM); +use PublicInbox::IO qw(read_all); sub default_branch () { state $default_branch = do { - my $r = popen_rd([qw(git config --global init.defaultBranch)], + my $h = run_qx([qw(git config --global init.defaultBranch)], { GIT_CONFIG => undef }); - chomp(my $h = <$r> // ''); - close $r; + chomp $h; $h eq '' ? 'refs/heads/master' : "refs/heads/$h"; } } @@ -55,11 +57,10 @@ sub new { # idempotent start function sub gfi_start { my ($self) = @_; - - return ($self->{in}, $self->{out}) if $self->{in}; - - my ($in_r, $out_r, $out_w); - pipe($out_r, $out_w) or die "pipe failed: $!"; + my $io = $self->{io}; + return $io if $io; + socketpair($io, my $s2, AF_UNIX, SOCK_STREAM, 0); + $io->autoflush(1); $self->lock_acquire; eval { @@ -72,21 +73,20 @@ sub gfi_start { die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; $self->{-tree} = { map { $_ => 1 } split(/\0/, $t) }; } - $in_r = $self->{in} = $git->popen(qw(fast-import - --quiet --done --date-format=raw), - undef, { 0 => $out_r }); - $out_w->autoflush(1); - $self->{out} = $out_w; + my $gfi = [ 'git', "--git-dir=$git->{git_dir}", qw(fast-import + --quiet --done --date-format=raw) ]; + my $pid = spawn($gfi, undef, { 0 => $s2, 1 => $s2 }); $self->{nchg} = 0; + $self->{io} = PublicInbox::IO::attach_pid($io, $pid); }; if ($@) { $self->lock_release; die $@; } - ($in_r, $out_w); + $self->{io}; } -sub wfail () { die "write to fast-import failed: $!" } +sub wfail () { croak "write to fast-import failed: $!" } sub now_raw () { time . ' +0000' } @@ -98,60 +98,43 @@ sub norm_body ($) { } # only used for v1 (ssoma) inboxes -sub _check_path ($$$$) { - my ($r, $w, $tip, $path) = @_; +sub _check_path ($$$) { + my ($io, $tip, $path) = @_; return if $tip eq ''; - print $w "ls $tip $path\n" or wfail; + print $io "ls $tip $path\n" or wfail; local $/ = "\n"; - defined(my $info = <$r>) or die "EOF from fast-import: $!"; + my $info = <$io> // die "EOF from fast-import: $!"; $info =~ /\Amissing / ? undef : $info; } -sub _cat_blob ($$$) { - my ($r, $w, $oid) = @_; - print $w "cat-blob $oid\n" or wfail; +sub _cat_blob ($$) { + my ($io, $oid) = @_; + print $io "cat-blob $oid\n" or wfail; local $/ = "\n"; - my $info = <$r>; - defined $info or die "EOF from fast-import / cat-blob: $!"; + my $info = <$io> // die "EOF from fast-import / cat-blob: $!"; $info =~ /\A[a-f0-9]{40,} blob ([0-9]+)\n\z/ or return; - my $left = $1; - my $offset = 0; - my $buf = ''; - my $n; - while ($left > 0) { - $n = read($r, $buf, $left, $offset); - defined($n) or die "read cat-blob failed: $!"; - $n == 0 and die 'fast-export (cat-blob) died'; - $left -= $n; - $offset += $n; - } - $n = read($r, my $lf, 1); - defined($n) or die "read final byte of cat-blob failed: $!"; - die "bad read on final byte: <$lf>" if $lf ne "\n"; - - # fixup some bugginess in old versions: - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $buf = read_all($io, my $len = $1 + 1); + my $lf = chop $buf; + croak "bad read on final byte: <$lf>" if $lf ne "\n"; \$buf; } sub cat_blob { my ($self, $oid) = @_; - my ($r, $w) = $self->gfi_start; - _cat_blob($r, $w, $oid); + _cat_blob($self->{io} // return, $oid); } sub check_remove_v1 { - my ($r, $w, $tip, $path, $mime) = @_; + my ($io, $tip, $path, $mime) = @_; - my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); + my $info = _check_path($io, $tip, $path) or return ('MISSING',undef); $info =~ m!\A100644 blob ([a-f0-9]{40,})\t!s or die "not blob: $info"; my $oid = $1; - my $msg = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed"; - my $cur = PublicInbox::Eml->new($msg); - my $cur_s = $cur->header('Subject'); - $cur_s = '' unless defined $cur_s; - my $cur_m = $mime->header('Subject'); - $cur_m = '' unless defined $cur_m; + my $bref = _cat_blob($io, $oid) or die "BUG: cat-blob $1 failed"; + PublicInbox::Eml::strip_from($$bref); + my $cur = PublicInbox::Eml->new($bref); + my $cur_s = $cur->header('Subject') // ''; + my $cur_m = $mime->header('Subject') // ''; if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { return ('MISMATCH', $cur); } @@ -160,16 +143,15 @@ sub check_remove_v1 { sub checkpoint { my ($self) = @_; - return unless $self->{in}; - print { $self->{out} } "checkpoint\n" or wfail; + print { $self->{io} // return } "checkpoint\n" or wfail; undef; } sub progress { my ($self, $msg) = @_; - return unless $self->{in}; - print { $self->{out} } "progress $msg\n" or wfail; - readline($self->{in}) eq "progress $msg\n" or die + my $io = $self->{io} or return; + print $io "progress $msg\n" or wfail; + readline($io) eq "progress $msg\n" or die "progress $msg not received\n"; undef; } @@ -185,8 +167,8 @@ sub _update_git_info ($$) { my $env = { GIT_INDEX_FILE => $index }; run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env); } - eval { run_die([@cmd, 'update-server-info']) }; my $ibx = $self->{ibx}; + eval { run_die([@cmd, 'update-server-info']) } if $ibx; if ($ibx && $ibx->version == 1 && -d "$ibx->{inboxdir}/public-inbox" && eval { require PublicInbox::SearchIdx }) { eval { @@ -195,7 +177,10 @@ sub _update_git_info ($$) { }; warn "$ibx->{inboxdir} index failed: $@\n" if $@; } - eval { run_die([@cmd, qw(gc --auto)]) } if $do_gc; + if ($do_gc) { + my @quiet = (-t STDERR ? () : '-q'); + eval { run_die([@cmd, qw(gc --auto), @quiet]) } + } } sub barrier { @@ -216,10 +201,9 @@ sub barrier { # used for v2 sub get_mark { my ($self, $mark) = @_; - die "not active\n" unless $self->{in}; - my ($r, $w) = $self->gfi_start; - print $w "get-mark $mark\n" or wfail; - defined(my $oid = <$r>) or die "get-mark failed, need git 2.6.0+\n"; + my $io = $self->{io} or croak "not active\n"; + print $io "get-mark $mark\n" or wfail; + my $oid = <$io> // die "get-mark failed, need git 2.6.0+\n"; chomp($oid); $oid; } @@ -236,11 +220,11 @@ sub remove { my $path_type = $self->{path_type}; my ($path, $err, $cur, $blob); - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my $tip = $self->{tip}; if ($path_type eq '2/38') { $path = mid2path(v1_mid0($mime)); - ($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime); + ($err, $cur) = check_remove_v1($io, $tip, $path, $mime); return ($err, $cur) if $err; } else { my $sref; @@ -252,7 +236,7 @@ sub remove { } my $len = length($$sref); $blob = $self->{mark}++; - print $w "blob\nmark :$blob\ndata $len\n", + print $io "blob\nmark :$blob\ndata $len\n", $$sref, "\n" or wfail; } @@ -260,22 +244,22 @@ sub remove { my $commit = $self->{mark}++; my $parent = $tip =~ /\A:/ ? $tip : undef; unless ($parent) { - print $w "reset $ref\n" or wfail; + print $io "reset $ref\n" or wfail; } my $ident = $self->{ident}; my $now = now_raw(); $msg //= 'rm'; my $len = length($msg) + 1; - print $w "commit $ref\nmark :$commit\n", + print $io "commit $ref\nmark :$commit\n", "author $ident $now\n", "committer $ident $now\n", "data $len\n$msg\n\n", 'from ', ($parent ? $parent : $tip), "\n" or wfail; if (defined $path) { - print $w "D $path\n\n" or wfail; + print $io "D $path\n\n" or wfail; } else { - clean_tree_v2($self, $w, 'd'); - print $w "M 100644 :$blob d\n\n" or wfail; + clean_tree_v2($self, $io, 'd'); + print $io "M 100644 :$blob d\n\n" or wfail; } $self->{nchg}++; (($self->{tip} = ":$commit"), $cur); @@ -337,11 +321,38 @@ sub extract_cmt_info ($;$) { # kill potentially confusing/misleading headers our @UNWANTED_HEADERS = (qw(Bytes Lines Content-Length), qw(Status X-Status)); +our $DROP_UNIQUE_UNSUB; sub drop_unwanted_headers ($) { my ($eml) = @_; for (@UNWANTED_HEADERS, @PublicInbox::MDA::BAD_HEADERS) { $eml->header_set($_); } + + # We don't want public-inbox readers to be able to unsubcribe the + # address which does archiving. WARNING: this breaks DKIM if the + # mailing list sender follows RFC 8058, section 4; but breaking DKIM + # (or have senders ignore RFC 8058 sec. 4) is preferable to having + # saboteurs unsubscribing independent archivists: + if ($DROP_UNIQUE_UNSUB && grep(/\AList-Unsubscribe=One-Click\z/, + $eml->header_raw('List-Unsubscribe-Post'))) { + for (qw(List-Unsubscribe-Post List-Unsubscribe)) { + $eml->header_set($_) + } + } +} + +sub load_config ($;$) { + my ($cfg, $do_exit) = @_; + my $v = $cfg->{lc 'publicinboxImport.dropUniqueUnsubscribe'}; + if (defined $v) { + $DROP_UNIQUE_UNSUB = $cfg->git_bool($v) // do { + warn <<EOM; +E: publicinboxImport.dropUniqueUnsubscribe=$v in $cfg->{-f} is not boolean +EOM + $do_exit //= \&CORE::exit; + $do_exit->(78); # EX_CONFIG + }; + } } # used by V2Writable, too @@ -365,11 +376,11 @@ sub v1_mid0 ($) { $mids->[0]; } sub clean_tree_v2 ($$$) { - my ($self, $w, $keep) = @_; + my ($self, $io, $keep) = @_; my $tree = $self->{-tree} or return; #v2 only delete $tree->{$keep}; foreach (keys %$tree) { - print $w "D $_\n" or wfail; + print $io "D $_\n" or wfail; } %$tree = ($keep => 1); } @@ -388,10 +399,10 @@ sub add { $path = 'm'; } - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my $tip = $self->{tip}; if ($path_type eq '2/38') { - _check_path($r, $w, $tip, $path) and return; + _check_path($io, $tip, $path) and return; } drop_unwanted_headers($mime); @@ -405,8 +416,7 @@ sub add { my $raw_email = $mime->{-public_inbox_raw} // $mime->as_string; my $n = length($raw_email); $self->{bytes_added} += $n; - print $w "blob\nmark :$blob\ndata ", $n, "\n" or wfail; - print $w $raw_email, "\n" or wfail; + print $io "blob\nmark :$blob\ndata $n\n", $raw_email, "\n" or wfail; # v2: we need this for Xapian if ($smsg) { @@ -433,19 +443,19 @@ sub add { my $parent = $tip =~ /\A:/ ? $tip : undef; unless ($parent) { - print $w "reset $ref\n" or wfail; + print $io "reset $ref\n" or wfail; } - print $w "commit $ref\nmark :$commit\n", + print $io "commit $ref\nmark :$commit\n", "author $author $at\n", - "committer $self->{ident} $ct\n" or wfail; - print $w "data ", (length($subject) + 1), "\n", + "committer $self->{ident} $ct\n", + "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; if ($tip ne '') { - print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; + print $io 'from ', ($parent ? $parent : $tip), "\n" or wfail; } - clean_tree_v2($self, $w, $path); - print $w "M 100644 :$blob $path\n\n" or wfail; + clean_tree_v2($self, $io, $path); + print $io "M 100644 :$blob $path\n\n" or wfail; $self->{nchg}++; $self->{tip} = ":$commit"; } @@ -461,32 +471,37 @@ my @INIT_FILES = ('HEAD' => undef, # filled in at runtime EOC sub init_bare { - my ($dir, $head) = @_; # or self + my ($dir, $head, $fmt) = @_; # or self $dir = $dir->{git}->{git_dir} if ref($dir); require File::Path; - File::Path::mkpath([ map { "$dir/$_" } qw(objects/info refs/heads) ]); + File::Path::make_path(map { $dir.$_ } qw(/objects/info /refs/heads)); $INIT_FILES[1] //= 'ref: '.default_branch."\n"; my @fn_contents = @INIT_FILES; $fn_contents[1] = "ref: refs/heads/$head\n" if defined $head; + $fn_contents[3] = <<EOM if defined($fmt) && $fmt ne 'sha1'; +[core] + repositoryFormatVersion = 1 + filemode = true + bare = true +[extensions] + objectFormat = $fmt +EOM while (my ($fn, $contents) = splice(@fn_contents, 0, 2)) { my $f = $dir.'/'.$fn; next if -f $f; - open my $fh, '>', $f or die "open $f: $!"; - print $fh $contents or die "print $f: $!"; - close $fh or die "close $f: $!"; + PublicInbox::IO::write_file '>', $f, $contents; } } # true if locked and active -sub active { !!$_[0]->{out} } +sub active { !!$_[0]->{io} } sub done { my ($self) = @_; - my $w = delete $self->{out} or return; + my $io = delete $self->{io} or return; eval { - my $r = delete $self->{in} or die 'BUG: missing {in} when done'; - print $w "done\n" or wfail; - close $r or die "fast-import failed: $?"; # ProcessPipe::CLOSE + print $io "done\n" or wfail; + $io->close or croak "close fast-import \$?=$?"; # reaps }; my $wait_err = $@; my $nchg = delete $self->{nchg}; @@ -499,13 +514,7 @@ sub done { die $wait_err if $wait_err; } -sub atfork_child { - my ($self) = @_; - foreach my $f (qw(in out)) { - next unless defined($self->{$f}); - close $self->{$f} or die "failed to close import[$f]: $!\n"; - } -} +sub atfork_child { (delete($_[0]->{io}) // return)->close } sub digest2mid ($$;$) { my ($dig, $hdr, $fallback_time) = @_; @@ -558,7 +567,7 @@ sub replace_oids { my $git = $self->{git}; my @export = (qw(fast-export --no-data --use-done-feature), $old); my $rd = $git->popen(@export); - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my @buf; my $nreplace = 0; my @oids; @@ -569,17 +578,14 @@ sub replace_oids { push @buf, "reset $tmp\n"; } elsif (/^commit (?:.+)/) { if (@buf) { - print $w @buf or wfail; + print $io @buf or wfail; @buf = (); } push @buf, "commit $tmp\n"; } elsif (/^data ([0-9]+)/) { # only commit message, so $len is small: - my $len = $1; # + 1 for trailing "\n" push @buf, $_; - my $n = read($rd, my $buf, $len) or die "read: $!"; - $len == $n or die "short read ($n < $len)"; - push @buf, $buf; + push @buf, read_all($rd, my $len = $1); } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { my ($oid, $path) = ($1, $2); $tree->{$path} = 1; @@ -606,7 +612,7 @@ sub replace_oids { rewrite_commit($self, \@oids, \@buf, $mime); $nreplace++; } - print $w @buf, "\n" or wfail; + print $io @buf, "\n" or wfail; @buf = (); } elsif ($_ eq "done\n") { $done = 1; @@ -617,9 +623,9 @@ sub replace_oids { push @buf, $_; } } - close $rd or die "close fast-export failed: $?"; + $rd->close or die "E: git @export (\$?=$?)"; if (@buf) { - print $w @buf or wfail; + print $io @buf or wfail; } die 'done\n not seen from fast-export' unless $done; chomp(my $cmt = $self->get_mark(":$mark")) if $nreplace; diff --git a/lib/PublicInbox/In2Tie.pm b/lib/PublicInbox/In2Tie.pm index ffe26a44..3689432b 100644 --- a/lib/PublicInbox/In2Tie.pm +++ b/lib/PublicInbox/In2Tie.pm @@ -1,10 +1,10 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # used to ensure PublicInbox::DS can call fileno() as a function # on Linux::Inotify2 objects package PublicInbox::In2Tie; -use strict; +use v5.12; use Symbol qw(gensym); sub io { diff --git a/lib/PublicInbox/In3Event.pm b/lib/PublicInbox/In3Event.pm new file mode 100644 index 00000000..f93dc0da --- /dev/null +++ b/lib/PublicInbox/In3Event.pm @@ -0,0 +1,24 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# duck-type compatible with Linux::Inotify2::Event for pure Perl +# PublicInbox::Inotify3 w/o callback support +package PublicInbox::In3Event; +use v5.12; + +sub w { $_[0]->[2] } # PublicInbox::In3Watch +sub mask { $_[0]->[0] } +sub name { $_[0]->[1] } + +sub fullname { + my ($name, $wname) = ($_[0]->[1], $_[0]->[2]->name); + length($name) ? "$wname/$name" : $wname; +} + +my $buf = ''; +while (my ($sym, $mask) = each %PublicInbox::Inotify3::events) { + $buf .= "sub $sym { \$_[0]->[0] & $mask }\n"; +} +eval $buf; + +1; diff --git a/lib/PublicInbox/In3Watch.pm b/lib/PublicInbox/In3Watch.pm new file mode 100644 index 00000000..bdb91869 --- /dev/null +++ b/lib/PublicInbox/In3Watch.pm @@ -0,0 +1,20 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# duck-type compatible with Linux::Inotify2::Watch for pure Perl +# PublicInbox::Inotify3 for our needs, only +package PublicInbox::In3Watch; +use v5.12; + +sub mask { $_[0]->[1] } +sub name { $_[0]->[2] } + +sub cancel { + my ($self) = @_; + my ($wd, $in3) = @$self[0, 3]; + $in3 or return 1; # already canceled + pop @$self; + $in3->rm_watch($wd); +} + +1; diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 3f70e69d..dd689221 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -10,32 +10,22 @@ use PublicInbox::MID qw(mid2path); use PublicInbox::Eml; use List::Util qw(max); use Carp qw(croak); +use PublicInbox::Compat qw(uniqstr); -# returns true if further checking is required +# in case DBs get replaced (Xapcmd does it for v1) sub check_inodes ($) { for (qw(over mm)) { $_[0]->{$_}->check_inodes if $_[0]->{$_} } } +# search/over/mm hold onto FDs and description+cloneurl may get updated. +# creating long-lived allocations in the same phase as short-lived +# allocations also leads to fragmentation, so we don't want some stuff +# living too long. sub do_cleanup { my ($ibx) = @_; - my $live; - if (defined $ibx->{git}) { - $live = $ibx->isa(__PACKAGE__) ? $ibx->{git}->cleanup(1) - : $ibx->{git}->cleanup_if_unlinked; - delete($ibx->{git}) unless $live; - } - if ($live) { - check_inodes($ibx); - } else { - delete(@$ibx{qw(over mm description cloneurl - -imap_url -nntp_url -pop3_url)}); - } - my $srch = $ibx->{search} // $ibx; + my ($srch) = delete @$ibx{qw(search over mm description cloneurl)}; + $srch //= $ibx; # extsearch delete @$srch{qw(xdb qp)}; - for my $git (@{$ibx->{-repo_objs} // []}) { - $live = 1 if $git->cleanup(1); - } - PublicInbox::DS::add_uniq_timer($ibx+0, 5, \&do_cleanup, $ibx) if $live; } sub _cleanup_later ($) { @@ -54,8 +44,8 @@ sub _set_limiter ($$$) { my $val = $self->{$mkey} or return; my $lim; if ($val =~ /\A[0-9]+\z/) { - require PublicInbox::Qspawn; - $lim = PublicInbox::Qspawn::Limiter->new($val); + require PublicInbox::Limiter; + $lim = PublicInbox::Limiter->new($val); } elsif ($val =~ /\A[a-z][a-z0-9]*\z/) { $lim = $pi_cfg->limiter($val); warn "$mkey limiter=$val not found\n" if !$lim; @@ -80,12 +70,8 @@ sub new { delete $opts->{feedmax}; } # allow any combination of multi-line or comma-delimited hide entries - my $hide = {}; - if (defined(my $h = $opts->{hide})) { - foreach my $v (@$h) { - $hide->{$_} = 1 foreach (split(/\s*,\s*/, $v)); - } - $opts->{-hide} = $hide; + for $v (@{delete($opts->{hide}) // []}) { + $opts->{-'hide_'.$_} = 1 for split(/\s*,\s*/, $v); } bless $opts, $class; } @@ -115,7 +101,6 @@ sub git { my $g = PublicInbox::Git->new($git_dir); my $lim = $self->{-httpbackend_limiter}; $g->{-httpbackend_limiter} = $lim if $lim; - _cleanup_later($self); $g; }; } @@ -181,33 +166,18 @@ sub over { } // ($req ? croak("E: $@") : undef); } -sub try_cat { - my ($path) = @_; - open(my $fh, '<', $path) or return ''; - local $/; - <$fh> // ''; -} - -sub cat_desc ($) { - my $desc = try_cat($_[0]); - local $/ = "\n"; - chomp $desc; - utf8::decode($desc); - $desc =~ s/\s+/ /smg; - $desc eq '' ? undef : $desc; -} - sub description { my ($self) = @_; - ($self->{description} //= cat_desc("$self->{inboxdir}/description")) // + ($self->{description} //= + PublicInbox::Git::cat_desc("$self->{inboxdir}/description")) // '($INBOX_DIR/description missing)'; } sub cloneurl { my ($self) = @_; $self->{cloneurl} // do { - my $s = try_cat("$self->{inboxdir}/cloneurl"); - my @urls = split(/\s+/s, $s); + my @urls = split(/\s+/s, + PublicInbox::IO::try_cat "$self->{inboxdir}/cloneurl"); scalar(@urls) ? ($self->{cloneurl} = \@urls) : undef; } // []; } @@ -220,7 +190,8 @@ sub base_url { $url .= '/' if $url !~ m!/\z!; return $url .= $self->{name} . '/'; } - # called from a non-PSGI environment (e.g. NNTP/POP3): + # called from a non-PSGI environment or cross-inbox environment + # where multiple inboxes can have different domains my $url = $self->{url} // return undef; $url = $url->[0] // return undef; # expand protocol-relative URLs to HTTPS if we're @@ -264,11 +235,7 @@ EOM # nntp://news.example.com/alt.example push @m, $u; } - - # List::Util::uniq requires Perl 5.26+, maybe we - # can use it by 2030 or so - my %seen; - @urls = grep { !$seen{$_}++ } (@urls, @m); + @urls = uniqstr @urls, @m; } \@urls; } @@ -285,11 +252,10 @@ sub pop3_url { my $group = $self->{newsgroup}; my @urls; ($ps && $group) and - @urls = map { m!\Apops?://! ? $_ : "pop://$_" } @$ps; + @urls = map { m!\Apop3?s?://! ? $_ : "pop3://$_" } @$ps; if (my $mi = $self->{'pop3mirror'}) { - my @m = map { m!\Apops?://! ? $_ : "pop://$_" } @$mi; - my %seen; # List::Util::uniq requires Perl 5.26+ - @urls = grep { !$seen{$_}++ } (@urls, @m); + my @m = map { m!\Apop3?s?://! ? $_ : "pop3://$_" } @$mi; + @urls = uniqstr @urls, @m; } my $n = 0; for (@urls) { $n += s!/+\z!! } @@ -351,11 +317,6 @@ sub msg_by_mid ($$) { $smsg ? msg_by_smsg($self, $smsg) : msg_by_path($self, mid2path($mid)); } -sub recent { - my ($self, $opts, $after, $before) = @_; - $self->over->recent($opts, $after, $before); -} - sub modified { my ($self) = @_; if (my $over = $self->over) { @@ -397,7 +358,7 @@ sub unsubscribe_unlock { # called by inotify sub on_unlock { my ($self) = @_; - check_inodes($self); + check_inodes($self); # DB files may be replaced while holding lock my $subs = $self->{unlock_subs} or return; for my $obj (values %$subs) { eval { $obj->on_inbox_unlock($self) }; @@ -431,4 +392,6 @@ sub mailboxid { # rfc 8474, 8620, 8621 sprintf('-%x', uidvalidity($self) // 0) } +sub thing_type { 'public inbox' } + 1; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index ffbbfea7..3c4d4a68 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -1,18 +1,18 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # fields: # inot: Linux::Inotify2-like object # pathmap => { inboxdir => [ ibx, watch1, watch2, watch3... ] } mapping package PublicInbox::InboxIdle; -use strict; +use v5.12; use parent qw(PublicInbox::DS); use PublicInbox::Syscall qw(EPOLLIN); my $IN_MODIFY = 0x02; # match Linux inotify my $ino_cls; -if ($^O eq 'linux' && eval { require Linux::Inotify2; 1 }) { - $IN_MODIFY = Linux::Inotify2::IN_MODIFY(); - $ino_cls = 'Linux::Inotify2'; +if ($^O eq 'linux' && eval { require PublicInbox::Inotify }) { + $IN_MODIFY = PublicInbox::Inotify::IN_MODIFY(); + $ino_cls = 'PublicInbox::Inotify'; } elsif (eval { require PublicInbox::KQNotify }) { $IN_MODIFY = PublicInbox::KQNotify::NOTE_WRITE(); $ino_cls = 'PublicInbox::KQNotify'; @@ -34,7 +34,7 @@ sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback $ibx->{unlock_subs} = $old_ibx->{unlock_subs}; %{$ibx->{unlock_subs}} = (%$u, %{$ibx->{unlock_subs}}) if $u; - # Linux::Inotify2::Watch::name matches if watches are the + # *::Inotify*::Watch::name matches if watches are the # same, no point in replacing a watch of the same name if ($cur->[1]->name eq $lock) { $self->{on_unlock}->{$lock} = $ibx; @@ -48,11 +48,9 @@ sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback $self->{on_unlock}->{$w->name} = $ibx; } else { warn "E: ".ref($inot)."->watch($lock, IN_MODIFY) failed: $!\n"; - if ($!{ENOSPC} && $^O eq 'linux') { - warn <<""; -I: consider increasing /proc/sys/fs/inotify/max_user_watches + warn <<"" if $!{ENOSPC} && $^O eq 'linux'; +# consider increasing /proc/sys/fs/inotify/max_user_watches - } } # TODO: detect deleted packs (and possibly other files) @@ -89,7 +87,7 @@ sub new { sub event_step { my ($self) = @_; eval { - my @events = $self->{inot}->read; # Linux::Inotify2::read + my @events = $self->{inot}->read; # PublicInbox::Inotify3::read my $on_unlock = $self->{on_unlock}; for my $ev (@events) { my $fn = $ev->fullname // next; # cancelled diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 17dfbe18..8e95cb28 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -1,25 +1,18 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Extends read-only Inbox for writing package PublicInbox::InboxWritable; use strict; use v5.10.1; -use parent qw(PublicInbox::Inbox Exporter); +use parent qw(PublicInbox::Inbox PublicInbox::Umask Exporter); use PublicInbox::Import; +use PublicInbox::IO qw(read_all); use PublicInbox::Filter::Base qw(REJECT); use Errno qw(ENOENT); our @EXPORT_OK = qw(eml_from_path); use Fcntl qw(O_RDONLY O_NONBLOCK); -use constant { - PERM_UMASK => 0, - OLD_PERM_GROUP => 1, - OLD_PERM_EVERYBODY => 2, - PERM_GROUP => 0660, - PERM_EVERYBODY => 0664, -}; - sub new { my ($class, $ibx, $creat_opt) = @_; return $ibx if ref($ibx) eq $class; @@ -122,9 +115,8 @@ sub filter { sub eml_from_path ($) { my ($path) = @_; if (sysopen(my $fh, $path, O_RDONLY|O_NONBLOCK)) { - return unless -f $fh; # no FIFOs or directories - my $str = do { local $/; <$fh> } or return; - PublicInbox::Eml->new(\$str); + return unless -f $fh && -s _; # no FIFOs or directories + PublicInbox::Eml->new(\(my $str = read_all($fh, -s _))); } else { # ENOENT is common with Maildir warn "failed to open $path: $!\n" if $! != ENOENT; undef; @@ -176,64 +168,6 @@ sub import_mbox { $im->done; } -sub _read_git_config_perm { - my ($self) = @_; - chomp(my $perm = $self->git->qx('config', 'core.sharedRepository')); - $perm; -} - -sub _git_config_perm { - my $self = shift; - my $perm = scalar @_ ? $_[0] : _read_git_config_perm($self); - return PERM_UMASK if (!defined($perm) || $perm eq ''); - return PERM_UMASK if ($perm eq 'umask'); - return PERM_GROUP if ($perm eq 'group'); - if ($perm =~ /\A(?:all|world|everybody)\z/) { - return PERM_EVERYBODY; - } - return PERM_GROUP if ($perm =~ /\A(?:true|yes|on|1)\z/); - return PERM_UMASK if ($perm =~ /\A(?:false|no|off|0)\z/); - - my $i = oct($perm); - return PERM_UMASK if ($i == PERM_UMASK); - return PERM_GROUP if ($i == OLD_PERM_GROUP); - return PERM_EVERYBODY if ($i == OLD_PERM_EVERYBODY); - - if (($i & 0600) != 0600) { - die "core.sharedRepository mode invalid: ". - sprintf('%.3o', $i) . "\nOwner must have permissions\n"; - } - ($i & 0666); -} - -sub _umask_for { - my ($perm) = @_; # _git_config_perm return value - my $rv = $perm; - return umask if $rv == 0; - - # set +x bit if +r or +w were set - $rv |= 0100 if ($rv & 0600); - $rv |= 0010 if ($rv & 0060); - $rv |= 0001 if ($rv & 0006); - (~$rv & 0777); -} - -sub with_umask { - my ($self, $cb, @arg) = @_; - my $old = umask($self->{umask} //= umask_prepare($self)); - my $rv = eval { $cb->(@arg) }; - my $err = $@; - umask $old; - die $err if $err; - $rv; -} - -sub umask_prepare { - my ($self) = @_; - my $perm = _git_config_perm($self); - _umask_for($perm); -} - sub cleanup ($) { delete @{$_[0]}{qw(over mm git search)}; } @@ -245,4 +179,31 @@ sub git_dir_latest { "$self->{inboxdir}/git/$$max.git" : undef; } +# for unconfigured inboxes +sub detect_indexlevel ($) { + my ($ibx) = @_; + + my $over = $ibx->over; + my $srch = $ibx->search; + delete @$ibx{qw(over search)}; # don't leave open FDs lying around + + # brand new or never before indexed inboxes default to full + return 'full' unless $over; + my $l = 'basic'; + return $l unless $srch; + if (my $xdb = $srch->xdb) { + $l = 'full'; + my $m = $xdb->get_metadata('indexlevel'); + if ($m eq 'medium') { + $l = $m; + } elsif ($m ne '') { + warn <<""; +$ibx->{inboxdir} has unexpected indexlevel in Xapian: $m + + } + $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata'); + } + $l; +} + 1; diff --git a/lib/PublicInbox/Inotify.pm b/lib/PublicInbox/Inotify.pm new file mode 100644 index 00000000..c4f1ae84 --- /dev/null +++ b/lib/PublicInbox/Inotify.pm @@ -0,0 +1,47 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# wrap Linux::Inotify2 XS module, support pure Perl via `syscall' someday +package PublicInbox::Inotify; +use v5.12; +our @ISA; +BEGIN { # prefer pure Perl since it works out-of-the-box + my $isa; + for my $m (qw(PublicInbox::Inotify3 Linux::Inotify2)) { + eval "require $m"; + next if $@; + $isa = $m; + } + if ($isa) { + push @ISA, $isa; + my $buf = ''; + for (qw(IN_MOVED_TO IN_CREATE IN_DELETE IN_DELETE_SELF + IN_MOVE_SELF IN_MOVED_FROM IN_MODIFY)) { + $buf .= "*$_ = \\&PublicInbox::Inotify3::$_;\n"; + } + eval $buf; + die $@ if $@; + } else { + die <<EOM; +W: inotify syscall numbers unknown on your platform and +W: Linux::Inotify2 missing: $@ +W: public-inbox hackers welcome the plain-text output of ./devel/sysdefs-list +W: at meta\@public-inbox.org +EOM + } +}; + +sub new { + $_[0]->SUPER::new // do { + my $msg = $!{EMFILE} ? <<EOM : "$_[0]->new: $!\n"; +inotify_init/inotify_init1: $! +You may need to raise the `fs.inotify.max_user_instances' sysctl limit. +Consult your OS documentation and/or sysctl(8) + sysctl.conf(5) manpages. +EOM + $msg =~ s/^/E: /smg; + require Carp; + Carp::croak($msg); + } +} + +1; diff --git a/lib/PublicInbox/Inotify3.pm b/lib/PublicInbox/Inotify3.pm new file mode 100644 index 00000000..4f337a7a --- /dev/null +++ b/lib/PublicInbox/Inotify3.pm @@ -0,0 +1,115 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Implements most Linux::Inotify2 functionality we need in pure Perl +# Anonymous sub support isn't supported since it's expensive in the +# best case and likely leaky in older Perls (e.g. 5.16.3) +package PublicInbox::Inotify3; +use v5.12; +use autodie qw(open); +use PublicInbox::Syscall (); +use Carp; +use Scalar::Util (); + +# this fails if undefined no unsupported platforms +use constant $PublicInbox::Syscall::INOTIFY; +our %events; + +# extracted from devel/sysdefs-list output, these should be arch-independent +BEGIN { +%events = ( + IN_ACCESS => 0x1, + IN_ALL_EVENTS => 0xfff, + IN_ATTRIB => 0x4, + IN_CLOSE => 0x18, + IN_CLOSE_NOWRITE => 0x10, + IN_CLOSE_WRITE => 0x8, + IN_CREATE => 0x100, + IN_DELETE => 0x200, + IN_DELETE_SELF => 0x400, + IN_DONT_FOLLOW => 0x2000000, + IN_EXCL_UNLINK => 0x4000000, + IN_IGNORED => 0x8000, + IN_ISDIR => 0x40000000, + IN_MASK_ADD => 0x20000000, + IN_MODIFY => 0x2, + IN_MOVE => 0xc0, + IN_MOVED_FROM => 0x40, + IN_MOVED_TO => 0x80, + IN_MOVE_SELF => 0x800, + IN_ONESHOT => 0x80000000, + IN_ONLYDIR => 0x1000000, + IN_OPEN => 0x20, + IN_Q_OVERFLOW => 0x4000, + IN_UNMOUNT => 0x2000, +); +} # /BEGIN +use constant \%events; +require PublicInbox::In3Event; # uses %events +require PublicInbox::In3Watch; # uses SYS_inotify_rm_watch + +use constant autocancel => + (IN_IGNORED|IN_UNMOUNT|IN_ONESHOT|IN_DELETE_SELF); + +sub new { + open my $fh, '+<&=', syscall(SYS_inotify_init1, IN_CLOEXEC); + bless { fh => $fh }, __PACKAGE__; +} + +sub read { + my ($self) = @_; + my (@ret, $wd, $mask, $len, $name, $size, $buf); + my $r = sysread($self->{fh}, my $rbuf, 8192); + if ($r) { + while ($r) { + ($wd, $mask, undef, $len) = unpack('lLLL', $rbuf); + $size = 16 + $len; # 16: sizeof(struct inotify_event) + substr($rbuf, 0, 16, ''); + $name = $len ? unpack('Z*', substr($rbuf, 0, $len, '')) + : undef; + $r -= $size; + next if $self->{ignore}->{$wd}; + my $ev = bless [$mask, $name], 'PublicInbox::In3Event'; + push @ret, $ev; + if (my $w = $self->{w}->{$wd}) { + $ev->[2] = $w; + $w->cancel if $ev->mask & autocancel; + } elsif ($mask & IN_Q_OVERFLOW) { + carp 'E: IN_Q_OVERFLOW, too busy? (non-fatal)' + } else { + carp "BUG? wd:$wd unknown (non-fatal)"; + } + } + } elsif (defined($r) || ($!{EAGAIN} || $!{EINTR})) { + } else { + croak "inotify read: $!"; + } + delete $self->{ignore}; + @ret; +} + +sub fileno { CORE::fileno($_[0]->{fh}) } + +sub fh { $_[0]->{fh} } + +sub blocking { shift->{fh}->blocking(@_) } + +sub watch { + my ($self, $name, $mask, $cb) = @_; + croak "E: $cb not supported" if $cb; # too much memory + my $wd = syscall(SYS_inotify_add_watch, $self->fileno, $name, $mask); + return if $wd < 0; + my $w = bless [ $wd, $mask, $name, $self ], 'PublicInbox::In3Watch'; + $self->{w}->{$wd} = $w; + Scalar::Util::weaken($w->[3]); # ugh + $w; +} + +sub rm_watch { + my ($self, $wd) = @_; + delete $self->{w}->{$wd}; + $self->{ignore}->{$wd} = 1; # is this needed? + syscall(SYS_inotify_rm_watch, $self->fileno, $wd) < 0 ? undef : 1; +} + +1; diff --git a/lib/PublicInbox/InputPipe.pm b/lib/PublicInbox/InputPipe.pm index e1e26e20..ee5bda59 100644 --- a/lib/PublicInbox/InputPipe.pm +++ b/lib/PublicInbox/InputPipe.pm @@ -1,36 +1,52 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# for reading pipes and sockets off the DS event loop +# for reading pipes, sockets, and TTYs off the DS event loop package PublicInbox::InputPipe; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::DS); -use PublicInbox::Syscall qw(EPOLLIN EPOLLET); +use PublicInbox::Syscall qw(EPOLLIN); sub consume { my ($in, $cb, @args) = @_; my $self = bless { cb => $cb, args => \@args }, __PACKAGE__; - eval { $self->SUPER::new($in, EPOLLIN|EPOLLET) }; - return $self->requeue if $@; # regular file - $in->blocking(0); # pipe or socket + eval { $self->SUPER::new($in, EPOLLIN) }; + if ($@) { # regular file (but not w/ select|IO::Poll backends) + $self->{-need_rq} = 1; + $self->requeue; + } elsif (-p _ || -S _) { # O_NONBLOCK for sockets and pipes + $in->blocking(0); + } + $self; +} + +sub close { # idempotent + my ($self) = @_; + $self->{-need_rq} ? delete($self->{sock}) : $self->SUPER::close } sub event_step { my ($self) = @_; my $r = sysread($self->{sock} // return, my $rbuf, 65536); - if ($r) { - $self->{cb}->(@{$self->{args} // []}, $rbuf); - return $self->requeue; # may be regular file or pipe - } - if (defined($r)) { # EOF - $self->{cb}->(@{$self->{args} // []}, ''); - } elsif ($!{EAGAIN}) { - return; - } else { # another error - $self->{cb}->(@{$self->{args} // []}, undef) + eval { + if ($r) { + $self->{cb}->($self, @{$self->{args}}, $rbuf); + $self->requeue if $self->{-need_rq}; + } elsif (defined($r)) { # EOF + $self->{cb}->($self, @{$self->{args}}, ''); + $self->close + } elsif ($!{EAGAIN}) { # rely on EPOLLIN + } elsif ($!{EINTR}) { # rely on EPOLLIN for sockets/pipes + $self->requeue if $self->{-need_rq}; + } else { # another error + $self->{cb}->($self, @{$self->{args}}, undef); + $self->close; + } + }; + if ($@) { + warn "E: $@"; + $self->close; } - $self->{sock}->blocking ? delete($self->{sock}) : $self->close } 1; diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm index 2b45e08e..20808d6d 100644 --- a/lib/PublicInbox/Isearch.pm +++ b/lib/PublicInbox/Isearch.pm @@ -1,12 +1,11 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Provides everything the PublicInbox::Search object does; # but uses global ExtSearch (->ALL) with an eidx_key query to # emulate per-Inbox search using ->ALL. package PublicInbox::Isearch; -use strict; -use v5.10.1; +use v5.12; use PublicInbox::ExtSearch; use PublicInbox::Search; @@ -27,34 +26,44 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 sub query_approxidate { $_[0]->{es}->query_approxidate($_[1], $_[2]) } -sub mset { - my ($self, $str, $opt) = @_; +sub eidx_mset_prep ($$) { + my ($self, $opt) = @_; my %opt = $opt ? %$opt : (); $opt{eidx_key} = $self->{eidx_key}; - if (my $uid_range = $opt{uid_range}) { - my ($beg, $end) = @$uid_range; - my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); - my $dbh = $self->{es}->over->dbh; - my $sth = $dbh->prepare_cached(<<'', undef, 1); + my $uid_range = $opt{uid_range} or return \%opt; + my ($beg, $end) = @$uid_range; + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $dbh = $self->{es}->over->dbh; + my $sth = $dbh->prepare_cached(<<'', undef, 1); SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? - $sth->execute($ibx_id, $beg, $end); - my @r = ($sth->fetchrow_array); + $sth->execute($ibx_id, $beg, $end); + my @r = ($sth->fetchrow_array); - $sth = $dbh->prepare_cached(<<'', undef, 1); + $sth = $dbh->prepare_cached(<<'', undef, 1); SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? - $sth->execute($ibx_id, $beg, $end); - $r[1] = $sth->fetchrow_array; - if (defined($r[1]) && defined($r[0])) { - $opt{limit} = $r[1] - $r[0] + 1; - } else { - $r[1] //= 0xffffffff; - $r[0] //= 0; - } - $opt{uid_range} = \@r; + $sth->execute($ibx_id, $beg, $end); + $r[1] = $sth->fetchrow_array; + if (defined($r[1]) && defined($r[0])) { + $opt{limit} = $r[1] - $r[0] + 1; + } else { + $r[1] //= $self->{es}->xdb->get_lastdocid; + $r[0] //= 0; } - $self->{es}->mset($str, \%opt); + $opt{uid_range} = \@r; # these are fed to Xapian and SQLite + \%opt; +} + +sub mset { + my ($self, $str, $opt) = @_; + $self->{es}->mset($str, eidx_mset_prep $self, $opt); +} + +sub async_mset { + my ($self, $str, $opt, $cb, @args) = @_; + $opt = eidx_mset_prep $self, $opt; + $self->{es}->async_mset($str, $opt, $cb, @args); } sub mset_to_artnums { @@ -124,4 +133,9 @@ sub has_threadid { 1 } sub help { $_[0]->{es}->help } +sub xh_args { # prep getopt args to feed to xap_helper.h socket + my ($self, $opt) = @_; # TODO uid_range + ($self->{es}->xh_args, '-O', $self->{eidx_key}); +} + 1; diff --git a/lib/PublicInbox/KQNotify.pm b/lib/PublicInbox/KQNotify.pm index 7efb8b60..2efa887d 100644 --- a/lib/PublicInbox/KQNotify.pm +++ b/lib/PublicInbox/KQNotify.pm @@ -1,52 +1,35 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # implements the small subset of Linux::Inotify2 functionality we use # using IO::KQueue on *BSD systems. package PublicInbox::KQNotify; -use strict; -use v5.10.1; +use v5.12; +use parent qw(PublicInbox::FakeInotify); use IO::KQueue; use PublicInbox::DSKQXS; # wraps IO::KQueue for fork-safe DESTROY -use PublicInbox::FakeInotify qw(fill_dirlist on_dir_change); -use Time::HiRes qw(stat); +use Errno qw(ENOENT); # NOTE_EXTEND detects rename(2), NOTE_WRITE detects link(2) sub MOVED_TO_OR_CREATE () { NOTE_EXTEND|NOTE_WRITE } sub new { my ($class) = @_; - bless { dskq => PublicInbox::DSKQXS->new, watch => {} }, $class; + bless { dskq => PublicInbox::DSKQXS->new }, $class; } sub watch { my ($self, $path, $mask) = @_; - my ($fh, $watch); - if (-d $path) { - opendir($fh, $path) or return; - my @st = stat($fh); - $watch = bless [ $fh, $path, $st[10] ], - 'PublicInbox::KQNotify::Watchdir'; - } else { - open($fh, '<', $path) or return; - $watch = bless [ $fh, $path ], - 'PublicInbox::KQNotify::Watch'; - } - my $ident = fileno($fh); + my $dir_delete = $mask & NOTE_DELETE ? 1 : 0; + my $w = $self->watch_open($path, \$dir_delete) or return; + $w->[2] = pop @$w; # ctime is unused by this subclass + my $ident = fileno($w->[2]) // die "BUG: bad fileno $w->[2]: $!"; $self->{dskq}->{kq}->EV_SET($ident, # ident (fd) EVFILT_VNODE, # filter EV_ADD | EV_CLEAR, # flags $mask, # fflags - 0, 0); # data, udata - if ($mask & (MOVED_TO_OR_CREATE|NOTE_DELETE|NOTE_LINK|NOTE_REVOKE)) { - $self->{watch}->{$ident} = $watch; - if ($mask & (NOTE_DELETE|NOTE_LINK|NOTE_REVOKE)) { - fill_dirlist($self, $path, $fh) - } - } else { - die "TODO Not implemented: $mask"; - } - $watch; + 0, $dir_delete); # data, udata + $self->{watch}->{$ident} = $w; } # emulate Linux::Inotify::fileno @@ -63,54 +46,31 @@ sub blocking {} # behave like Linux::Inotify2->read sub read { my ($self) = @_; - my @kevents = $self->{dskq}->{kq}->kevent(0); my $events = []; - my @gone; - my $watch = $self->{watch}; - for my $kev (@kevents) { + for my $kev ($self->{dskq}->{kq}->kevent(0)) { my $ident = $kev->[KQ_IDENT]; - my $mask = $kev->[KQ_FFLAGS]; - my ($dh, $path, $old_ctime) = @{$watch->{$ident}}; - if (!defined($old_ctime)) { - push @$events, - bless(\$path, 'PublicInbox::FakeInotify::Event') - } elsif ($mask & (MOVED_TO_OR_CREATE|NOTE_DELETE|NOTE_LINK| - NOTE_REVOKE|NOTE_RENAME)) { - my @new_st = stat($path); - if (!@new_st && $!{ENOENT}) { - push @$events, bless(\$path, - 'PublicInbox::FakeInotify::'. - 'SelfGoneEvent'); - push @gone, $ident; - delete $self->{dirlist}->{$path}; - next; - } - if (!@new_st) { - warn "unhandled stat($path) error: $!\n"; - next; - } - $watch->{$ident}->[3] = $new_st[10]; # ctime - rewinddir($dh); - on_dir_change($events, $dh, $path, $old_ctime, - $self->{dirlist}); + my $w = $self->{watch}->{$ident} or next; + if (!@$w) { # cancelled + delete($self->{watch}->{$ident}); + next; + } + my $dir_delete = $kev->[KQ_UDATA]; + my ($old_dev, $old_ino, $fh, $path) = @$w; + my @new_st = stat($path); + warn "W: stat($path): $!\n" if !@new_st && $! != ENOENT; + if (!@new_st || "$old_dev $old_ino" ne "@new_st[0,1]") { + push(@$events, $self->gone($ident, $path)); + next; + } + if (-d _) { + rewinddir($fh); + $self->on_dir_change($events, $fh, $path, $dir_delete); + } else { + push @$events, bless(\$path, + 'PublicInbox::FakeInotify::Event'); } } - delete @$watch{@gone}; @$events; } -package PublicInbox::KQNotify::Watch; -use strict; - -sub name { $_[0]->[1] } - -sub cancel { close $_[0]->[0] or die "close: $!" } - -package PublicInbox::KQNotify::Watchdir; -use strict; - -sub name { $_[0]->[1] } - -sub cancel { closedir $_[0]->[0] or die "closedir: $!" } - 1; diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm index d81ca296..e9a0de6c 100644 --- a/lib/PublicInbox/LEI.pm +++ b/lib/PublicInbox/LEI.pm @@ -9,8 +9,9 @@ package PublicInbox::LEI; use v5.12; use parent qw(PublicInbox::DS PublicInbox::LeiExternal PublicInbox::LeiQuery); +use autodie qw(bind chdir open pipe socket socketpair syswrite unlink); use Getopt::Long (); -use Socket qw(AF_UNIX SOCK_SEQPACKET MSG_EOR pack_sockaddr_un); +use Socket qw(AF_UNIX SOCK_SEQPACKET pack_sockaddr_un); use Errno qw(EPIPE EAGAIN ECONNREFUSED ENOENT ECONNRESET); use Cwd qw(getcwd); use POSIX qw(strftime); @@ -18,33 +19,36 @@ use IO::Handle (); use Fcntl qw(SEEK_SET); use PublicInbox::Config; use PublicInbox::Syscall qw(EPOLLIN); -use PublicInbox::DS qw(dwaitpid); -use PublicInbox::Spawn qw(spawn popen_rd); +use PublicInbox::Spawn qw(run_wait popen_rd run_qx); use PublicInbox::Lock; use PublicInbox::Eml; use PublicInbox::Import; use PublicInbox::ContentHash qw(git_sha); +use PublicInbox::OnDestroy; +use PublicInbox::IPC; use Time::HiRes qw(stat); # ctime comparisons for config cache -use File::Path qw(mkpath); +use File::Path (); use File::Spec; +use Carp qw(carp); use Sys::Syslog qw(openlog syslog closelog); our $quit = \&CORE::exit; -our ($current_lei, $errors_log, $listener, $oldset, $dir_idle, - $recv_cmd, $send_cmd); +our ($current_lei, $errors_log, $listener, $oldset, $dir_idle); my $GLP = Getopt::Long::Parser->new; $GLP->configure(qw(gnu_getopt no_ignore_case auto_abbrev)); my $GLP_PASS = Getopt::Long::Parser->new; $GLP_PASS->configure(qw(gnu_getopt no_ignore_case auto_abbrev pass_through)); -our %PATH2CFG; # persistent for socket daemon -our $MDIR2CFGPATH; # /path/to/maildir => { /path/to/config => [ ino watches ] } +our (%PATH2CFG, # persistent for socket daemon +$MDIR2CFGPATH, # location => { /path/to/config => [ ino watches ] } +$OPT, # shared between optparse and opt_dash callback (for Getopt::Long) +$daemon_pid +); # TBD: this is a documentation mechanism to show a subcommand # (may) pass options through to another command: sub pass_through { $GLP_PASS } -my $OPT; -sub opt_dash ($$) { +sub opt_dash ($$) { # callback runs inside optparse my ($spec, $re_str) = @_; # 'limit|n=i', '([0-9]+)' my ($key) = ($spec =~ m/\A([a-z]+)/g); my $cb = sub { # Getopt::Long "<>" catch-all handler @@ -158,7 +162,7 @@ our @diff_opt = qw(unified|U=i output-indicator-new=s output-indicator-old=s rename-empty! check ws-error-highlight=s full-index binary abbrev:i break-rewrites|B:s find-renames|M:s find-copies:s find-copies-harder irreversible-delete|D l=i diff-filter=s - S=s G=s find-object=s pickaxe-all pickaxe-regex O=s R + S=s G=s find-object=s pickaxe-all pickaxe-regex R relative:s text|a ignore-cr-at-eol ignore-space-at-eol ignore-space-change|b ignore-all-space|w ignore-blank-lines inter-hunk-context=i function-context|W exit-code ext-diff @@ -173,6 +177,7 @@ our %CMD = ( # sorted in order of importance/use: 'stdin|', # /|\z/ must be first for lone dash @lxs_opt, @net_opt, qw(save! output|mfolder|o=s format|f=s dedupe|d=s threads|t+ + thread-id|T=s sort|s=s reverse|r offset=i pretty jobs|j=s globoff|g augment|a import-before! lock=s@ rsyncable alert=s@ mua=s verbose|v+ shared color! mail-sync!), @c_opt, opt_dash('limit|n=i', '[0-9]+') ], @@ -197,8 +202,8 @@ our %CMD = ( # sorted in order of importance/use: 'rediff' => [ '--stdin|LOCATION...', 'regenerate a diff with different options', 'stdin|', # /|\z/ must be first for lone dash - qw(git-dir=s@ cwd! verbose|v+ color:s no-color drq:1 dequote-only:1), - @diff_opt, @lxs_opt, @net_opt, @c_opt ], + qw(git-dir=s@ cwd! verbose|v+ color:s no-color drq:1 dequote-only:1 + order-file=s), @diff_opt, @lxs_opt, @net_opt, @c_opt ], 'mail-diff' => [ '--stdin|LOCATION...', 'diff the contents of emails', 'stdin|', # /|\z/ must be first for lone dash @@ -229,20 +234,21 @@ our %CMD = ( # sorted in order of importance/use: 'rm' => [ '--stdin|LOCATION...', 'remove a message from the index and prevent reindexing', 'stdin|', # /|\z/ must be first for lone dash - qw(in-format|F=s lock=s@), @net_opt, @c_opt ], + qw(in-format|F=s lock=s@ commit-delay=i), @net_opt, @c_opt ], 'plonk' => [ '--threads|--from=IDENT', 'exclude mail matching From: or threads from non-Message-ID searches', qw(stdin| threads|t from|f=s mid=s oid=s), @c_opt ], -'tag' => [ 'KEYWORDS...', +tag => [ 'KEYWORDS... LOCATION...|--stdin', 'set/unset keywords and/or labels on message(s)', - qw(stdin| in-format|F=s input|i=s@ oid=s@ mid=s@), + qw(stdin| in-format|F=s input|i=s@ oid=s@ mid=s@ commit-delay=i), @net_opt, @c_opt, pass_through('-kw:foo for delete') ], 'purge-mailsource' => [ 'LOCATION|--all', 'remove imported messages from IMAP, Maildirs, and MH', qw(exact! all jobs:i indexed), @c_opt ], -'add-watch' => [ 'LOCATION...', 'watch for new messages and flag changes', +'add-watch' => [ 'LOCATION... [LABELS...]', + 'watch for new messages and flag changes', qw(poll-interval=s state=s recursive|r), @c_opt ], 'rm-watch' => [ 'LOCATION...', 'remove specified watch(es)', qw(recursive|r), @c_opt ], @@ -253,14 +259,17 @@ our %CMD = ( # sorted in order of importance/use: 'forget-watch' => [ '{WATCH_NUMBER|--prune}', 'stop and forget a watch', qw(prune), @c_opt ], -'index' => [ 'LOCATION...', 'one-time index from URL or filesystem', +'reindex' => [ '', 'reindex all locally-indexed messages', @c_opt ], + +'index' => [ 'LOCATION... [LABELS...]', 'one-time index from URL or filesystem', qw(in-format|F=s kw! offset=i recursive|r exclude=s include|I=s verbose|v+ incremental!), @net_opt, # mainly for --proxy= @c_opt ], -'import' => [ 'LOCATION...|--stdin', +import => [ 'LOCATION...|--stdin [LABELS...]', 'one-time import/update from URL or filesystem', qw(stdin| offset=i recursive|r exclude=s include|I=s new-only - lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync!), + lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync! + commit-delay=i sort|s:s@), @net_opt, @c_opt ], 'forget-mail-sync' => [ 'LOCATION...', 'forget sync information for a mail folder', @c_opt ], @@ -272,13 +281,15 @@ our %CMD = ( # sorted in order of importance/use: qw(all:s mode=s), @net_opt, @c_opt ], 'convert' => [ 'LOCATION...|--stdin', 'one-time conversion from URL or filesystem to another format', - qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s lock=s@ kw!), + qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s lock=s@ kw! + rsyncable sort|s:s@), @net_opt, @c_opt ], 'p2q' => [ 'LOCATION_OR_COMMIT...|--stdin', "use a patch to generate a query for `lei q --stdin'", qw(stdin| in-format|F=s want|w=s@ uri debug), @net_opt, @c_opt ], 'config' => [ '[...]', sub { - 'git-config(1) wrapper for '._config_path($_[0]); + 'git-config(1) wrapper for '._config_path($_[0]). "\n" . + '-l/--list and other common git-config uses are supported' }, qw(config-file|system|global|file|f=s), # for conflict detection qw(edit|e c=s@ C=s@), pass_through('git config') ], 'inspect' => [ 'ITEMS...|--stdin', 'inspect lei/store and/or local external', @@ -312,6 +323,9 @@ our %CMD = ( # sorted in order of importance/use: my $stdin_formats = [ 'MAIL_FORMAT|eml|mboxrd|mboxcl2|mboxcl|mboxo', 'specify message input format' ]; my $ls_format = [ 'OUT|plain|json|null', 'listing output format' ]; +my $sort_out = [ 'VAL|received|relevance|docid', + "order of results is `--output'-dependent"]; +my $sort_in = [ 'sequence|mtime|size', 'sort input (format-dependent)' ]; # we use \x{a0} (non-breaking SP) to avoid wrapping in PublicInbox::LeiHelp my %OPTDESC = ( @@ -344,6 +358,7 @@ my %OPTDESC = ( 'no-torsocks' => 'alias for --torsocks=no', 'save!' => "do not save a search for `lei up'", 'import-remote!' => 'do not memoize remote messages into local store', +'import-before!' => 'do not import before writing to output (DANGEROUS)', 'type=s' => [ 'any|mid|git', 'disambiguate type' ], @@ -397,8 +412,10 @@ my %OPTDESC = ( 'include specified external(s) in search' ], 'only|O=s@ q' => [ 'LOCATION', 'only use specified external(s) for search' ], -'jobs=s q' => [ '[SEARCH_JOBS][,WRITER_JOBS]', - 'control number of search and writer jobs' ], +'jobs|j=s' => [ 'JOBSPEC', + 'control number of query and writer jobs' . + "integers delimited by `,', either of which may be omitted" + ], 'jobs|j=i add-external' => 'set parallelism when indexing after --mirror', 'in-format|F=s' => $stdin_formats, @@ -416,8 +433,10 @@ my %OPTDESC = ( 'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ], 'offset=i' => ['OFF', 'search result offset (default: 0)'], -'sort|s=s' => [ 'VAL|received|relevance|docid', - "order of results is `--output'-dependent"], +'sort|s=s q' => $sort_out, +'sort|s=s lcat' => $sort_out, +'sort|s:s@ convert' => $sort_in, +'sort|s:s@ import' => $sort_in, 'reverse|r' => 'reverse search results', # like sort(1) 'boost=i' => 'increase/decrease priority of results (default: 0)', @@ -451,6 +470,7 @@ my %OPTDESC = ( 'z|0' => 'use NUL \\0 instead of newline (CR) to delimit lines', 'signal|s=s' => [ 'SIG', 'signal to send lei-daemon (default: TERM)' ], +'edit|e config' => 'open an editor to modify the lei config file', ); # %OPTDESC my %CONFIG_KEYS = ( @@ -462,7 +482,7 @@ my @WQ_KEYS = qw(lxs l2m ikw pmd wq1 lne v2w); # internal workers sub _drop_wq { my ($self) = @_; for my $wq (grep(defined, delete(@$self{@WQ_KEYS}))) { - $wq->wq_kill('-TERM'); + $wq->wq_kill(-POSIX::SIGTERM()); $wq->DESTROY; } } @@ -470,17 +490,18 @@ sub _drop_wq { # pronounced "exit": x_it(1 << 8) => exit(1); x_it(13) => SIGPIPE sub x_it ($$) { my ($self, $code) = @_; - local $current_lei = $self; # make sure client sees stdout before exit $self->{1}->autoflush(1) if $self->{1}; stop_pager($self); if ($self->{pkt_op_p}) { # worker => lei-daemon $self->{pkt_op_p}->pkt_do('x_it', $code); + exit($code >> 8) if $$ != $daemon_pid; } elsif ($self->{sock}) { # lei->daemon => lei(1) client - send($self->{sock}, "x_it $code", MSG_EOR); + send($self->{sock}, "x_it $code", 0); } elsif ($quit == \&CORE::exit) { # an admin (one-shot) command exit($code >> 8); } # else ignore if client disconnected + $self->dclose if $$ == $daemon_pid; } sub err ($;@) { @@ -489,7 +510,7 @@ sub err ($;@) { my @eor = (substr($_[-1]//'', -1, 1) eq "\n" ? () : ("\n")); print $err @_, @eor and return; my $old_err = delete $self->{2}; - close($old_err) if $! == EPIPE && $old_err; + $old_err->close if $! == EPIPE && $old_err; $err = $self->{2} = ($self->{pgr} // [])->[2] // *STDERR{GLOB}; print $err @_, @eor or print STDERR @_, @eor; } @@ -504,8 +525,7 @@ sub qfin { # show message on finalization (LeiFinmsg) sub fail_handler ($;$$) { my ($lei, $code, $io) = @_; - local $current_lei = $lei; - close($io) if $io; # needed to avoid warnings on SIGPIPE + $io->close if $io; # needed to avoid warnings on SIGPIPE _drop_wq($lei); x_it($lei, $code // (1 << 8)); } @@ -514,13 +534,17 @@ sub sigpipe_handler { # handles SIGPIPE from @WQ_KEYS workers fail_handler($_[0], 13, delete $_[0]->{1}); } -sub fail ($$;$) { - my ($self, $msg, $exit_code) = @_; - local $current_lei = $self; - $self->{failed}++; - warn(substr($msg, -1, 1) eq "\n" ? $msg : "$msg\n") if defined $msg; - $self->{pkt_op_p}->pkt_do('fail_handler') if $self->{pkt_op_p}; - x_it($self, ($exit_code // 1) << 8); +sub fail ($;@) { + my ($lei, @msg) = @_; + my $exit_code = ($msg[0]//'') =~ /\A-?[0-9]+\z/ ? shift(@msg) : undef; + local $current_lei = $lei; + $lei->{failed}++; + if (@msg) { + push @msg, "\n" if substr($msg[-1], -1, 1); + warn @msg; + } + $lei->{pkt_op_p}->pkt_do('fail_handler') if $lei->{pkt_op_p}; + x_it($lei, $exit_code // (1 << 8)); undef; } @@ -540,18 +564,17 @@ sub child_error { # passes non-fatal curl exit codes to user local $current_lei = $self; $child_error ||= 1 << 8; warn(substr($msg, -1, 1) eq "\n" ? $msg : "$msg\n") if defined $msg; + $self->{child_error} ||= $child_error; if ($self->{pkt_op_p}) { # to top lei-daemon $self->{pkt_op_p}->pkt_do('child_error', $child_error); } elsif ($self->{sock}) { # to lei(1) client - send($self->{sock}, "child_error $child_error", MSG_EOR); - } else { # non-lei admin command - $self->{child_error} ||= $child_error; + send($self->{sock}, "child_error $child_error", 0); } # else noop if client disconnected } sub note_sigpipe { # triggers sigpipe_handler my ($self, $fd) = @_; - close(delete($self->{$fd})); # explicit close silences Perl warning + delete($self->{$fd})->close; # explicit close silences Perl warning $self->{pkt_op_p}->pkt_do('sigpipe_handler') if $self->{pkt_op_p}; x_it($self, 13); } @@ -559,20 +582,21 @@ sub note_sigpipe { # triggers sigpipe_handler sub _lei_atfork_child { my ($self, $persist) = @_; # we need to explicitly close things which are on stack + my $cfg = $self->{cfg}; + delete @$cfg{qw(-watches -lei_note_event)}; if ($persist) { - open $self->{3}, '<', '/' or die "open(/) $!"; + open $self->{3}, '<', '/'; fchdir($self); close($_) for (grep(defined, delete @$self{qw(0 1 2 sock)})); - if (my $cfg = $self->{cfg}) { - delete @$cfg{qw(-lei_store -watches -lei_note_event)}; - } + delete $cfg->{-lei_store}; } else { # worker, Net::NNTP (Net::Cmd) uses STDERR directly - open STDERR, '+>&='.fileno($self->{2}) or warn "open $!"; + open STDERR, '+>&='.fileno($self->{2}); # idempotent w/ fileno STDERR->autoflush(1); + $self->{2} = \*STDERR; POSIX::setpgid(0, $$) // die "setpgid(0, $$): $!"; } close($_) for (grep(defined, delete @$self{qw(old_1 au_done)})); - delete $self->{-socks}; + close($_) for (@{delete($self->{-socks}) // []}); if (my $op_c = delete $self->{pkt_op_c}) { close(delete $op_c->{sock}); } @@ -584,7 +608,7 @@ sub _lei_atfork_child { $dir_idle->force_close if $dir_idle; undef $dir_idle; %PATH2CFG = (); - $MDIR2CFGPATH = {}; + $MDIR2CFGPATH = undef; eval 'no warnings; undef $PublicInbox::LeiNoteEvent::to_flush'; undef $errors_log; $quit = \&CORE::exit; @@ -609,16 +633,16 @@ sub _delete_pkt_op { # OnDestroy callback to prevent leaks on die sub pkt_op_pair { my ($self) = @_; - require PublicInbox::OnDestroy; require PublicInbox::PktOp; - my $end = PublicInbox::OnDestroy->new($$, \&_delete_pkt_op, $self); + my $end = on_destroy \&_delete_pkt_op, $self; @$self{qw(pkt_op_c pkt_op_p)} = PublicInbox::PktOp->pair; $end; } sub incr { - my ($self, $field, $nr) = @_; - $self->{counters}->{$field} += $nr; + my $lei = shift; + $lei->{incr_pid} = $$ if @_; + while (my ($f, $n) = splice(@_, 0, 2)) { $lei->{$f} += $n } } sub pkt_ops { @@ -641,12 +665,12 @@ sub workers_start { my $end = $lei->pkt_op_pair; my $ident = $wq->{-wq_ident} // "lei-$lei->{cmd} worker"; $flds->{lei} = $lei; - $wq->wq_workers_start($ident, $jobs, $lei->oldset, $flds); + $wq->wq_workers_start($ident, $jobs, $lei->oldset, $flds, + $wq->can('_wq_done_wait') // \&wq_done_wait, $lei); delete $lei->{pkt_op_p}; my $op_c = delete $lei->{pkt_op_c}; @$end = (); $lei->event_step_init; - $wq->wq_wait_async($wq->can('_wq_done_wait') // \&wq_done_wait, $lei); ($op_c, $ops); } @@ -680,7 +704,7 @@ sub optparse ($$$) { # allow _complete --help to complete, not show help return 1 if substr($cmd, 0, 1) eq '_'; $self->{cmd} = $cmd; - $OPT = $self->{opt} //= {}; + local $OPT = $self->{opt} //= {}; my $info = $CMD{$cmd} // [ '[...]' ]; my ($proto, undef, @spec) = @$info; my $glp = ref($spec[-1]) eq ref($GLP) ? pop(@spec) : $GLP; @@ -700,6 +724,12 @@ sub optparse ($$$) { # "-" aliases "stdin" or "clear" $OPT->{$lone_dash} = ${$OPT->{$lone_dash}} if defined $lone_dash; + if ($proto =~ s/\s*\[?(?:KEYWORDS|LABELS)\.\.\.\]?\s*//g) { + require PublicInbox::LeiInput; + my @err = PublicInbox::LeiInput::vmd_mod_extract($self, $argv); + return $self->fail(join("\n", @err)) if @err; + } + my $i = 0; my $POS_ARG = '[A-Z][A-Z0-9_]+'; my ($err, $inf); @@ -727,11 +757,10 @@ sub optparse ($$$) { # w/o args means stdin if ($sw eq 'stdin' && !@$argv && (-p $self->{0} || - -f _) && -r _) { + -f _)) { $OPT->{stdin} //= 1; } - $ok = defined($OPT->{$sw}); - last if $ok; + $ok = defined($OPT->{$sw}) and last; } elsif (defined($argv->[$i])) { $ok = 1; $i++; @@ -752,38 +781,7 @@ sub optparse ($$$) { $err ? fail($self, "usage: lei $cmd $proto\nE: $err") : 1; } -sub _tmp_cfg { # for lei -c <name>=<value> ... - my ($self) = @_; - my $cfg = _lei_cfg($self, 1); - require File::Temp; - my $ft = File::Temp->new(TEMPLATE => 'lei_cfg-XXXX', TMPDIR => 1); - my $tmp = { '-f' => $ft->filename, -tmp => $ft }; - $ft->autoflush(1); - print $ft <<EOM or return fail($self, "$tmp->{-f}: $!"); -[include] - path = $cfg->{-f} -EOM - $tmp = $self->{cfg} = bless { %$cfg, %$tmp }, ref($cfg); - for (@{$self->{opt}->{c}}) { - /\A([^=\.]+\.[^=]+)(?:=(.*))?\z/ or return fail($self, <<EOM); -`-c $_' is not of the form -c <name>=<value>' -EOM - my $name = $1; - my $value = $2 // 1; - _config($self, '--add', $name, $value); - if (defined(my $v = $tmp->{$name})) { - if (ref($v) eq 'ARRAY') { - push @$v, $value; - } else { - $tmp->{$name} = [ $v, $value ]; - } - } else { - $tmp->{$name} = $value; - } - } -} - -sub lazy_cb ($$$) { +sub lazy_cb ($$$) { # $pfx is _complete_ or lei_ my ($self, $cmd, $pfx) = @_; my $ucmd = $cmd; $ucmd =~ tr/-/_/; @@ -796,11 +794,19 @@ sub lazy_cb ($$$) { $pkg->can($pfx.$ucmd) : undef; } +sub do_env { + my $lei = shift; + fchdir($lei); + my $cb = shift // return ($lei, %{$lei->{env}}) ; + local ($current_lei, %ENV) = ($lei, %{$lei->{env}}); + $cb = $lei->can($cb) if !ref($cb); # $cb may be a scalar sub name + eval { $cb->($lei, @_) }; + $lei->fail($@) if $@; +} + sub dispatch { my ($self, $cmd, @argv) = @_; - fchdir($self); - local %ENV = %{$self->{env}}; - local $current_lei = $self; # for __WARN__ + local ($current_lei, %ENV) = do_env($self); $self->{2}->autoflush(1); # keep stdout buffered until x_it|DESTROY return _help($self, 'no command given') unless defined($cmd); # do not support Getopt bundling for this @@ -812,14 +818,12 @@ sub dispatch { } if (my $cb = lazy_cb(__PACKAGE__, $cmd, 'lei_')) { optparse($self, $cmd, \@argv) or return; - $self->{opt}->{c} and (_tmp_cfg($self) // return); if (my $chdir = $self->{opt}->{C}) { for my $d (@$chdir) { next if $d eq ''; # same as git(1) - chdir $d or return fail($self, "cd $d: $!"); + chdir $d; } - open $self->{3}, '<', '.' or - return fail($self, "open . $!"); + open($self->{3}, '<', '.'); } $cb->($self, @argv); } elsif (grep(/\A-/, $cmd, @argv)) { # --help or -h only @@ -837,28 +841,30 @@ sub _lei_cfg ($;$) { my $f = _config_path($self); my @st = stat($f); my $cur_st = @st ? pack('dd', $st[10], $st[7]) : ''; # 10:ctime, 7:size - my ($sto, $sto_dir, $watches, $lne); - if (my $cfg = $PATH2CFG{$f}) { # reuse existing object in common case - return ($self->{cfg} = $cfg) if $cur_st eq $cfg->{-st}; + my ($sto, $sto_dir, $watches, $lne, $cfg); + if ($cfg = $PATH2CFG{$f}) { # reuse existing object in common case + ($cur_st eq $cfg->{-st} && !$self->{opt}->{c}) and + return ($self->{cfg} = $cfg); + # reuse some fields below if they match: ($sto, $sto_dir, $watches, $lne) = @$cfg{qw(-lei_store leistore.dir -watches -lei_note_event)}; } if (!@st) { - unless ($creat) { - delete $self->{cfg}; - return bless {}, 'PublicInbox::Config'; + unless ($creat) { # any commands which write to cfg must creat + $cfg = PublicInbox::Config->git_config_dump( + '/dev/null', $self); + return ($self->{cfg} = $cfg); } my ($cfg_dir) = ($f =~ m!(.*?/)[^/]+\z!); - -d $cfg_dir or mkpath($cfg_dir) or die "mkpath($cfg_dir): $!\n"; - open my $fh, '>>', $f or die "open($f): $!\n"; + File::Path::mkpath($cfg_dir); + open my $fh, '>>', $f; @st = stat($fh) or die "fstat($f): $!\n"; $cur_st = pack('dd', $st[10], $st[7]); qerr($self, "# $f created") if $self->{cmd} ne 'config'; } - my $cfg = PublicInbox::Config->git_config_dump($f, $self->{2}); + $cfg = PublicInbox::Config->git_config_dump($f, $self); $cfg->{-st} = $cur_st; - $cfg->{'-f'} = $f; if ($sto && canonpath_harder($sto_dir // store_path($self)) eq canonpath_harder($cfg->{'leistore.dir'} // store_path($self))) { @@ -870,7 +876,7 @@ sub _lei_cfg ($;$) { # FIXME: use inotify/EVFILT_VNODE to detect unlinked configs delete(@PATH2CFG{grep(!-f, keys %PATH2CFG)}); } - $self->{cfg} = $PATH2CFG{$f} = $cfg; + $self->{cfg} = $self->{opt}->{c} ? $cfg : ($PATH2CFG{$f} = $cfg); refresh_watches($self); $cfg; } @@ -886,16 +892,49 @@ sub _lei_store ($;$) { }; } +# returns true on success, undef +# argv[0] eq `+e' means errors do not ->fail # (like `sh +e') sub _config { my ($self, @argv) = @_; - my %env = (%{$self->{env}}, GIT_CONFIG => undef); + my $err_ok = ($argv[0] // '') eq '+e' ? shift(@argv) : undef; + my %env; + my %opt = map { $_ => $self->{$_} } (0..2); my $cfg = _lei_cfg($self, 1); - my $cmd = [ qw(git config -f), $cfg->{'-f'}, @argv ]; - my %rdr = map { $_ => $self->{$_} } (0..2); - waitpid(spawn($cmd, \%env, \%rdr), 0); + my $opt_c = delete local $cfg->{-opt_c}; + my @file_arg; + if ($opt_c) { + my ($set, $get, $nondash); + for (@argv) { # order matters for git-config + if (!$nondash) { + if (/\A--(?:add|rename-section|remove-section| + replace-all| + unset-all|unset)\z/x) { + ++$set; + } elsif ($_ eq '-l' || $_ eq '--list' || + /\A--get/) { + ++$get; + } elsif (/\A-/) { # -z and such + } else { + ++$nondash; + } + } else { + ++$nondash; + } + } + if ($set || ($nondash//0) > 1 && !$get) { + @file_arg = ('-f', $cfg->{-f}); + $env{GIT_CONFIG} = $file_arg[1]; + } else { # OK, we can use `-c n=v' for read-only + $cfg->{-opt_c} = $opt_c; + $env{GIT_CONFIG} = undef; + } + } + my $cmd = $cfg->config_cmd(\%env, \%opt); + push @$cmd, @file_arg, @argv; + run_wait($cmd, \%env, \%opt) ? ($err_ok ? undef : fail($self, $?)) : 1; } -sub lei_daemon_pid { puts shift, $$ } +sub lei_daemon_pid { puts shift, $daemon_pid } sub lei_daemon_kill { my ($self) = @_; @@ -1000,9 +1039,10 @@ sub start_mua { $io->[0] = $self->{1} if $self->{opt}->{stdin} && -t $self->{1}; send_exec_cmd($self, $io, \@cmd, {}); } - if ($self->{lxs} && $self->{au_done}) { # kick wait_startq - syswrite($self->{au_done}, 'q' x ($self->{lxs}->{jobs} // 0)); - } + + # kick wait_startq: + syswrite($self->{au_done}, 'q') if $self->{lxs} && $self->{au_done}; + return unless -t $self->{2}; # XXX how to determine non-TUI MUAs? $self->{opt}->{quiet} = 1; delete $self->{-progress}; @@ -1011,9 +1051,11 @@ sub start_mua { sub send_exec_cmd { # tell script/lei to execute a command my ($self, $io, $cmd, $env) = @_; - my $sock = $self->{sock} // die 'lei client gone'; - my $fds = [ map { fileno($_) } @$io ]; - $send_cmd->($sock, $fds, exec_buf($cmd, $env), MSG_EOR); + $PublicInbox::IPC::send_cmd->( + $self->{sock} // die('lei client gone'), + [ map { fileno($_) } @$io ], + exec_buf($cmd, $env), 0) // + Carp::croak("sendmsg: $!"); } sub poke_mua { # forces terminal MUAs to wake up and hopefully notice new mail @@ -1023,7 +1065,7 @@ sub poke_mua { # forces terminal MUAs to wake up and hopefully notice new mail while (my $op = shift(@$alerts)) { if ($op eq ':WINCH') { # hit the process group that started the MUA - send($sock, '-WINCH', MSG_EOR) if $sock; + send($sock, '-WINCH', 0) if $sock; } elsif ($op eq ':bell') { out($self, "\a"); } elsif ($op =~ /(?<!\\),/) { # bare ',' (not ',,') @@ -1032,7 +1074,7 @@ sub poke_mua { # forces terminal MUAs to wake up and hopefully notice new mail my $cmd = $1; # run an arbitrary command require Text::ParseWords; $cmd = [ Text::ParseWords::shellwords($cmd) ]; - send($sock, exec_buf($cmd, {}), MSG_EOR) if $sock; + send($sock, exec_buf($cmd, {}), 0) if $sock; } else { warn("W: unsupported --alert=$op\n"); # non-fatal } @@ -1056,16 +1098,15 @@ sub path_to_fd { # caller needs to "-t $self->{1}" to check if tty sub start_pager { my ($self, $new_env) = @_; - my $fh = popen_rd([qw(git var GIT_PAGER)]); - chomp(my $pager = <$fh> // ''); - close($fh) or warn "`git var PAGER' error: \$?=$?"; + chomp(my $pager = run_qx([qw(git var GIT_PAGER)])); + warn "`git var PAGER' error: \$?=$?" if $?; return if $pager eq 'cat' || $pager eq ''; $new_env //= {}; $new_env->{LESS} //= 'FRX'; $new_env->{LV} //= '-c'; $new_env->{MORE} = $new_env->{LESS} if $^O eq 'freebsd'; - pipe(my ($r, $wpager)) or return warn "pipe: $!"; - my $rdr = { 0 => $r, 1 => $self->{1}, 2 => $self->{2} }; + my $rdr = { 1 => $self->{1}, 2 => $self->{2} }; + CORE::pipe($rdr->{0}, my $wpager) or return warn "pipe: $!"; my $pgr = [ undef, @$rdr{1, 2} ]; my $env = $self->{env}; if ($self->{sock}) { # lei(1) process runs it @@ -1085,17 +1126,17 @@ sub pgr_err { my ($self, @msg) = @_; return warn(@msg) unless $self->{sock} && -t $self->{2}; start_pager($self, { LESS => 'RX' }); # no 'F' so we prompt - print { $self->{2} } @msg; + say { $self->{2} } @msg, '# -quit pager to continue-'; $self->{2}->autoflush(1); stop_pager($self); - send($self->{sock}, 'wait', MSG_EOR); # wait for user to quit pager + send($self->{sock}, 'wait', 0); # wait for user to quit pager } sub stop_pager { my ($self) = @_; my $pgr = delete($self->{pgr}) or return; $self->{2} = $pgr->[2]; - close(delete($self->{1})) if $self->{1}; + delete($self->{1})->close if $self->{1}; $self->{1} = $pgr->[1]; } @@ -1105,24 +1146,22 @@ sub accept_dispatch { # Listener {post_accept} callback my $self = bless { sock => $sock }, __PACKAGE__; vec(my $rvec = '', fileno($sock), 1) = 1; select($rvec, undef, undef, 60) or - return send($sock, 'timed out waiting to recv FDs', MSG_EOR); + return send($sock, 'timed out waiting to recv FDs', 0); # (4096 * 33) >MAX_ARG_STRLEN - my @fds = $recv_cmd->($sock, my $buf, 4096 * 33) or return; # EOF + my @fds = $PublicInbox::IPC::recv_cmd->($sock, my $buf, 4096 * 33) or + return; # EOF if (!defined($fds[0])) { warn(my $msg = "recv_cmd failed: $!"); - return send($sock, $msg, MSG_EOR); + return send($sock, $msg, 0); } else { my $i = 0; - for my $fd (@fds) { - open($self->{$i++}, '+<&=', $fd) and next; - send($sock, "open(+<&=$fd) (FD=$i): $!", MSG_EOR); - } - $i == 4 or return send($sock, 'not enough FDs='.($i-1), MSG_EOR) + open($self->{$i++}, '+<&=', $_) for @fds; + $i == 4 or return send($sock, 'not enough FDs='.($i-1), 0) } # $ENV_STR = join('', map { "\0$_=$ENV{$_}" } keys %ENV); # $buf = "$argc\0".join("\0", @ARGV).$ENV_STR."\0\0"; substr($buf, -2, 2, '') eq "\0\0" or # s/\0\0\z// - return send($sock, 'request command truncated', MSG_EOR); + return send($sock, 'request command truncated', 0); my ($argc, @argv) = split(/\0/, $buf, -1); undef $buf; my %env = map { split(/=/, $_, 2) } splice(@argv, $argc); @@ -1145,11 +1184,11 @@ sub event_step { local %ENV = %{$self->{env}}; local $current_lei = $self; eval { - my @fds = $recv_cmd->($self->{sock} // return, my $buf, 4096); + my @fds = $PublicInbox::IPC::recv_cmd->( + $self->{sock} // return, my $buf, 4096); if (scalar(@fds) == 1 && !defined($fds[0])) { return if $! == EAGAIN; die "recvmsg: $!" if $! != ECONNRESET; - $buf = ''; @fds = (); # for open loop below: } for (@fds) { open my $rfh, '+<&=', $_ } @@ -1167,7 +1206,7 @@ sub event_step { die "unrecognized client signal: $buf"; } my $s = $self->{-socks} // []; # lei up --all - @$s = grep { send($_, $buf, MSG_EOR) } @$s; + @$s = grep { send($_, $buf, 0) } @$s; }; if (my $err = $@) { eval { $self->fail($err) }; @@ -1185,8 +1224,6 @@ sub event_step_init { }; } -sub noop {} - sub oldset { $oldset } sub dump_and_clear_log { @@ -1203,48 +1240,87 @@ sub dump_and_clear_log { sub cfg2lei ($) { my ($cfg) = @_; my $lei = bless { env => { %{$cfg->{-env}} } }, __PACKAGE__; - open($lei->{0}, '<&', \*STDIN) or die "dup 0: $!"; - open($lei->{1}, '>>&', \*STDOUT) or die "dup 1: $!"; - open($lei->{2}, '>>&', \*STDERR) or die "dup 2: $!"; - open($lei->{3}, '<', '/') or die "open /: $!"; - my ($x, $y); - socketpair($x, $y, AF_UNIX, SOCK_SEQPACKET, 0) or die "socketpair: $!"; + open($lei->{0}, '<&', \*STDIN); + open($lei->{1}, '>>&', \*STDOUT); + open($lei->{2}, '>>&', \*STDERR); + open($lei->{3}, '<', '/'); + socketpair(my $x, my $y, AF_UNIX, SOCK_SEQPACKET, 0); $lei->{sock} = $x; require PublicInbox::LeiSelfSocket; PublicInbox::LeiSelfSocket->new($y); # adds to event loop $lei; } +sub note_event ($@) { # runs lei_note_event for a given config file + my ($cfg_f, @args) = @_; + my $cfg = $PATH2CFG{$cfg_f} // return; + eval { cfg2lei($cfg)->dispatch('note-event', @args) }; + carp "E: note-event $cfg_f: $@\n" if $@; +} + sub dir_idle_handler ($) { # PublicInbox::DirIdle callback my ($ev) = @_; # Linux::Inotify2::Event or duck type my $fn = $ev->fullname; if ($fn =~ m!\A(.+)/(new|cur)/([^/]+)\z!) { # Maildir file - my ($mdir, $nc, $bn) = ($1, $2, $3); - $nc = '' if $ev->IN_DELETE || $ev->IN_MOVED_FROM; - for my $f (keys %{$MDIR2CFGPATH->{$mdir} // {}}) { - my $cfg = $PATH2CFG{$f} // next; - eval { - my $lei = cfg2lei($cfg); - $lei->dispatch('note-event', - "maildir:$mdir", $nc, $bn, $fn); - }; - warn "E: note-event $f: $@\n" if $@; + my ($loc, $new_cur, $bn) = ("maildir:$1", $2, $3); + $new_cur = '' if $ev->IN_DELETE || $ev->IN_MOVED_FROM; + for my $cfg_f (keys %{$MDIR2CFGPATH->{$loc} // {}}) { + note_event($cfg_f, $loc, $new_cur, $bn, $fn); } - } + } elsif ($fn =~ m!\A(.+)/([0-9]+)\z!) { # MH mail message file + my ($loc, $n, $new_cur) = ("mh:$1", $2, '+'); + $new_cur = '' if $ev->IN_DELETE || $ev->IN_MOVED_FROM; + for my $cfg_f (keys %{$MDIR2CFGPATH->{$loc} // {}}) { + note_event($cfg_f, $loc, $new_cur, $n, $fn); + } + } elsif ($fn =~ m!\A(.+)/\.mh_sequences\z!) { # reread flags + my $loc = "mh:$1"; + for my $cfg_f (keys %{$MDIR2CFGPATH->{$loc} // {}}) { + note_event($cfg_f, $loc, '.mh_sequences') + } + } # else we don't care if ($ev->can('cancel') && ($ev->IN_IGNORE || $ev->IN_UNMOUNT)) { $ev->cancel; } if ($fn =~ m!\A(.+)/(?:new|cur)\z! && !-e $fn) { - delete $MDIR2CFGPATH->{$1}; + delete $MDIR2CFGPATH->{"maildir:$1"}; } - if (!-e $fn) { # config file or Maildir gone - for my $cfgpaths (values %$MDIR2CFGPATH) { - delete $cfgpaths->{$fn}; - } + if (!-e $fn) { # config file, Maildir, or MH dir gone + delete $_->{$fn} for values %$MDIR2CFGPATH; # config file + delete @$MDIR2CFGPATH{"maildir:$fn", "mh:$fn"}; delete $PATH2CFG{$fn}; } } +sub can_stay_alive { # PublicInbox::DS::post_loop_do cb + my ($path, $dev_ino_expect) = @_; + if (my @st = defined($$path) ? stat($$path) : ()) { + if ($dev_ino_expect ne pack('dd', $st[0], $st[1])) { + warn "$$path dev/ino changed, quitting\n"; + $$path = undef; + } + } elsif (defined($$path)) { # ENOENT is common + warn "stat($$path): $!, quitting ...\n" if $! != ENOENT; + undef $$path; + $quit->(); + } + return 1 if defined($$path); + my $n = PublicInbox::DS::close_non_busy() or do { + eval 'PublicInbox::LeiNoteEvent::flush_task()'; + # drop stores only if no clients + for my $cfg (values %PATH2CFG) { + my $lne = delete($cfg->{-lei_note_event}); + $lne->wq_close if $lne; + my $sto = delete($cfg->{-lei_store}) // next; + eval { $sto->wq_do('done') if $sto->{-wq_s1} }; + warn "E: $@ (dropping store for $cfg->{-f})" if $@; + $sto->wq_close; + } + }; + # returns true: continue, false: stop + $n + scalar(keys(%PublicInbox::DS::AWAIT_PIDS)); +} + # lei(1) calls this when it can't connect sub lazy_start { my ($path, $errno, $narg) = @_; @@ -1252,106 +1328,66 @@ sub lazy_start { my ($sock_dir) = ($path =~ m!\A(.+?)/[^/]+\z!); $errors_log = "$sock_dir/errors.log"; my $addr = pack_sockaddr_un($path); - my $lk = bless { lock_path => $errors_log }, 'PublicInbox::Lock'; + my $lk = PublicInbox::Lock->new($errors_log); umask(077) // die("umask(077): $!"); $lk->lock_acquire; - socket($listener, AF_UNIX, SOCK_SEQPACKET, 0) or die "socket: $!"; + socket($listener, AF_UNIX, SOCK_SEQPACKET, 0); if ($errno == ECONNREFUSED || $errno == ENOENT) { return if connect($listener, $addr); # another process won - if ($errno == ECONNREFUSED && -S $path) { - unlink($path) or die "unlink($path): $!"; - } + unlink($path) if $errno == ECONNREFUSED && -S $path; } else { $! = $errno; # allow interpolation to stringify in die die "connect($path): $!"; } - bind($listener, $addr) or die "bind($path): $!"; + bind($listener, $addr); $lk->lock_release; undef $lk; my @st = stat($path) or die "stat($path): $!"; my $dev_ino_expect = pack('dd', $st[0], $st[1]); # dev+ino - local $oldset = PublicInbox::DS::block_signals(); - if ($narg == 5) { - $send_cmd = PublicInbox::Spawn->can('send_cmd4'); - $recv_cmd = PublicInbox::Spawn->can('recv_cmd4') // do { - require PublicInbox::CmdIPC4; - $send_cmd = PublicInbox::CmdIPC4->can('send_cmd4'); - PublicInbox::CmdIPC4->can('recv_cmd4'); - } // do { - $send_cmd = PublicInbox::Syscall->can('send_cmd4'); - PublicInbox::Syscall->can('recv_cmd4'); - }; - } - $recv_cmd or die <<""; + local $oldset = PublicInbox::DS::block_signals(POSIX::SIGALRM); + die "incompatible narg=$narg" if $narg != 5; + $PublicInbox::IPC::send_cmd or die <<""; (Socket::MsgHdr || Inline::C) missing/unconfigured (narg=$narg); require PublicInbox::Listener; require PublicInbox::PktOp; (-p STDOUT) or die "E: stdout must be a pipe\n"; - open(STDIN, '+>>', $errors_log) or die "open($errors_log): $!"; + open(STDIN, '+>>', $errors_log); STDIN->autoflush(1); dump_and_clear_log(); POSIX::setsid() > 0 or die "setsid: $!"; - my $pid = fork // die "fork: $!"; + my $pid = PublicInbox::OnDestroy::fork_tmp; return if $pid; $0 = "lei-daemon $path"; - local %PATH2CFG; - local $MDIR2CFGPATH; + local (%PATH2CFG, $MDIR2CFGPATH); + local $daemon_pid = $$; $listener->blocking(0); my $exit_code; my $pil = PublicInbox::Listener->new($listener, \&accept_dispatch); local $quit = do { my (undef, $eof_p) = PublicInbox::PktOp->pair; sub { - $exit_code //= shift; + $exit_code //= eval("POSIX::SIG$_[0] + 128") if @_; + $dir_idle->close if $dir_idle; # EPOLL_CTL_DEL + $dir_idle = undef; # let RC take care of it eval 'PublicInbox::LeiNoteEvent::flush_task()'; - my $lis = $pil or exit($exit_code); + my $lis = $pil or exit($exit_code // 0); # closing eof_p triggers \&noop wakeup $listener = $eof_p = $pil = $path = undef; $lis->close; # DS::close - PublicInbox::DS->SetLoopTimeout(1000); }; }; - my $sig = { - CHLD => \&PublicInbox::DS::enqueue_reap, - QUIT => $quit, - INT => $quit, - TERM => $quit, - HUP => \&noop, - USR1 => \&noop, - USR2 => \&noop, - }; + my $sig = { CHLD => \&PublicInbox::DS::enqueue_reap }; + $sig->{$_} = $quit for qw(QUIT INT TERM); + $sig->{$_} = \&PublicInbox::Config::noop for qw(HUP USR1 USR2); require PublicInbox::DirIdle; local $dir_idle = PublicInbox::DirIdle->new(sub { - # just rely on wakeup to hit PostLoopCallback set below + # just rely on wakeup to hit post_loop_do dir_idle_handler($_[0]) if $_[0]->fullname ne $path; }); $dir_idle->add_watches([$sock_dir]); - PublicInbox::DS->SetPostLoopCallback(sub { - my ($dmap, undef) = @_; - if (@st = defined($path) ? stat($path) : ()) { - if ($dev_ino_expect ne pack('dd', $st[0], $st[1])) { - warn "$path dev/ino changed, quitting\n"; - $path = undef; - } - } elsif (defined($path)) { # ENOENT is common - warn "stat($path): $!, quitting ...\n" if $! != ENOENT; - undef $path; - $quit->(); - } - return 1 if defined($path); - my $n = 0; - for my $s (values %$dmap) { - $s->can('busy') or next; - if ($s->busy) { - ++$n; - } else { - $s->close; - } - } - $n; # true: continue, false: stop - }); - + local @PublicInbox::DS::post_loop_do = (\&can_stay_alive, + \$path, $dev_ino_expect); # STDIN was redirected to /dev/null above, closing STDERR and # STDOUT will cause the calling `lei' client process to finish # reading the <$daemon> pipe. @@ -1359,13 +1395,13 @@ sub lazy_start { $current_lei ? err($current_lei, @_) : warn( strftime('%Y-%m-%dT%H:%M:%SZ', gmtime(time))," $$ ", @_); }; - open STDERR, '>&STDIN' or die "redirect stderr failed: $!"; - open STDOUT, '>&STDIN' or die "redirect stdout failed: $!"; + local $SIG{PIPE} = 'IGNORE'; + local $SIG{ALRM} = 'IGNORE'; + open STDERR, '>&STDIN'; + open STDOUT, '>&STDIN'; # $daemon pipe to `lei' closed, main loop begins: eval { PublicInbox::DS::event_loop($sig, $oldset) }; warn "event loop error: $@\n" if $@; - # exit() may trigger waitpid via various DESTROY, ensure interruptible - PublicInbox::DS::sig_setmask($oldset); dump_and_clear_log(); exit($exit_code // 0); } @@ -1376,9 +1412,10 @@ sub busy { 1 } # prevent daemon-shutdown if client is connected # can immediately reread it sub DESTROY { my ($self) = @_; - if (my $counters = delete $self->{counters}) { - for my $k (sort keys %$counters) { - my $nr = $counters->{$k}; + if (defined($self->{incr_pid}) && $self->{incr_pid} == $$) { + for my $k (sort(grep(/\A-nr_/, keys %$self))) { + my $nr = $self->{$k}; + substr($k, 0, length('-nr_'), ''); $self->child_error(0, "$nr $k messages"); } } @@ -1388,9 +1425,8 @@ sub DESTROY { # preserve $? for ->fail or ->x_it code } -sub wq_done_wait { # dwaitpid callback - my ($arg, $pid) = @_; - my ($wq, $lei) = @$arg; +sub wq_done_wait { # awaitpid cb (via wq_eof) + my ($pid, $wq, $lei) = @_; local $current_lei = $lei; my $err_type = $lei->{-err_type}; $? and $lei->child_error($?, @@ -1400,15 +1436,14 @@ sub wq_done_wait { # dwaitpid callback sub fchdir { my ($lei) = @_; - my $dh = $lei->{3} // die 'BUG: lei->{3} (CWD) gone'; - chdir($dh) || die "fchdir: $!"; + chdir($lei->{3} // die 'BUG: lei->{3} (CWD) gone'); } sub wq_eof { # EOF callback for main daemon my ($lei, $wq_fld) = @_; local $current_lei = $lei; my $wq = delete $lei->{$wq_fld // 'wq1'}; - $lei->sto_done_request($wq); + $lei->sto_barrier_request($wq); $wq // $lei->fail; # already failed } @@ -1417,19 +1452,22 @@ sub watch_state_ok ($) { $state =~ /\Apause|(?:import|index|tag)-(?:ro|rw)\z/; } -sub cancel_maildir_watch ($$) { - my ($d, $cfg_f) = @_; - my $w = delete $MDIR2CFGPATH->{$d}->{$cfg_f}; - scalar(keys %{$MDIR2CFGPATH->{$d}}) or - delete $MDIR2CFGPATH->{$d}; - for my $x (@{$w // []}) { $x->cancel } +sub cancel_dir_watch ($$$) { + my ($type, $d, $cfg_f) = @_; + my $loc = "$type:".canonpath_harder($d); + my $w = delete $MDIR2CFGPATH->{$loc}->{$cfg_f}; + delete $MDIR2CFGPATH->{$loc} if !(keys %{$MDIR2CFGPATH->{$loc}}); + $_->cancel for @$w; } -sub add_maildir_watch ($$) { - my ($d, $cfg_f) = @_; - if (!exists($MDIR2CFGPATH->{$d}->{$cfg_f})) { - my @w = $dir_idle->add_watches(["$d/cur", "$d/new"], 1); - push @{$MDIR2CFGPATH->{$d}->{$cfg_f}}, @w if @w; +sub add_dir_watch ($$$) { + my ($type, $d, $cfg_f) = @_; + $d = canonpath_harder($d); + my $loc = "$type:$d"; + my @dirs = $type eq 'mh' ? ($d) : ("$d/cur", "$d/new"); + if (!exists($MDIR2CFGPATH->{$loc}->{$cfg_f})) { + my @w = $dir_idle->add_watches(\@dirs, 1); + push @{$MDIR2CFGPATH->{$loc}->{$cfg_f}}, @w if @w; } } @@ -1442,24 +1480,20 @@ sub refresh_watches { my %seen; my $cfg_f = $cfg->{'-f'}; for my $w (grep(/\Awatch\..+\.state\z/, keys %$cfg)) { - my $url = substr($w, length('watch.'), -length('.state')); + my $loc = substr($w, length('watch.'), -length('.state')); require PublicInbox::LeiWatch; - $watches->{$url} //= PublicInbox::LeiWatch->new($url); - $seen{$url} = undef; - my $state = $cfg->get_1("watch.$url.state"); + $watches->{$loc} //= PublicInbox::LeiWatch->new($loc); + $seen{$loc} = undef; + my $state = $cfg->get_1("watch.$loc.state"); if (!watch_state_ok($state)) { - warn("watch.$url.state=$state not supported\n"); - next; - } - if ($url =~ /\Amaildir:(.+)/i) { - my $d = canonpath_harder($1); - if ($state eq 'pause') { - cancel_maildir_watch($d, $cfg_f); - } else { - add_maildir_watch($d, $cfg_f); - } + warn("watch.$loc.state=$state not supported\n"); + } elsif ($loc =~ /\A(maildir|mh):(.+)\z/i) { + my ($type, $d) = ($1, $2); + $state eq 'pause' ? + cancel_dir_watch($type, $d, $cfg_f) : + add_dir_watch($type, $d, $cfg_f); } else { # TODO: imap/nntp/jmap - $lei->child_error(0, "E: watch $url not supported, yet") + $lei->child_error(0, "E: watch $loc not supported, yet") } } @@ -1467,29 +1501,28 @@ sub refresh_watches { my $lms = $lei->lms; if ($lms) { $lms->lms_write_prepare; - for my $d ($lms->folders('maildir:')) { - substr($d, 0, length('maildir:')) = ''; - + for my $loc ($lms->folders(qr/\A(?:maildir|mh):/)) { + my $old = $loc; + my ($type, $d) = split /:/, $loc, 2; # fixup old bugs while we're iterating: - my $cd = canonpath_harder($d); - my $f = "maildir:$cd"; - $lms->rename_folder("maildir:$d", $f) if $d ne $cd; - next if $watches->{$f}; # may be set to pause + $d = canonpath_harder($d); + $loc = "$type:$d"; + $lms->rename_folder($old, $loc) if $old ne $loc; + next if $watches->{$loc}; # may be set to pause require PublicInbox::LeiWatch; - $watches->{$f} = PublicInbox::LeiWatch->new($f); - $seen{$f} = undef; - add_maildir_watch($cd, $cfg_f); + $watches->{$loc} = PublicInbox::LeiWatch->new($loc); + $seen{$loc} = undef; + add_dir_watch($type, $d, $cfg_f); } } if ($old) { # cull old non-existent entries - for my $url (keys %$old) { - next if exists $seen{$url}; - delete $old->{$url}; - if ($url =~ /\Amaildir:(.+)/i) { - my $d = canonpath_harder($1); - cancel_maildir_watch($d, $cfg_f); + for my $loc (keys %$old) { + next if exists $seen{$loc}; + delete $old->{$loc}; + if ($loc =~ /\A(maildir|mh):(.+)\z/i) { + cancel_dir_watch($1, $2, $cfg_f); } else { # TODO: imap/nntp/jmap - $lei->child_error(0, "E: watch $url TODO"); + $lei->child_error(0, "E: watch $loc TODO"); } } } @@ -1515,24 +1548,24 @@ sub lms { (-f $f || $creat) ? PublicInbox::LeiMailSync->new($f) : undef; } -sub sto_done_request { +sub sto_barrier_request { my ($lei, $wq) = @_; - return unless $lei->{sto}; + return unless $lei->{sto} && $lei->{sto}->{-wq_s1}; local $current_lei = $lei; - my $sock = $wq ? $wq->{lei_sock} : undef; - eval { - if ($sock //= $lei->{sock}) { # issue, async wait - $lei->{sto}->wq_io_do('done', [ $sock ]); - } else { # forcibly wait - my $wait = $lei->{sto}->wq_do('done'); - } - }; + if (my $n = $lei->{opt}->{'commit-delay'}) { + eval { $lei->{sto}->wq_do('schedule_commit', $n) }; + } else { + my $s = ($wq ? $wq->{lei_sock} : undef) // $lei->{sock}; + my $errfh = $lei->{2} // *STDERR{GLOB}; + my @io = $s ? ($errfh, $s) : ($errfh); + eval { $lei->{sto}->wq_io_do('barrier', \@io, 1) }; + } warn($@) if $@; } sub cfg_dump ($$) { my ($lei, $f) = @_; - my $ret = eval { PublicInbox::Config->git_config_dump($f, $lei->{2}) }; + my $ret = eval { PublicInbox::Config->git_config_dump($f, $lei) }; return $ret if !$@; warn($@); undef; @@ -1541,7 +1574,7 @@ sub cfg_dump ($$) { sub request_umask { my ($lei) = @_; my $s = $lei->{sock} // return; - send($s, 'umask', MSG_EOR) // die "send: $!"; + send($s, 'umask', 0) // die "send: $!"; vec(my $rvec = '', fileno($s), 1) = 1; select($rvec, undef, undef, 2) or die 'timeout waiting for umask'; recv($s, my $v, 5, 0) // die "recv: $!"; @@ -1549,4 +1582,24 @@ sub request_umask { $u eq 'u' or warn "E: recv $v has no umask"; } +sub _stdin_cb { # PublicInbox::InputPipe::consume callback for --stdin + my (undef, $lei, $cb) = @_; # $_[-1] = $rbuf + $_[1] // return $lei->fail("error reading stdin: $!"); + $lei->{stdin_buf} .= $_[-1]; + do_env($lei, $cb) if $_[-1] eq ''; +} + +sub slurp_stdin { + my ($lei, $cb) = @_; + require PublicInbox::InputPipe; + my $in = $lei->{0}; + if (-t $in) { # run cat via script/lei and read from it + $in = undef; + pipe($in, my $wr); + say { $lei->{2} } '# enter query, Ctrl-D when done'; + send_exec_cmd($lei, [ $lei->{0}, $wr ], ['cat'], {}); + } + PublicInbox::InputPipe::consume($in, \&_stdin_cb, $lei, $cb); +} + 1; diff --git a/lib/PublicInbox/LI2Wrap.pm b/lib/PublicInbox/LI2Wrap.pm index 204850a6..d4792b25 100644 --- a/lib/PublicInbox/LI2Wrap.pm +++ b/lib/PublicInbox/LI2Wrap.pm @@ -5,7 +5,7 @@ # Remove this when supported LTS/enterprise distros are all # Linux::Inotify2 >= 2.3 package PublicInbox::LI2Wrap; -use v5.10.1; +use v5.12; our @ISA = qw(Linux::Inotify2); sub wrapclose { diff --git a/lib/PublicInbox/LeiALE.pm b/lib/PublicInbox/LeiALE.pm index cc9a2095..ce03f5b4 100644 --- a/lib/PublicInbox/LeiALE.pm +++ b/lib/PublicInbox/LeiALE.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # All Locals Ever: track lei/store + externals ever used as @@ -6,11 +6,12 @@ # and --only targets that haven't been through "lei add-external". # Typically: ~/.cache/lei/all_locals_ever.git package PublicInbox::LeiALE; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::LeiSearch PublicInbox::Lock); use PublicInbox::Git; +use autodie qw(close open rename seek truncate); use PublicInbox::Import; +use PublicInbox::OnDestroy; use PublicInbox::LeiXSearch; use Fcntl qw(SEEK_SET); @@ -41,11 +42,11 @@ sub over {} # undef for xoids_for sub overs_all { # for xoids_for (called only in lei workers?) my ($self) = @_; - my $pid = $$; - if (($self->{owner_pid} // $pid) != $pid) { + my $fgen = $PublicInbox::OnDestroy::fork_gen ; + if (($self->{fgen} // $fgen) != $fgen) { delete($_->{over}) for @{$self->{ibxish}}; } - $self->{owner_pid} = $pid; + $self->{fgen} = $fgen; grep(defined, map { $_->over } @{$self->{ibxish}}); } @@ -54,11 +55,7 @@ sub refresh_externals { $self->git->cleanup; my $lk = $self->lock_for_scope; my $cur_lxs = ref($lxs)->new; - my $orig = do { - local $/; - readline($self->{lockfh}) // - die "readline($self->{lock_path}): $!"; - }; + my $orig = PublicInbox::IO::read_all $self->{lockfh}; my $new = ''; my $old = ''; my $gone = 0; @@ -81,19 +78,16 @@ sub refresh_externals { if ($new ne '' || $gone) { $self->{lockfh}->autoflush(1); if ($gone) { - seek($self->{lockfh}, 0, SEEK_SET) or die "seek: $!"; - truncate($self->{lockfh}, 0) or die "truncate: $!"; + seek($self->{lockfh}, 0, SEEK_SET); + truncate($self->{lockfh}, 0); } else { $old = ''; } print { $self->{lockfh} } $old, $new or die "print: $!"; } - $new = $old = ''; + $new = ''; my $f = $self->git->{git_dir}.'/objects/info/alternates'; - if (open my $fh, '<', $f) { - local $/; - $old = <$fh> // die "readline($f): $!"; - } + $old = PublicInbox::IO::try_cat $f; for my $x (@ibxish) { $new .= $lei->canonpath_harder($x->git->{git_dir})."/objects\n"; } @@ -103,10 +97,10 @@ sub refresh_externals { # this needs to be atomic since child processes may start # git-cat-file at any time my $tmp = "$f.$$.tmp"; - open my $fh, '>', $tmp or die "open($tmp): $!"; - print $fh $new or die "print($tmp): $!"; - close $fh or die "close($tmp): $!"; - rename($tmp, $f) or die "rename($tmp, $f): $!"; + open my $fh, '>', $tmp; + print $fh $new; + close $fh; + rename($tmp, $f) } 1; diff --git a/lib/PublicInbox/LeiAddWatch.pm b/lib/PublicInbox/LeiAddWatch.pm index 97e7a342..e2be5cee 100644 --- a/lib/PublicInbox/LeiAddWatch.pm +++ b/lib/PublicInbox/LeiAddWatch.pm @@ -15,24 +15,23 @@ sub lei_add_watch { my $state = $lei->{opt}->{'state'} // 'import-rw'; $lei->watch_state_ok($state) or return $lei->fail("invalid state: $state"); - my $vmd_mod = $self->vmd_mod_extract(\@argv); - return $lei->fail(join("\n", @{$vmd_mod->{err}})) if $vmd_mod->{err}; $self->prepare_inputs($lei, \@argv) or return; my @vmd; - while (my ($type, $vals) = each %$vmd_mod) { + while (my ($type, $vals) = each %{$lei->{vmd_mod}}) { push @vmd, "$type:$_" for @$vals; } my $vmd0 = shift @vmd; for my $w (@{$self->{inputs}}) { # clobber existing, allow multiple if (defined($vmd0)) { - $lei->_config("watch.$w.vmd", '--replace-all', $vmd0); + $lei->_config("watch.$w.vmd", '--replace-all', $vmd0) + or return; for my $v (@vmd) { - $lei->_config("watch.$w.vmd", $v); + $lei->_config("watch.$w.vmd", $v) or return; } } next if defined $cfg->{"watch.$w.state"}; - $lei->_config("watch.$w.state", $state); + $lei->_config("watch.$w.state", $state) or return; } $lei->_lei_store(1); # create $lei->lms(1)->lms_write_prepare->add_folders(@{$self->{inputs}}); diff --git a/lib/PublicInbox/LeiAuth.pm b/lib/PublicInbox/LeiAuth.pm index 9b09cecf..020dd125 100644 --- a/lib/PublicInbox/LeiAuth.pm +++ b/lib/PublicInbox/LeiAuth.pm @@ -1,8 +1,8 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Authentication worker for anything that needs auth for read/write IMAP -# (eventually for read-only NNTP access) +# and read-only NNTP access # # timelines # lei-daemon | LeiAuth worker #0 | other WQ workers @@ -22,8 +22,7 @@ # | # call net_merge_all_done ->-> do per-WQ-class defined actions package PublicInbox::LeiAuth; -use strict; -use v5.10.1; +use v5.12; sub do_auth_atfork { # used by IPC WQ workers my ($self, $wq) = @_; @@ -57,7 +56,7 @@ sub net_merge_all { # called in wq worker via wq_broadcast # called by top-level lei-daemon when first worker is done with auth # passes updated net auth info to current workers sub net_merge_continue { - my ($wq, $lei, $net_new) = @_; + my ($lei, $wq, $net_new) = @_; $wq->{-net_new} = $net_new; # for "lei up" $wq->wq_broadcast('PublicInbox::LeiAuth::net_merge_all', $net_new); $wq->net_merge_all_done($lei); # defined per-WQ @@ -65,7 +64,7 @@ sub net_merge_continue { sub op_merge { # prepares PktOp->pair ops my ($self, $ops, $wq, $lei) = @_; - $ops->{net_merge_continue} = [ \&net_merge_continue, $wq, $lei ]; + $ops->{net_merge_continue} = [ \&net_merge_continue, $lei, $wq ]; } sub new { bless \(my $x), __PACKAGE__ } diff --git a/lib/PublicInbox/LeiBlob.pm b/lib/PublicInbox/LeiBlob.pm index 004b156c..00697097 100644 --- a/lib/PublicInbox/LeiBlob.pm +++ b/lib/PublicInbox/LeiBlob.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # "lei blob $OID" command @@ -7,8 +7,11 @@ package PublicInbox::LeiBlob; use strict; use v5.10.1; use parent qw(PublicInbox::IPC); -use PublicInbox::Spawn qw(spawn popen_rd which); +use PublicInbox::Spawn qw(run_wait run_qx which); use PublicInbox::DS; +use PublicInbox::Eml; +use PublicInbox::Git; +use PublicInbox::IO qw(read_all); sub get_git_dir ($$) { my ($lei, $d) = @_; @@ -21,10 +24,8 @@ sub get_git_dir ($$) { } else { # implicit --cwd, quiet errors open $opt->{2}, '>', '/dev/null' or die "open /dev/null: $!"; } - my ($r, $pid) = popen_rd($cmd, {GIT_DIR => undef}, $opt); - chomp(my $gd = do { local $/; <$r> }); - waitpid($pid, 0) == $pid or die "BUG: waitpid @$cmd ($!)"; - $? == 0 ? $gd : undef; + chomp(my $git_dir = run_qx($cmd, {GIT_DIR => undef}, $opt)); + $? ? undef : $git_dir; } sub solver_user_cb { # called by solver when done @@ -44,8 +45,7 @@ sub solver_user_cb { # called by solver when done my $cmd = [ 'git', "--git-dir=$gd", 'show', $oid ]; my $rdr = { 1 => $lei->{1}, 2 => $lei->{2} }; - waitpid(spawn($cmd, $lei->{env}, $rdr), 0); - $lei->child_error($?) if $?; + run_wait($cmd, $lei->{env}, $rdr) and $lei->child_error($?); } sub do_solve_blob { # via wq_do @@ -70,7 +70,7 @@ sub do_solve_blob { # via wq_do } @$git_dirs ], user_cb => \&solver_user_cb, uarg => $self, - # -cur_di, -qsp, -msg => temporary fields for Qspawn callbacks + # -cur_di, -msg => temporary fields for Qspawn callbacks inboxes => [ $self->{lxs}->locals, @rmt ], }, 'PublicInbox::SolverGit'; local $PublicInbox::DS::in_loop = 0; # waitpid synchronously @@ -119,28 +119,29 @@ sub lei_blob { } else { open $rdr->{2}, '>', '/dev/null' or die "open: $!"; } - my $cmd = [ 'git', '--git-dir='.$lei->ale->git->{git_dir}, - 'cat-file', 'blob', $blob ]; + my $cmd = $lei->ale->git->cmd('cat-file', 'blob', $blob); + my $cerr; if (defined $lei->{-attach_idx}) { - my $fh = popen_rd($cmd, $lei->{env}, $rdr); - require PublicInbox::Eml; - my $buf = do { local $/; <$fh> }; - return extract_attach($lei, $blob, \$buf) if close($fh); + my $buf = run_qx($cmd, $lei->{env}, $rdr); + return extract_attach($lei, $blob, \$buf) unless $?; + $cerr = $?; } else { - $rdr->{1} = $lei->{1}; - waitpid(spawn($cmd, $lei->{env}, $rdr), 0); + $rdr->{1} = $lei->{1}; # write directly to client + $cerr = run_wait($cmd, $lei->{env}, $rdr) or return; } - my $ce = $?; - return if $ce == 0; + # fall back to unimported ('lei index') and inflight blobs my $lms = $lei->lms; - if (my $bref = $lms ? $lms->local_blob($blob, 1) : undef) { - defined($lei->{-attach_idx}) and - return extract_attach($lei, $blob, $bref); - return $lei->out($$bref); - } elsif ($opt->{mail}) { - my $eh = $rdr->{2}; - seek($eh, 0, 0); - return $lei->child_error($ce, do { local $/; <$eh> }); + my $bref = ($lms ? $lms->local_blob($blob, 1) : undef) // do { + my $sto = $lei->{sto} // $lei->_lei_store; + $sto && $sto->{-wq_s1} ? $sto->wq_do('cat_blob', $blob) + : undef; + }; + $bref and return $lei->{-attach_idx} ? + extract_attach($lei, $blob, $bref) : + $lei->out($$bref); + if ($opt->{mail}) { + seek($rdr->{2}, 0, 0); + return $lei->child_error($cerr, read_all($rdr->{2})); } # else: fall through to solver below } @@ -158,7 +159,7 @@ sub lei_blob { if ($lxs->remotes) { require PublicInbox::LeiRemote; $lei->{curl} //= which('curl') or return - $lei->fail('curl needed for', $lxs->remotes); + $lei->fail('curl needed for '.join(', ',$lxs->remotes)); $lei->_lei_store(1)->write_prepare($lei); } require PublicInbox::SolverGit; diff --git a/lib/PublicInbox/LeiConfig.pm b/lib/PublicInbox/LeiConfig.pm index 23be9aaf..a50ff2b6 100644 --- a/lib/PublicInbox/LeiConfig.pm +++ b/lib/PublicInbox/LeiConfig.pm @@ -1,9 +1,11 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -package PublicInbox::LeiConfig; -use strict; -use v5.10.1; +package PublicInbox::LeiConfig; # subclassed by LeiEditSearch +use v5.12; use PublicInbox::PktOp; +use Fcntl qw(SEEK_SET); +use autodie qw(open seek); +use PublicInbox::IO qw(read_all); sub cfg_do_edit ($;$) { my ($self, $reason) = @_; @@ -15,28 +17,39 @@ sub cfg_do_edit ($;$) { # run in script/lei foreground my ($op_c, $op_p) = PublicInbox::PktOp->pair; # $op_p will EOF when $EDITOR is done - $op_c->{ops} = { '' => [\&cfg_edit_done, $self] }; + $op_c->{ops} = { '' => [\&cfg_edit_done, $lei, $self] }; $lei->send_exec_cmd([ @$lei{qw(0 1 2)}, $op_p->{op_p} ], $cmd, $env); } -sub cfg_edit_done { # PktOp - my ($self) = @_; - eval { - my $cfg = $self->{lei}->cfg_dump($self->{-f}, $self->{lei}->{2}) - // return cfg_do_edit($self, "\n"); - $self->cfg_verify($cfg) if $self->can('cfg_verify'); +sub cfg_edit_done { # PktOp lei->do_env cb + my ($lei, $self) = @_; + open my $fh, '+>', undef; + my $cfg = do { + local $lei->{2} = $fh; + $lei->cfg_dump($self->{-f}); + } or do { + seek($fh, 0, SEEK_SET); + return cfg_do_edit($self, read_all($fh)); }; - $self->{lei}->fail($@) if $@; + $self->cfg_verify($cfg) if $self->can('cfg_verify'); } sub lei_config { my ($lei, @argv) = @_; $lei->{opt}->{'config-file'} and return $lei->fail( "config file switches not supported by `lei config'"); - return $lei->_config(@argv) unless $lei->{opt}->{edit}; - my $f = $lei->_lei_cfg(1)->{-f}; - my $self = bless { lei => $lei, -f => $f }, __PACKAGE__; - cfg_do_edit($self); + if ($lei->{opt}->{edit}) { + @argv and return $lei->fail( +'--edit must be used without other arguments'); + $lei->{opt}->{c} and return $lei->fail( +"`-c $lei->{opt}->{c}->[0]' not allowed with --edit"); + my $f = $lei->_lei_cfg(1)->{-f}; + cfg_do_edit(bless { lei => $lei, -f => $f }, __PACKAGE__); + } elsif (@argv) { # let git-config do error-checking + $lei->_config(@argv); + } else { + $lei->_help('no options given'); + } } 1; diff --git a/lib/PublicInbox/LeiConvert.pm b/lib/PublicInbox/LeiConvert.pm index 59af40de..4d4fceb2 100644 --- a/lib/PublicInbox/LeiConvert.pm +++ b/lib/PublicInbox/LeiConvert.pm @@ -28,15 +28,22 @@ sub input_maildir_cb { $self->{wcb}->(undef, { kw => $kw }, $eml); } +sub input_mh_cb { + my ($dn, $bn, $kw, $eml, $self) = @_; + $self->{wcb}->(undef, { kw => $kw }, $eml); +} + sub process_inputs { # via wq_do my ($self) = @_; - local $PublicInbox::DS::in_loop = 0; # force synchronous dwaitpid + local $PublicInbox::DS::in_loop = 0; # force synchronous awaitpid $self->SUPER::process_inputs; my $lei = $self->{lei}; - delete $lei->{1}; - delete $self->{wcb}; # commit - my $nr_w = delete($lei->{-nr_write}) // 0; - my $d = (delete($lei->{-nr_seen}) // 0) - $nr_w; + my $l2m = delete $lei->{l2m}; + delete $self->{wcb}; # may close connections + $l2m->finish_output($lei) if $l2m; + if (my $v2w = delete $lei->{v2w}) { $v2w->done } # may die + my $nr_w = delete($l2m->{-nr_write}) // 0; + my $d = (delete($l2m->{-nr_seen}) // 0) - $nr_w; $d = $d ? " ($d duplicates)" : ''; $lei->qerr("# converted $nr_w messages$d"); } @@ -45,6 +52,7 @@ sub lei_convert { # the main "lei convert" method my ($lei, @inputs) = @_; $lei->{opt}->{kw} //= 1; $lei->{opt}->{dedupe} //= 'none'; + $lei->{input_opt}->{sort} = 1; # for LeiToMail conflict check my $self = bless {}, __PACKAGE__; my $ovv = PublicInbox::LeiOverview->new($lei, 'out-format'); $lei->{l2m} or return @@ -64,7 +72,7 @@ sub ipc_atfork_child { my ($self) = @_; my $lei = $self->{lei}; $lei->_lei_atfork_child; - my $l2m = delete $lei->{l2m}; + my $l2m = $lei->{l2m}; if (my $net = $lei->{net}) { # may prompt user once $net->{mics_cached} = $net->imap_common_init($lei); $net->{nn_cached} = $net->nntp_common_init($lei); diff --git a/lib/PublicInbox/LeiCurl.pm b/lib/PublicInbox/LeiCurl.pm index 5ffade99..48c66ee9 100644 --- a/lib/PublicInbox/LeiCurl.pm +++ b/lib/PublicInbox/LeiCurl.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # common option and torsocks(1) wrapping for curl(1) @@ -7,8 +7,7 @@ # n.b. curl may support a daemon/client model like lei someday: # https://github.com/curl/curl/wiki/curl-tool-master-client package PublicInbox::LeiCurl; -use strict; -use v5.10.1; +use v5.12; use PublicInbox::Spawn qw(which); use PublicInbox::Config; @@ -27,7 +26,7 @@ sub new { my ($cls, $lei, $curl) = @_; $curl //= which('curl') // return $lei->fail('curl not found'); my $opt = $lei->{opt}; - my @cmd = ($curl, qw(-Sf)); + my @cmd = ($curl, qw(-gSf)); $cmd[-1] .= 's' if $opt->{quiet}; # already the default for "lei q" $cmd[-1] .= 'v' if $opt->{verbose}; # we use ourselves, too for my $o ($lei->curl_opt) { @@ -77,8 +76,8 @@ sub for_uri { my $pfx = torsocks($self, $lei, $uri) or return; # error if ($uri->scheme =~ /\Ahttps?\z/i) { my $cfg = $lei->_lei_cfg; - my $p = $cfg ? $cfg->urlmatch('http.Proxy', $$uri) : undef; - push(@opt, "--proxy=$p") if defined($p); + my $p = $cfg ? $cfg->urlmatch('http.Proxy', $$uri, 1) : undef; + push(@opt, '--proxy', $p) if defined($p); } bless [ @$pfx, @$self, @opt, $uri->as_string ], ref($self); } diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm index 32f99cd0..eda54d79 100644 --- a/lib/PublicInbox/LeiDedupe.pm +++ b/lib/PublicInbox/LeiDedupe.pm @@ -1,10 +1,9 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::LeiDedupe; -use strict; -use v5.10.1; -use PublicInbox::ContentHash qw(content_hash git_sha); -use Digest::SHA (); +use v5.12; +use PublicInbox::ContentHash qw(content_hash content_digest git_sha); +use PublicInbox::SHA qw(sha256); # n.b. mutt sets most of these headers not sure about Bytes our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes); @@ -30,11 +29,9 @@ sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } sub smsg_hash ($) { my ($smsg) = @_; - my $dig = Digest::SHA->new(256); my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)}); utf8::encode($x); - $dig->add($x); - $dig->digest; + sha256($x); } # the paranoid option @@ -72,7 +69,12 @@ sub dedupe_content ($) { my ($skv) = @_; (sub { # may be called in a child process my ($eml) = @_; # $oidhex = $_[1], ignored - $skv->set_maybe(content_hash($eml), ''); + + # we must account for Message-ID via hash_mids, since + # (unlike v2 dedupe) Message-ID is not accounted for elsewhere: + $skv->set_maybe(content_digest($eml, PublicInbox::SHA->new(256), + 1 # hash_mids + )->digest, ''); }, sub { my ($smsg) = @_; $skv->set_maybe(smsg_hash($smsg), ''); diff --git a/lib/PublicInbox/LeiExportKw.pm b/lib/PublicInbox/LeiExportKw.pm index d2396fa7..16f069da 100644 --- a/lib/PublicInbox/LeiExportKw.pm +++ b/lib/PublicInbox/LeiExportKw.pm @@ -38,7 +38,7 @@ sub export_kw_md { # LeiMailSync->each_src callback } elsif ($! == EEXIST) { # lost race with lei/store? return; } elsif ($! != ENOENT) { - $lei->child_error(1, + $lei->child_error(0, "E: rename_noreplace($src -> $dst): $!"); } # else loop @try } @@ -46,7 +46,7 @@ sub export_kw_md { # LeiMailSync->each_src callback # both tries failed my $oidhex = unpack('H*', $oidbin); my $src = "$mdir/{".join(',', @try)."}/$$id"; - $lei->child_error(1, "rename_noreplace($src -> $dst) ($oidhex): $e"); + $lei->child_error(0, "rename_noreplace($src -> $dst) ($oidhex): $e"); for (@try) { return if -e "$mdir/$_/$$id" } $self->{lms}->clear_src("maildir:$mdir", $id); } diff --git a/lib/PublicInbox/LeiExternal.pm b/lib/PublicInbox/LeiExternal.pm index 30bb1a45..31b9bd1e 100644 --- a/lib/PublicInbox/LeiExternal.pm +++ b/lib/PublicInbox/LeiExternal.pm @@ -1,11 +1,11 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # *-external commands of lei package PublicInbox::LeiExternal; use strict; use v5.10.1; -use PublicInbox::Config; +use PublicInbox::Config qw(glob2re); sub externals_each { my ($self, $cb, @arg) = @_; @@ -44,40 +44,6 @@ sub ext_canonicalize { } } -# TODO: we will probably extract glob2re into a separate module for -# PublicInbox::Filter::Base and maybe other places -my %re_map = ( '*' => '[^/]*?', '?' => '[^/]', - '[' => '[', ']' => ']', ',' => ',' ); - -sub glob2re { - my $re = $_[-1]; # $_[0] may be $lei - my $p = ''; - my $in_bracket = 0; - my $qm = 0; - my $schema_host_port = ''; - - # don't glob URL-looking things that look like IPv6 - if ($re =~ s!\A([a-z0-9\+]+://\[[a-f0-9\:]+\](?::[0-9]+)?/)!!i) { - $schema_host_port = quotemeta $1; # "http://[::1]:1234" - } - my $changes = ($re =~ s!(.)! - $re_map{$p eq '\\' ? '' : do { - if ($1 eq '[') { ++$in_bracket } - elsif ($1 eq ']') { --$in_bracket } - elsif ($1 eq ',') { ++$qm } # no change - $p = $1; - }} // do { - $p = $1; - ($p eq '-' && $in_bracket) ? $p : (++$qm, "\Q$p") - }!sge); - # bashism (also supported by curl): {a,b,c} => (a|b|c) - $changes += ($re =~ s/([^\\]*)\\\{([^,]*,[^\\]*)\\\}/ - (my $in_braces = $2) =~ tr!,!|!; - $1."($in_braces)"; - /sge); - ($changes - $qm) ? $schema_host_port.$re : undef; -} - # get canonicalized externals list matching $loc # $is_exclude denotes it's for --exclude # otherwise it's for --only/--include is assumed @@ -88,7 +54,7 @@ sub get_externals { my @cur = externals_each($self); my $do_glob = !$self->{opt}->{globoff}; # glob by default if ($do_glob && (my $re = glob2re($loc))) { - @m = grep(m!$re!, @cur); + @m = grep(m!$re/?\z!, @cur); return @m if scalar(@m); } elsif (index($loc, '/') < 0) { # exact basename match: @m = grep(m!/\Q$loc\E/?\z!, @cur); @@ -120,39 +86,34 @@ sub canonicalize_excludes { # returns an anonymous sub which returns an array of potential results sub complete_url_prepare { my $argv = $_[-1]; # $_[0] may be $lei - # Workaround bash word-splitting URLs to ['https', ':', '//' ...] - # Maybe there's a better way to go about this in - # contrib/completion/lei-completion.bash - my $re = ''; - my $cur = pop(@$argv) // ''; + # Workaround bash default COMP_WORDBREAKS splitting URLs to + # ['https', ':', '//', ...]. COMP_WORDBREAKS is global for all + # completions loaded, not just ours, so we can't change it. + # cf. contrib/completion/lei-completion.bash + my ($pfx, $cur) = ('', pop(@$argv) // ''); if (@$argv) { my @x = @$argv; - if ($cur eq ':' && @x) { + if ($cur =~ /\A[:;=]\z/) { # COMP_WORDBREAKS + URL union push @x, $cur; $cur = ''; } - while (@x > 2 && $x[0] !~ /\A(?:http|nntp|imap)s?\z/i && - $x[1] ne ':') { - shift @x; - } - if (@x >= 2) { # qw(https : hostname : 443) or qw(http :) - $re = join('', @x); - } else { # just filter out the flags and hope for the best - $re = join('', grep(!/^-/, @$argv)); + while (@x && $pfx !~ m!\A(?: (?:[\+\-]?(?:L|kw):) | + (?:(?:imap|nntp|http)s?:) | + (?:--\w?\z)|(?:-\w?\z) )!x) { + $pfx = pop(@x).$pfx; } - $re = quotemeta($re); } + my $re = qr!\A\Q$pfx\E(\Q$cur\E.*)!; my $match_cb = sub { # the "//;" here (for AUTH=ANONYMOUS) interacts badly with # bash tab completion, strip it out for now since our commands # work w/o it. Not sure if there's a better solution... $_[0] =~ s!//;AUTH=ANONYMOUS\@!//!i; - $_[0] =~ s!;!\\;!g; # only return the part specified on the CLI # don't duplicate if already 100% completed - $_[0] =~ /\A$re(\Q$cur\E.*)/ ? ($cur eq $1 ? () : $1) : () + $_[0] =~ $re ? ($cur eq $1 ? () : $1) : () }; - wantarray ? ($re, $cur, $match_cb) : $match_cb; + wantarray ? ($pfx, $cur, $match_cb) : $match_cb; } 1; diff --git a/lib/PublicInbox/LeiForgetExternal.pm b/lib/PublicInbox/LeiForgetExternal.pm index 07f0ac80..c8d1df38 100644 --- a/lib/PublicInbox/LeiForgetExternal.pm +++ b/lib/PublicInbox/LeiForgetExternal.pm @@ -16,8 +16,7 @@ sub lei_forget_external { next if $seen{$l}++; my $key = "external.$l.boost"; delete($cfg->{$key}); - $lei->_config('--unset', $key); - if ($? == 0) { + if ($lei->_config('+e', '--unset', $key)) { $lei->qerr("# $l forgotten "); } elsif (($? >> 8) == 5) { warn("# $l not found\n"); @@ -32,14 +31,10 @@ sub lei_forget_external { sub _complete_forget_external { my ($lei, @argv) = @_; my $cfg = $lei->_lei_cfg or return (); - my ($cur, $re, $match_cb) = $lei->complete_url_prepare(\@argv); - # FIXME: bash completion off "http:" or "https:" when the last - # character is a colon doesn't work properly even if we're - # returning "//$HTTP_HOST/$PATH_INFO/", not sure why, could - # be a bash issue. + my ($pfx, $cur, $match_cb) = $lei->complete_url_prepare(\@argv); map { $match_cb->(substr($_, length('external.'))); - } grep(/\Aexternal\.$re\Q$cur/, @{$cfg->{-section_order}}); + } grep(/\Aexternal\.\Q$pfx$cur/, @{$cfg->{-section_order}}); } 1; diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm index 2d91e4c4..5521188c 100644 --- a/lib/PublicInbox/LeiImport.pm +++ b/lib/PublicInbox/LeiImport.pm @@ -7,6 +7,7 @@ use strict; use v5.10.1; use parent qw(PublicInbox::IPC PublicInbox::LeiInput); use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::Compat qw(uniqstr); # /^input_/ subs are used by (or override) PublicInbox::LeiInput superclass @@ -40,8 +41,7 @@ sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn my @oidbin = $lms ? $lms->name_oidbin($folder, $bn) : (); @oidbin > 1 and warn("W: $folder/*/$$bn not unique:\n", map { "\t".unpack('H*', $_)."\n" } @oidbin); - my %seen; - my @docids = sort { $a <=> $b } grep { !$seen{$_}++ } + my @docids = sort { $a <=> $b } uniqstr map { $lse->over->oidbin_exists($_) } @oidbin; my $vmd = $self->{-import_kw} ? { kw => $kw } : undef; if (scalar @docids) { @@ -53,6 +53,29 @@ sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn } } +sub input_mh_cb { + my ($mhdir, $n, $kw, $eml, $self) = @_; + substr($mhdir, 0, 0) = 'mh:'; # add prefix + my $lse = $self->{lse} //= $self->{lei}->{sto}->search; + my $lms = $self->{-lms_rw} //= $self->{lei}->lms; # may be 0 or undef + my @oidbin = $lms ? $lms->num_oidbin($mhdir, $n) : (); + @oidbin > 1 and warn("W: $mhdir/$n not unique:\n", + map { "\t".unpack('H*', $_)."\n" } @oidbin); + my @docids = sort { $a <=> $b } uniqstr + map { $lse->over->oidbin_exists($_) } @oidbin; + if (scalar @docids) { + $lse->kw_changed(undef, $kw, \@docids) or return; + } + if (defined $eml) { + my $vmd = $self->{-import_kw} ? { kw => $kw } : undef; + $vmd->{sync_info} = [ $mhdir, $n + 0 ] if $self->{-mail_sync}; + $self->input_eml_cb($eml, $vmd); + } + # TODO: + # elsif (my $ikw = $self->{lei}->{ikw}) { # old message, kw only + # $ikw->wq_io_do('ck_update_kw', [], "mh:$dir", $uid, $kw); +} + sub input_net_cb { # imap_each / nntp_each my ($uri, $uid, $kw, $eml, $self) = @_; if (defined $eml) { @@ -71,9 +94,7 @@ sub do_import_index ($$@) { my $sto = $lei->_lei_store(1); $sto->write_prepare($lei); $self->{-import_kw} = $lei->{opt}->{kw} // 1; - my $vmd_mod = $self->vmd_mod_extract(\@inputs); - return $lei->fail(join("\n", @{$vmd_mod->{err}})) if $vmd_mod->{err}; - $self->{all_vmd} = $vmd_mod if scalar keys %$vmd_mod; + $self->{all_vmd} = $lei->{vmd_mod} if keys %{$lei->{vmd_mod}}; $lei->ale; # initialize for workers to read (before LeiPmdir->new) $self->{-mail_sync} = $lei->{opt}->{'mail-sync'} // 1; $self->prepare_inputs($lei, \@inputs) or return; @@ -115,18 +136,24 @@ sub lei_import { # the main "lei import" method sub _complete_import { my ($lei, @argv) = @_; - my ($re, $cur, $match_cb) = $lei->complete_url_prepare(\@argv); - my @k = $lei->url_folder_cache->keys($argv[-1] // undef, 1); + my $has_arg = @argv; + my ($pfx, $cur, $match_cb) = $lei->complete_url_prepare(\@argv); + my @try = $has_arg ? ($pfx.$cur, $argv[-1]) : ($argv[-1]); + push(@try, undef) if defined $try[-1]; + my (@f, @k); + for (@try) { + @k = $lei->url_folder_cache->keys($_, 1) and last; + } my @L = eval { $lei->_lei_store->search->all_terms('L') }; push(@k, map { "+L:$_" } @L); - my @m = map { $match_cb->($_) } @k; - my %f = map { $_ => 1 } (@m ? @m : @k); if (my $lms = $lei->lms) { - @k = $lms->folders($argv[-1] // undef, 1); - @m = map { $match_cb->($_) } @k; - if (@m) { @f{@m} = @m } else { @f{@k} = @k } + for (@try) { + @f = $lms->folders($_, 1) and last; + } + push @k, @f; } - keys %f; + my @m = map { $match_cb->($_) } @k; + @m ? @m : @k; } no warnings 'once'; diff --git a/lib/PublicInbox/LeiImportKw.pm b/lib/PublicInbox/LeiImportKw.pm index 4dd938f5..765e23cd 100644 --- a/lib/PublicInbox/LeiImportKw.pm +++ b/lib/PublicInbox/LeiImportKw.pm @@ -7,6 +7,7 @@ package PublicInbox::LeiImportKw; use strict; use v5.10.1; use parent qw(PublicInbox::IPC); +use PublicInbox::Compat qw(uniqstr); sub new { my ($cls, $lei) = @_; @@ -35,11 +36,10 @@ sub ipc_atfork_child { sub ck_update_kw { # via wq_io_do my ($self, $url, $uid, $kw) = @_; my @oidbin = $self->{-lms_rw}->num_oidbin($url, $uid); - my $uid_url = "$url/;UID=$uid"; + my $uid_url = index($url, 'mh:') == 0 ? $url.$uid : "$url/;UID=$uid"; @oidbin > 1 and warn("W: $uid_url not unique:\n", map { "\t".unpack('H*', $_)."\n" } @oidbin); - my %seen; - my @docids = sort { $a <=> $b } grep { !$seen{$_}++ } + my @docids = sort { $a <=> $b } uniqstr map { $self->{over}->oidbin_exists($_) } @oidbin; $self->{lse}->kw_changed(undef, $kw, \@docids) or return; $self->{verbose} and $self->{lei}->qerr("# $uid_url => @$kw\n"); diff --git a/lib/PublicInbox/LeiIndex.pm b/lib/PublicInbox/LeiIndex.pm index b3f3e1a0..0e329e58 100644 --- a/lib/PublicInbox/LeiIndex.pm +++ b/lib/PublicInbox/LeiIndex.pm @@ -35,7 +35,7 @@ sub lei_index { no warnings 'once'; no strict 'refs'; -for my $m (qw(pmdir_cb input_net_cb)) { +for my $m (qw(pmdir_cb input_net_cb input_mh_cb)) { *$m = PublicInbox::LeiImport->can($m); } diff --git a/lib/PublicInbox/LeiInit.pm b/lib/PublicInbox/LeiInit.pm index 27ce8169..94897e61 100644 --- a/lib/PublicInbox/LeiInit.pm +++ b/lib/PublicInbox/LeiInit.pm @@ -23,7 +23,7 @@ sub lei_init { # some folks like symlinks and bind mounts :P if (@dir && "@cur[1,0]" eq "@dir[1,0]") { - $self->_config('leistore.dir', $dir); + $self->_config('leistore.dir', $dir) or return; $self->_lei_store(1)->done; return $self->qerr("$exists (as $cur)"); } @@ -31,7 +31,7 @@ sub lei_init { E: leistore.dir=$cur already initialized and it is not $dir } - $self->_config('leistore.dir', $dir); + $self->_config('leistore.dir', $dir) or return; $self->_lei_store(1)->done; $exists //= "# leistore.dir=$dir newly initialized"; $self->qerr($exists); diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm index a1dcc907..c388f7dc 100644 --- a/lib/PublicInbox/LeiInput.pm +++ b/lib/PublicInbox/LeiInput.pm @@ -1,14 +1,12 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # parent class for LeiImport, LeiConvert, LeiIndex package PublicInbox::LeiInput; -use strict; -use v5.10.1; +use v5.12; use PublicInbox::DS; use PublicInbox::Spawn qw(which popen_rd); use PublicInbox::InboxWritable qw(eml_from_path); -use PublicInbox::AutoReap; # JMAP RFC 8621 4.1.1 # https://www.iana.org/assignments/imap-jmap-keywords/imap-jmap-keywords.xhtml @@ -30,6 +28,8 @@ my %ERR = ( my ($label) = @_; length($label) >= $L_MAX and return "`$label' too long (must be <= $L_MAX)"; + $label =~ /[A-Z]/ and + return "`$label' must be lowercase"; $label =~ m{\A[a-z0-9_](?:[a-z0-9_\-\./\@,]*[a-z0-9])?\z} ? undef : "`$label' is invalid"; }, @@ -69,6 +69,11 @@ sub input_maildir_cb { $self->input_eml_cb($eml); } +sub input_mh_cb { + my ($dn, $n, $kw, $eml, $self) = @_; + $self->input_eml_cb($eml); +} + sub input_net_cb { # imap_each, nntp_each cb my ($url, $uid, $kw, $eml, $self) = @_; $self->input_eml_cb($eml); @@ -79,14 +84,12 @@ sub input_net_cb { # imap_each, nntp_each cb sub input_fh { my ($self, $ifmt, $fh, $name, @args) = @_; if ($ifmt eq 'eml') { - my $buf = do { local $/; <$fh> } // - return $self->{lei}->child_error(0, <<""); -error reading $name: $! + my $buf = eval { PublicInbox::IO::read_all $fh, 0 }; + my $e = $@; + return $self->{lei}->child_error($?, <<"") if !$fh->close || $e; +error reading $name: $! (\$?=$?) (\$@=$e) - # mutt pipes single RFC822 messages with a "From " line, - # but no Content-Length or "From " escaping. - # "git format-patch" also generates such files by default. - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($buf); # a user may feed just a body: git diff | lei rediff -U9 if ($self->{-force_eml}) { @@ -113,15 +116,58 @@ sub handle_http_input ($$@) { push @$curl, '-s', @$curl_opt; my $cmd = $curl->for_uri($lei, $uri); $lei->qerr("# $cmd"); - my ($fh, $pid) = popen_rd($cmd, undef, { 2 => $lei->{2} }); - my $ar = PublicInbox::AutoReap->new($pid); + my $fh = popen_rd($cmd, undef, { 2 => $lei->{2} }); grep(/\A--compressed\z/, @$curl) or - $fh = IO::Uncompress::Gunzip->new($fh, MultiStream => 1); + $fh = IO::Uncompress::Gunzip->new($fh, + MultiStream => 1, AutoClose => 1); eval { $self->input_fh('mboxrd', $fh, $url, @args) }; - my @err = ($@ ? $@ : ()); - $ar->join; - push(@err, "\$?=$?") if $?; - $lei->child_error($?, "@$cmd failed: @err") if @err; + my $err = $@ ? ": $@" : ''; + $lei->child_error($?, "@$cmd failed$err") if $err || $?; +} + +sub oid2eml { # git->cat_async cb + my ($bref, $oid, $type, $size, $self) = @_; + if ($type eq 'blob') { + $self->input_eml_cb(PublicInbox::Eml->new($bref)); + } else { + warn "W: $oid is type=$type\n"; + } +} + +sub each_ibx_eml_unindexed { + my ($self, $ibx, @args) = @_; + $ibx->isa('PublicInbox::Inbox') or return $self->{lei}->fail(<<EOM); +unindexed extindex $ibx->{topdir} not supported +EOM + require PublicInbox::SearchIdx; + my $n = $ibx->max_git_epoch; + my @g = defined($n) ? map { $ibx->git_epoch($_) } (0..$n) : ($ibx->git); + my $sync = { D => {}, ibx => $ibx }; # D => {} filters out deletes + my ($f, $at, $ct, $oid, $cmt); + for my $git (grep defined, @g) { + my $s = PublicInbox::SearchIdx::log2stack($sync, $git, 'HEAD'); + while (($f, $at, $ct, $oid, $cmt) = $s->pop_rec) { + $git->cat_async($oid, \&oid2eml, $self) if $f eq 'm'; + } + $git->cleanup; # wait all + } +} + +sub each_ibx_eml { + my ($self, $ibx, @args) = @_; # TODO: is @args used at all? + my $over = $ibx->over or return each_ibx_eml_unindexed(@_); + my $git = $ibx->git; + my $prev = 0; + my $smsg; + my $ids = $over->ids_after(\$prev); + while (@$ids) { + for (@$ids) { + $smsg = $over->get_art($_) // next; + $git->cat_async($smsg->{blob}, \&oid2eml, $self); + } + $ids = $over->ids_after(\$prev); + } + $git->cat_async_wait; } sub input_path_url { @@ -149,7 +195,7 @@ sub input_path_url { $ifmt = lc($1); } elsif ($input =~ /\.(?:patch|eml)\z/i) { $ifmt = 'eml'; - } elsif (-f $input && $input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z}) { + } elsif ($input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z} && -f $input) { my $bn = $1; my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn); return if index($fl, 'T') >= 0; @@ -163,6 +209,10 @@ sub input_path_url { my $devfd = $lei->path_to_fd($input) // return; if ($devfd >= 0) { $self->input_fh($ifmt, $lei->{$devfd}, $input, @args); + } elsif ($devfd < 0 && $input =~ m{\A(.+/)([0-9]+)\z} && -f $input) { + my ($dn, $n) = ($1, $2); + my $mhr = PublicInbox::MHreader->new($dn, $lei->{3}); + $mhr->mh_read_one($n, $self->can('input_mh_cb'), $self); } elsif (-f $input && $ifmt eq 'eml') { open my $fh, '<', $input or return $lei->fail("open($input): $!"); @@ -177,12 +227,9 @@ sub input_path_url { $mbl->{fh} = PublicInbox::MboxReader::zsfxcat($in, $zsfx, $lei); } - local $PublicInbox::DS::in_loop = 0 if $zsfx; # dwaitpid + local $PublicInbox::DS::in_loop = 0 if $zsfx; # awaitpid $self->input_fh($ifmt, $mbl->{fh}, $input, @args); - } elsif (-d _ && (-d "$input/cur" || -d "$input/new")) { - return $lei->fail(<<EOM) if $ifmt && $ifmt ne 'maildir'; -$input appears to be a maildir, not $ifmt -EOM + } elsif (-d _ && $ifmt eq 'maildir') { my $mdr = PublicInbox::MdirReader->new; if (my $pmd = $self->{pmd}) { $mdr->maildir_each_file($input, @@ -193,15 +240,24 @@ EOM $self->can('input_maildir_cb'), $self, @args); } + } elsif (-d _ && $ifmt eq 'mh') { + my $mhr = PublicInbox::MHreader->new($input.'/', $lei->{3}); + $mhr->{sort} = $lei->{opt}->{sort} // [ 'sequence']; + $mhr->mh_each_eml($self->can('input_mh_cb'), $self, @args); + } elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) { + my $ibx = PublicInbox::Inbox->new({inboxdir => $input}); + each_ibx_eml($self, $ibx, @args); + } elsif (-d _ && $ifmt eq 'extindex') { + my $esrch = PublicInbox::ExtSearch->new($input); + each_ibx_eml($self, $esrch, @args); } elsif ($self->{missing_ok} && !-e $input) { # don't ->fail if ($lei->{cmd} eq 'p2q') { my $fp = [ qw(git format-patch --stdout -1), $input ]; my $rdr = { 2 => $lei->{2} }; my $fh = popen_rd($fp, undef, $rdr); eval { $self->input_fh('eml', $fh, $input, @args) }; - my @err = ($@ ? $@ : ()); - close($fh) or push @err, "\$?=$?"; - $lei->child_error($?, "@$fp failed: @err") if @err; + my $err = $@ ? ": $@" : ''; + $lei->child_error($?, "@$fp failed$err") if $err || $?; } else { $self->folder_missing("$ifmt:$input"); } @@ -257,6 +313,17 @@ sub prepare_http_input ($$$) { $self->{"-curl-$url"} = [ @curl_opt, $uri ]; # for handle_http_input } +sub add_dir ($$$$) { + my ($lei, $istate, $ifmt, $input) = @_; + if ($istate->{-may_sync}) { + $$input = "$ifmt:".$lei->abs_path($$input); + push @{$istate->{-sync}->{ok}}, $$input if $istate->{-sync}; + } else { + substr($$input, 0, 0) = "$ifmt:"; # prefix + } + push @{$istate->{$ifmt}}, $$input; +} + sub prepare_inputs { # returns undef on error my ($self, $lei, $inputs) = @_; my $in_fmt = $lei->{opt}->{'in-format'}; @@ -270,7 +337,8 @@ sub prepare_inputs { # returns undef on error push @{$sync->{no}}, '/dev/stdin' if $sync; } my $net = $lei->{net}; # NetWriter may be created by l2m - my (@f, @md); + my @f; + my $istate = { -sync => $sync, -may_sync => $may_sync }; # e.g. Maildir:/home/user/Mail/ or imaps://example.com/INBOX for my $input (@$inputs) { my $input_path = $input; @@ -290,26 +358,24 @@ sub prepare_inputs { # returns undef on error --in-format=$in_fmt and `$ifmt:' conflict } - if ($ifmt =~ /\A(?:maildir|mh)\z/i) { - push @{$sync->{ok}}, $input if $sync; - } else { - push @{$sync->{no}}, $input if $sync; - } + ($sync && $ifmt !~ /\A(?:maildir|mh)\z/i) and + push(@{$sync->{no}}, $input); my $devfd = $lei->path_to_fd($input_path) // return; if ($devfd >= 0 || (-f $input_path || -p _)) { require PublicInbox::MboxLock; require PublicInbox::MboxReader; PublicInbox::MboxReader->reads($ifmt) or return $lei->fail("$ifmt not supported"); - } elsif (-d $input_path) { - $ifmt eq 'maildir' or return - $lei->fail("$ifmt not supported"); - $may_sync and $input = 'maildir:'. - $lei->abs_path($input_path); - push @md, $input; - } elsif ($self->{missing_ok} && !-e _) { + } elsif (-d $input_path) { # TODO extindex + $ifmt =~ /\A(?:maildir|mh|v1|v2|extindex)\z/ or + return$lei->fail("$ifmt not supported"); + $input = $input_path; + add_dir $lei, $istate, $ifmt, \$input; + } elsif ($self->{missing_ok} && + $ifmt =~ /\A(?:maildir|mh)\z/ && + !-e $input_path) { # for "lei rm-watch" on missing Maildir - $may_sync and $input = 'maildir:'. + $may_sync and $input = "$ifmt:". $lei->abs_path($input_path); } else { my $m = "Unable to handle $input"; @@ -322,7 +388,7 @@ sub prepare_inputs { # returns undef on error $input is `eml', not --in-format=$in_fmt push @{$sync->{no}}, $input if $sync; - } elsif (-f $input && $input =~ m{\A(.+)/(new|cur)/([^/]+)\z}) { + } elsif ($input =~ m{\A(.+)/(new|cur)/([^/]+)\z} && -f $input) { # single file in a Maildir my ($mdir, $nc, $bn) = ($1, $2, $3); my $other = $mdir . ($nc eq 'new' ? '/cur' : '/new'); @@ -334,25 +400,41 @@ $input is `eml', not --in-format=$in_fmt if ($sync) { $input = $lei->abs_path($mdir) . "/$nc/$bn"; - push @{$sync->{ok}}, $input if $sync; + push @{$sync->{ok}}, $input; } require PublicInbox::MdirReader; } else { my $devfd = $lei->path_to_fd($input) // return; - if ($devfd >= 0 || -f $input || -p _) { + if ($devfd < 0 && $input =~ m{\A(.+)/([0-9]+)\z} && + -f $input) { # single file in MH dir + my ($mh, $n) = ($1, $2); + lc($in_fmt//'eml') eq 'eml' or + return $lei->fail(<<""); +$input is `eml', not --in-format=$in_fmt + + if ($sync) { + $input = $lei->abs_path($mh)."/$n"; + push @{$sync->{ok}}, $input; + } + require PublicInbox::MHreader; + } elsif ($devfd >= 0 || -f $input || -p _) { push @{$sync->{no}}, $input if $sync; push @f, $input; } elsif (-d "$input/new" && -d "$input/cur") { - if ($may_sync) { - $input = 'maildir:'. - $lei->abs_path($input); - push @{$sync->{ok}}, $input if $sync; - } - push @md, $input; + add_dir $lei, $istate, 'maildir', \$input; + } elsif (-e "$input/inbox.lock") { + add_dir $lei, $istate, 'v2', \$input; + } elsif (-e "$input/ssoma.lock") { + add_dir $lei, $istate, 'v1', \$input; + } elsif (-e "$input/ei.lock") { + add_dir $lei, $istate, 'extindex', \$input; + } elsif (-f "$input/.mh_sequences") { + add_dir $lei, $istate, 'mh', \$input; } elsif ($self->{missing_ok} && !-e $input) { if ($lei->{cmd} eq 'p2q') { # will run "git format-patch" } elsif ($may_sync) { # for lei rm-watch + # FIXME: support MH, here $input = 'maildir:'. $lei->abs_path($input); } @@ -380,20 +462,29 @@ $input is `eml', not --in-format=$in_fmt $lei->{auth} //= PublicInbox::LeiAuth->new; $lei->{net} //= $net; } - if (scalar(@md)) { + if (my $md = $istate->{maildir}) { require PublicInbox::MdirReader; if ($self->can('pmdir_cb')) { require PublicInbox::LeiPmdir; $self->{pmd} = PublicInbox::LeiPmdir->new($lei, $self); } + grep(!m!\Amaildir:/!i, @$md) and die "BUG: @$md (no pfx)"; # start watching Maildirs ASAP if ($may_sync && $lei->{sto}) { - grep(!m!\Amaildir:/!i, @md) and die "BUG: @md (no pfx)"; - $lei->lms(1)->lms_write_prepare->add_folders(@md); + $lei->lms(1)->lms_write_prepare->add_folders(@$md); $lei->refresh_watches; } } + if (my $mh = $istate->{mh}) { + require PublicInbox::MHreader; + grep(!m!\Amh:!i, @$mh) and die "BUG: @$mh (no pfx)"; + if ($may_sync && $lei->{sto}) { + $lei->lms(1)->lms_write_prepare->add_folders(@$mh); + # $lei->refresh_watches; TODO + } + } + require PublicInbox::ExtSearch if $istate->{extindex}; $self->{inputs} = $inputs; } @@ -408,7 +499,7 @@ sub process_inputs { } # always commit first, even on error partial work is acceptable for # lei <import|tag|convert> - my $wait = $self->{lei}->{sto}->wq_do('done') if $self->{lei}->{sto}; + $self->{lei}->sto_barrier_request; $self->{lei}->fail($err) if $err; } @@ -432,23 +523,22 @@ sub input_only_net_merge_all_done { # for update_xvmd -> update_vmd # returns something like { "+L" => [ @Labels ], ... } sub vmd_mod_extract { - my $argv = $_[-1]; - my $vmd_mod = {}; - my @new_argv; + my ($lei, $argv) = @_; + my (@new_argv, @err); for my $x (@$argv) { if ($x =~ /\A(\+|\-)(kw|L):(.+)\z/) { my ($op, $pfx, $val) = ($1, $2, $3); if (my $err = $ERR{$pfx}->($val)) { - push @{$vmd_mod->{err}}, $err; + push @err, $err; } else { # set "+kw", "+L", "-L", "-kw" - push @{$vmd_mod->{$op.$pfx}}, $val; + push @{$lei->{vmd_mod}->{$op.$pfx}}, $val; } } else { push @new_argv, $x; } } @$argv = @new_argv; - $vmd_mod; + @err; } 1; diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm index d7775d4b..576ab2c7 100644 --- a/lib/PublicInbox/LeiInspect.pm +++ b/lib/PublicInbox/LeiInspect.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # "lei inspect" general purpose inspector for stuff in SQLite and @@ -12,7 +12,6 @@ use parent qw(PublicInbox::IPC); use PublicInbox::Config; use PublicInbox::MID qw(mids); use PublicInbox::NetReader qw(imap_uri nntp_uri); -use POSIX qw(strftime); use PublicInbox::LeiOverview; *iso8601 = \&PublicInbox::LeiOverview::iso8601; @@ -97,7 +96,6 @@ sub _inspect_doc ($$) { my $term = ($1 // ''); push @{$ent->{terms}->{$term}}, $tn; } - @$_ = sort(@$_) for values %{$ent->{terms} // {}}; $cur = $doc->values_begin; $end = $doc->values_end; for (; $cur != $end; $cur++) { @@ -235,7 +233,8 @@ sub inspect_argv { # via wq_do $lei->{1}->autoflush(0); $lei->out('[') if $multi; while (defined(my $x = shift @$argv)) { - inspect1($lei, $x, scalar(@$argv)) or return; + eval { inspect1($lei, $x, scalar(@$argv)) or return }; + warn "E: $@\n" if $@; } $lei->out(']') if $multi; } @@ -250,21 +249,13 @@ sub inspect_start ($$) { $self->wq_close; } -sub ins_add { # InputPipe->consume callback - my ($lei) = @_; # $_[1] = $rbuf - if (defined $_[1]) { - $_[1] eq '' and return eval { - my $str = delete $lei->{istr}; - $str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - my $eml = PublicInbox::Eml->new(\$str); - inspect_start($lei, [ - 'blob:'.$lei->git_oid($eml)->hexdigest, - map { "mid:$_" } @{mids($eml)} ]); - }; - $lei->{istr} .= $_[1]; - } else { - $lei->fail("error reading stdin: $!"); - } +sub do_inspect { # lei->do_env cb + my ($lei) = @_; + my $str = delete $lei->{stdin_buf}; + PublicInbox::Eml::strip_from($str); + my $eml = PublicInbox::Eml->new(\$str); + inspect_start($lei, [ 'blob:'.$lei->git_oid($eml)->hexdigest, + map { "mid:$_" } @{mids($eml)} ]); } sub lei_inspect { @@ -281,8 +272,7 @@ sub lei_inspect { return $lei->fail(<<'') if @argv; no args allowed on command-line with --stdin - require PublicInbox::InputPipe; - PublicInbox::InputPipe::consume($lei->{0}, \&ins_add, $lei); + $lei->slurp_stdin(\&do_inspect); } else { inspect_start($lei, \@argv); } diff --git a/lib/PublicInbox/LeiLcat.pm b/lib/PublicInbox/LeiLcat.pm index 8d89cb73..274a9605 100644 --- a/lib/PublicInbox/LeiLcat.pm +++ b/lib/PublicInbox/LeiLcat.pm @@ -122,17 +122,11 @@ could not extract Message-ID from $x @q ? join(' OR ', @q) : $lei->fail("no Message-ID in: @argv"); } -sub _stdin { # PublicInbox::InputPipe::consume callback for --stdin - my ($lei) = @_; # $_[1] = $rbuf - $_[1] // return $lei->fail("error reading stdin: $!"); - return $lei->{mset_opt}->{qstr} .= $_[1] if $_[1] ne ''; - eval { - $lei->fchdir; - my @argv = split(/\s+/, $lei->{mset_opt}->{qstr}); - $lei->{mset_opt}->{qstr} = extract_all($lei, @argv) or return; - $lei->_start_query; - }; - $lei->fail($@) if $@; +sub do_lcat { # lei->do_env cb + my ($lei) = @_; + my @argv = split(/\s+/, delete($lei->{stdin_buf})); + $lei->{mset_opt}->{qstr} = extract_all($lei, @argv) or return; + $lei->_start_query; } sub lei_lcat { @@ -151,9 +145,7 @@ sub lei_lcat { return $lei->fail(<<'') if @argv; no args allowed on command-line with --stdin - require PublicInbox::InputPipe; - PublicInbox::InputPipe::consume($lei->{0}, \&_stdin, $lei); - return; + return $lei->slurp_stdin(\&do_lcat); } $lei->{mset_opt}->{qstr} = extract_all($lei, @argv) or return; $lei->_start_query; diff --git a/lib/PublicInbox/LeiLsExternal.pm b/lib/PublicInbox/LeiLsExternal.pm index dd2eb2e7..2cdd0c4d 100644 --- a/lib/PublicInbox/LeiLsExternal.pm +++ b/lib/PublicInbox/LeiLsExternal.pm @@ -5,6 +5,7 @@ package PublicInbox::LeiLsExternal; use strict; use v5.10.1; +use PublicInbox::Config qw(glob2re); # TODO: does this need JSON output? sub lei_ls_external { @@ -12,7 +13,8 @@ sub lei_ls_external { my $do_glob = !$lei->{opt}->{globoff}; # glob by default my ($OFS, $ORS) = $lei->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n"); $filter //= '*'; - my $re = $do_glob ? $lei->glob2re($filter) : undef; + my $re = $do_glob ? glob2re($filter) : undef; + $re .= '/?\\z' if defined $re; $re //= index($filter, '/') < 0 ? qr!/\Q$filter\E/?\z! : # exact basename match qr/\Q$filter\E/; # grep -F semantics diff --git a/lib/PublicInbox/LeiLsMailSource.pm b/lib/PublicInbox/LeiLsMailSource.pm index 50799270..ab6c1e60 100644 --- a/lib/PublicInbox/LeiLsMailSource.pm +++ b/lib/PublicInbox/LeiLsMailSource.pm @@ -19,7 +19,8 @@ sub input_path_url { # overrides LeiInput version if ($url =~ m!\Aimaps?://!i) { my $uri = PublicInbox::URIimap->new($url); my $sec = $lei->{net}->can('uri_section')->($uri); - my $mic = $lei->{net}->mic_get($uri); + my $mic = $lei->{net}->mic_get($uri) or + return $lei->err("E: $uri"); my $l = $mic->folders_hash($uri->path); # server-side filter @$l = map { $_->[2] } # undo Schwartzian transform below: sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] } @@ -39,8 +40,13 @@ sub input_path_url { # overrides LeiInput version } } elsif ($url =~ m!\A(?:nntps?|s?news)://!i) { my $uri = PublicInbox::URInntps->new($url); - my $nn = $lei->{net}->nn_get($uri); - my $l = $nn->newsgroups($uri->group); # name => description + my $nn = $lei->{net}->nn_get($uri) or + return $lei->err("E: $uri"); + # $l = name => description + my $l = $nn->newsgroups($uri->group) // return $lei->err(<<EOM); +E: $uri LIST NEWSGROUPS: ${\($lei->{net}->ndump($nn->message))} +E: login may be required, try adding `-c nntp.debug' to your command +EOM my $sec = $lei->{net}->can('uri_section')->($uri); if ($json) { my $all = $nn->list; diff --git a/lib/PublicInbox/LeiLsMailSync.pm b/lib/PublicInbox/LeiLsMailSync.pm index 2b167b1d..1400d488 100644 --- a/lib/PublicInbox/LeiLsMailSync.pm +++ b/lib/PublicInbox/LeiLsMailSync.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # front-end for the "lei ls-mail-sync" sub-command @@ -6,13 +6,17 @@ package PublicInbox::LeiLsMailSync; use strict; use v5.10.1; use PublicInbox::LeiMailSync; +use PublicInbox::Config qw(glob2re); sub lei_ls_mail_sync { my ($lei, $filter) = @_; my $lms = $lei->lms or return; my $opt = $lei->{opt}; - my $re = $opt->{globoff} ? undef : $lei->glob2re($filter // '*'); - $re //= qr/\Q$filter\E/; + my $re = $opt->{globoff} ? undef : glob2re($filter // '*'); + $re .= '/?\\z' if defined $re; + $re //= index($filter, '/') < 0 ? + qr!/\Q$filter\E/?\z! : # exact basename match + qr/\Q$filter\E/; # grep -F semantics my @f = $lms->folders; @f = $opt->{'invert-match'} ? grep(!/$re/, @f) : grep(/$re/, @f); if ($opt->{'local'} && !$opt->{remote}) { diff --git a/lib/PublicInbox/LeiMailDiff.pm b/lib/PublicInbox/LeiMailDiff.pm index 2b4cfd9e..af6ecf82 100644 --- a/lib/PublicInbox/LeiMailDiff.pm +++ b/lib/PublicInbox/LeiMailDiff.pm @@ -4,74 +4,27 @@ # The "lei mail-diff" sub-command, diffs input contents against # the first message of input package PublicInbox::LeiMailDiff; -use strict; -use v5.10.1; -use parent qw(PublicInbox::IPC PublicInbox::LeiInput); -use File::Temp 0.19 (); # 0.19 for ->newdir -use PublicInbox::Spawn qw(spawn which); -use PublicInbox::MsgIter qw(msg_part_text); -use File::Path qw(remove_tree); -use PublicInbox::ContentHash qw(content_digest); +use v5.12; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput PublicInbox::MailDiff); +use PublicInbox::Spawn qw(run_wait); require PublicInbox::LeiRediff; -use Data::Dumper (); - -sub write_part { # Eml->each_part callback - my ($ary, $self) = @_; - my ($part, $depth, $idx) = @$ary; - if ($idx ne '1' || $self->{lei}->{opt}->{'raw-header'}) { - open my $fh, '>', "$self->{curdir}/$idx.hdr" or die "open: $!"; - print $fh ${$part->{hdr}} or die "print $!"; - close $fh or die "close $!"; - } - my $ct = $part->content_type || 'text/plain'; - my ($s, $err) = msg_part_text($part, $ct); - my $sfx = defined($s) ? 'txt' : 'bin'; - open my $fh, '>', "$self->{curdir}/$idx.$sfx" or die "open: $!"; - print $fh ($s // $part->body) or die "print $!"; - close $fh or die "close $!"; -} - -sub dump_eml ($$$) { - my ($self, $dir, $eml) = @_; - local $self->{curdir} = $dir; - mkdir $dir or die "mkdir($dir): $!"; - $eml->each_part(\&write_part, $self); - - open my $fh, '>', "$dir/content_digest" or die "open: $!"; - my $dig = PublicInbox::ContentDigestDbg->new($fh); - local $Data::Dumper::Useqq = 1; - local $Data::Dumper::Terse = 1; - content_digest($eml, $dig); - print $fh "\n", $dig->hexdigest, "\n" or die "print $!"; - close $fh or die "close: $!"; -} - -sub prep_a ($$) { - my ($self, $eml) = @_; - $self->{tmp} = File::Temp->newdir('lei-mail-diff-XXXX', TMPDIR => 1); - dump_eml($self, "$self->{tmp}/a", $eml); -} sub diff_a ($$) { my ($self, $eml) = @_; - ++$self->{nr}; - my $dir = "$self->{tmp}/N$self->{nr}"; - dump_eml($self, $dir, $eml); + my $dir = "$self->{tmp}/N".(++$self->{nr}); + $self->dump_eml($dir, $eml); my $cmd = [ qw(git diff --no-index) ]; my $lei = $self->{lei}; PublicInbox::LeiRediff::_lei_diff_prepare($lei, $cmd); push @$cmd, qw(-- a), "N$self->{nr}"; my $rdr = { -C => "$self->{tmp}" }; @$rdr{1, 2} = @$lei{1, 2}; - my $pid = spawn($cmd, $lei->{env}, $rdr); - waitpid($pid, 0); - $lei->child_error($?) if $?; # for git diff --exit-code - File::Path::remove_tree($self->{curdir}); + run_wait($cmd, $lei->{env}, $rdr) and $lei->child_error($?); } sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh my ($self, $eml) = @_; - $self->{tmp} ? diff_a($self, $eml) : prep_a($self, $eml); + $self->{tmp} ? diff_a($self, $eml) : $self->prep_a($eml); } sub lei_mail_diff { @@ -82,24 +35,10 @@ sub lei_mail_diff { $lei->{opt}->{color} //= $isatty; $lei->start_pager if $isatty; $lei->{-err_type} = 'non-fatal'; + $self->{-raw_hdr} = $lei->{opt}->{'raw-header'}; $lei->wq1_start($self); } no warnings 'once'; *net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; - -package PublicInbox::ContentDigestDbg; # cf. PublicInbox::ContentDigest -use strict; -use v5.10.1; -use Data::Dumper; - -sub new { bless { dig => Digest::SHA->new(256), fh => $_[1] }, __PACKAGE__ } - -sub add { - $_[0]->{dig}->add($_[1]); - print { $_[0]->{fh} } Dumper([split(/^/sm, $_[1])]) or die "print $!"; -} - -sub hexdigest { $_[0]->{dig}->hexdigest; } - 1; diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm index 665206a8..c498421c 100644 --- a/lib/PublicInbox/LeiMailSync.pm +++ b/lib/PublicInbox/LeiMailSync.pm @@ -6,9 +6,12 @@ package PublicInbox::LeiMailSync; use strict; use v5.10.1; use parent qw(PublicInbox::Lock); +use PublicInbox::Compat qw(uniqstr); use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::ContentHash qw(git_sha); use Carp (); +use PublicInbox::Git qw(%HEXLEN2SHA); +use PublicInbox::IO qw(read_all); sub dbh_new { my ($self) = @_; @@ -339,6 +342,17 @@ SELECT $op(uid) FROM blob2num WHERE fid = ? $ret; } +# must be called with lock +sub _forget_fids ($;@) { + my $dbh = shift; + $dbh->begin_work; + for my $t (qw(blob2name blob2num folders)) { + my $sth = $dbh->prepare_cached("DELETE FROM $t WHERE fid = ?"); + $sth->execute($_) for @_; + } + $dbh->commit; +} + # returns a { location => [ list-of-ids-or-names ] } mapping sub locations_for { my ($self, $oidbin) = @_; @@ -379,18 +393,28 @@ sub locations_for { $sth = $dbh->prepare('SELECT loc FROM folders WHERE fid = ? LIMIT 1'); my $ret = {}; + my $drop_fids = $dbh->{ReadOnly} ? undef : {}; while (my ($fid, $ids) = each %fid2id) { $sth->execute($fid); my ($loc) = $sth->fetchrow_array; unless (defined $loc) { + my $del = ''; + if ($drop_fids) { + $del = ' (deleting)'; + $drop_fids->{$fid} = $fid; + } my $oidhex = unpack('H*', $oidbin); - warn "E: fid=$fid for $oidhex unknown:\n", map { - 'E: '.(ref() ? $$_ : "#$_")."\n"; + warn "E: fid=$fid for $oidhex stale/unknown:\n", map { + 'E: '.(ref() ? $$_ : "#$_")."$del\n"; } @$ids; next; } $ret->{$loc} = $ids; } + if ($drop_fids && scalar(values %$drop_fids)) { + my $lk = $self->lock_for_scope; + _forget_fids($self->{dbh}, values %$drop_fids); + } scalar(keys %$ret) ? $ret : undef; } @@ -401,9 +425,13 @@ sub folders { my $re; if (defined($pfx[0])) { $sql .= ' WHERE loc REGEXP ?'; # DBD::SQLite uses perlre - $re = !!$pfx[1] ? '.*' : ''; - $re .= quotemeta($pfx[0]); - $re .= '.*'; + if (ref($pfx[0])) { # assume qr// "Regexp" + $re = $pfx[0]; + } else { + $re = !!$pfx[1] ? '.*' : ''; + $re .= quotemeta($pfx[0]); + $re .= '.*'; + } } my $sth = ($self->{dbh} //= dbh_new($self))->prepare($sql); $sth->bind_param(1, $re) if defined($re); @@ -411,15 +439,24 @@ sub folders { map { $_->[0] } @{$sth->fetchall_arrayref}; } +sub blob_mismatch ($$$) { + my ($f, $oidhex, $rawref) = @_; + my $sha = $HEXLEN2SHA{length($oidhex)}; + my $got = git_sha($sha, $rawref)->hexdigest; + $got eq $oidhex ? undef : warn("$f changed $oidhex => $got\n"); +} + sub local_blob { my ($self, $oidhex, $vrfy) = @_; my $dbh = $self->{dbh} //= dbh_new($self); + my $oidbin = pack('H*', $oidhex); + my $b2n = $dbh->prepare(<<''); SELECT f.loc,b.name FROM blob2name b LEFT JOIN folders f ON b.fid = f.fid WHERE b.oidbin = ? - $b2n->bind_param(1, pack('H*', $oidhex), SQL_BLOB); + $b2n->bind_param(1, $oidbin, SQL_BLOB); $b2n->execute; while (my ($d, $n) = $b2n->fetchrow_array) { substr($d, 0, length('maildir:')) = ''; @@ -432,19 +469,28 @@ WHERE b.oidbin = ? my $f = "$d/$x/$n"; open my $fh, '<', $f or next; # some (buggy) Maildir writers are non-atomic: - next unless -s $fh; - local $/; - my $raw = <$fh>; - if ($vrfy) { - my $got = git_sha(1, \$raw)->hexdigest; - if ($got ne $oidhex) { - warn "$f changed $oidhex => $got\n"; - next; - } - } + my $raw = read_all($fh, -s $fh // next); + next if $vrfy && blob_mismatch $f, $oidhex, \$raw; return \$raw; } } + + # MH, except `uid' is not always unique (can be packed) + $b2n = $dbh->prepare(<<''); +SELECT f.loc,b.uid FROM blob2num b +LEFT JOIN folders f ON b.fid = f.fid +WHERE b.oidbin = ? AND f.loc REGEXP '^mh:/' + + $b2n->bind_param(1, $oidbin, SQL_BLOB); + $b2n->execute; + while (my ($f, $n) = $b2n->fetchrow_array) { + $f =~ s/\Amh://s or die "BUG: not MH: $f"; + $f .= "/$n"; + open my $fh, '<', $f or next; + my $raw = read_all($fh, -s $fh // next); + next if blob_mismatch $f, $oidhex, \$raw; + return \$raw; + } undef; } @@ -520,20 +566,19 @@ EOM --all=@no not accepted (must be `local' and/or `remote') EOM } - my (%seen, @inc); my @all = $self->folders; for my $ok (@ok) { if ($ok eq 'local') { - @inc = grep(!m!\A[a-z0-9\+]+://!i, @all); + push @$folders, grep(!m!\A[a-z0-9\+]+://!i, @all); } elsif ($ok eq 'remote') { - @inc = grep(m!\A[a-z0-9\+]+://!i, @all); + push @$folders, grep(m!\A[a-z0-9\+]+://!i, @all); } elsif ($ok ne '') { return $lei->fail("--all=$all not understood"); } else { - @inc = @all; + push @$folders, @all; } - push(@$folders, (grep { !$seen{$_}++ } @inc)); } + @$folders = uniqstr @$folders; scalar(@$folders) || $lei->fail(<<EOM); no --mail-sync folders known to lei EOM @@ -596,14 +641,10 @@ EOF sub forget_folders { my ($self, @folders) = @_; my $lk = $self->lock_for_scope; - for my $folder (@folders) { - my $fid = delete($self->{fmap}->{$folder}) // - fid_for($self, $folder) // next; - for my $t (qw(blob2name blob2num folders)) { - $self->{dbh}->do("DELETE FROM $t WHERE fid = ?", - undef, $fid); - } - } + _forget_fids($self->{dbh}, map { + delete($self->{fmap}->{$_}) // + fid_for($self, $_) // (); + } @folders); } # only used for changing canonicalization errors @@ -637,8 +678,8 @@ sub num_oidbin ($$$) { SELECT oidbin FROM blob2num WHERE fid = ? AND uid = ? ORDER BY _rowid_ EOM $sth->execute($fid, $uid); - my %uniq; # for public-inbox <= 1.7.0 - grep { !$uniq{$_}++ } map { $_->[0] } @{$sth->fetchall_arrayref}; + # for public-inbox <= 1.7.0: + uniqstr(map { $_->[0] } @{$sth->fetchall_arrayref}); } sub name_oidbin ($$$) { @@ -655,8 +696,7 @@ EOM $sth->bind_param(2, $nm, SQL_VARCHAR); $sth->execute; my @old = map { $_->[0] } @{$sth->fetchall_arrayref}; - my %uniq; # for public-inbox <= 1.7.0 - grep { !$uniq{$_}++ } (@bin, @old); + uniqstr @bin, @old # for public-inbox <= 1.7.0 } sub imap_oidhex { diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm index e20d30b4..08e61e4b 100644 --- a/lib/PublicInbox/LeiMirror.pm +++ b/lib/PublicInbox/LeiMirror.pm @@ -1,32 +1,49 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # "lei add-external --mirror" support (also "public-inbox-clone"); package PublicInbox::LeiMirror; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::IPC); -use PublicInbox::Config; -use PublicInbox::AutoReap; use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use IO::Compress::Gzip qw(gzip $GzipError); -use PublicInbox::Spawn qw(popen_rd spawn); +use PublicInbox::Spawn qw(spawn run_wait run_die run_qx); +use PublicInbox::IO qw(write_file); +use File::Path (); use File::Temp (); +use File::Spec (); use Fcntl qw(SEEK_SET O_CREAT O_EXCL O_WRONLY); use Carp qw(croak); +use URI; +use PublicInbox::Config qw(glob2re); +use PublicInbox::Inbox; +use PublicInbox::LeiCurl; +use PublicInbox::OnDestroy; +use PublicInbox::SHA qw(sha256_hex sha_all); +use POSIX qw(strftime); +use PublicInbox::Admin qw(fmt_localtime); +use autodie qw(chdir chmod close open pipe readlink + seek symlink sysopen sysseek truncate unlink); -sub _wq_done_wait { # dwaitpid callback (via wq_eof) - my ($arg, $pid) = @_; - my ($mrr, $lei) = @$arg; - my $f = "$mrr->{dst}/mirror.done"; +our $LIVE; # pid => callback +our $FGRP_TODO; # objstore -> [[ to resume ], [ to clone ]] +our $TODO; # reference => [ non-fgrp mirror objects ] +our @PUH; # post-update hooks + +sub keep_going ($) { + $LIVE && (!$_[0]->{lei}->{child_error} || + $_[0]->{lei}->{opt}->{'keep-going'}); +} + +sub _wq_done_wait { # awaitpid cb (via wq_eof) + my ($pid, $mrr, $lei) = @_; if ($?) { $lei->child_error($?); - } elsif (!unlink($f)) { - warn("unlink($f): $!\n") unless $!{ENOENT}; - } else { - if ($lei->{cmd} ne 'public-inbox-clone') { - $lei->lazy_cb('add-external', '_finish_' - )->($lei, $mrr->{dst}); + } elsif (!$lei->{child_error}) { + if (!$mrr->{dry_run} && $lei->{cmd} ne 'public-inbox-clone') { + require PublicInbox::LeiAddExternal; + PublicInbox::LeiAddExternal::_finish_add_external( + $lei, $mrr->{dst}); } $lei->qerr("# mirrored $mrr->{src} => $mrr->{dst}"); } @@ -35,20 +52,27 @@ sub _wq_done_wait { # dwaitpid callback (via wq_eof) # for old installations without manifest.js.gz sub try_scrape { - my ($self) = @_; + my ($self, $fallback_manifest) = @_; my $uri = URI->new($self->{src}); my $lei = $self->{lei}; my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; my $cmd = $curl->for_uri($lei, $uri, '--compressed'); my $opt = { 0 => $lei->{0}, 2 => $lei->{2} }; - my $fh = popen_rd($cmd, undef, $opt); - my $html = do { local $/; <$fh> } // die "read(curl $uri): $!"; - close($fh) or return $lei->child_error($?, "@$cmd failed"); + my $html = run_qx($cmd, undef, $opt); + return $lei->child_error($?, "@$cmd failed") if $?; # we grep with URL below, we don't want Subject/From headers - # making us clone random URLs + # making us clone random URLs. This assumes remote instances + # prior to public-inbox 1.7.0 + # 5b96edcb1e0d8252 (www: move mirror instructions to /text/, 2021-08-28) my @html = split(/<hr>/, $html); my @urls = ($html[-1] =~ m!\bgit clone --mirror ([a-z\+]+://\S+)!g); + if (!@urls && $fallback_manifest) { + warn <<EOM; +W: failed to extract URLs from $uri, trying manifest.js.gz... +EOM + return start_clone_url($self); + } my $url = $uri->as_string; chop($url) eq '/' or die "BUG: $uri not canonicalized"; @@ -57,9 +81,12 @@ sub try_scrape { if (my @v2_urls = grep(m!\A\Q$url\E/[0-9]+\z!, @urls)) { my %v2_epochs = map { my ($n) = (m!/([0-9]+)\z!); - $n => URI->new($_) + $n => [ URI->new($_), '' ] } @v2_urls; # uniq - return clone_v2($self, \%v2_epochs); + clone_v2_prep($self, \%v2_epochs); + delete local $lei->{opt}->{epoch}; + clone_all($self); + return; } # filter out common URLs served by WWW (e.g /$MSGID/T/) @@ -84,56 +111,92 @@ sub clone_cmd { # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []}; push @cmd, qw(clone --mirror); - push @cmd, '-q' if $lei->{opt}->{quiet}; + push @cmd, '-q' if $lei->{opt}->{quiet} || + ($lei->{opt}->{jobs} // 1) > 1; push @cmd, '-v' if $lei->{opt}->{verbose}; # XXX any other options to support? - # --reference is tricky with multiple epochs... + # --reference is tricky with multiple epochs, but handled + # automatically if using manifest.js.gz @cmd; } -sub ft_rename ($$$) { - my ($ft, $dst, $open_mode) = @_; - my $fn = $ft->filename; - my @st = stat($dst); +sub ft_rename ($$$;$) { + my ($ft, $dst, $open_mode, $fh) = @_; + my @st = stat($fh // $dst); my $mode = @st ? ($st[2] & 07777) : ($open_mode & ~umask); - chmod($mode, $ft) or croak "E: chmod $fn: $!"; - rename($fn, $dst) or croak "E: rename($fn => $ft): $!"; + chmod($mode, $ft); + require File::Copy; + File::Copy::mv($ft->filename, $dst) or croak "E: mv($ft => $dst): $!"; $ft->unlink_on_destroy(0); } -sub _get_txt { # non-fatal - my ($self, $endpoint, $file, $mode) = @_; - my $uri = URI->new($self->{src}); +sub do_reap ($;$) { + my ($self, $jobs) = @_; + $jobs //= $self->{-jobs} //= $self->{lei}->{opt}->{jobs} // 1; + $jobs = 1 if $jobs < 1; + while (keys(%$LIVE) >= $jobs) { + my $pid = waitpid(-1, 0) // die "waitpid(-1): $!"; + if (my $x = delete $LIVE->{$pid}) { + my $cb = shift @$x; + $cb->(@$x) if $cb; + } else { + warn "reaped unknown PID=$pid ($?)\n"; + } + } +} + +sub _get_txt_start { # non-fatal + my ($self, $endpoint, $fini) = @_; + my $uri = URI->new($self->{cur_src} // $self->{src}); my $lei = $self->{lei}; my $path = $uri->path; chop($path) eq '/' or die "BUG: $uri not canonicalized"; $uri->path("$path/$endpoint"); - my $ft = File::Temp->new(TEMPLATE => "$file-XXXX", DIR => $self->{dst}); + my $f = (split(m!/!, $endpoint))[-1]; + my $ft = File::Temp->new(TEMPLATE => "$f-XXXX", TMPDIR => 1); my $opt = { 0 => $lei->{0}, 1 => $lei->{1}, 2 => $lei->{2} }; - my $cmd = $self->{curl}->for_uri($lei, $uri, - qw(--compressed -R -o), $ft->filename); - my $cerr = run_reap($lei, $cmd, $opt); - return "$uri missing" if ($cerr >> 8) == 22; - return "# @$cmd failed (non-fatal)" if $cerr; - ft_rename($ft, "$self->{dst}/$file", $mode); + my $cmd = $self->{curl}->for_uri($lei, $uri, qw(--compressed -R -o), + $ft->filename); + do_reap($self); + $lei->qerr("# @$cmd"); + return if $self->{dry_run}; + $self->{"-get_txt.$endpoint"} = [ $ft, $cmd, $uri ]; + $LIVE->{spawn($cmd, undef, $opt)} = + [ \&_get_txt_done, $self, $endpoint, $fini ]; +} + +sub _get_txt_done { # returns true on error (non-fatal), undef on success + my ($self, $endpoint) = @_; + my ($fh, $cmd, $uri) = @{delete $self->{"-get_txt.$endpoint"}}; + my $cerr = $?; + $? = 0; # don't influence normal lei exit + return warn("$uri missing\n") if ($cerr >> 8) == 22; + return warn("# @$cmd failed (non-fatal)\n") if $cerr; + seek($fh, 0, SEEK_SET); + $self->{"mtime.$endpoint"} = (stat($fh))[9]; + $self->{"txt.$endpoint"} = PublicInbox::IO::read_all $fh, -s _; undef; # success } -# tries the relatively new /$INBOX/_/text/config/raw endpoint -sub _try_config { +sub _write_inbox_config { my ($self) = @_; - my $dst = $self->{dst}; - if (!-d $dst || !mkdir($dst)) { - require File::Path; - File::Path::mkpath($dst); - -d $dst or die "mkpath($dst): $!\n"; - } - my $err = _get_txt($self, - qw(_/text/config/raw inbox.config.example), 0444); - return warn($err, "\n") if $err; - my $f = "$self->{dst}/inbox.config.example"; - my $cfg = PublicInbox::Config->git_config_dump($f, $self->{lei}->{2}); - my $ibx = $self->{ibx} = {}; + my $buf = delete($self->{'txt._/text/config/raw'}) // return; + my $dst = $self->{cur_dst} // $self->{dst}; + my $f = "$dst/inbox.config.example"; + my $mtime = delete $self->{'mtime._/text/config/raw'}; + if (CORE::sysopen(my $fh, $f, O_CREAT|O_EXCL|O_WRONLY)) { + print $fh $buf; + chmod(0444 & ~umask, $fh); + $fh->flush or die "flush($f): $!"; + if (defined $mtime) { + utime($mtime, $mtime, $fh) or die "utime($f): $!"; + } + } elsif (!$!{EEXIST}) { + die "open($f): $!"; + } + my $cfg = PublicInbox::Config->git_config_dump($f, + { 2 => $self->{lei}->{2} }); + my $ibx = $self->{ibx} = {}; # for indexing for my $sec (grep(/\Apublicinbox\./, @{$cfg->{-section_order}})) { for (qw(address newsgroup nntpmirror)) { $ibx->{$_} = $cfg->{"$sec.$_"}; @@ -143,35 +206,31 @@ sub _try_config { sub set_description ($) { my ($self) = @_; - my $f = "$self->{dst}/description"; - open my $fh, '+>>', $f or die "open($f): $!"; - seek($fh, 0, SEEK_SET) or die "seek($f): $!"; - chomp(my $d = do { local $/; <$fh> } // die "read($f): $!"); - if ($d eq '($INBOX_DIR/description missing)' || - $d =~ /^Unnamed repository/ || $d !~ /\S/) { - seek($fh, 0, SEEK_SET) or die "seek($f): $!"; - truncate($fh, 0) or die "truncate($f): $!"; - print $fh "mirror of $self->{src}\n" or die "print($f): $!"; - close $fh or die "close($f): $!"; + my $dst = $self->{cur_dst} // $self->{dst}; + chomp(my $orig = PublicInbox::IO::try_cat("$dst/description")); + my $d = $orig; + while (defined($d) && ($d =~ m!^\(\$INBOX_DIR/description missing\)! || + $d =~ /^Unnamed repository/ || $d !~ /\S/)) { + $d = delete($self->{'txt.description'}); } + $d //= 'mirror of '.($self->{cur_src} // $self->{src}); + atomic_write($dst, 'description', $d."\n") if $d ne $orig; } sub index_cloned_inbox { my ($self, $iv) = @_; my $lei = $self->{lei}; - my $err = _get_txt($self, qw(description description), 0666); - warn($err, "\n") if $err; # non fatal - eval { set_description($self) }; - warn $@ if $@; # n.b. public-inbox-clone works w/o (SQLite || Xapian) # lei is useless without Xapian + SQLite if ($lei->{cmd} ne 'public-inbox-clone') { + require PublicInbox::InboxWritable; + require PublicInbox::Admin; my $ibx = delete($self->{ibx}) // { address => [ 'lei@example.com' ], version => $iv, }; - $ibx->{inboxdir} = $self->{dst}; + $ibx->{inboxdir} = $self->{cur_dst} // $self->{dst}; PublicInbox::Inbox->new($ibx); PublicInbox::InboxWritable->new($ibx); my $opt = {}; @@ -179,46 +238,405 @@ sub index_cloned_inbox { my ($k) = ($sw =~ /\A([\w-]+)/); $opt->{$k} = $lei->{opt}->{$k}; } - # force synchronous dwaitpid for v2: + # force synchronous awaitpid for v2: local $PublicInbox::DS::in_loop = 0; - my $cfg = PublicInbox::Config->new(undef, $lei->{2}); + my $cfg = PublicInbox::Config->new(undef, { 2 => $lei->{2} }); my $env = PublicInbox::Admin::index_prepare($opt, $cfg); local %ENV = (%ENV, %$env) if $env; PublicInbox::Admin::progress_prepare($opt, $lei->{2}); PublicInbox::Admin::index_inbox($ibx, undef, $opt); } - open my $x, '>', "$self->{dst}/mirror.done"; # for _wq_done_wait + return if defined $self->{cur_dst}; # one of many repos to clone } sub run_reap { my ($lei, $cmd, $opt) = @_; $lei->qerr("# @$cmd"); - my $ar = PublicInbox::AutoReap->new(spawn($cmd, undef, $opt)); - $ar->join; - my $ret = $?; + my $ret = run_wait($cmd, undef, $opt); $? = 0; # don't let it influence normal exit $ret; } -sub clone_v1 { +sub start_cmd { + my ($self, $cmd, $opt, $fini) = @_; + do_reap($self); + $self->{lei}->qerr("# @$cmd"); + return if $self->{dry_run}; + $LIVE->{spawn($cmd, undef, $opt)} = [ \&reap_cmd, $self, $cmd, $fini ] +} + +sub fetch_args ($$) { + my ($lei, $opt) = @_; + my @cmd; # (git --git-dir=...) to be added by caller + $opt->{$_} = $lei->{$_} for (0..2); + # we support "-c $key=$val" for arbitrary git config options + # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 + push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []}; + push @cmd, 'fetch'; + push @cmd, '-q' if $lei->{opt}->{quiet} || + ($lei->{opt}->{jobs} // 1) > 1; + push @cmd, '-v' if $lei->{opt}->{verbose}; + push(@cmd, '-p') if $lei->{opt}->{prune}; + PublicInbox::Git::git_version() ge v2.29.0 and + push(@cmd, '--no-write-fetch-head'); + @cmd; +} + +sub upr { # feed `git update-ref --stdin -z' verbosely + my ($lei, $w, $op, @rest) = @_; # ($ref, $oid) = @rest + $lei->qerr("# $op @rest") if $lei->{opt}->{verbose}; + print $w "$op ", join("\0", @rest, '') or die "print(w): $!"; +} + +sub start_update_ref { + my ($fgrp) = @_; + pipe(my $r, my $w); + my $cmd = [ 'git', "--git-dir=$fgrp->{cur_dst}", + qw(update-ref --stdin -z) ]; + my $pack = on_destroy \&satellite_done, $fgrp; + start_cmd($fgrp, $cmd, { 0 => $r, 2 => $fgrp->{lei}->{2} }, $pack); + close $r; + $fgrp->{dry_run} ? undef : $w; +} + +sub upref_warn { warn "E: close(update-ref --stdin): $! (need git 1.8.5+)\n" } + +sub fgrp_update { + my ($fgrp) = @_; + return if !keep_going($fgrp); + my $srcfh = delete $fgrp->{srcfh} or return; + my $dstfh = delete $fgrp->{dstfh} or return; + seek($srcfh, 0, SEEK_SET); + seek($dstfh, 0, SEEK_SET); + my %src = map { chomp; split /\0/ } PublicInbox::IO::read_all $srcfh; + my %dst = map { chomp; split /\0/ } PublicInbox::IO::read_all $dstfh; + $srcfh = $dstfh = undef; + my $w = start_update_ref($fgrp) or return; + my $lei = $fgrp->{lei}; + my $ndel; + for my $ref (keys %dst) { + my $new = delete $src{$ref}; + my $old = $dst{$ref}; + if (defined $new) { + $new eq $old or + upr($lei, $w, 'update', $ref, $new, $old); + } else { + upr($lei, $w, 'delete', $ref, $old); + ++$ndel; + } + } + # git's ref files backend doesn't allow directory/file conflicts + # between `delete' and `create' ops: + if ($ndel && scalar(keys %src)) { + $fgrp->{-create_refs} = \%src; + } else { + while (my ($ref, $oid) = each %src) { + upr($lei, $w, 'create', $ref, $oid); + } + } + $w->close or upref_warn(); +} + +sub satellite_done { + my ($fgrp) = @_; + if (my $create = delete $fgrp->{-create_refs}) { + my $w = start_update_ref($fgrp) or return; + while (my ($ref, $oid) = each %$create) { + upr($fgrp->{lei}, $w, 'create', $ref, $oid); + } + $w->close or upref_warn(); + } else { + pack_refs($fgrp, $fgrp->{cur_dst}); + run_puh($fgrp); + } +} + +sub pack_refs { + my ($self, $git_dir) = @_; + my $cmd = [ 'git', "--git-dir=$git_dir", qw(pack-refs --all --prune) ]; + start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }); +} + +sub unlink_fetch_head ($) { + my ($git_dir) = @_; + return if CORE::unlink("$git_dir/FETCH_HEAD") || $!{ENOENT}; + warn "W: unlink($git_dir/FETCH_HEAD): $!"; +} + +sub fgrpv_done { + my ($fgrpv) = @_; + return if !$LIVE; + my $first = $fgrpv->[0] // die 'BUG: no fgrpv->[0]'; + return if !keep_going($first); + unlink_fetch_head($first->{-osdir}) if !$first->{dry_run}; + pack_refs($first, $first->{-osdir}); # objstore refs always packed + for my $fgrp (@$fgrpv) { + my $rn = $fgrp->{-remote}; + my %opt = ( 2 => $fgrp->{lei}->{2} ); + my $update_ref = on_destroy \&fgrp_update, $fgrp; + my $src = [ 'git', "--git-dir=$fgrp->{-osdir}", 'for-each-ref', + "--format=refs/%(refname:lstrip=3)%00%(objectname)", + "refs/remotes/$rn/" ]; + open(my $sfh, '+>', undef); + $fgrp->{srcfh} = $sfh; + start_cmd($fgrp, $src, { %opt, 1 => $sfh }, $update_ref); + my $dst = [ 'git', "--git-dir=$fgrp->{cur_dst}", 'for-each-ref', + '--format=%(refname)%00%(objectname)' ]; + open(my $dfh, '+>', undef); + $fgrp->{dstfh} = $dfh; + start_cmd($fgrp, $dst, { %opt, 1 => $dfh }, $update_ref); + } +} + +sub fgrp_fetch_all { my ($self) = @_; + my $todo = $FGRP_TODO; + $FGRP_TODO = \'BUG on further use'; + keys(%$todo) or return; + + # Rely on the fgrptmp remote groups in the config file rather + # than listing all remotes since the remote name list may exceed + # system argv limits: + my $grp = 'fgrptmp'; + + my @git = (@{$self->{-torsocks}}, 'git'); + my $j = $self->{lei}->{opt}->{jobs}; + my $opt = {}; + my @fetch = do { + local $self->{lei}->{opt}->{jobs} = 1; + (fetch_args($self->{lei}, $opt), qw(--no-tags --multiple)); + }; + push(@fetch, "-j$j") if $j; + while (my ($osdir, $fgrp_old_new) = each %$todo) { + my $f = "$osdir/config"; + return if !keep_going($self); + my ($old, $new) = @$fgrp_old_new; + @$old = sort { $b->{-sort} <=> $a->{-sort} } @$old; + # $new is ordered by {references} + my $cmd = ['git', "--git-dir=$osdir", qw(config -f), $f ]; + + # clobber settings from previous run atomically + for ("remotes.$grp", 'fetch.hideRefs') { + my $c = [ @$cmd, '--unset-all', $_ ]; + $self->{lei}->qerr("# @$c"); + next if $self->{dry_run}; + run_wait($c, undef, $opt); + die "E: @$c \$?=$?" if ($? && ($? >> 8) != 5); + } + + # permanent configs: + my $cfg = PublicInbox::Config->git_config_dump($f); + for my $fgrp (@$old, @$new) { + my $u = $fgrp->{-uri} // die 'BUG: no {-uri}'; + my $rn = $fgrp->{-remote} // die 'BUG: no {-remote}'; + for ("url=$u", "fetch=+refs/*:refs/remotes/$rn/*", + 'tagopt=--no-tags') { + my ($k, $v) = split(/=/, $_, 2); + $k = "remote.$rn.$k"; + next if ($cfg->{$k} // '') eq $v; + my $c = [@$cmd, $k, $v]; + $fgrp->{lei}->qerr("# @$c"); + next if $fgrp->{dry_run}; + run_die($c, undef, $opt); + } + } + + if (!$self->{dry_run}) { + # update the config atomically via O_APPEND while + # respecting git-config locking + sysopen(my $lk, "$f.lock", O_CREAT|O_EXCL|O_WRONLY); + open my $fh, '>>', $f; + $fh->autoflush(1); + my $buf = ''; + if (@$old) { + $buf = "[fetch]\n\thideRefs = refs\n"; + $buf .= join('', map { + "\thideRefs = !refs/remotes/" . + "$_->{-remote}/\n"; + } @$old); + } + $buf .= join('', "[remotes]\n", + (map { "\t$grp = $_->{-remote}\n" } @$old), + (map { "\t$grp = $_->{-remote}\n" } @$new)); + print $fh $buf or die "print($f): $!"; + close $fh; + unlink("$f.lock"); + } + $cmd = [ @git, "--git-dir=$osdir", @fetch, $grp ]; + push @$old, @$new; + my $end = on_destroy \&fgrpv_done, $old; + start_cmd($self, $cmd, $opt, $end); + } +} + +# keep this idempotent for future use by public-inbox-fetch +sub forkgroup_prep { + my ($self, $uri) = @_; + $self->{-ent} // return; + my $os = $self->{-objstore} // return; + my $fg = $self->{-ent}->{forkgroup} // return; + my $dir = "$os/$fg.git"; + if (!-d $dir && !$self->{dry_run}) { + PublicInbox::Import::init_bare($dir); + write_file '+>>', "$dir/config", <<EOM; +[repack] + useDeltaIslands = true +[pack] + island = refs/remotes/([^/]+)/ +EOM + } + my $key = $self->{-key} // die 'BUG: no -key'; + my $rn = substr(sha256_hex($key), 0, 16); + if (!-d $self->{cur_dst} && !$self->{dry_run}) { + PublicInbox::Import::init_bare($self->{cur_dst}); + write_file '+>>', "$self->{cur_dst}/config", <<EOM; +; rely on the "$rn" remote in the +; $fg fork group for fetches +; only uncomment the following iff you detach from fork groups +; [remote "origin"] +; url = $uri +; fetch = +refs/*:refs/* +; mirror = true +EOM + } + if (!$self->{dry_run}) { + my $alt = File::Spec->rel2abs("$dir/objects"); + my $o = "$self->{cur_dst}/objects"; + my $l = File::Spec->abs2rel($alt, File::Spec->rel2abs($o)); + open my $fh, '+>>', my $f = "$o/info/alternates"; + seek($fh, 0, SEEK_SET); # Perl did SEEK_END when it saw '>>' + my $seen = grep /\A\Q$l\E\n/, PublicInbox::IO::read_all $fh; + print $fh "$l\n" if !$seen; + close $fh; + } + bless { + %$self, -osdir => $dir, -remote => $rn, -uri => $uri + }, __PACKAGE__; +} + +sub fp_done { + my ($self, $cmd, $cb, @arg) = @_; + if ($?) { + $self->{lei}->err("@$cmd failed (\$?=$?) (non-fatal)"); + $? = 0; # don't let it influence normal exit + } + return if !keep_going($self); + my $fh = delete $self->{-show_ref} // die 'BUG: no show-ref output'; + sysseek($fh, 0, SEEK_SET); + $self->{-ent} // die 'BUG: no -ent'; + my $A = $self->{-ent}->{fingerprint} // die 'BUG: no fingerprint'; + my $B = sha_all(1, $fh)->hexdigest; + return $cb->($self, @arg) if $A ne $B; + $self->{lei}->qerr("# $self->{-key} up-to-date"); +} + +sub cmp_fp_do { + my ($self, $cb, @arg) = @_; + # $cb is either resume_fetch or fgrp_enqueue + $self->{-ent} // return $cb->($self, @arg); + my $new = $self->{-ent}->{fingerprint} // return $cb->($self, @arg); + my $key = $self->{-key} // die 'BUG: no -key'; + if (my $cur_ent = $self->{-local_manifest}->{$key}) { + # runs go_fetch->DESTROY run if eq + return if $cur_ent->{fingerprint} eq $new; + } + my $dst = $self->{cur_dst} // $self->{dst}; + my $cmd = ['git', "--git-dir=$dst", 'show-ref']; + my $opt = { 2 => $self->{lei}->{2} }; + open($opt->{1}, '+>', undef); + $self->{-show_ref} = $opt->{1}; + do_reap($self); + $self->{lei}->qerr("# @$cmd"); + $LIVE->{spawn($cmd, undef, $opt)} = [ \&fp_done, $self, $cmd, + $cb, @arg ]; +} + +sub resume_fetch { + my ($self, $uri, $fini) = @_; + return if !keep_going($self); + my $dst = $self->{cur_dst} // $self->{dst}; + my @git = ('git', "--git-dir=$dst"); + my $opt = { 2 => $self->{lei}->{2} }; + my $rn = 'random'.int(rand(1 << 30)); + for ("url=$uri", "fetch=+refs/*:refs/*", 'mirror=true') { + push @git, '-c', "remote.$rn.$_"; + } + my $cmd = [ @{$self->{-torsocks}}, @git, + fetch_args($self->{lei}, $opt), $rn ]; + push @$cmd, '-P' if $self->{lei}->{prune}; # --prune-tags implied + my $run_puh = on_destroy \&run_puh, $self, $fini; + ++$self->{chg}->{nr_chg}; + start_cmd($self, $cmd, $opt, $run_puh); +} + +sub fgrp_enqueue { + my ($fgrp, $end) = @_; # $end calls fgrp_fetch_all + return if !keep_going($fgrp); + ++$fgrp->{chg}->{nr_chg}; + my $dst = $FGRP_TODO->{$fgrp->{-osdir}} //= [ [], [] ]; # [ old, new ] + push @{$dst->[defined($fgrp->{-sort} ? 0 : 1)]}, $fgrp; +} + +sub clone_v1 { + my ($self, $end) = @_; my $lei = $self->{lei}; my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; - my $uri = URI->new($self->{src}); + my $uri = URI->new($self->{cur_src} // $self->{src}); + my $path = $uri->path; + $path =~ s!/*\z!! and $uri->path($path); defined($lei->{opt}->{epoch}) and die "$uri is a v1 inbox, --epoch is not supported\n"; - my $pfx = $curl->torsocks($lei, $uri) or return; - my $cmd = [ @$pfx, clone_cmd($lei, my $opt = {}), - $uri->as_string, $self->{dst} ]; - my $cerr = run_reap($lei, $cmd, $opt); - return $lei->child_error($cerr, "@$cmd failed") if $cerr; - _try_config($self); - write_makefile($self->{dst}, 1); - index_cloned_inbox($self, 1); + $self->{-torsocks} //= $curl->torsocks($lei, $uri) or return; + my $dst = $self->{cur_dst} // $self->{dst}; + my $resume = -d $dst; + if ($resume) { # respect read-only cloned w/ --epoch= + my @st = stat(_); # for root + if (!-w _ || !($st[2] & 0222)) { + warn "# skipping $dst, not writable\n"; + return; + } + } + my $fini = on_destroy \&v1_done, $self; + if (my $fgrp = forkgroup_prep($self, $uri)) { + $fgrp->{-fini} = $fini; + if ($resume) { + $fgrp->{-sort} = $fgrp->{-ent}->{modified}; + cmp_fp_do($fgrp, \&fgrp_enqueue, $end); + } else { # new repo, save for last + fgrp_enqueue($fgrp, $end); + } + } elsif ($resume) { + cmp_fp_do($self, \&resume_fetch, $uri, $fini); + } else { # normal clone + my $cmd = [ @{$self->{-torsocks}}, + clone_cmd($lei, my $opt = {}), "$uri", $dst ]; + if (defined($self->{-ent})) { + if (defined(my $ref = $self->{-ent}->{reference})) { + -e "$self->{dst}$ref" and + push @$cmd, '--reference', + "$self->{dst}$ref"; + } + } + ++$self->{chg}->{nr_chg}; + start_cmd($self, $cmd, $opt, + on_destroy(\&run_puh, $self, $fini)); + } + if (!$self->{-is_epoch} && $lei->{opt}->{'inbox-config'} =~ + /\A(?:always|v1)\z/s && + !-f "$dst/inbox.config.example") { + _get_txt_start($self, '_/text/config/raw', $fini); + } + + my $d = $self->{-ent} ? $self->{-ent}->{description} : undef; + utf8::encode($self->{'txt.description'} = $d) if defined $d; + (!defined($d) && !$end) and + _get_txt_start($self, 'description', $fini); + + $end or do_reap($self, 1); # for non-manifest clone } sub parse_epochs ($$) { - my ($opt_epochs, $v2_epochs) = @_; # $epcohs "LOW..HIGH" + my ($opt_epochs, $v2_epochs) = @_; # $epochs "LOW..HIGH" $opt_epochs // return; # undef => all epochs my ($lo, $dotdot, $hi, @extra) = split(/(\.\.)/, $opt_epochs); undef($lo) if ($lo // '') eq ''; @@ -260,12 +678,14 @@ EOM $want } -sub init_placeholder ($$) { - my ($src, $edst) = @_; +sub init_placeholder ($$$) { + my ($src, $edst, $ent) = @_; PublicInbox::Import::init_bare($edst); - my $f = "$edst/config"; - open my $fh, '>>', $f or die "open($f): $!"; - print $fh <<EOM or die "print($f): $!"; + my @owner = defined($ent->{owner}) ? (<<EOM) : (); +[gitweb] + owner = $ent->{owner} +EOM + write_file '>>', "$edst/config", <<EOM, @owner; [remote "origin"] url = $src fetch = +refs/*:refs/* @@ -273,20 +693,196 @@ sub init_placeholder ($$) { ; This git epoch was created read-only and "public-inbox-fetch" ; will not fetch updates for it unless write permission is added. +; Hint: chmod +w $edst EOM - close $fh or die "close:($f): $!"; + my %map = (head => 'HEAD', description => undef); + while (my ($key, $fn) = each %map) { + my $val = $ent->{$key} // next; + $fn //= $key; + write_file '>', "$edst/$fn", $val; + } +} + +sub reap_cmd { # async, called via SIGCHLD + my ($self, $cmd) = @_; + my $cerr = $?; + $? = 0; # don't let it influence normal exit + $self->{lei}->child_error($cerr, "@$cmd failed (\$?=$cerr)") if $cerr; +} + +sub up_fp_done { + my ($self) = @_; + return if !keep_going($self); + my $fh = delete $self->{-show_ref_up} // die 'BUG: no show-ref output'; + sysseek($fh, 0, SEEK_SET); + $self->{-ent} // die 'BUG: no -ent'; + my $A = $self->{-ent}->{fingerprint} // die 'BUG: no fingerprint'; + my $B = sha_all(1, $fh)->hexdigest; + return if $A eq $B; + $self->{-ent}->{fingerprint} = $B; + push @{$self->{chg}->{fp_mismatch}}, $self->{-key}; +} + +sub atomic_write ($$$) { + my ($dn, $bn, $raw) = @_; + my $ft = File::Temp->new(DIR => $dn, TEMPLATE => "$bn-XXXX"); + print $ft $raw; + $ft->flush or die "flush($ft): $!"; + ft_rename($ft, "$dn/$bn", 0666); } -sub clone_v2 ($$;$) { +sub run_next_puh { + my ($self) = @_; + my $puh = shift @{$self->{-puh_todo}} // return delete($self->{-fini}); + my $fini = on_destroy \&run_next_puh, $self; + my $cmd = [ @$puh, ($self->{cur_dst} // $self->{dst}) ]; + my $opt = +{ map { $_ => $self->{lei}->{$_} } (0..2) }; + start_cmd($self, $cmd, undef, $opt, $fini); +} + +sub run_puh { + my ($self, $fini) = @_; + $self->{-fini} = $fini; + @{$self->{-puh_todo}} = @PUH; + run_next_puh($self); +} + +# modifies the to-be-written manifest entry, and sets values from it, too +sub update_ent { + my ($self) = @_; + my $key = $self->{-key} // die 'BUG: no -key'; + my $new = $self->{-ent}->{fingerprint}; + my $cur = $self->{-local_manifest}->{$key}->{fingerprint} // "\0"; + my $dst = $self->{cur_dst} // $self->{dst}; + if (defined($new) && $new ne $cur) { + my $cmd = ['git', "--git-dir=$dst", 'show-ref']; + my $opt = { 2 => $self->{lei}->{2} }; + open($opt->{1}, '+>', undef); + $self->{-show_ref_up} = $opt->{1}; + my $done = on_destroy \&up_fp_done, $self; + start_cmd($self, $cmd, $opt, $done); + } + $new = $self->{-ent}->{head}; + $cur = $self->{-local_manifest}->{$key}->{head} // "\0"; + if (defined($new) && $new ne $cur) { + # n.b. grokmirror writes raw contents to $dst/HEAD w/o locking + my $cmd = [ 'git', "--git-dir=$dst" ]; + if ($new =~ s/\Aref: //) { + push @$cmd, qw(symbolic-ref HEAD), $new; + } elsif ($new =~ /\A[a-f0-9]{40,}\z/) { + push @$cmd, qw(update-ref --no-deref HEAD), $new; + } else { + undef $cmd; + warn "W: $key: {head} => `$new' not understood\n"; + } + start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }) if $cmd; + } + if (my $symlinks = $self->{-ent}->{symlinks}) { + my $top = File::Spec->rel2abs($self->{dst}); + push @{$self->{-new_symlinks}}, @$symlinks; + for my $p (@$symlinks) { + my $ln = "$top/$p"; + $ln =~ tr!/!/!s; + my (undef, $dn, $bn) = File::Spec->splitpath($ln); + File::Path::mkpath($dn); + my $tgt = "$top/$key"; + $tgt = File::Spec->abs2rel($tgt, $dn); + if (lstat($ln)) { + if (-l _) { + next if readlink($ln) eq $tgt; + unlink($ln); + } else { + push @{$self->{chg}->{badlink}}, $p; + } + } + symlink($tgt, $ln); + ++$self->{chg}->{nr_chg}; + } + } + if (defined(my $t = $self->{-ent}->{modified})) { + my ($dn, $bn) = ("$dst/info/web", 'last-modified'); + my $orig = PublicInbox::IO::try_cat("$dn/$bn"); + $t = strftime('%F %T', gmtime($t))." +0000\n"; + File::Path::mkpath($dn); + atomic_write($dn, $bn, $t) if $orig ne $t; + } + + $new = $self->{-ent}->{owner} // return; + $cur = $self->{-local_manifest}->{$key}->{owner} // "\0"; + return if $cur eq $new; + utf8::encode($new); # to octets + my $cmd = [ qw(git config -f), "$dst/config", 'gitweb.owner', $new ]; + start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }); +} + +sub v1_done { # called via OnDestroy + my ($self) = @_; + return if $self->{dry_run} || !keep_going($self); + _write_inbox_config($self); + my $dst = $self->{cur_dst} // $self->{dst}; + unlink_fetch_head($dst); + update_ent($self) if $self->{-ent}; + my $o = "$dst/objects"; + if (CORE::open(my $fh, '<', my $fn = "$o/info/alternates")) {; + my $base = File::Spec->rel2abs($o); + my @l = <$fh>; + my $ft; + for (@l) { + next unless m!\A/!; + $_ = File::Spec->abs2rel($_, $base); + $ft //= File::Temp->new(TEMPLATE => '.XXXX', + DIR => "$o/info"); + } + if ($ft) { + print $ft @l; + $ft->flush or die "flush($ft): $!"; + ft_rename($ft, $fn, 0666, $fh); + } + } + eval { set_description($self) }; + warn $@ if $@; + return if ($self->{-is_epoch} || + $self->{lei}->{opt}->{'inbox-config'} ne 'always'); + write_makefile($dst, 1); + index_cloned_inbox($self, 1); +} + +sub v2_done { # called via OnDestroy + my ($self) = @_; + return if $self->{dry_run} || !keep_going($self); + my $dst = $self->{cur_dst} // $self->{dst}; + require PublicInbox::Lock; + my $lk = PublicInbox::Lock->new("$dst/inbox.lock"); + my $lck = $lk->lock_for_scope; + _write_inbox_config($self); + require PublicInbox::MultiGit; + my $mg = PublicInbox::MultiGit->new($dst, 'all.git', 'git'); + $mg->fill_alternates; + for my $i ($mg->git_epochs) { $mg->epoch_cfg_set($i) } + for my $edst (@{delete($self->{-read_only}) // []}) { + my @st = stat($edst) or die "stat($edst): $!"; + chmod($st[2] & 0555, $edst); + } + write_makefile($dst, 2); + undef $lck; # unlock + eval { set_description($self) }; + warn $@ if $@; + index_cloned_inbox($self, 2); +} + +sub clone_v2_prep ($$;$) { my ($self, $v2_epochs, $m) = @_; # $m => manifest.js.gz hashref my $lei = $self->{lei}; my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; - my $pfx = $curl->torsocks($lei, (values %$v2_epochs)[0]) or return; - my $dst = $self->{dst}; + my $first_uri = (map { $_->[0] } values %$v2_epochs)[0]; + $self->{-torsocks} //= $curl->torsocks($lei, $first_uri) or return; + my $dst = $self->{cur_dst} // $self->{dst}; my $want = parse_epochs($lei->{opt}->{epoch}, $v2_epochs); - my (@src_edst, @read_only, @skip_nr); + my $task = $m ? bless { %$self }, __PACKAGE__ : $self; + my (@skip, $desc); + my $fini = on_destroy \&v2_done, $task; for my $nr (sort { $a <=> $b } keys %$v2_epochs) { - my $uri = $v2_epochs->{$nr}; + my ($uri, $key) = @{$v2_epochs->{$nr}}; my $src = $uri->as_string; my $edst = $dst; $src =~ m!/([0-9]+)(?:\.git)?\z! or die <<""; @@ -294,60 +890,48 @@ failed to extract epoch number from $src $1 + 0 == $nr or die "BUG: <$uri> miskeyed $1 != $nr"; $edst .= "/git/$nr.git"; + my $ent; + if ($m) { + $ent = $m->{$key} // + die("BUG: `$key' not in manifest.js.gz"); + if (defined(my $d = $ent->{description})) { + $d =~ s/ \[epoch [0-9]+\]\z//s; + $desc = $d; + } + } if (!$want || $want->{$nr}) { - push @src_edst, $src, $edst; + my $etask = bless { %$task, -key => $key }, __PACKAGE__; + $etask->{-ent} = $ent; # may have {reference} + $etask->{cur_src} = $src; + $etask->{cur_dst} = $edst; + $etask->{-is_epoch} = $fini; + my $ref = $ent->{reference} // ''; + push @{$TODO->{$ref}}, $etask; + $self->{any_want}->{$key} = 1; } else { # create a placeholder so users only need to chmod +w - init_placeholder($src, $edst); - push @read_only, $edst; - push @skip_nr, $nr; + init_placeholder($src, $edst, $ent); + push @{$task->{-read_only}}, $edst; + push @skip, $key; } } - if (@skip_nr) { # filter out the epochs we skipped - my $re = join('|', @skip_nr); - my @del = grep(m!/git/$re\.git\z!, keys %$m); - delete @$m{@del}; - $self->{-culled_manifest} = 1; - } - my $lk = bless { lock_path => "$dst/inbox.lock" }, 'PublicInbox::Lock'; - _try_config($self); - my $on_destroy = $lk->lock_for_scope($$); - my @cmd = clone_cmd($lei, my $opt = {}); - while (my ($src, $edst) = splice(@src_edst, 0, 2)) { - my $cmd = [ @$pfx, @cmd, $src, $edst ]; - my $cerr = run_reap($lei, $cmd, $opt); - return $lei->child_error($cerr, "@$cmd failed") if $cerr; - } - require PublicInbox::MultiGit; - my $mg = PublicInbox::MultiGit->new($dst, 'all.git', 'git'); - $mg->fill_alternates; - for my $i ($mg->git_epochs) { $mg->epoch_cfg_set($i) } - for my $edst (@read_only) { - my @st = stat($edst) or die "stat($edst): $!"; - chmod($st[2] & 0555, $edst) or die "chmod(a-w, $edst): $!"; + # filter out the epochs we skipped + $self->{chg}->{manifest} = 1 if $m && delete(@$m{@skip}); + + $self->{dry_run} or File::Path::mkpath($dst); + + if ($lei->{opt}->{'inbox-config'} =~ /\A(?:always|v2)\z/s && + !-f "$dst/inbox.config.example") { + _get_txt_start($task, '_/text/config/raw', $fini); } - write_makefile($self->{dst}, 2); - undef $on_destroy; # unlock - index_cloned_inbox($self, 2); -} -# PSGI mount prefixes and manifest.js.gz prefixes don't always align... -sub deduce_epochs ($$) { - my ($m, $path) = @_; - my ($v1_ent, @v2_epochs); - my $path_pfx = ''; - $path =~ s!/+\z!!; - do { - $v1_ent = $m->{$path}; - @v2_epochs = grep(m!\A\Q$path\E/git/[0-9]+\.git\z!, keys %$m); - } while (!defined($v1_ent) && !@v2_epochs && - $path =~ s!\A(/[^/]+)/!/! and $path_pfx .= $1); - ($path_pfx, $v1_ent ? $path : undef, @v2_epochs); + defined($desc) ? ($task->{'txt.description'} = $desc) : + _get_txt_start($task, 'description', $fini); } sub decode_manifest ($$$) { my ($fh, $fn, $uri) = @_; my $js; - my $gz = do { local $/; <$fh> } // die "slurp($fn): $!"; + my $gz = PublicInbox::IO::read_all $fh; gunzip(\$gz => \$js, MultiStream => 1) or die "gunzip($uri): $GunzipError\n"; my $m = eval { PublicInbox::Config->json->decode($js) }; @@ -356,61 +940,318 @@ sub decode_manifest ($$$) { $m; } +sub load_current_manifest ($) { + my ($self) = @_; + my $fn = $self->{-manifest} // return; + if (CORE::open(my $fh, '<', $fn)) { + decode_manifest($fh, $fn, $fn); + } elsif ($!{ENOENT}) { # non-fatal, we can just do it slowly + $self->{-initial_clone} or + warn "W: open($fn): $! (non-fatal)\n"; + undef; + } else { + die "E: open($fn): $!\n"; + } +} + +sub multi_inbox ($$$) { + my ($self, $path, $m) = @_; + my $incl = $self->{lei}->{opt}->{include}; + my $excl = $self->{lei}->{opt}->{exclude}; + + # assuming everything not v2 is v1, for now + my @v1 = sort grep(!m!.+/git/[0-9]+\.git\z!, keys %$m); + my @v2_epochs = sort grep(m!.+/git/[0-9]+\.git\z!, keys %$m); + my $v2 = {}; + + for (@v2_epochs) { + m!\A(/.+)/git/[0-9]+\.git\z! or die "BUG: $_"; + push @{$v2->{$1}}, $_; + } + my $n = scalar(keys %$v2) + scalar(@v1); + my @orig = defined($incl // $excl) ? (keys %$v2, @v1) : (); + if (defined $incl) { + my $re = '(?:'.join('\\z|', map { + glob2re($_) // qr/\A\Q$_\E/ + } @$incl).'\\z)'; + my @gone = delete @$v2{grep(!/$re/, keys %$v2)}; + delete @$m{map { @$_ } @gone} and $self->{chg}->{manifest} = 1; + delete @$m{grep(!/$re/, @v1)} and $self->{chg}->{manifest} = 1; + @v1 = grep(/$re/, @v1); + } + if (defined $excl) { + my $re = '(?:'.join('\\z|', map { + glob2re($_) // qr/\A\Q$_\E/ + } @$excl).'\\z)'; + my @gone = delete @$v2{grep(/$re/, keys %$v2)}; + delete @$m{map { @$_ } @gone} and $self->{chg}->{manifest} = 1; + delete @$m{grep(/$re/, @v1)} and $self->{chg}->{manifest} = 1; + @v1 = grep(!/$re/, @v1); + } + my $ret; # { v1 => [ ... ], v2 => { "/$inbox_name" => [ epochs ] }} + $ret->{v1} = \@v1 if @v1; + $ret->{v2} = $v2 if keys %$v2; + $ret //= @orig ? "Nothing to clone, available repositories:\n\t". + join("\n\t", sort @orig) + : "Nothing available to clone\n"; + my $path_pfx = ''; + + # PSGI mount prefixes and manifest.js.gz prefixes don't always align... + if (@v2_epochs) { + until (grep(m!\A\Q$$path\E/git/[0-9]+\.git\z!, + @v2_epochs) == @v2_epochs) { + $$path =~ s!\A(/[^/]+)/!/! or last; + $path_pfx .= $1; + } + } elsif (@v1) { + while (!defined($m->{$$path}) && $$path =~ s!\A(/[^/]+)/!/!) { + $path_pfx .= $1; + } + } + ($path_pfx, $n, $ret); +} + +sub clone_all { + my ($self, $m) = @_; + my $todo = $TODO; + $TODO = \'BUG on further use'; + my $end = on_destroy \&fgrp_fetch_all, $self; + { + my $nodep = delete $todo->{''}; + + # do not download unwanted deps + my $any_want = delete $self->{any_want}; + my @unwanted = grep { !$any_want->{$_} } keys %$todo; + my @nodep = delete(@$todo{@unwanted}); + push(@$nodep, @$_) for @nodep; + + # handle no-dependency repos, first + for (@$nodep) { + clone_v1($_, $end); + return if !keep_going($self); + } + } + # resolve references, deepest, first: + while (scalar keys %$todo) { + for my $x (keys %$todo) { + my ($nr, $nxt); + # resolve multi-level references + while ($m && defined($nxt = $m->{$x}->{reference})) { + exists($todo->{$nxt}) or last; + if (++$nr > 1000) { + $m->{$x}->{reference} = undef; + $m->{$nxt}->{reference} = undef; + warn <<EOM +E: dependency loop detected (`$x' => `$nxt'), breaking +EOM + } + $x = $nxt; + } + my $y = delete $todo->{$x} // next; # already done + for (@$y) { + clone_v1($_, $end); + return if !keep_going($self); + } + last; # restart %$todo iteration + } + } + + # $end->DESTROY will call fgrp_fetch_all once all references + # in $LIVE are gone, and do_reap will eventually drain $LIVE + $end = undef; + do_reap($self, 1); +} + +sub dump_manifest ($$) { + my ($m, $ft) = @_; + # write the smaller manifest if epochs were skipped so + # users won't have to delete manifest if they +w an + # epoch they no longer want to skip + my $json = PublicInbox::Config->json->encode($m); + my $mtime = (stat($ft))[9]; + seek($ft, 0, SEEK_SET); + truncate($ft, 0); + gzip(\$json => $ft) or die "gzip($ft): $GzipError"; + $ft->flush or die "flush($ft): $!"; + utime($mtime, $mtime, "$ft") or die "utime(..., $ft): $!"; +} + +sub dump_project_list ($$) { + my ($self, $m) = @_; + my $f = $self->{'-project-list'}; + my $old = defined($f) ? PublicInbox::IO::try_cat($f) : ''; + my %new; + + open my $dh, '<', '.'; + if (!$self->{dry_run} || -d $self->{dst}) { + chdir($self->{dst}); + } + my @local = grep { -e $_ ? ($new{$_} = undef) : 1 } split(/\n/s, $old); + chdir($dh); + + $new{substr($_, 1)} = 1 for keys %$m; # drop leading '/' + my @list = sort keys %new; + my @remote = grep { !defined($new{$_}) } @list; + my %lnk = map { substr($_, 1) => undef } @{$self->{-new_symlinks}}; + @remote = grep { !exists($lnk{$_}) } @remote; + + if (@remote) { + warn <<EOM; +The following local repositories are ignored/gone from $self->{src}: +EOM + warn "\t", $_, "\n" for @remote; + + if ($self->{lei}->{opt}->{purge} && !$self->{dry_run}) { + my $o = {}; + $o->{verbose} = 1 if $self->{lei}->{opt}->{verbose}; + my $dst = $self->{dst}; + File::Path::remove_tree(map { "$dst/$_" } @remote, $o); + my %rm = map { $_ => undef } @remote; + @list = grep { !exists($rm{$_}) } @list; + $self->{lei}->qerr('# purged '); + } + } + if (defined($f) && @local) { + warn <<EOM; +The following repos in $f no longer exist on the filesystem: +EOM + warn "\t", $_, "\n" for @local; + } + $self->{chg}->{nr_chg} += scalar(@remote) + scalar(@local); + return if !defined($f) || $self->{dry_run}; + my (undef, $dn, $bn) = File::Spec->splitpath($f); + my $new = join("\n", @list, ''); + atomic_write($dn, $bn, $new) if $new ne $old; +} + +# FIXME: this gets confused by single inbox instance w/ global manifest.js.gz sub try_manifest { my ($self) = @_; my $uri = URI->new($self->{src}); my $lei = $self->{lei}; my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + $self->{-torsocks} //= $curl->torsocks($lei, $uri) or return; my $path = $uri->path; chop($path) eq '/' or die "BUG: $uri not canonicalized"; - $uri->path($path . '/manifest.js.gz'); - my $pdir = $lei->rel2abs($self->{dst}); - $pdir =~ s!/[^/]+/?\z!!; - my $ft = File::Temp->new(TEMPLATE => 'm-XXXX', - UNLINK => 1, DIR => $pdir, SUFFIX => '.tmp'); - my $fn = $ft->filename; - my ($bn) = ($fn =~ m!/([^/]+)\z!); - my $cmd = $curl->for_uri($lei, $uri, '-R', '-o', $bn); - my $opt = { -C => $pdir }; - $opt->{$_} = $lei->{$_} for (0..2); - my $cerr = run_reap($lei, $cmd, $opt); + my $rmf = $lei->{opt}->{'remote-manifest'} // '/manifest.js.gz'; + if ($rmf =~ m!\A[^/:]+://!) { + $uri = URI->new($rmf); + } else { + $rmf = "/$rmf" if index($rmf, '/') != 0; + $uri->path($path.$rmf); + } + my $manifest = $self->{-manifest} // "$self->{dst}/manifest.js.gz"; + my %opt = (UNLINK => 1, SUFFIX => '.tmp', TMPDIR => 1); + if (!$self->{dry_run} && $manifest =~ m!\A(.+?)/[^/]+\z! and -d $1) { + $opt{DIR} = $1; # allows fast rename(2) w/o EXDEV + delete $opt{TMPDIR}; + } + my $ft = File::Temp->new(TEMPLATE => '.manifest-XXXX', %opt); + my $cmd = $curl->for_uri($lei, $uri, qw(-R -o), $ft->filename); + push(@$cmd, '-z', $manifest) if -f $manifest; + my $mf_url = "$uri"; + %opt = map { $_ => $lei->{$_} } (0..2); + my $cerr = run_reap($lei, $cmd, \%opt); if ($cerr) { return try_scrape($self) if ($cerr >> 8) == 22; # 404 missing return $lei->child_error($cerr, "@$cmd failed"); } - my $m = eval { decode_manifest($ft, $fn, $uri) }; + + # bail out if curl -z/--timecond hit 304 Not Modified, $ft will be empty + if (-f $manifest && !-s $ft) { + $lei->child_error(127 << 8) if $lei->{opt}->{'exit-code'}; + return $lei->qerr("# $manifest unchanged"); + } + + my $m = eval { decode_manifest($ft, $ft, $uri) }; if ($@) { warn $@; return try_scrape($self); } - my ($path_pfx, $v1_path, @v2_epochs) = deduce_epochs($m, $path); - if (@v2_epochs) { - # It may be possible to have v1 + v2 in parallel someday: - warn(<<EOM) if defined $v1_path; -# `$v1_path' appears to be a v1 inbox while v2 epochs exist: -# @v2_epochs -# ignoring $v1_path (use --inbox-version=1 to force v1 instead) + local $self->{chg} = {}; + local $self->{-local_manifest} = load_current_manifest($self); + local $self->{-new_symlinks} = []; + my ($path_pfx, $n, $multi) = multi_inbox($self, \$path, $m); + return $lei->child_error(0, $multi) if !ref($multi); + my $v2 = delete $multi->{v2}; + if ($v2) { + for my $name (sort keys %$v2) { + my $epochs = delete $v2->{$name}; + my %v2_epochs = map { + $uri->path($n > 1 ? $path_pfx.$path.$_ + : $path_pfx.$_); + my ($e) = ("$uri" =~ m!/([0-9]+)\.git\z!); + $e // die "no [0-9]+\.git in `$uri'"; + $e => [ $uri->clone, $_ ]; + } @$epochs; + ("$uri" =~ m!\A(.+/)git/[0-9]+\.git\z!) or + die "BUG: `$uri' !~ m!/git/[0-9]+.git!"; + local $self->{cur_src} = $1; + local $self->{cur_dst} = $self->{dst}; + if ($n > 1 && $uri->path =~ m!\A\Q$path_pfx$path\E/(.+)/ + git/[0-9]+\.git\z!x) { + $self->{cur_dst} .= "/$1"; + } + index($self->{cur_dst}, "\n") >= 0 and die <<EOM; +E: `$self->{cur_dst}' must not contain newline EOM - my %v2_epochs = map { - $uri->path($path_pfx.$_); - my ($n) = ("$uri" =~ m!/([0-9]+)\.git\z!); - $n => $uri->clone - } @v2_epochs; - clone_v2($self, \%v2_epochs, $m); - } elsif (defined $v1_path) { - clone_v1($self); - } else { - die "E: confused by <$uri>, possible matches:\n\t", - join(', ', sort keys %$m), "\n"; + clone_v2_prep($self, \%v2_epochs, $m); + return if !keep_going($self); + } } - if (delete $self->{-culled_manifest}) { # set by clone_v2 - # write the smaller manifest if epochs were skipped so - # users won't have to delete manifest if they +w an - # epoch they no longer want to skip - my $json = PublicInbox::Config->json->encode($m); - gzip(\$json => $fn) or die "gzip: $GzipError"; + if (my $v1 = delete $multi->{v1}) { + my $p = $path_pfx.$path; + chop($p) if substr($p, -1, 1) eq '/'; + $uri->path($p); + for my $name (@$v1) { + my $task = bless { %$self }, __PACKAGE__; + $task->{-ent} = $m->{$name} // + die("BUG: no `$name' in manifest"); + $task->{cur_src} = "$uri"; + $task->{cur_dst} = $task->{dst}; + $task->{-key} = $name; + if ($n > 1) { + $task->{cur_dst} .= $name; + $task->{cur_src} .= $name; + } + index($task->{cur_dst}, "\n") >= 0 and die <<EOM; +E: `$task->{cur_dst}' must not contain newline +EOM + $task->{cur_src} .= '/'; + my $dep = $task->{-ent}->{reference} // ''; + push @{$TODO->{$dep}}, $task; # for clone_all + $self->{any_want}->{$name} = 1; + } + } + delete local $lei->{opt}->{epoch} if defined($v2); + clone_all($self, $m); + return if !keep_going($self); + + # set by clone_v2_prep/-I/--exclude + my $mis = delete $self->{chg}->{fp_mismatch}; + if ($mis) { + my $t = fmt_localtime((stat($ft))[9]); + warn <<EOM; +W: Fingerprints for the following repositories do not match +W: $mf_url @ $t: +W: These repositories may have updated since $t: +EOM + warn "\t", $_, "\n" for @$mis; + warn <<EOM if !$self->{lei}->{opt}->{prune}; +W: The above fingerprints may never match without --prune +EOM + } + if ((delete($self->{chg}->{manifest}) || $mis) && !$self->{dry_run}) { + dump_manifest($m => $ft); } - ft_rename($ft, "$self->{dst}/manifest.js.gz", 0666); + my $bad = delete $self->{chg}->{badlink}; + warn(<<EOM, map { ("\t", $_, "\n") } @$bad) if $bad; +W: The following exist and have not been converted to symlinks +EOM + dump_project_list($self, $m); + ft_rename($ft, $manifest, 0666) if !$self->{dry_run}; + !$self->{chg}->{nr_chg} && $lei->{opt}->{'exit-code'} and + $lei->child_error(127 << 8); } sub start_clone_url { @@ -419,19 +1260,45 @@ sub start_clone_url { die "TODO: non-HTTP/HTTPS clone of $self->{src} not supported, yet"; } -sub do_mirror { # via wq_io_do +sub do_mirror { # via wq_io_do or public-inbox-clone my ($self) = @_; my $lei = $self->{lei}; + $self->{dry_run} = 1 if $lei->{opt}->{'dry-run'}; umask($lei->{client_umask}) if defined $lei->{client_umask}; + $self->{-initial_clone} = 1 if !-d $self->{dst}; + local @PUH; + if (defined(my $puh = $lei->{opt}->{'post-update-hook'})) { + require Text::ParseWords; + @PUH = map { [ Text::ParseWords::shellwords($_) ] } @$puh; + } eval { - my $iv = $lei->{opt}->{'inbox-version'}; - if (defined $iv) { - return clone_v1($self) if $iv == 1; - return try_scrape($self) if $iv == 2; - die "bad --inbox-version=$iv\n"; + my $ic = $lei->{opt}->{'inbox-config'} //= 'always'; + $ic =~ /\A(?:v1|v2|always|never)\z/s or die <<""; +--inbox-config must be one of `always', `v2', `v1', or `never' + + # we support these switches with '' (empty string). + # defaults match example conf distributed with grokmirror + my @pairs = qw(objstore objstore manifest manifest.js.gz + project-list projects.list); + while (@pairs) { + my ($k, $default) = splice(@pairs, 0, 2); + my $v = $lei->{opt}->{$k} // next; + $v = $default if $v eq ''; + $v = "$self->{dst}/$v" if $v !~ m!\A\.{0,2}/!; + $self->{"-$k"} = $v; } - return start_clone_url($self) if $self->{src} =~ m!://!; - die "TODO: cloning local directories not supported, yet"; + + local $LIVE = {}; + local $TODO = {}; + local $FGRP_TODO = {}; + my $iv = $lei->{opt}->{'inbox-version'} // + return start_clone_url($self); + return clone_v1($self) if $iv == 1; + die "bad --inbox-version=$iv\n" if $iv != 2; + die <<EOM if $self->{src} !~ m!://!; +cloning local v2 inboxes not supported +EOM + try_scrape($self, 1); }; $lei->fail($@) if $@; } @@ -439,14 +1306,6 @@ sub do_mirror { # via wq_io_do sub start { my ($cls, $lei, $src, $dst) = @_; my $self = bless { src => $src, dst => $dst }, $cls; - if ($src =~ m!https?://!) { - require URI; - require PublicInbox::LeiCurl; - } - require PublicInbox::Lock; - require PublicInbox::Inbox; - require PublicInbox::Admin; - require PublicInbox::InboxWritable; $lei->request_umask; my ($op_c, $ops) = $lei->workers_start($self, 1); $lei->{wq1} = $self; @@ -464,8 +1323,8 @@ sub ipc_atfork_child { sub write_makefile { my ($dir, $ibx_ver) = @_; my $f = "$dir/Makefile"; - if (sysopen my $fh, $f, O_CREAT|O_EXCL|O_WRONLY) { - print $fh <<EOM or die "print($f) $!"; + if (CORE::sysopen my $fh, $f, O_CREAT|O_EXCL|O_WRONLY) { + print $fh <<EOM; # This is a v$ibx_ver public-inbox, see the public-inbox-v$ibx_ver-format(5) # manpage for more information on the format. This Makefile is # intended as a familiar wrapper for users unfamiliar with @@ -479,7 +1338,7 @@ sub write_makefile { # so you may edit it freely with your own convenience targets # and notes. public-inbox-fetch will recreate it if removed. EOM - print $fh <<'EOM' or die "print($f): $!"; + print $fh <<'EOM'; # the default target: help : @echo Common targets: @@ -489,6 +1348,7 @@ help : @echo Rarely needed targets: @echo ' make reindex - may be needed for new features/bugfixes' @echo ' make compact - rewrite Xapian storage to save space' + @echo ' make index - initial index after clone' fetch : public-inbox-fetch @@ -505,14 +1365,16 @@ update : echo 'public-inbox index not initialized'; \ echo 'see public-inbox-index(1) man page'; \ fi +index : + public-inbox-index reindex : public-inbox-index --reindex compact : public-inbox-compact -.PHONY : help fetch update reindex compact +.PHONY : help fetch update index reindex compact EOM - close $fh or die "close($f): $!"; + close $fh; } else { die "open($f): $!" unless $!{EEXIST}; } diff --git a/lib/PublicInbox/LeiNoteEvent.pm b/lib/PublicInbox/LeiNoteEvent.pm index 8581bd9a..8d900d0c 100644 --- a/lib/PublicInbox/LeiNoteEvent.pm +++ b/lib/PublicInbox/LeiNoteEvent.pm @@ -60,6 +60,18 @@ sub maildir_event { # via wq_nonblock_do } # else: eml_from_path already warns } +sub _mh_cb { # mh_read_one cb + my ($dir, $bn, $kw, $eml, $self, $state) = @_; +} + +sub mh_event { # via wq_nonblock_do + my ($self, $folder, $bn, $state) = @_; + my $dir = substr($folder, 3); + require PublicInbox::MHreader; # if we forked early + my $mhr = PublicInbox::MHreader->new($dir, $self->{lei}->{3}); + $mhr->mh_read_one($bn, \&_mh_cb, $self, $state); +} + sub lei_note_event { my ($lei, $folder, $new_cur, $bn, $fn, @rest) = @_; die "BUG: unexpected: @rest" if @rest; @@ -72,11 +84,14 @@ sub lei_note_event { $lms->arg2folder($lei, [ $folder ]); my $state = $cfg->get_1("watch.$folder.state") // 'tag-rw'; return if $state eq 'pause'; - return $lms->clear_src($folder, \$bn) if $new_cur eq ''; + if ($new_cur eq '') { + my $id = $folder =~ /\Amaildir:/ ? \$bn : $bn + 0; + return $lms->clear_src($folder, $id); + } $lms->lms_pause; $lei->ale; # prepare $sto->write_prepare($lei); - require PublicInbox::MdirReader; + require PublicInbox::MHreader if $folder =~ /\Amh:/; # optimistic my $self = $cfg->{-lei_note_event} //= do { my $wq = bless { lms => $lms }, __PACKAGE__; # MUAs such as mutt can trigger massive rename() storms so @@ -91,12 +106,15 @@ sub lei_note_event { $lei->{lne} = $wq; }; if ($folder =~ /\Amaildir:/i) { + require PublicInbox::MdirReader; my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn) // return; return if index($fl, 'T') >= 0; my $kw = PublicInbox::MdirReader::flags2kw($fl); my $vmd = { kw => $kw, sync_info => [ $folder, \$bn ] }; $self->wq_nonblock_do('maildir_event', $fn, $vmd, $state); + } elsif ($folder =~ /\Amh:/) { + $self->wq_nonblock_do('mh_event', $folder, $bn, $state); } # else: TODO: imap } diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm index 066c40bd..0529bbe4 100644 --- a/lib/PublicInbox/LeiOverview.pm +++ b/lib/PublicInbox/LeiOverview.pm @@ -41,8 +41,8 @@ sub detect_fmt ($) { my ($dst) = @_; if ($dst =~ m!\A([:/]+://)!) { die "$1 support not implemented, yet\n"; - } elsif (!-e $dst || -d _) { - 'maildir'; # the default TODO: MH? + } elsif (!-e $dst || -d _) { # maildir is the default TODO: MH + -e "$dst/inbox.lock" ? 'v2' : 'maildir'; } elsif (-f _ || -p _) { die "unable to determine mbox family of $dst\n"; } else { @@ -212,7 +212,8 @@ sub ovv_each_smsg_cb { # runs in wq worker usually sub { my ($smsg, $mitem, $eml) = @_; $smsg->{pct} = get_pct($mitem) if $mitem; - $l2m->wq_io_do('write_mail', [], $smsg, $eml); + eval { $l2m->wq_io_do('write_mail', [], $smsg, $eml) }; + $lei->fail($@) if $@ && !$!{ECONNRESET} && !$!{EPIPE}; } } elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) { my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},"; diff --git a/lib/PublicInbox/LeiP2q.pm b/lib/PublicInbox/LeiP2q.pm index 610adb78..68faa016 100644 --- a/lib/PublicInbox/LeiP2q.pm +++ b/lib/PublicInbox/LeiP2q.pm @@ -189,7 +189,7 @@ sub lei_p2q { # the "lei patch-to-query" entry point sub ipc_atfork_child { my ($self) = @_; PublicInbox::LeiInput::input_only_atfork_child($self); - PublicInbox::OnDestroy->new($$, \&emit_query, $self); + on_destroy \&emit_query, $self; } no warnings 'once'; diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm index c998e5c0..eadf811f 100644 --- a/lib/PublicInbox/LeiQuery.pm +++ b/lib/PublicInbox/LeiQuery.pm @@ -1,11 +1,10 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # handles "lei q" command and provides internals for # several other sub-commands (up, lcat, ...) package PublicInbox::LeiQuery; -use strict; -use v5.10.1; +use v5.12; sub prep_ext { # externals_each callback my ($lxs, $exclude, $loc) = @_; @@ -17,6 +16,8 @@ sub _start_query { # used by "lei q" and "lei up" require PublicInbox::LeiOverview; PublicInbox::LeiOverview->new($self) or return; my $opt = $self->{opt}; + require PublicInbox::OverIdx; # loads DBI + PublicInbox::OverIdx::fork_ok($opt); my ($xj, $mj) = split(/,/, $opt->{jobs} // ''); (defined($xj) && $xj ne '' && $xj !~ /\A[1-9][0-9]*\z/) and die "`$xj' search jobs must be >= 1\n"; @@ -37,8 +38,11 @@ sub _start_query { # used by "lei q" and "lei up" $lms->lms_write_prepare->lms_pause; # just create } } - $l2m and $l2m->{-wq_nr_workers} //= $mj // - int($nproc * 0.75 + 0.5); # keep some CPU for git + $l2m and $l2m->{-wq_nr_workers} //= $mj // do { + # keep some CPU for git, and don't overload IMAP destinations + my $n = int($nproc * 0.75 + 0.5); + $self->{net} && $n > 4 ? 4 : $n; + }; # descending docid order is cheapest, MUA controls sorting order $self->{mset_opt}->{relevance} //= -2 if $l2m || $opt->{threads}; @@ -55,18 +59,19 @@ sub _start_query { # used by "lei q" and "lei up" $lxs->do_query($self); } -sub qstr_add { # PublicInbox::InputPipe::consume callback for --stdin - my ($lei) = @_; # $_[1] = $rbuf - $_[1] // $lei->fail("error reading stdin: $!"); - return $lei->{mset_opt}->{qstr} .= $_[1] if $_[1] ne ''; - eval { - $lei->fchdir; - $lei->{mset_opt}->{q_raw} = $lei->{mset_opt}->{qstr}; - $lei->{lse}->query_approxidate($lei->{lse}->git, - $lei->{mset_opt}->{qstr}); - _start_query($lei); - }; - $lei->fail($@) if $@; +sub do_qry { # do_env cb + my ($lei) = @_; + $lei->{mset_opt}->{q_raw} = $lei->{mset_opt}->{qstr} + = delete $lei->{stdin_buf}; + $lei->{lse}->query_approxidate($lei->{lse}->git, + $lei->{mset_opt}->{qstr}); + _start_query($lei); +} + +# make the URI||PublicInbox::{Inbox,ExtSearch} a config-file friendly string +sub cfg_ext ($) { + my ($x) = @_; + $x->isa('URI') ? "$x" : ($x->{inboxdir} // $x->{topdir}); } sub lxs_prepare { @@ -84,21 +89,32 @@ sub lxs_prepare { $lxs->prepare_external($self->{lse}); } if (@only) { + my $only; for my $loc (@only) { my @loc = $self->get_externals($loc) or return; - $lxs->prepare_external($_) for @loc; + for (@loc) { + my $x = $lxs->prepare_external($_); + push(@$only, cfg_ext($x)) if $x; + } } + $opt->{only} = $only if $only; } else { - my (@ilocals, @iremotes); + my (@ilocals, @iremotes, $incl); for my $loc (@{$opt->{include} // []}) { my @loc = $self->get_externals($loc) or return; - $lxs->prepare_external($_) for @loc; + for (@loc) { + my $x = $lxs->prepare_external($_); + push(@$incl, cfg_ext($x)) if $x; + } @ilocals = @{$lxs->{locals} // []}; @iremotes = @{$lxs->{remotes} // []}; } + $opt->{include} = $incl if $incl; # --external is enabled by default, but allow --no-external if ($opt->{external} //= 1) { my $ex = $self->canonicalize_excludes($opt->{exclude}); + my @excl = keys %$ex; + $opt->{exclude} = \@excl if scalar(@excl); $self->externals_each(\&prep_ext, $lxs, $ex); $opt->{remote} //= !($lxs->locals - $opt->{'local'}); $lxs->{locals} = \@ilocals if !$opt->{'local'}; @@ -137,9 +153,7 @@ sub lei_q { return $self->fail(<<'') if @argv; no query allowed on command-line with --stdin - require PublicInbox::InputPipe; - PublicInbox::InputPipe::consume($self->{0}, \&qstr_add, $self); - return; + return $self->slurp_stdin(\&do_qry); } chomp(@argv) and $self->qerr("# trailing `\\n' removed"); $mset_opt{q_raw} = [ @argv ]; # copy @@ -151,6 +165,8 @@ no query allowed on command-line with --stdin # shell completion helper called by lei__complete sub _complete_q { my ($self, @argv) = @_; + join('', @argv) =~ /\bL:\S*\z/ and + return eval { $self->_lei_store->search->all_terms('L') }; my @cur; my $cb = $self->lazy_cb(qw(forget-external _complete_)); while (@argv) { diff --git a/lib/PublicInbox/LeiRediff.pm b/lib/PublicInbox/LeiRediff.pm index c312d90f..35728330 100644 --- a/lib/PublicInbox/LeiRediff.pm +++ b/lib/PublicInbox/LeiRediff.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # The "lei rediff" sub-command, regenerates diffs with new options @@ -7,7 +7,7 @@ use strict; use v5.10.1; use parent qw(PublicInbox::IPC PublicInbox::LeiInput); use File::Temp 0.19 (); # 0.19 for ->newdir -use PublicInbox::Spawn qw(spawn which); +use PublicInbox::Spawn qw(run_wait popen_wr which); use PublicInbox::MsgIter qw(msg_part_text); use PublicInbox::ViewDiff; use PublicInbox::LeiBlob; @@ -82,6 +82,7 @@ sub _lei_diff_prepare ($$) { push @$cmd, $c ? "-$c" : "--$o"; } } + push(@$cmd, "-O$opt->{'order-file'}") if $opt->{'order-file'}; } sub diff_ctxq ($$) { @@ -113,65 +114,51 @@ EOM if (!$rw->{-tmp}) { my $d = "$self->{rdtmp}/for_tree.git"; -d $d or PublicInbox::Import::init_bare($d); - my $f = "$d/objects/info/alternates"; # always overwrite - open my $fh, '>', $f or die "open $f: $!"; - for my $git (@{$self->{gits}}) { - print $fh $git->git_path('objects'),"\n"; - } - close $fh or die "close $f: $!"; + # always overwrite + PublicInbox::IO::write_file '>', "$d/objects/info/alternates", + map { $_->git_path('objects')."\n" } @{$self->{gits}}; $rw = PublicInbox::Git->new($d); } - pipe(my ($r, $w)) or die "pipe: $!"; - my $pid = spawn(['git', "--git-dir=$rw->{git_dir}", + my $w = popen_wr(['git', "--git-dir=$rw->{git_dir}", qw(fast-import --quiet --done --date-format=raw)], - $lei->{env}, { 2 => $lei->{2}, 0 => $r }); - close $r or die "close r fast-import: $!"; + $lei->{env}, { 2 => $lei->{2} }); print $w $ta, "\n", $tb, "\ndone\n" or die "print fast-import: $!"; - close $w or die "close w fast-import: $!"; - waitpid($pid, 0); - die "fast-import failed: \$?=$?" if $?; + $w->close or die "close w fast-import: \$?=$? \$!=$!"; my $cmd = [ 'diff' ]; _lei_diff_prepare($lei, $cmd); $lei->qerr("# git @$cmd"); push @$cmd, qw(A B); unshift @$cmd, 'git', "--git-dir=$rw->{git_dir}"; - $pid = spawn($cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} }); - waitpid($pid, 0); - $lei->child_error($?) if $?; # for git diff --exit-code + run_wait($cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} }) and + $lei->child_error($?); # for git diff --exit-code undef; } -sub wait_requote ($$$) { # OnDestroy callback - my ($lei, $pid, $old_1) = @_; - $lei->{1} = $old_1; # closes stdin of `perl -pE 's/^/> /'` - waitpid($pid, 0) == $pid or die "BUG(?) waitpid: \$!=$! \$?=$?"; - $lei->child_error($?) if $?; -} +# awaitpid callback +sub wait_requote { $_[1]->child_error($?) if $? } -sub requote ($$) { +sub requote ($$) { # '> ' prefix(es) lei->{1} my ($lei, $pfx) = @_; - pipe(my($r, $w)) or die "pipe: $!"; - my $rdr = { 0 => $r, 1 => $lei->{1}, 2 => $lei->{2} }; + my $opt = { 1 => $lei->{1}, 2 => $lei->{2} }; # $^X (perl) is overkill, but maybe there's a weird system w/o sed - my $pid = spawn([$^X, '-pE', "s/^/$pfx/"], $lei->{env}, $rdr); - my $old_1 = $lei->{1}; - $w->autoflush(1); + my $w = popen_wr([$^X, '-pe', "s/^/$pfx/"], $lei->{env}, $opt, + \&wait_requote, $lei); binmode $w, ':utf8'; - $lei->{1} = $w; - PublicInbox::OnDestroy->new(\&wait_requote, $lei, $pid, $old_1); + $w; } sub extract_oids { # Eml each_part callback my ($ary, $self) = @_; + my $lei = $self->{lei}; my ($p, undef, $idx) = @$ary; - $self->{lei}->out($p->header_obj->as_string, "\n"); + $lei->out($p->header_obj->as_string, "\n"); my ($s, undef) = msg_part_text($p, $p->content_type || 'text/plain'); defined $s or return; - my $rq; - if ($self->{dqre} && $s =~ s/$self->{dqre}//g) { # '> ' prefix(es) - $rq = requote($self->{lei}, $1) if $self->{lei}->{opt}->{drq}; - } + + $self->{dqre} && $s =~ s/$self->{dqre}//g && $lei->{opt}->{drq} and + local $lei->{1} = requote($lei, $1); + my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $s); undef $s; my $blobs = $self->{blobs}; # blobs to resolve @@ -268,7 +255,7 @@ sub lei_rediff { if ($lxs->remotes) { require PublicInbox::LeiRemote; $lei->{curl} //= which('curl') or return - $lei->fail('curl needed for', $lxs->remotes); + $lei->fail('curl needed for '.join(', ',$lxs->remotes)); } $lei->ale->refresh_externals($lxs, $lei); my $self = bless { diff --git a/lib/PublicInbox/LeiRefreshMailSync.pm b/lib/PublicInbox/LeiRefreshMailSync.pm index a60a9a5e..dde23274 100644 --- a/lib/PublicInbox/LeiRefreshMailSync.pm +++ b/lib/PublicInbox/LeiRefreshMailSync.pm @@ -60,7 +60,7 @@ sub input_path_url { # overrides PublicInbox::LeiInput::input_path_url $self->folder_missing($$uri); } } else { die "BUG: $input not supported" } - $self->{lei}->sto_done_request; + $self->{lei}->sto_barrier_request; } sub lei_refresh_mail_sync { diff --git a/lib/PublicInbox/LeiReindex.pm b/lib/PublicInbox/LeiReindex.pm new file mode 100644 index 00000000..3f109f33 --- /dev/null +++ b/lib/PublicInbox/LeiReindex.pm @@ -0,0 +1,49 @@ +# Copyright all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei reindex" command to reindex everything in lei/store +package PublicInbox::LeiReindex; +use v5.12; +use parent qw(PublicInbox::IPC); + +sub reindex_full { + my ($lei) = @_; + my $sto = $lei->{sto}; + my $max = $sto->search->over(1)->max; + $lei->qerr("# reindexing 1..$max"); + $sto->wq_do('reindex_art', $_) for (1..$max); +} + +sub reindex_store { # via wq_do + my ($self) = @_; + my ($lei, $argv) = delete @$self{qw(lei argv)}; + if (!@$argv) { + reindex_full($lei); + } +} + +sub lei_reindex { + my ($lei, @argv) = @_; + my $sto = $lei->_lei_store or return $lei->fail('nothing indexed'); + $sto->write_prepare($lei); + my $self = bless { lei => $lei, argv => \@argv }, __PACKAGE__; + my ($op_c, $ops) = $lei->workers_start($self, 1); + $lei->{wq1} = $self; + $lei->wait_wq_events($op_c, $ops); + $self->wq_do('reindex_store'); + $self->wq_close; +} + +sub _lei_wq_eof { # EOF callback for main lei daemon + my ($lei) = @_; + $lei->{sto}->wq_do('reindex_done'); + $lei->wq_eof; +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->_lei_atfork_child; + $self->SUPER::ipc_atfork_child; +} + +1; diff --git a/lib/PublicInbox/LeiRemote.pm b/lib/PublicInbox/LeiRemote.pm index 54750062..d6fc40a4 100644 --- a/lib/PublicInbox/LeiRemote.pm +++ b/lib/PublicInbox/LeiRemote.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Make remote externals HTTP(S) inboxes behave like @@ -12,7 +12,6 @@ use IO::Uncompress::Gunzip; use PublicInbox::MboxReader; use PublicInbox::Spawn qw(popen_rd); use PublicInbox::LeiCurl; -use PublicInbox::AutoReap; use PublicInbox::ContentHash qw(git_sha); sub new { @@ -22,7 +21,7 @@ sub new { sub isrch { $_[0] } # SolverGit expcets this -sub _each_mboxrd_eml { # callback for MboxReader->mboxrd +sub each_mboxrd_eml { # callback for MboxReader->mboxrd my ($eml, $self) = @_; my $lei = $self->{lei}; my $xoids = $lei->{ale}->xoids_for($eml, 1); @@ -47,14 +46,13 @@ sub mset { $uri->query_form(q => $qstr, x => 'm', r => 1); # r=1: relevance my $cmd = $curl->for_uri($self->{lei}, $uri); $self->{lei}->qerr("# $cmd"); - my ($fh, $pid) = popen_rd($cmd, undef, { 2 => $lei->{2} }); - my $ar = PublicInbox::AutoReap->new($pid); $self->{smsg} = []; - $fh = IO::Uncompress::Gunzip->new($fh, MultiStream => 1); - PublicInbox::MboxReader->mboxrd($fh, \&_each_mboxrd_eml, $self); - my $wait = $self->{lei}->{sto}->wq_do('done'); - $ar->join; - $lei->child_error($?) if $?; + my $fh = popen_rd($cmd, undef, { 2 => $lei->{2} }); + $fh = IO::Uncompress::Gunzip->new($fh, MultiStream=>1, AutoClose=>1); + eval { PublicInbox::MboxReader->mboxrd($fh, \&each_mboxrd_eml, $self) }; + my $err = $@ ? ": $@" : ''; + my $wait = $self->{lei}->{sto}->wq_do('barrier'); + $lei->child_error($?, "@$cmd failed$err") if $err || $?; $self; # we are the mset (and $ibx, and $self) } @@ -69,9 +67,16 @@ sub base_url { "$_[0]->{uri}" } sub smsg_eml { my ($self, $smsg) = @_; - if (my $bref = $self->{lei}->ale->git->cat_file($smsg->{blob})) { - return PublicInbox::Eml->new($bref); - } + my $bref = $self->{lei}->ale->git->cat_file($smsg->{blob}) // do { + my $lms = $self->{lei}->lms; + ($lms ? $lms->local_blob($smsg->{blob}, 1) : undef) // do { + my $sto = $self->{lei}->{sto} // + $self->{lei}->_lei_store; + $sto && $sto->{-wq_s1} ? + $sto->wq_do('cat_blob', $smsg->{blob}) : undef; + } + }; + return PublicInbox::Eml->new($bref) if $bref; warn("E: $self->{uri} $smsg->{blob} gone <$smsg->{mid}>\n"); undef; } diff --git a/lib/PublicInbox/LeiRmWatch.pm b/lib/PublicInbox/LeiRmWatch.pm index c0f336f0..19bee3ab 100644 --- a/lib/PublicInbox/LeiRmWatch.pm +++ b/lib/PublicInbox/LeiRmWatch.pm @@ -14,7 +14,7 @@ sub lei_rm_watch { my $self = bless { missing_ok => 1 }, __PACKAGE__; $self->prepare_inputs($lei, \@argv) or return; for my $w (@{$self->{inputs}}) { - $lei->_config('--remove-section', "watch.$w"); + $lei->_config('--remove-section', "watch.$w") or return; } delete $lei->{cfg}; # force reload $lei->refresh_watches; diff --git a/lib/PublicInbox/LeiSavedSearch.pm b/lib/PublicInbox/LeiSavedSearch.pm index 1d13aef6..9ae9dcdb 100644 --- a/lib/PublicInbox/LeiSavedSearch.pm +++ b/lib/PublicInbox/LeiSavedSearch.pm @@ -1,10 +1,9 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # pretends to be like LeiDedupe and also PublicInbox::Inbox package PublicInbox::LeiSavedSearch; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::Lock); use PublicInbox::Git; use PublicInbox::OverIdx; @@ -13,7 +12,9 @@ use PublicInbox::Config; use PublicInbox::Spawn qw(run_die); use PublicInbox::ContentHash qw(git_sha); use PublicInbox::MID qw(mids_for_index); -use Digest::SHA qw(sha256_hex); +use PublicInbox::SHA qw(sha256_hex); +use File::Temp (); +use IO::Handle (); our $LOCAL_PFX = qr!\A(?:maildir|mh|mbox.+|mmdf|v2):!i; # TODO: put in LeiToMail? # move this to PublicInbox::Config if other things use it: @@ -76,20 +77,17 @@ sub list { my $lss_dir = $lei->share_path.'/saved-searches'; return () unless -d $lss_dir; # TODO: persist the cache? Use another format? - my $f = $lei->cache_dir."/saved-tmp.$$.".time.'.config'; - open my $fh, '>', $f or die "open $f: $!"; + my $fh = File::Temp->new(TEMPLATE => 'lss_list-XXXX', TMPDIR => 1) or + die "File::Temp->new: $!"; print $fh "[include]\n"; for my $p (glob("$lss_dir/*/lei.saved-search")) { print $fh "\tpath = ", cquote_val($p), "\n"; } - close $fh or die "close $f: $!"; - my $cfg = $lei->cfg_dump($f); - unlink($f); + $fh->flush or die "flush: $fh"; + my $cfg = $lei->cfg_dump($fh->filename); my $out = $cfg ? $cfg->get_all('lei.q.output') : []; - map {; - s!$LOCAL_PFX!!; - $_; - } @$out + s!$LOCAL_PFX!! for @$out;; + @$out; } sub translate_dedupe ($$) { @@ -247,10 +245,10 @@ sub git { $_[0]->{git} //= PublicInbox::Git->new($_[0]->{ale}->git->{git_dir}) } sub pause_dedupe { my ($self) = @_; - git($self)->cleanup; - my $lockfh = delete $self->{lockfh}; # from lock_for_scope_fast; - my $oidx = delete($self->{oidx}) // return; - $oidx->commit_lazy; + my ($git, $oidx) = delete @$self{qw(git oidx)}; + $git->cleanup if $git; + $oidx->commit_lazy if $oidx; + delete $self->{lockfh}; # from lock_for_scope_fast; } sub reset_dedupe { @@ -299,7 +297,6 @@ no warnings 'once'; *smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid; *msg_by_mid = \&PublicInbox::Inbox::msg_by_mid; *modified = \&PublicInbox::Inbox::modified; -*recent = \&PublicInbox::Inbox::recent; *max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef *isrch = *search = \&mm; # TODO *DESTROY = \&pause_dedupe; diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm index 936c2751..684668c5 100644 --- a/lib/PublicInbox/LeiSearch.pm +++ b/lib/PublicInbox/LeiSearch.pm @@ -9,6 +9,7 @@ use parent qw(PublicInbox::ExtSearch); # PublicInbox::Search->reopen use PublicInbox::Search qw(xap_terms); use PublicInbox::ContentHash qw(content_digest content_hash git_sha); use PublicInbox::MID qw(mids mids_for_index); +use PublicInbox::Compat qw(uniqstr); use Carp qw(croak); sub _msg_kw { # retry_reopen callback @@ -44,20 +45,16 @@ sub oidbin_keywords { sub _xsmsg_vmd { # retry_reopen my ($self, $smsg, $want_label) = @_; my $xdb = $self->xdb; # set {nshard}; - my (%kw, %L, $doc, $x); - $kw{flagged} = 1 if delete($smsg->{lei_q_tt_flagged}); + my (@kw, @L, $doc, $x); + @kw = qw(flagged) if delete($smsg->{lei_q_tt_flagged}); my @num = $self->over->blob_exists($smsg->{blob}); for my $num (@num) { # there should only be one... $doc = $xdb->get_document($self->num2docid($num)); - $x = xap_terms('K', $doc); - %kw = (%kw, %$x); - if ($want_label) { # JSON/JMAP only - $x = xap_terms('L', $doc); - %L = (%L, %$x); - } + push @kw, xap_terms('K', $doc); + push @L, xap_terms('L', $doc) if $want_label # JSON/JMAP only } - $smsg->{kw} = [ sort keys %kw ] if scalar(keys(%kw)); - $smsg->{L} = [ sort keys %L ] if scalar(keys(%L)); + @{$smsg->{kw}} = sort(uniqstr(@kw)) if @kw; + @{$smsg->{L}} = uniqstr(@L) if @L; } # lookup keywords+labels for external messages @@ -106,6 +103,8 @@ sub xoids_for { for my $o (@overs) { my ($id, $prev); while (my $cur = $o->next_by_mid($mid, \$id, \$prev)) { + # {bytes} may be '' from old bug + $cur->{bytes} = 1 if $cur->{bytes} eq ''; next if $cur->{bytes} == 0 || $xoids->{$cur->{blob}}; $git->cat_async($cur->{blob}, \&_cmp_1st, @@ -158,20 +157,6 @@ sub kw_changed { join("\0", @$new_kw_sorted) eq $cur_kw ? 0 : 1; } -sub all_terms { - my ($self, $pfx) = @_; - my $xdb = $self->xdb; - my $cur = $xdb->allterms_begin($pfx); - my $end = $xdb->allterms_end($pfx); - my %ret; - for (; $cur != $end; $cur++) { - my $tn = $cur->get_termname; - index($tn, $pfx) == 0 and - $ret{substr($tn, length($pfx))} = undef; - } - wantarray ? (sort keys %ret) : \%ret; -} - sub qparse_new { my ($self) = @_; my $qp = $self->SUPER::qparse_new; # PublicInbox::Search diff --git a/lib/PublicInbox/LeiSelfSocket.pm b/lib/PublicInbox/LeiSelfSocket.pm index 860020cb..0e15bc7c 100644 --- a/lib/PublicInbox/LeiSelfSocket.pm +++ b/lib/PublicInbox/LeiSelfSocket.pm @@ -5,31 +5,27 @@ # This receives what script/lei receives, but isn't connected # to an interactive terminal so I'm not sure what to do with it... package PublicInbox::LeiSelfSocket; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::DS); use Data::Dumper; $Data::Dumper::Useqq = 1; # should've been the Perl default :P use PublicInbox::Syscall qw(EPOLLIN); -use PublicInbox::Spawn; -my $recv_cmd; +use PublicInbox::IPC; sub new { my ($cls, $r) = @_; - my $self = bless { sock => $r }, $cls; + my $self = bless {}, $cls; $r->blocking(0); - no warnings 'once'; - $recv_cmd = $PublicInbox::LEI::recv_cmd; $self->SUPER::new($r, EPOLLIN); } sub event_step { my ($self) = @_; - my (@fds) = $recv_cmd->($self->{sock}, my $buf, 4096 * 33); + my ($buf, @fds); + @fds = $PublicInbox::IPC::recv_cmd->($self->{sock}, $buf, 4096 * 33); if (scalar(@fds) == 1 && !defined($fds[0])) { return if $!{EAGAIN}; die "recvmsg: $!" unless $!{ECONNRESET}; - $buf = ''; } else { # just in case open so perl can auto-close them: for (@fds) { open my $fh, '+<&=', $_ }; } diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 66049dfe..b2da2bc3 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Local storage (cache/memo) for lei(1), suitable for personal/private @@ -27,12 +27,16 @@ use PublicInbox::MDA; use PublicInbox::Spawn qw(spawn); use PublicInbox::MdirReader; use PublicInbox::LeiToMail; -use File::Temp (); +use PublicInbox::Compat qw(uniqstr); +use PublicInbox::OnDestroy; +use File::Temp qw(tmpnam); use POSIX (); use IO::Handle (); # ->autoflush use Sys::Syslog qw(syslog openlog); use Errno qw(EEXIST ENOENT); use PublicInbox::Syscall qw(rename_noreplace); +use PublicInbox::LeiStoreErr; +use PublicInbox::DS qw(add_uniq_timer); sub new { my (undef, $dir, $opt) = @_; @@ -77,7 +81,7 @@ sub importer { delete $self->{im}; $im->done; undef $im; - $self->checkpoint; + $self->barrier; $max = $self->{priv_eidx}->{mg}->git_epochs + 1; } my (undef, $tl) = eidx_init($self); # acquire lock @@ -107,17 +111,32 @@ sub search { PublicInbox::LeiSearch->new($_[0]->{priv_eidx}->{topdir}); } +sub cat_blob { + my ($self, $oid) = @_; + $self->{im} ? $self->{im}->cat_blob($oid) : undef; +} + +sub schedule_commit { + my ($self, $sec) = @_; + add_uniq_timer($self->{priv_eidx}->{topdir}, $sec, \&barrier, $self); +} + # follows the stderr file sub _tail_err { my ($self) = @_; - print { $self->{-err_wr} } readline($self->{-tmp_err}); + my $err = $self->{-tmp_err} // return; + $err->clearerr; # clear EOF marker + my @msg = readline($err); + PublicInbox::LeiStoreErr::emit($self->{-err_wr}, @msg) and return; + # syslog is the last resort if lei-daemon broke + syslog('warning', '%s', $_) for @msg; } sub eidx_init { my ($self) = @_; my $eidx = $self->{priv_eidx}; my $tl = wantarray && $self->{-err_wr} ? - PublicInbox::OnDestroy->new($$, \&_tail_err, $self) : + on_destroy(\&_tail_err, $self) : undef; $eidx->idx_init({-private => 1}); # acquires lock wantarray ? ($eidx, $tl) : $eidx; @@ -255,13 +274,13 @@ sub remove_eml_vmd { # remove just the VMD sub _lms_rw ($) { # it is important to have eidx processes open before lms my ($self) = @_; - my ($eidx, $tl) = eidx_init($self); - $self->{lms} //= do { + $self->{lms} // do { require PublicInbox::LeiMailSync; + my ($eidx, $tl) = eidx_init($self); my $f = "$self->{priv_eidx}->{topdir}/mail_sync.sqlite3"; my $lms = PublicInbox::LeiMailSync->new($f); $lms->lms_write_prepare; - $lms; + $self->{lms} = $lms; }; } @@ -324,15 +343,55 @@ sub _add_vmd ($$$$) { sub _docids_and_maybe_kw ($$) { my ($self, $docids) = @_; return $docids unless wantarray; - my $kw = {}; + my (@kw, $idx, @tmp); for my $num (@$docids) { # likely only 1, unless ContentHash changes # can't use ->search->msg_keywords on uncommitted docs - my $idx = $self->{priv_eidx}->idx_shard($num); - my $tmp = eval { $idx->ipc_do('get_terms', 'K', $num) }; - if ($@) { warn "#$num get_terms: $@" } - else { @$kw{keys %$tmp} = values(%$tmp) }; + $idx = $self->{priv_eidx}->idx_shard($num); + @tmp = eval { $idx->ipc_do('get_terms', 'K', $num) }; + $@ ? warn("#$num get_terms: $@") : push(@kw, @tmp); + } + @kw = sort(uniqstr(@kw)) if @$docids > 1; + ($docids, \@kw); +} + +sub _reindex_1 { # git->cat_async callback + my ($bref, $hex, $type, $size, $smsg) = @_; + my $self = delete $smsg->{-sto}; + my ($eidx, $tl) = eidx_init($self); + $bref //= _lms_rw($self)->local_blob($hex, 1); + if ($bref) { + my $eml = PublicInbox::Eml->new($bref); + $smsg->{-merge_vmd} = 1; # preserve existing keywords + $eidx->idx_shard($smsg->{num})->index_eml($eml, $smsg); + } elsif ($type eq 'missing') { + # pre-release/buggy lei may've indexed external-only msgs, + # try to correct that, here + warn("E: missing $hex, culling (ancient lei artifact?)\n"); + $smsg->{to} = $smsg->{cc} = $smsg->{from} = ''; + $smsg->{bytes} = 0; + $eidx->{oidx}->update_blob($smsg, ''); + my $eml = PublicInbox::Eml->new("\r\n\r\n"); + $eidx->idx_shard($smsg->{num})->index_eml($eml, $smsg); + } else { + warn("E: $type $hex\n"); } - ($docids, [ sort keys %$kw ]); +} + +sub reindex_art { + my ($self, $art) = @_; + my ($eidx, $tl) = eidx_init($self); + my $smsg = $eidx->{oidx}->get_art($art) // return; + return if $smsg->{bytes} == 0; # external-only message + $smsg->{-sto} = $self; + $eidx->git->cat_async($smsg->{blob} // die("no blob (#$art)"), + \&_reindex_1, $smsg); +} + +sub reindex_done { + my ($self) = @_; + my ($eidx, $tl) = eidx_init($self); + $eidx->git->async_wait_all; + # ->done to be called via sto_barrier_request } sub add_eml { @@ -347,8 +406,14 @@ sub add_eml { _lms_rw($self)->set_src($smsg->oidbin, @{$vmd->{sync_info}}); } unless ($im_mark) { # duplicate blob returns undef - return unless wantarray; + return unless wantarray || $vmd; my @docids = $oidx->blob_exists($smsg->{blob}); + if ($vmd) { + for my $docid (@docids) { + my $idx = $eidx->idx_shard($docid); + _add_vmd($self, $idx, $docid, $vmd); + } + } return _docids_and_maybe_kw $self, \@docids; } @@ -506,13 +571,11 @@ sub set_xvmd { sto_export_kw($self, $smsg->{num}, $vmd); } -sub checkpoint { - my ($self, $wait) = @_; - if (my $im = $self->{im}) { - $wait ? $im->barrier : $im->checkpoint; - } - delete $self->{lms}; - $self->{priv_eidx}->checkpoint($wait); +sub check_done { + my ($self) = @_; + $self->git->_active ? + add_uniq_timer("$self-check_done", 5, \&check_done, $self) : + done($self); } sub xchg_stderr { @@ -520,32 +583,42 @@ sub xchg_stderr { _tail_err($self) if $self->{-err_wr}; my $dir = $self->{priv_eidx}->{topdir}; return unless -e $dir; - my $old = delete $self->{-tmp_err}; - my $pfx = POSIX::strftime('%Y%m%d%H%M%S', gmtime(time)); - my $err = File::Temp->new(TEMPLATE => "$pfx.$$.err-XXXX", - SUFFIX => '.err', DIR => $dir); - open STDERR, '>>', $err->filename or die "dup2: $!"; + delete $self->{-tmp_err}; + my ($err, $name) = tmpnam(); + open STDERR, '>>', $name or die "dup2: $!"; + unlink($name); STDERR->autoflush(1); # shared with shard subprocesses $self->{-tmp_err} = $err; # separate file description for RO access undef; } -sub done { - my ($self, $sock_ref) = @_; - my $err = ''; - if (my $im = delete($self->{im})) { - eval { $im->done }; - if ($@) { - $err .= "import done: $@\n"; - warn $err; - } +sub _commit ($$) { + my ($self, $cmd) = @_; # cmd is 'done' or 'barrier' + my ($errfh, $lei_sock) = @$self{0, 1}; # via sto_barrier_request + my @err; + if ($self->{im}) { + eval { $self->{im}->$cmd }; + push(@err, "E: import $cmd: $@\n") if $@; } delete $self->{lms}; - $self->{priv_eidx}->done; # V2Writable::done + eval { $self->{priv_eidx}->$cmd }; + push(@err, "E: priv_eidx $cmd: $@\n") if $@; + print { $errfh // \*STDERR } @err; + send($lei_sock, 'child_error 256', 0) if @err && $lei_sock; xchg_stderr($self); - die $err if $err; + die @err if @err; + # $lei_sock goes out-of-scope and script/lei can terminate } +sub barrier { + my ($self) = @_; + _commit $self, 'barrier'; + add_uniq_timer("$self-check_done", 5, \&check_done, $self); + undef; +} + +sub done { _commit $_[0], 'done' } + sub ipc_atfork_child { my ($self) = @_; my $lei = $self->{lei}; @@ -564,16 +637,15 @@ sub recv_and_run { $self->SUPER::recv_and_run(@args); } -sub _sto_atexit { # dwaitpid callback - my ($args, $pid) = @_; - my $self = $args->[0]; +sub _sto_atexit { # awaitpid cb + my ($pid) = @_; warn "lei/store PID:$pid died \$?=$?\n" if $?; } sub write_prepare { my ($self, $lei) = @_; $lei // die 'BUG: $lei not passed'; - unless ($self->{-ipc_req}) { + unless ($self->{-wq_s1}) { my $dir = $lei->store_path; substr($dir, -length('/lei/store'), 10, ''); pipe(my ($r, $w)) or die "pipe: $!"; @@ -581,13 +653,12 @@ sub write_prepare { # Mail we import into lei are private, so headers filtered out # by -mda for public mail are not appropriate local @PublicInbox::MDA::BAD_HEADERS = (); + local $SIG{ALRM} = 'IGNORE'; $self->wq_workers_start("lei/store $dir", 1, $lei->oldset, { lei => $lei, -err_wr => $w, to_close => [ $r ], - }); - $self->wq_wait_async(\&_sto_atexit); # outlives $lei - require PublicInbox::LeiStoreErr; + }, \&_sto_atexit); PublicInbox::LeiStoreErr->new($r, $lei); } $lei->{sto} = $self; diff --git a/lib/PublicInbox/LeiStoreErr.pm b/lib/PublicInbox/LeiStoreErr.pm index cc085fdc..c8bc72b6 100644 --- a/lib/PublicInbox/LeiStoreErr.pm +++ b/lib/PublicInbox/LeiStoreErr.pm @@ -1,38 +1,60 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # forwards stderr from lei/store process to any lei clients using # the same store, falls back to syslog if no matching clients exist. package PublicInbox::LeiStoreErr; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::DS); -use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); +use PublicInbox::Syscall qw(EPOLLIN); use Sys::Syslog qw(openlog syslog closelog); use IO::Handle (); # ->blocking +use Time::HiRes (); +use autodie qw(open); +our $err_wr; + +# We don't want blocked stderr on clients to block lei/store or lei-daemon. +# We can't make stderr non-blocking since it can break MUAs or anything +# lei might spawn. So we setup a timer to wake us up after a second if +# printing to a user's stderr hasn't completed, yet. Unfortunately, +# EINTR alone isn't enough since Perl auto-restarts writes on signals, +# so to interrupt writes to clients with blocked stderr, we dup the +# error output to $err_wr ahead-of-time and close $err_wr in the +# SIGALRM handler to ensure `print' gets aborted: + +sub abort_err_wr { close($err_wr) if $err_wr; undef $err_wr } + +sub emit ($@) { + my ($efh, @msg) = @_; + open(local $err_wr, '>&', $efh); # fdopen(dup(fileno($efh)), "w") + local $SIG{ALRM} = \&abort_err_wr; + Time::HiRes::alarm(1.0, 0.1); + my $ret = print $err_wr @msg; + Time::HiRes::alarm(0); + $ret; +} sub new { my ($cls, $rd, $lei) = @_; my $self = bless { sock => $rd, store_path => $lei->store_path }, $cls; $rd->blocking(0); - $self->SUPER::new($rd, EPOLLIN | EPOLLONESHOT); + $self->SUPER::new($rd, EPOLLIN); # level-trigger } sub event_step { my ($self) = @_; - my $rbuf = $self->{rbuf} // \(my $x = ''); - $self->do_read($rbuf, 8192, length($$rbuf)) or return; - my $cb; + my $n = sysread($self->{sock}, my $buf, 8192); + return ($!{EAGAIN} ? 0 : $self->close) if !defined($n); + return $self->close if !$n; my $printed; - for my $lei (values %PublicInbox::DS::DescriptorMap) { - $cb = $lei->can('store_path') // next; + for my $lei (grep defined, @PublicInbox::DS::FD_MAP) { + my $cb = $lei->can('store_path') // next; next if $cb->($lei) ne $self->{store_path}; - my $err = $lei->{2} // next; - print $err $$rbuf and $printed = 1; + emit($lei->{2} // next, $buf) and $printed = 1; } if (!$printed) { openlog('lei/store', 'pid,nowait,nofatal,ndelay', 'user'); - for my $l (split(/\n/, $$rbuf)) { syslog('warning', '%s', $l) } + for my $l (split(/\n/, $buf)) { syslog('warning', '%s', $l) } closelog(); # don't share across fork } } diff --git a/lib/PublicInbox/LeiSucks.pm b/lib/PublicInbox/LeiSucks.pm index 8e866fc9..ddb3faf7 100644 --- a/lib/PublicInbox/LeiSucks.pm +++ b/lib/PublicInbox/LeiSucks.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Undocumented hidden command somebody might discover if they're @@ -7,11 +7,12 @@ package PublicInbox::LeiSucks; use strict; use v5.10.1; -use Digest::SHA (); +use PublicInbox::SHA qw(sha1_hex); use Config; use POSIX (); use PublicInbox::Config; use PublicInbox::IPC; +use PublicInbox::IO qw(read_all); sub lei_sucks { my ($lei, @argv) = @_; @@ -54,13 +55,13 @@ sub lei_sucks { } else { push @out, "Xapian not available: $@\n"; } - my $dig = Digest::SHA->new(1); push @out, "public-inbox blob OIDs of loaded features:\n"; for my $m (grep(m{^PublicInbox/}, sort keys %INC)) { my $f = $INC{$m} // next; # lazy require failed (missing dep) - $dig->add('blob '.(-s $f)."\0"); - $dig->addfile($f); - push @out, ' '.$dig->hexdigest.' '.$m."\n"; + open my $fh, '<', $f or do { warn "open($f): $!"; next }; + my $size = -s $fh; + my $hex = sha1_hex("blob $size\0".read_all($fh, $size)); + push @out, ' '.$hex.' '.$m."\n"; } push @out, <<'EOM'; Let us know how it sucks! Please include the above and any other diff --git a/lib/PublicInbox/LeiTag.pm b/lib/PublicInbox/LeiTag.pm index 8ce96a10..da8caeb7 100644 --- a/lib/PublicInbox/LeiTag.pm +++ b/lib/PublicInbox/LeiTag.pm @@ -1,21 +1,21 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # handles "lei tag" command package PublicInbox::LeiTag; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::IPC PublicInbox::LeiInput); use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::OnDestroy; sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh my ($self, $eml) = @_; if (my $xoids = $self->{lse}->xoids_for($eml) // # tries LeiMailSync $self->{lei}->{ale}->xoids_for($eml)) { $self->{lei}->{sto}->wq_do('update_xvmd', $xoids, $eml, - $self->{vmd_mod}); + $self->{lei}->{vmd_mod}); } else { - ++$self->{unimported}; + ++$self->{-nr_unimported}; } } @@ -31,11 +31,8 @@ sub lei_tag { # the "lei tag" method my $sto = $lei->_lei_store(1)->write_prepare($lei); my $self = bless {}, __PACKAGE__; $lei->ale; # refresh and prepare - my $vmd_mod = $self->vmd_mod_extract(\@argv); - return $lei->fail(join("\n", @{$vmd_mod->{err}})) if $vmd_mod->{err}; - $self->{vmd_mod} = $vmd_mod; # before LeiPmdir->new in prepare_inputs $self->prepare_inputs($lei, \@argv) or return; - grep(defined, @$vmd_mod{qw(+kw +L -L -kw)}) or + grep(defined, @{$lei->{vmd_mod}}{qw(+kw +L -L -kw)}) or return $lei->fail('no keywords or labels specified'); $lei->{-err_type} = 'non-fatal'; $lei->wq1_start($self); @@ -43,8 +40,8 @@ sub lei_tag { # the "lei tag" method sub note_unimported { my ($self) = @_; - my $n = $self->{unimported} or return; - $self->{lei}->{pkt_op_p}->pkt_do('incr', 'unimported', $n); + my $n = $self->{-nr_unimported} or return; + $self->{lei}->{pkt_op_p}->pkt_do('incr', -nr_unimported => $n); } sub ipc_atfork_child { @@ -52,7 +49,7 @@ sub ipc_atfork_child { PublicInbox::LeiInput::input_only_atfork_child($self); $self->{lse} = $self->{lei}->{sto}->search; # this goes out-of-scope at worker process exit: - PublicInbox::OnDestroy->new($$, \¬e_unimported, $self); + on_destroy \¬e_unimported, $self; } # Workaround bash word-splitting s to ['kw', ':', 'keyword' ...] diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 2aa3977e..5481b5e4 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -7,12 +7,15 @@ use strict; use v5.10.1; use parent qw(PublicInbox::IPC); use PublicInbox::Eml; -use PublicInbox::ProcessPipe; +use PublicInbox::IO; +use PublicInbox::Git; use PublicInbox::Spawn qw(spawn); -use Symbol qw(gensym); +use PublicInbox::Import; use IO::Handle; # ->autoflush use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY); use PublicInbox::Syscall qw(rename_noreplace); +use autodie qw(pipe open seek close); +use Carp qw(croak); my %kw2char = ( # Maildir characters draft => 'D', @@ -54,8 +57,7 @@ sub _mbox_hdr_buf ($$$) { } my $buf = delete $eml->{hdr}; - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($$buf); my $ident = $smsg->{blob} // 'lei'; if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" } @@ -132,40 +134,41 @@ sub eml2mboxcl2 { } sub git_to_mail { # git->cat_async callback - my ($bref, $oid, $type, $size, $arg) = @_; - $type // return; # called by git->async_abort - my ($write_cb, $smsg) = @$arg; - if ($type eq 'missing' && $smsg->{-lms_rw}) { - if ($bref = $smsg->{-lms_rw}->local_blob($oid, 1)) { + my ($bref, $oid, $type, $size, $smsg) = @_; + $type // return; # called by PublicInbox::Git::close + return if $PublicInbox::Git::in_cleanup; + my $self = delete $smsg->{l2m} // croak "BUG: no l2m (type=$type)"; + $self->{lei} // croak "BUG: no {lei} (type=$type)"; + eval { + if ($type eq 'missing' && + ($bref = $self->{-lms_rw}->local_blob($oid, 1))) { $type = 'blob'; $size = length($$bref); } - } - return warn("W: $oid is $type (!= blob)\n") if $type ne 'blob'; - return warn("E: $oid is empty\n") unless $size; - die "BUG: expected=$smsg->{blob} got=$oid" if $smsg->{blob} ne $oid; - $write_cb->($bref, $smsg); + $type eq 'blob' or return $self->{lei}->child_error(0, + "W: $oid is $type (!= blob)"); + $size or return $self->{lei}->child_error(0,"E: $oid is empty"); + $smsg->{blob} eq $oid or die "BUG: expected=$smsg->{blob}"; + $smsg->{bytes} ||= $size; + $self->{wcb}->($bref, $smsg); + }; + $self->{lei}->fail("$@ (oid=$oid)") if $@; } -sub reap_compress { # dwaitpid callback - my ($lei, $pid) = @_; - my $cmd = delete $lei->{"pid.$pid"}; - return if $? == 0; - $lei->fail("@$cmd failed", $? >> 8); +sub reap_compress { # awaitpid callback + my ($pid, $lei, $cmd, $old_out) = @_; + $lei->{1} = $old_out; + $lei->fail($?, "@$cmd failed") if $?; } -sub _post_augment_mbox { # open a compressor process from top-level process +sub _post_augment_mbox { # open a compressor process from top-level lei-daemon my ($self, $lei) = @_; my $zsfx = $self->{zsfx} or return; my $cmd = PublicInbox::MboxReader::zsfx2cmd($zsfx, undef, $lei); my ($r, $w) = @{delete $lei->{zpipe}}; my $rdr = { 0 => $r, 1 => $lei->{1}, 2 => $lei->{2}, pgid => 0 }; - my $pid = spawn($cmd, undef, $rdr); - my $pp = gensym; - my $dup = bless { "pid.$pid" => $cmd }, ref($lei); - $dup->{$_} = $lei->{$_} for qw(2 sock); - tie *$pp, 'PublicInbox::ProcessPipe', $pid, $w, \&reap_compress, $dup; - $lei->{1} = $pp; + $lei->{1} = PublicInbox::IO::attach_pid($w, spawn($cmd, undef, $rdr), + \&reap_compress, $lei, $cmd, $lei->{1}); } # --augment existing output destination, with deduplication @@ -197,7 +200,7 @@ sub _mbox_write_cb ($$) { sub { # for git_to_mail my ($buf, $smsg, $eml) = @_; $eml //= PublicInbox::Eml->new($buf); - ++$lei->{-nr_seen}; + ++$self->{-nr_seen}; return if $dedupe->is_dup($eml, $smsg); $lse->xsmsg_vmd($smsg) if $lse; $smsg->{-recent} = 1 if $set_recent; @@ -208,7 +211,7 @@ sub _mbox_write_cb ($$) { my $lk = $ovv->lock_for_scope; $lei->out($$buf); } - ++$lei->{-nr_write}; + ++$self->{-nr_write}; } } @@ -259,7 +262,7 @@ sub _buf2maildir ($$$$) { $tmp = $dst.'tmp/'.$rand.$common; } while (!($ok = sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY)) && $!{EEXIST} && ($rand = _rand.',')); - if ($ok && print $fh $$buf and close($fh)) { + if ($ok && print $fh $$buf and $fh->close) { $dst .= $dir; # 'new/' or 'cur/' $rand = ''; do { @@ -293,7 +296,7 @@ sub _maildir_write_cb ($$) { my ($bref, $smsg, $eml) = @_; $dst // return $lei->fail; # dst may be undef-ed in last run - ++$lei->{-nr_seen}; + ++$self->{-nr_seen}; return if $dedupe && $dedupe->is_dup($eml // PublicInbox::Eml->new($$bref), $smsg); @@ -301,7 +304,7 @@ sub _maildir_write_cb ($$) { my $n = _buf2maildir($dst, $bref // \($eml->as_string), $smsg, $dir); $lms->set_src($smsg->oidbin, $out, $n) if $lms; - ++$lei->{-nr_write}; + ++$self->{-nr_write}; } } @@ -310,8 +313,11 @@ sub _imap_write_cb ($$) { my $dedupe = $lei->{dedupe}; $dedupe->prepare_dedupe if $dedupe; my $append = $lei->{net}->can('imap_append'); - my $uri = $self->{uri}; - my $mic = $lei->{net}->mic_get($uri); + my $uri = $self->{uri} // die 'BUG: no {uri}'; + my $mic = $lei->{net}->mic_get($uri) // die <<EOM; +E: $uri connection failed. +E: Consider using `--jobs ,1' to limit IMAP connections +EOM my $folder = $uri->mailbox; $uri->uidvalidity($mic->uidvalidity($folder)); my $lse = $lei->{lse}; # may be undef @@ -321,7 +327,7 @@ sub _imap_write_cb ($$) { my ($bref, $smsg, $eml) = @_; $mic // return $lei->fail; # mic may be undef-ed in last run - ++$lei->{-nr_seen}; + ++$self->{-nr_seen}; return if $dedupe && $dedupe->is_dup($eml // PublicInbox::Eml->new($$bref), $smsg); @@ -334,7 +340,7 @@ sub _imap_write_cb ($$) { # imap_append returns UID if IMAP server has UIDPLUS extension ($lms && $uid =~ /\A[0-9]+\z/) and $lms->set_src($smsg->oidbin, $$uri, $uid + 0); - ++$lei->{-nr_write}; + ++$self->{-nr_write}; } } @@ -362,13 +368,14 @@ sub _v2_write_cb ($$) { my ($self, $lei) = @_; my $dedupe = $lei->{dedupe}; $dedupe->prepare_dedupe if $dedupe; + # only call in worker + $PublicInbox::Import::DROP_UNIQUE_UNSUB = $lei->{-drop_unique_unsub}; sub { # for git_to_mail my ($bref, $smsg, $eml) = @_; $eml //= PublicInbox::Eml->new($bref); - ++$lei->{-nr_seen}; + ++$self->{-nr_seen}; return if $dedupe && $dedupe->is_dup($eml, $smsg); - $lei->{v2w}->wq_do('add', $eml); # V2Writable->add - ++$lei->{-nr_write}; + $lei->{v2w}->add($eml) and ++$self->{-nr_write}; } } @@ -392,9 +399,16 @@ sub new { "$dst exists and is not a directory\n"; $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; + } elsif ($fmt eq 'mh') { + -e $dst && !-d _ and die + "$dst exists and is not a directory\n"; + $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; + $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; } elsif (substr($fmt, 0, 4) eq 'mbox') { require PublicInbox::MboxReader; - $self->can("eml2$fmt") or die "bad mbox format: $fmt\n"; + $self->can("eml2$fmt") or die <<EOM; +E: bad mbox format: $fmt (did you mean: mboxrd, mboxo, mboxcl, or mboxcl2?) +EOM $self->{base_type} = 'mbox'; if ($lei->{cmd} eq 'q' && (($lei->path_to_fd($dst) // -1) < 0) && @@ -428,7 +442,7 @@ sub new { ($lei->{opt}->{dedupe}//'') eq 'oid'; $self->{base_type} = 'v2'; $self->{-wq_nr_workers} = 1; # v2 has shards - $lei->{opt}->{save} = \1; + $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; $dst = $lei->{ovv}->{dst} = $lei->abs_path($dst); @conflict = qw(mua sort); } else { @@ -438,6 +452,8 @@ sub new { (-d $dst || (-e _ && !-w _)) and die "$dst exists and is not a writable file\n"; } + $lei->{input_opt} and # lei_convert sets this + @conflict = grep { !$lei->{input_opt}->{$_} } @conflict; my @err = map { defined($lei->{opt}->{$_}) ? "--$_" : () } @conflict; die "@err incompatible with $fmt\n" if @err; $self->{dst} = $dst; @@ -456,15 +472,10 @@ sub new { sub _pre_augment_maildir { my ($self, $lei) = @_; my $dst = $lei->{ovv}->{dst}; - for my $x (qw(tmp new cur)) { - my $d = $dst.$x; - next if -d $d; - require File::Path; - File::Path::mkpath($d); - -d $d or die "$d is not a directory"; - } + require File::Path; + File::Path::make_path(map { $dst.$_ } qw(tmp new cur)); # for utime, so no opendir - open $self->{poke_dh}, '<', "${dst}cur" or die "open ${dst}cur: $!"; + open $self->{poke_dh}, '<', "${dst}cur"; } sub clobber_dst_prepare ($;$) { @@ -544,11 +555,11 @@ sub _pre_augment_text { $out = $lei->{$devfd}; } else { # normal-looking path if (-p $dst) { - open $out, '>', $dst or die "open($dst): $!"; + open $out, '>', $dst; } elsif (-f _ || !-e _) { # text allows augment, HTML/Atom won't my $mode = $lei->{opt}->{augment} ? '>>' : '>'; - open $out, $mode, $dst or die "open($mode, $dst): $!"; + open $out, $mode, $dst; } else { die "$dst is not a file or FIFO\n"; } @@ -567,7 +578,7 @@ sub _pre_augment_mbox { $out = $lei->{$devfd}; } else { # normal-looking path if (-p $dst) { - open $out, '>', $dst or die "open($dst): $!"; + open $out, '>', $dst; } elsif (-f _ || !-e _) { require PublicInbox::MboxLock; my $m = $lei->{opt}->{'lock'} // @@ -580,7 +591,7 @@ sub _pre_augment_mbox { $lei->{old_1} = $lei->{1}; # keep for spawning MUA } # Perl does SEEK_END even with O_APPEND :< - $self->{seekable} = seek($out, 0, SEEK_SET); + $self->{seekable} = $out->seek(0, SEEK_SET); if (!$self->{seekable} && !$!{ESPIPE} && !defined($devfd)) { die "seek($dst): $!\n"; } @@ -594,7 +605,7 @@ sub _pre_augment_mbox { $lei->{dedupe} && $lei->{dedupe}->can('reset_dedupe'); } if ($self->{zsfx} = PublicInbox::MboxReader::zsfx($dst)) { - pipe(my ($r, $w)) or die "pipe: $!"; + pipe(my $r, my $w); $lei->{zpipe} = [ $r, $w ]; $lei->{ovv}->{lock_path} and die 'BUG: unexpected {ovv}->{lock_path}'; @@ -606,6 +617,17 @@ sub _pre_augment_mbox { undef; } +sub finish_output { + my ($self, $lei) = @_; + my $out = delete $lei->{1} // die 'BUG: no lei->{1}'; + my $old = delete $lei->{old_1} or return; # path only + $lei->{1} = $old; + return if $out->close; # reaps gzip|pigz|xz|bzip2 + my $msg = "E: Error closing $lei->{ovv}->{dst}"; + $? ? $lei->child_error($?) : ($msg .= " ($!)"); + die $msg; +} + sub _do_augment_mbox { my ($self, $lei) = @_; return unless $self->{seekable}; @@ -622,7 +644,7 @@ sub _do_augment_mbox { if (my $zsfx = $self->{zsfx}) { $rd = PublicInbox::MboxReader::zsfxcat($out, $zsfx, $lei); } else { - open($rd, '+>>&', $out) or die "dup: $!"; + open($rd, '+>>&', $out); } my $dedupe; if ($opt->{augment}) { @@ -642,16 +664,10 @@ sub _do_augment_mbox { PublicInbox::MboxReader->$fmt($rd, \&_augment, $lei); } # maybe some systems don't honor O_APPEND, Perl does this: - seek($out, 0, SEEK_END) or die "seek $dst: $!"; + seek($out, 0, SEEK_END); $dedupe->pause_dedupe if $dedupe; } -sub v2w_done_wait { # dwaitpid callback - my ($arg, $pid) = @_; - my ($v2w, $lei) = @$arg; - $lei->child_error($?, "error for $v2w->{ibx}->{inboxdir}") if $?; -} - sub _pre_augment_v2 { my ($self, $lei) = @_; my $dir = $self->{dst}; @@ -672,19 +688,21 @@ sub _pre_augment_v2 { }); } PublicInbox::InboxWritable->new($ibx, @creat); + local $PublicInbox::Import::DROP_UNIQUE_UNSUB; # only for workers + PublicInbox::Import::load_config(PublicInbox::Config->new, sub { + $lei->x_it(shift); + die "E: can't write v2 inbox with broken config\n"; + }); + $lei->{-drop_unique_unsub} = $PublicInbox::Import::DROP_UNIQUE_UNSUB; $ibx->init_inbox if @creat; - my $v2w = $ibx->importer; - $v2w->wq_workers_start("lei/v2w $dir", 1, $lei->oldset, {lei => $lei}); - $v2w->wq_wait_async(\&v2w_done_wait, $lei); - $lei->{v2w} = $v2w; + $lei->{v2w} = $ibx->importer; return if !$lei->{opt}->{shared}; my $d = "$lei->{ale}->{git}->{git_dir}/objects"; - my $al = "$dir/git/0.git/objects/info/alternates"; - open my $fh, '+>>', $al or die "open($al): $!"; - seek($fh, 0, SEEK_SET) or die "seek($al): $!"; - grep(/\A\Q$d\E\n/, <$fh>) and return; - print $fh "$d\n" or die "print($al): $!"; - close $fh or die "close($al): $!"; + open my $fh, '+>>', my $f = "$dir/git/0.git/objects/info/alternates"; + seek($fh, 0, SEEK_SET); # Perl did SEEK_END when it saw '>>' + my $seen = grep /\A\Q$d\E\n/, PublicInbox::IO::read_all $fh; + print $fh "$d\n" if !$seen; + close $fh; } sub pre_augment { # fast (1 disk seek), runs in same process as post_augment @@ -701,16 +719,32 @@ sub do_augment { # slow, runs in wq worker $m->($self, $lei); } +sub post_augment_call ($$$$) { + my ($self, $lei, $m, $post_augment_done) = @_; + eval { $m->($self, $lei) }; + $lei->{post_augment_err} = $@ if $@; # for post_augment_done +} + # fast (spawn compressor or mkdir), runs in same process as pre_augment sub post_augment { - my ($self, $lei, @args) = @_; + my ($self, $lei, $post_augment_done) = @_; $self->{-au_noted}++ and $lei->qerr("# writing to $self->{dst} ..."); - my $wait = $lei->{opt}->{'import-before'} ? - $lei->{sto}->wq_do('checkpoint', 1) : 0; # _post_augment_mbox my $m = $self->can("_post_augment_$self->{base_type}") or return; - $m->($self, $lei, @args); + + # --import-before is only for lei-(q|lcat), not lei-convert + $lei->{opt}->{'import-before'} or + return post_augment_call $self, $lei, $m, $post_augment_done; + + # we can't deal with post_augment until import-before commits: + require PublicInbox::EOFpipe; + my @io = @$lei{qw(2 sock)}; + pipe(my $r, $io[2]); + PublicInbox::EOFpipe->new($r, \&post_augment_call, + $self, $lei, $m, $post_augment_done); + $lei->{sto}->wq_io_do('barrier', \@io); + # _post_augment_* && post_augment_done run when barrier is complete } # called by every single l2m worker process @@ -749,7 +783,8 @@ sub do_post_auth { $au_peers->[1] = undef; sysread($au_peers->[0], my $barrier1, 1); } - $self->{wcb} = $self->write_cb($lei); + eval { $self->{wcb} = $self->write_cb($lei) }; + $lei->fail($@) if $@; if ($au_peers) { # wait for peer l2m to set write_cb $au_peers->[3] = undef; sysread($au_peers->[2], my $barrier2, 1); @@ -786,22 +821,29 @@ sub poke_dst { sub write_mail { # via ->wq_io_do my ($self, $smsg, $eml) = @_; - return $self->{wcb}->(undef, $smsg, $eml) if $eml; - $smsg->{-lms_rw} = $self->{-lms_rw}; - $self->{git}->cat_async($smsg->{blob}, \&git_to_mail, - [$self->{wcb}, $smsg]); + if ($eml) { + eval { $self->{wcb}->(undef, $smsg, $eml) }; + $self->{lei}->fail("blob=$smsg->{blob} $@") if $@; + } else { + $smsg->{l2m} = $self; + $self->{git}->cat_async($smsg->{blob}, \&git_to_mail, $smsg); + } } sub wq_atexit_child { my ($self) = @_; local $PublicInbox::DS::in_loop = 0; # waitpid synchronously my $lei = $self->{lei}; - delete $self->{wcb}; $lei->{ale}->git->async_wait_all; - my ($nr_w, $nr_s) = delete(@$lei{qw(-nr_write -nr_seen)}); - $nr_s or return; + my ($nr_w, $nr_s) = delete(@$self{qw(-nr_write -nr_seen)}); + if (my $v2w = delete $lei->{v2w}) { + eval { $v2w->done }; + $lei->child_error($?, "E: $@ ($v2w->{ibx}->{inboxdir})") if $@; + } + delete $self->{wcb}; + (($nr_w //= 0) + ($nr_s //= 0)) or return; return if $lei->{early_mua} || !$lei->{-progress} || !$lei->{pkt_op_p}; - $lei->{pkt_op_p}->pkt_do('l2m_progress', $nr_w, $nr_s); + $lei->{pkt_op_p}->pkt_do('incr', -nr_write => $nr_w, -nr_seen => $nr_s) } # runs on a 1s timer in lei-daemon diff --git a/lib/PublicInbox/LeiUp.pm b/lib/PublicInbox/LeiUp.pm index b8a98360..9931f017 100644 --- a/lib/PublicInbox/LeiUp.pm +++ b/lib/PublicInbox/LeiUp.pm @@ -1,16 +1,16 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # "lei up" - updates the result of "lei q --save" package PublicInbox::LeiUp; -use strict; -use v5.10.1; +use v5.12; # n.b. we use LeiInput to setup IMAP auth use parent qw(PublicInbox::IPC PublicInbox::LeiInput); -use PublicInbox::LeiSavedSearch; +use PublicInbox::LeiSavedSearch; # OverIdx use PublicInbox::DS; use PublicInbox::PktOp; use PublicInbox::LeiFinmsg; +use PublicInbox::LEI; my $REMOTE_RE = qr!\A(?:imap|http)s?://!i; # http(s) will be for JMAP sub up1 ($$) { @@ -32,8 +32,10 @@ sub up1 ($$) { my $rawstr = $lss->{-cfg}->{'lei.internal.rawstr'} // (scalar(@$q) == 1 && substr($q->[0], -1) eq "\n"); if ($rawstr) { - scalar(@$q) > 1 and - die "$f: lei.q has multiple values (@$q) (out=$out)\n"; + die <<EOM if scalar(@$q) > 1; +$f: lei.q has multiple values (@$q) (out=$out) +$f: while lei.internal.rawstr is set +EOM $lse->query_approxidate($lse->git, $mset_opt->{qstr} = $q->[0]); } else { $mset_opt->{qstr} = $lse->query_argv_to_string($lse->git, $q); @@ -75,6 +77,7 @@ sub redispatch_all ($$) { my $upq = [ (@{$self->{o_local} // []}, @{$self->{o_remote} // []}) ]; return up1($lei, $upq->[0]) if @$upq == 1; # just one, may start MUA + PublicInbox::OverIdx::fork_ok($lei->{opt}); # FIXME: this is also used per-query, see lei->_start_query my $j = $lei->{opt}->{jobs} || do { my $n = $self->detect_nproc // 1; @@ -89,7 +92,6 @@ sub redispatch_all ($$) { $op_c->{ops} = { '' => [ $lei->can('dclose'), $lei ] }; my @first_batch = splice(@$upq, 0, $j); # initial parallelism $lei->{-upq} = $upq; - $lei->{daemon_pid} = $$; $lei->event_step_init; # wait for client disconnects for my $out (@first_batch) { PublicInbox::DS::requeue( @@ -162,9 +164,8 @@ sub _complete_up { # lei__complete hook map { $match_cb->($_) } PublicInbox::LeiSavedSearch::list($lei); } -sub _wq_done_wait { # dwaitpid callback - my ($arg, $pid) = @_; - my ($wq, $lei) = @$arg; +sub _wq_done_wait { # awaitpid cb + my ($pid, $wq, $lei) = @_; $lei->child_error($?, 'auth failure') if $? } @@ -172,8 +173,7 @@ no warnings 'once'; *ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child; package PublicInbox::LeiUp1; # for redispatch_all -use strict; -use v5.10.1; +use v5.12; sub nxt ($$$) { my ($lei, $out, $op_p) = @_; @@ -210,8 +210,8 @@ sub event_step { # runs via PublicInbox::DS::requeue sub DESTROY { my ($self) = @_; + return if ($PublicInbox::LEI::daemon_pid // -1) != $$; my $lei = $self->{lei}; # the original, from lei_up - return if $lei->{daemon_pid} != $$; my $sock = delete $self->{unref_on_destroy}; my $s = $lei->{-socks} // []; @$s = grep { $_ != $sock } @$s; diff --git a/lib/PublicInbox/LeiViewText.pm b/lib/PublicInbox/LeiViewText.pm index 53555467..c7d72c71 100644 --- a/lib/PublicInbox/LeiViewText.pm +++ b/lib/PublicInbox/LeiViewText.pm @@ -72,12 +72,11 @@ sub new { my $self = bless { %{$lei->{opt}}, -colored => \&uncolored }, $cls; $self->{-quote_reply} = 1 if $fmt eq 'reply'; return $self unless $self->{color} //= -t $lei->{1}; - my $cmd = [ qw(git config -z --includes -l) ]; - my ($r, $pid) = popen_rd($cmd, undef, { 2 => $lei->{2} }); + my @cmd = qw(git config -z --includes -l); # reuse normal git config + my $r = popen_rd(\@cmd, undef, { 2 => $lei->{2} }); my $cfg = PublicInbox::Config::config_fh_parse($r, "\0", "\n"); - waitpid($pid, 0); - if ($?) { - warn "# git-config failed, no color (non-fatal)\n"; + if (!$r->close) { + warn "# @cmd failed, no color (non-fatal \$?=$?)\n"; return $self; } $self->{-colored} = \&my_colored; diff --git a/lib/PublicInbox/LeiWatch.pm b/lib/PublicInbox/LeiWatch.pm index 35267b58..b30e5152 100644 --- a/lib/PublicInbox/LeiWatch.pm +++ b/lib/PublicInbox/LeiWatch.pm @@ -1,13 +1,12 @@ # Copyright all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# represents a Maildir or IMAP "watch" item +# represents a Maildir, MH or IMAP "watch" item package PublicInbox::LeiWatch; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::IPC); -# "url" may be something like "maildir:/path/to/dir" +# "url" may be something like "maildir:/path/to/dir" or "mh:/path/to/dir" sub new { bless { url => $_[1] }, $_[0] } 1; diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm index 6f877019..43dedd10 100644 --- a/lib/PublicInbox/LeiXSearch.pm +++ b/lib/PublicInbox/LeiXSearch.pm @@ -12,15 +12,17 @@ use PublicInbox::DS qw(now); use File::Temp 0.19 (); # 0.19 for ->newdir use File::Spec (); use PublicInbox::Search qw(xap_terms); -use PublicInbox::Spawn qw(popen_rd spawn which); -use PublicInbox::MID qw(mids); +use PublicInbox::Spawn qw(popen_rd popen_wr which); +use PublicInbox::MID qw(mids mid_escape); use PublicInbox::Smsg; -use PublicInbox::AutoReap; use PublicInbox::Eml; use PublicInbox::LEI; use Fcntl qw(SEEK_SET F_SETFL O_APPEND O_RDWR); use PublicInbox::ContentHash qw(git_sha); use POSIX qw(strftime); +use autodie qw(close open read seek truncate); +use PublicInbox::Syscall qw($F_SETPIPE_SZ); +use PublicInbox::OnDestroy; sub new { my ($class) = @_; @@ -103,13 +105,6 @@ sub smsg_for { $smsg; } -sub recent { - my ($self, $qstr, $opt) = @_; - $opt //= {}; - $opt->{relevance} //= -2; - $self->mset($qstr //= 'z:1..', $opt); -} - sub over {} sub _check_mset_limit ($$$) { @@ -128,26 +123,16 @@ sub _mset_more ($$) { $size >= $mo->{limit} && (($mo->{offset} += $size) < $mo->{total}); } -# $startq will EOF when do_augment is done augmenting and allow +# $startq will see `q' in do_post_augment -> start_mua if spawning MUA. +# Otherwise $startq will EOF when do_augment is done augmenting and allow # query_combined_mset and query_thread_mset to proceed. sub wait_startq ($) { my ($lei) = @_; - my $startq = delete $lei->{startq} or return; - while (1) { - my $n = sysread($startq, my $do_augment_done, 1); - if (defined $n) { - return if $n == 0; # no MUA - if ($do_augment_done eq 'q') { - $lei->{opt}->{quiet} = 1; - delete $lei->{opt}->{verbose}; - delete $lei->{-progress}; - } else { - die "BUG: do_augment_done=`$do_augment_done'"; - } - return; - } - die "wait_startq: $!" unless $!{EINTR}; - } + read(delete($lei->{startq}) // return, my $buf, 1) or return; # EOF + die "BUG: wrote `$buf' to au_done" if $buf ne 'q'; + $lei->{opt}->{quiet} = 1; + delete $lei->{opt}->{verbose}; + delete $lei->{-progress}; } sub mset_progress { @@ -162,71 +147,62 @@ sub mset_progress { } } -sub l2m_progress { - my ($lei, $nr_write, $nr_seen) = @_; - $lei->{-nr_write} += $nr_write; - $lei->{-nr_seen} += $nr_seen; -} - sub query_one_mset { # for --threads and l2m w/o sort my ($self, $ibxish) = @_; - local $0 = "$0 query_one_mset"; my $lei = $self->{lei}; my ($srch, $over) = ($ibxish->search, $ibxish->over); my $dir = $ibxish->{inboxdir} // $ibxish->{topdir}; return warn("$dir not indexed by Xapian\n") unless ($srch && $over); bless $srch, 'PublicInbox::LeiSearch'; # for ->qparse_new my $mo = { %{$lei->{mset_opt}} }; # copy + local $0 = "$0 1 $mo->{qstr}"; my $mset; my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); my $can_kw = !!$ibxish->can('msg_keywords'); my $threads = $lei->{opt}->{threads} // 0; my $fl = $threads > 1 ? 1 : undef; + my $mid = $lei->{opt}->{'thread-id'}; + $mo->{threadid} = $over->mid2tid($mid) if defined $mid; my $lss = $lei->{lss}; - my $maxk = "external.$dir.maxuid"; - my $stop_at = $lss ? $lss->{-cfg}->{$maxk} : undef; - if (defined $stop_at) { - ref($stop_at) and - return warn("$maxk=$stop_at has multiple values\n"); - ($stop_at =~ /[^0-9]/) and - return warn("$maxk=$stop_at not numeric\n"); - } + my $maxk = "external.$dir.maxuid"; # max of previous, so our min + my $min = $lss ? ($lss->{-cfg}->{$maxk} // 0) : 0; + ref($min) and return warn("$maxk=$min has multiple values\n"); + ($min =~ /[^0-9]/) and return warn("$maxk=$min not numeric\n"); my $first_ids; do { - $mset = $srch->mset($mo->{qstr}, $mo); + $mset = eval { $srch->mset($mo->{qstr}, $mo) }; + return $lei->child_error(22 << 8, "E: $@") if $@; # 22 from curl mset_progress($lei, $dir, $mo->{offset} + $mset->size, $mset->get_matches_estimated); wait_startq($lei); # wait for keyword updates my $ids = $srch->mset_to_artnums($mset, $mo); - @$ids = grep { $_ > $stop_at } @$ids if defined($stop_at); my $i = 0; if ($threads) { # copy $ids if $lss since over->expand_thread # shifts @{$ctx->{ids}} $first_ids = [ @$ids ] if $lss; - my $ctx = { ids => $ids }; - my %n2item = map { ($ids->[$i++], $_) } $mset->items; - while ($over->expand_thread($ctx)) { - for my $n (@{$ctx->{xids}}) { + my $ctx = { ids => $ids, min => $min }; + my %n2item = map { $ids->[$i++] => $_ } $mset->items; + while ($over->expand_thread($ctx)) { # fills {xids} + for my $n (@{delete $ctx->{xids}}) { my $smsg = $over->get_art($n) or next; - my $mitem = delete $n2item{$n}; + my $mi = delete $n2item{$n}; next if $smsg->{bytes} == 0; - if ($mitem && $can_kw) { - mitem_kw($srch, $smsg, $mitem, - $fl); - } elsif ($mitem && $fl) { + if ($mi && $can_kw) { + mitem_kw($srch, $smsg, $mi, $fl) + } elsif ($mi && $fl) { # call ->xsmsg_vmd, later $smsg->{lei_q_tt_flagged} = 1; } - $each_smsg->($smsg, $mitem); + $each_smsg->($smsg, $mi); } - @{$ctx->{xids}} = (); } } else { $first_ids = $ids; - my @items = $mset->items; + my @items = $mset->items; # parallel with @$ids for my $n (@$ids) { my $mitem = $items[$i++]; + next if $n <= $min; my $smsg = $over->get_art($n) or next; next if $smsg->{bytes} == 0; mitem_kw($srch, $smsg, $mitem, $fl) if $can_kw; @@ -236,7 +212,6 @@ sub query_one_mset { # for --threads and l2m w/o sort } while (_mset_more($mset, $mo)); _check_mset_limit($lei, $dir, $mset); if ($lss && scalar(@$first_ids)) { - undef $stop_at; my $max = $first_ids->[0]; $lss->cfg_set($maxk, $max); undef $lss; @@ -247,16 +222,17 @@ sub query_one_mset { # for --threads and l2m w/o sort sub query_combined_mset { # non-parallel for non-"--threads" users my ($self) = @_; - local $0 = "$0 query_combined_mset"; my $lei = $self->{lei}; my $mo = { %{$lei->{mset_opt}} }; + local $0 = "$0 C $mo->{qstr}"; my $mset; for my $loc (locals($self)) { attach_external($self, $loc); } my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); do { - $mset = $self->mset($mo->{qstr}, $mo); + $mset = eval { $self->mset($mo->{qstr}, $mo) }; + return $lei->child_error(22 << 8, "E: $@") if $@; # 22 from curl mset_progress($lei, 'xsearch', $mo->{offset} + $mset->size, $mset->get_matches_estimated); wait_startq($lei); # wait for keyword updates @@ -325,8 +301,9 @@ sub fudge_qstr_time ($$$) { $rft = $diff; } $lr -= ($rft || (48 * 60 * 60)); + require PublicInbox::Admin; $lei->qerr("# $uri limiting to ". - strftime('%Y-%m-%d %k:%M %z', localtime($lr)). ' and newer'); + PublicInbox::Admin::fmt_localtime($lr).' and newer'); # this should really be rt: (received-time), but no stable # public-inbox releases support it, yet. my $dt = 'dt:'.strftime('%Y%m%d%H%M%S', gmtime($lr)).'..'; @@ -339,67 +316,69 @@ sub fudge_qstr_time ($$$) { sub query_remote_mboxrd { my ($self, $uris) = @_; - local $0 = "$0 query_remote_mboxrd"; local $SIG{TERM} = sub { exit(0) }; # for DESTROY (File::Temp, $reap) my $lei = $self->{lei}; my $opt = $lei->{opt}; - chomp(my $qstr = $lei->{mset_opt}->{qstr}); - $qstr =~ s/[ \n\t]+/ /sg; # make URLs less ugly + my $qstr = $lei->{mset_opt}->{qstr}; + local $0 = "$0 R $qstr"; my @qform = (x => 'm'); push(@qform, t => 1) if $opt->{threads}; - my $verbose = $opt->{verbose}; - my $reap_tail; - my $cerr = File::Temp->new(TEMPLATE => 'curl.err-XXXX', TMPDIR => 1); - fcntl($cerr, F_SETFL, O_APPEND|O_RDWR) or warn "set O_APPEND: $!"; + open my $cerr, '+>', undef; my $rdr = { 2 => $cerr }; - if ($verbose) { - # spawn a process to force line-buffering, otherwise curl + my @lbf_tee; + if ($opt->{verbose}) { + # spawn a line-buffered tee(1) script, otherwise curl # will write 1 character at-a-time and parallel outputs # mmmaaayyy llloookkk llliiikkkeee ttthhhiiisss - my $o = { 1 => $lei->{2}, 2 => $lei->{2} }; - my $pid = spawn(['tail', '-f', $cerr->filename], undef, $o); - $reap_tail = PublicInbox::AutoReap->new($pid); + # (n.b. POSIX tee(1) cannot do any buffering) + my $o = { 1 => $cerr, 2 => $lei->{2} }; + delete $rdr->{2}; + @lbf_tee = ([ $^X, qw(-w -p -e), <<'' ], undef, $o); +BEGIN { $| = 1; use IO::Handle; STDERR->autoflush(1); } +print STDERR $_; + } my $curl = PublicInbox::LeiCurl->new($lei, $self->{curl}) or return; push @$curl, '-s', '-d', ''; my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); $self->{import_sto} = $lei->{sto} if $lei->{opt}->{'import-remote'}; + if (defined(my $mid = $opt->{'thread-id'})) { + $mid = mid_escape($mid); + for my $uri (@$uris) { + $uri->path($uri->path.$mid.'/'); + } + } for my $uri (@$uris) { $lei->{-current_url} = $uri->as_string; - $lei->{-nr_remote_eml} = 0; my $start = time; my ($q, $key) = fudge_qstr_time($lei, $uri, $qstr); $uri->query_form(@qform, q => $q); my $cmd = $curl->for_uri($lei, $uri); $lei->qerr("# $cmd"); - my ($fh, $pid) = popen_rd($cmd, undef, $rdr); - my $reap_curl = PublicInbox::AutoReap->new($pid); - $fh = IO::Uncompress::Gunzip->new($fh, MultiStream => 1); - PublicInbox::MboxReader->mboxrd($fh, \&each_remote_eml, $self, - $lei, $each_smsg); - if (delete($self->{-sto_imported})) { - my $wait = $self->{import_sto}->wq_do('done'); - } - $reap_curl->join; - if ($? == 0) { - # don't update if no results, maybe MTA is down - my $nr = $lei->{-nr_remote_eml}; + $rdr->{2} //= popen_wr(@lbf_tee) if @lbf_tee; + my $fh = popen_rd($cmd, undef, $rdr); + $fh = IO::Uncompress::Gunzip->new($fh, + MultiStream => 1, AutoClose => 1); + eval { + PublicInbox::MboxReader->mboxrd($fh, \&each_remote_eml, + $self, $lei, $each_smsg); + }; + my ($exc, $code) = ($@, $?); + $lei->sto_barrier_request if delete($self->{-sto_imported}); + die "E: $exc" if $exc && !$code; + my $nr = delete $lei->{-nr_remote_eml} // 0; + if (!$code) { # don't update if no results, maybe MTA is down $lei->{lss}->cfg_set($key, $start) if $key && $nr; mset_progress($lei, $lei->{-current_url}, $nr, $nr); next; } - my $err; - if (-s $cerr) { - seek($cerr, 0, SEEK_SET) // - warn "seek($cmd stderr): $!"; - $err = do { local $/; <$cerr> } // - warn "read($cmd stderr): $!"; - truncate($cerr, 0) // warn "truncate($cmd stderr): $!"; - } - $err //= ''; - next if (($? >> 8) == 22 && $err =~ /\b404\b/); + delete($rdr->{2})->close if @lbf_tee; + seek($cerr, 0, SEEK_SET); + read($cerr, my $err, -s $cerr); + truncate($cerr, 0); + next if (($code >> 8) == 22 && $err =~ /\b404\b/); $uri->query_form(q => $qstr); - $lei->child_error($?, "E: <$uri> $err"); + $lei->child_error($code, "E: <$uri> `$cmd` failed"); } undef $each_smsg; $lei->{ovv}->ovv_atexit_child($lei); @@ -407,9 +386,8 @@ sub query_remote_mboxrd { sub git { $_[0]->{git} // die 'BUG: git uninitialized' } -sub xsearch_done_wait { # dwaitpid callback - my ($arg, $pid) = @_; - my ($wq, $lei) = @$arg; +sub xsearch_done_wait { # awaitpid cb + my ($pid, $wq, $lei) = @_; return if !$?; my $s = $? & 127; return $lei->child_error($?) if $s == 13 || $s == 15; @@ -418,75 +396,60 @@ sub xsearch_done_wait { # dwaitpid callback sub query_done { # EOF callback for main daemon my ($lei) = @_; - local $PublicInbox::LEI::current_lei = $lei; - eval { - my $l2m = delete $lei->{l2m}; - delete $lei->{lxs}; - ($lei->{opt}->{'mail-sync'} && !$lei->{sto}) and - warn "BUG: {sto} missing with --mail-sync"; - $lei->sto_done_request if $lei->{sto}; - if (my $v2w = delete $lei->{v2w}) { - my $wait = $v2w->wq_do('done'); # may die - $v2w->wq_close; - } - $lei->{ovv}->ovv_end($lei); - if ($l2m) { # close() calls LeiToMail reap_compress - if (my $out = delete $lei->{old_1}) { - if (my $mbout = $lei->{1}) { - close($mbout) or die <<""; -Error closing $lei->{ovv}->{dst}: \$!=$! \$?=$? - - } - $lei->{1} = $out; - } - if ($l2m->lock_free) { - $l2m->poke_dst; - $lei->poke_mua; - } else { # mbox users - delete $l2m->{mbl}; # drop dotlock - } + my $l2m = delete $lei->{l2m}; + delete $lei->{lxs}; + ($lei->{opt}->{'mail-sync'} && !$lei->{sto}) and + warn "BUG: {sto} missing with --mail-sync"; + $lei->sto_barrier_request; + $lei->{ovv}->ovv_end($lei); + if ($l2m) { # close() calls LeiToMail reap_compress + $l2m->finish_output($lei); + if ($l2m->lock_free) { + $l2m->poke_dst; + $lei->poke_mua; + } else { # mbox users + delete $l2m->{mbl}; # drop dotlock } - if ($lei->{-progress}) { - my $tot = $lei->{-mset_total} // 0; - my $nr_w = $lei->{-nr_write} // 0; - my $d = ($lei->{-nr_seen} // 0) - $nr_w; - my $x = "$tot matches"; - $x .= ", $d duplicates" if $d; - if ($l2m) { - my $m = "# $nr_w written to " . - "$lei->{ovv}->{dst} ($x)"; - $nr_w ? $lei->qfin($m) : $lei->qerr($m); - } else { - $lei->qerr("# $x"); - } + } + my $nr_w = delete($lei->{-nr_write}) // 0; + my $nr_dup = (delete($lei->{-nr_seen}) // 0) - $nr_w; + if ($lei->{-progress}) { + my $tot = $lei->{-mset_total} // 0; + my $x = "$tot matches"; + $x .= ", $nr_dup duplicates" if $nr_dup; + if ($l2m) { + my $m = "# $nr_w written to " . + "$lei->{ovv}->{dst} ($x)"; + $nr_w ? $lei->qfin($m) : $lei->qerr($m); + } else { + $lei->qerr("# $x"); } - $lei->start_mua if $l2m && !$l2m->lock_free; - $lei->dclose; - }; - $lei->fail($@) if $@; + } + $lei->start_mua if $l2m && !$l2m->lock_free; + $lei->dclose; } -sub do_post_augment { +sub post_augment_done { # via on_destroy in top-level lei-daemon my ($lei) = @_; - local $PublicInbox::LEI::current_lei = $lei; - my $l2m = $lei->{l2m} or return; # client disconnected - eval { - $lei->fchdir; - $l2m->post_augment($lei); - }; - my $err = $@; + my $err = delete $lei->{post_augment_err}; if ($err) { if (my $lxs = delete $lei->{lxs}) { - $lxs->wq_kill('-TERM'); + $lxs->wq_kill(-POSIX::SIGTERM()); $lxs->wq_close; } $lei->fail("$err"); } if (!$err && delete $lei->{early_mua}) { # non-augment case - eval { $lei->start_mua }; + eval { $lei->start_mua }; # may trigger wait_startq $lei->fail($@) if $@; } - close(delete $lei->{au_done}); # triggers wait_startq in lei_xsearch + close(delete $lei->{au_done}); # trigger wait_startq if start_mua didn't +} + +sub do_post_augment { + my ($lei) = @_; + my $l2m = $lei->{l2m} or return; # client disconnected + $l2m->post_augment($lei, on_destroy(\&post_augment_done, $lei)); } sub incr_post_augment { # called whenever an l2m shard finishes augment @@ -509,7 +472,9 @@ sub concurrency { sub start_query ($$) { # always runs in main (lei-daemon) process my ($self, $lei) = @_; local $PublicInbox::LEI::current_lei = $lei; - if ($self->{opt_threads} || ($lei->{l2m} && !$self->{opt_sort})) { + if ($lei->{opt}->{threads} || + defined($lei->{opt}->{'thread-id'}) || + ($lei->{l2m} && !$lei->{opt}->{'sort'})) { for my $ibxish (locals($self)) { $self->wq_io_do('query_one_mset', [], $ibxish); } @@ -531,7 +496,7 @@ sub start_query ($$) { # always runs in main (lei-daemon) process } sub incr_start_query { # called whenever an l2m shard starts do_post_auth - my ($self, $lei) = @_; + my ($lei, $self) = @_; my $l2m = $lei->{l2m}; return if ++$self->{nr_start_query} != $l2m->{-wq_nr_workers}; start_query($self, $lei); @@ -546,17 +511,20 @@ sub ipc_atfork_child { sub do_query { my ($self, $lei) = @_; my $l2m = $lei->{l2m}; + my $qstr = \($lei->{mset_opt}->{qstr}); + chomp $$qstr; + $$qstr =~ s/[ \n\t]+/ /sg; # make URLs and $0 less ugly my $ops = { - 'sigpipe_handler' => [ $lei ], - 'fail_handler' => [ $lei ], - 'do_post_augment' => [ \&do_post_augment, $lei ], - 'incr_post_augment' => [ \&incr_post_augment, $lei ], + sigpipe_handler => [ $lei ], + fail_handler => [ $lei ], + do_post_augment => [ \&do_post_augment, $lei ], + incr_post_augment => [ \&incr_post_augment, $lei ], '' => [ \&query_done, $lei ], - 'mset_progress' => [ \&mset_progress, $lei ], - 'l2m_progress' => [ \&l2m_progress, $lei ], - 'x_it' => [ $lei ], - 'child_error' => [ $lei ], - 'incr_start_query' => [ $self, $lei ], + mset_progress => [ \&mset_progress, $lei ], + incr => [ $lei ], + x_it => [ $lei ], + child_error => [ $lei ], + incr_start_query => [ \&incr_start_query, $lei, $self ], }; $lei->{auth}->op_merge($ops, $l2m, $lei) if $l2m && $lei->{auth}; my $end = $lei->pkt_op_pair; @@ -569,7 +537,6 @@ sub do_query { if ($lei->{opt}->{augment} && delete $lei->{early_mua}) { $lei->start_mua; } - my $F_SETPIPE_SZ = $^O eq 'linux' ? 1031 : undef; if ($l2m->{-wq_nr_workers} > 1 && $l2m->{base_type} =~ /\A(?:maildir|mbox)\z/) { # setup two barriers to coordinate ->has_entries @@ -581,20 +548,19 @@ sub do_query { $l2m->{au_peers} = [ $a_r, $a_w, $b_r, $b_w ]; } $l2m->wq_workers_start('lei2mail', undef, - $lei->oldset, { lei => $lei }); - $l2m->wq_wait_async(\&xsearch_done_wait, $lei); + $lei->oldset, { lei => $lei }, + \&xsearch_done_wait, $lei); pipe($lei->{startq}, $lei->{au_done}) or die "pipe: $!"; fcntl($lei->{startq}, $F_SETPIPE_SZ, 4096) if $F_SETPIPE_SZ; delete $l2m->{au_peers}; + close(delete $l2m->{-wq_s2}); # share wq_s1 with lei_xsearch } $self->wq_workers_start('lei_xsearch', undef, - $lei->oldset, { lei => $lei }); - $self->wq_wait_async(\&xsearch_done_wait, $lei); + $lei->oldset, { lei => $lei }, + \&xsearch_done_wait, $lei); my $op_c = delete $lei->{pkt_op_c}; delete $lei->{pkt_op_p}; @$end = (); - $self->{opt_threads} = $lei->{opt}->{threads}; - $self->{opt_sort} = $lei->{opt}->{'sort'}; $self->{-do_lcat} = !!(delete $lei->{lcat_todo}); if ($l2m) { $l2m->net_merge_all_done($lei) unless $lei->{auth}; @@ -612,34 +578,40 @@ sub add_uri { require IO::Uncompress::Gunzip; require PublicInbox::LeiCurl; push @{$self->{remotes}}, $uri; + $uri; } else { warn "curl missing, ignoring $uri\n"; + undef; } } +# returns URI or PublicInbox::Inbox-like object sub prepare_external { my ($self, $loc, $boost) = @_; # n.b. already ordered by boost if (ref $loc) { # already a URI, or PublicInbox::Inbox-like object return add_uri($self, $loc) if $loc->can('scheme'); + # fall-through on Inbox-like objects } elsif ($loc =~ m!\Ahttps?://!) { require URI; return add_uri($self, URI->new($loc)); - } elsif (-f "$loc/ei.lock") { + } elsif (-f "$loc/ei.lock" && -d "$loc/ALL.git/objects") { require PublicInbox::ExtSearch; die "`\\n' not allowed in `$loc'\n" if index($loc, "\n") >= 0; $loc = PublicInbox::ExtSearch->new($loc); - } elsif (-f "$loc/inbox.lock" || -d "$loc/public-inbox") { + } elsif ((-f "$loc/inbox.lock" && -d "$loc/all.git/objects") || + (-d "$loc/public-inbox" && -d "$loc/objects")) { die "`\\n' not allowed in `$loc'\n" if index($loc, "\n") >= 0; require PublicInbox::Inbox; # v2, v1 $loc = bless { inboxdir => $loc }, 'PublicInbox::Inbox'; } elsif (!-e $loc) { warn "W: $loc gone, perhaps run: lei forget-external $loc\n"; - return; + return undef; } else { warn "W: $loc ignored, unable to determine external type\n"; - return; + return undef; } push @{$self->{locals}}, $loc; + $loc; } sub _lcat_i { # LeiMailSync->each_src iterator callback diff --git a/lib/PublicInbox/Limiter.pm b/lib/PublicInbox/Limiter.pm new file mode 100644 index 00000000..a8d08fc3 --- /dev/null +++ b/lib/PublicInbox/Limiter.pm @@ -0,0 +1,50 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::Limiter; +use v5.12; +use PublicInbox::Spawn; + +sub new { + my ($class, $max) = @_; + bless { + # 32 is same as the git-daemon connection limit + max => $max || 32, + running => 0, + run_queue => [], + # RLIMIT_CPU => undef, + # RLIMIT_DATA => undef, + # RLIMIT_CORE => undef, + }, $class; +} + +sub setup_rlimit { + my ($self, $name, $cfg) = @_; + for my $rlim (@PublicInbox::Spawn::RLIMITS) { + my $k = lc($rlim); + $k =~ tr/_//d; + $k = "publicinboxlimiter.$name.$k"; + my $v = $cfg->{$k} // next; + my @rlimit = split(/\s*,\s*/, $v); + if (scalar(@rlimit) == 1) { + push @rlimit, $rlimit[0]; + } elsif (scalar(@rlimit) != 2) { + warn "could not parse $k: $v\n"; + } + my $inf = $v =~ /\binfinity\b/i ? + $PublicInbox::Spawn::RLIMITS{RLIM_INFINITY} // eval { + require BSD::Resource; + BSD::Resource::RLIM_INFINITY(); + } // do { + warn "BSD::Resource missing for $rlim"; + next; + } : undef; + for my $i (0..$#rlimit) { + next if $rlimit[$i] ne 'INFINITY'; + $rlimit[$i] = $inf; + } + $self->{$rlim} = \@rlimit; + } +} + +1; diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index 2ac74e2a..306a57e7 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # two-step linkification. @@ -11,8 +11,8 @@ # Maybe this could be done more efficiently... package PublicInbox::Linkify; use strict; -use warnings; -use Digest::SHA qw/sha1_hex/; +use v5.10.1; +use PublicInbox::SHA qw(sha1_hex); use PublicInbox::Hval qw(ascii_html mid_href); use PublicInbox::MID qw($MID_EXTRACT); @@ -68,23 +68,22 @@ sub linkify_1 { # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($url . $SALT); - + $key =~ tr/0-9/A-J/; # no digits for YAML highlight $_[0]->{$key} = $url; - $beg . 'PI-LINK-'. $key . $end; + $beg . 'LINKIFY' . $key . $end; ^geo; $_[1]; } sub linkify_2 { - # Added "PI-LINK-" prefix to avoid false-positives on git commits - $_[1] =~ s!\bPI-LINK-([a-f0-9]{40})\b! + # Added "LINKIFY" prefix to avoid false-positives on git commits + $_[1] =~ s!\bLINKIFY([a-fA-J]{40})\b! my $key = $1; my $url = $_[0]->{$key}; if (defined $url) { "<a\nhref=\"$url\">$url</a>"; - } else { - # false positive or somebody tried to mess with us - $key; + } else { # false positive or somebody tried to mess with us + 'LINKIFY'.$key; } !ge; $_[1]; @@ -102,20 +101,20 @@ sub linkify_mids { # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($html . $SALT); + $key =~ tr/0-9/A-J/; my $repl = qq(<<a\nhref="$pfx/$href/">$html</a>>); $repl .= qq{ (<a\nhref="$pfx/$href/raw">raw</a>)} if $raw; $self->{$key} = $repl; - 'PI-LINK-'. $key; + 'LINKIFY'.$key; !ge; $$str = ascii_html($$str); - $$str =~ s!\bPI-LINK-([a-f0-9]{40})\b! + $$str =~ s!\bLINKIFY([a-fA-J]{40})\b! my $key = $1; my $repl = $_[0]->{$key}; if (defined $repl) { $repl; - } else { - # false positive or somebody tried to mess with us - $key; + } else { # false positive or somebody tried to mess with us + 'LINKIFY'.$key; } !ge; } diff --git a/lib/PublicInbox/Listener.pm b/lib/PublicInbox/Listener.pm index 7cedc349..c83901b2 100644 --- a/lib/PublicInbox/Listener.pm +++ b/lib/PublicInbox/Listener.pm @@ -1,14 +1,15 @@ -# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used by -nntpd for listen sockets package PublicInbox::Listener; -use strict; +use v5.12; use parent 'PublicInbox::DS'; use Socket qw(SOL_SOCKET SO_KEEPALIVE IPPROTO_TCP TCP_NODELAY); use IO::Handle; use PublicInbox::Syscall qw(EPOLLIN EPOLLEXCLUSIVE); use Errno qw(EAGAIN ECONNABORTED); +our $MULTI_ACCEPT = 0; # Warn on transient errors, mostly resource limitations. # EINTR would indicate the failure to set NonBlocking in systemd or similar @@ -16,37 +17,35 @@ my %ERR_WARN = map {; eval("Errno::$_()") => $_ } qw(EMFILE ENFILE ENOBUFS ENOMEM EINTR); -sub new ($$$) { - my ($class, $s, $cb) = @_; +sub new { + my ($class, $s, $cb, $multi_accept) = @_; setsockopt($s, SOL_SOCKET, SO_KEEPALIVE, 1); setsockopt($s, IPPROTO_TCP, TCP_NODELAY, 1); # ignore errors on non-TCP listen($s, 2**31 - 1); # kernel will clamp my $self = bless { post_accept => $cb }, $class; + $self->{multi_accept} = $multi_accept //= $MULTI_ACCEPT; $self->SUPER::new($s, EPOLLIN|EPOLLEXCLUSIVE); } sub event_step { my ($self) = @_; my $sock = $self->{sock} or return; - - # no loop here, we want to fairly distribute clients - # between multiple processes sharing the same socket - # XXX our event loop needs better granularity for - # a single accept() here to be, umm..., acceptable - # on high-traffic sites. - if (my $addr = accept(my $c, $sock)) { - IO::Handle::blocking($c, 0); # no accept4 :< - eval { $self->{post_accept}->($c, $addr, $sock) }; - warn "E: $@\n" if $@; - } elsif ($! == EAGAIN || $! == ECONNABORTED) { - # EAGAIN is common and likely - # ECONNABORTED is common with bad connections - return; - } elsif (my $sym = $ERR_WARN{int($!)}) { - warn "W: accept(): $! ($sym)\n"; - } else { - warn "BUG?: accept(): $!\n"; - } + my $n = $self->{multi_accept}; + do { + if (my $addr = accept(my $c, $sock)) { + IO::Handle::blocking($c, 0); # no accept4 :< + eval { $self->{post_accept}->($c, $addr, $sock) }; + warn "E: $@\n" if $@; + } elsif ($! == EAGAIN || $! == ECONNABORTED) { + # EAGAIN is common and likely + # ECONNABORTED is common with bad connections + return; + } elsif (my $sym = $ERR_WARN{int($!)}) { + return warn "W: accept(): $! ($sym)\n"; + } else { + return warn "BUG?: accept(): $!\n"; + } + } while ($n--); } 1; diff --git a/lib/PublicInbox/Lock.pm b/lib/PublicInbox/Lock.pm index 0ee2a8bd..7162d80e 100644 --- a/lib/PublicInbox/Lock.pm +++ b/lib/PublicInbox/Lock.pm @@ -1,60 +1,66 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# Base class for per-inbox locking +# Base class for per-inbox locking, subclassed by several +# only uses {lock_path} and {lockfh} fields package PublicInbox::Lock; -use strict; -use v5.10.1; -use Fcntl qw(:flock :DEFAULT); +use v5.12; +use Fcntl qw(LOCK_UN LOCK_EX O_RDWR O_CREAT); use Carp qw(croak); use PublicInbox::OnDestroy; +use Errno qw(EINTR); +use autodie qw(close sysopen syswrite); + +sub xflock ($$) { + until (flock($_[0], $_[1])) { return if $! != EINTR } + 1; +} + +sub new { bless { lock_path => $_[1] }, $_[0] } # we only acquire the flock if creating or reindexing; # PublicInbox::Import already has the lock on its own. sub lock_acquire { my ($self) = @_; - my $lock_path = $self->{lock_path}; - croak 'already locked '.($lock_path // '(undef)') if $self->{lockfh}; - return unless defined($lock_path); - sysopen(my $lockfh, $lock_path, O_RDWR|O_CREAT) or - croak "failed to open $lock_path: $!\n"; - flock($lockfh, LOCK_EX) or croak "lock $lock_path failed: $!\n"; - $self->{lockfh} = $lockfh; + my $fn = $self->{lock_path}; + croak 'already locked '.($fn // '(undef)') if $self->{lockfh}; + $fn // return; + sysopen(my $fh, $fn, O_RDWR|O_CREAT); + xflock($fh, LOCK_EX) or croak "LOCK_EX $fn: $!"; + $self->{lockfh} = $fh; } sub lock_release { my ($self, $wake) = @_; - defined(my $lock_path = $self->{lock_path}) or return; - my $lockfh = delete $self->{lockfh} or croak "not locked: $lock_path"; - - syswrite($lockfh, '.') if $wake; - - flock($lockfh, LOCK_UN) or croak "unlock $lock_path failed: $!\n"; - close $lockfh or croak "close $lock_path failed: $!\n"; + my $fn = $self->{lock_path} // return; + my $fh = delete $self->{lockfh} or croak "not locked: $fn"; + syswrite($fh, '.') if $wake; + xflock($fh, LOCK_UN) or croak "LOCK_UN $fn: $!"; + close $fh; # may detect errors } # caller must use return value sub lock_for_scope { - my ($self, @single_pid) = @_; + my ($self) = @_; lock_acquire($self) or return; # lock_path not set - PublicInbox::OnDestroy->new(@single_pid, \&lock_release, $self); + on_destroy \&lock_release, $self; } sub lock_acquire_fast { - $_[0]->{lockfh} or return lock_acquire($_[0]); - flock($_[0]->{lockfh}, LOCK_EX) or croak "lock (fast) failed: $!"; + my $fh = $_[0]->{lockfh} or return lock_acquire($_[0]); + xflock($fh, LOCK_EX) or croak "LOCK_EX $_[0]->{lock_path}: $!"; } sub lock_release_fast { - flock($_[0]->{lockfh} // return, LOCK_UN) or - croak "unlock (fast) $_[0]->{lock_path}: $!"; + xflock($_[0]->{lockfh} // return, LOCK_UN) or + croak "LOCK_UN $_[0]->{lock_path}: $!" } # caller must use return value sub lock_for_scope_fast { - my ($self, @single_pid) = @_; + my ($self) = @_; lock_acquire_fast($self) or return; # lock_path not set - PublicInbox::OnDestroy->new(@single_pid, \&lock_release_fast, $self); + on_destroy \&lock_release_fast, $self; } 1; diff --git a/lib/PublicInbox/MHreader.pm b/lib/PublicInbox/MHreader.pm new file mode 100644 index 00000000..16e505a2 --- /dev/null +++ b/lib/PublicInbox/MHreader.pm @@ -0,0 +1,104 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# MH reader, based on Lib/mailbox.py in cpython source +package PublicInbox::MHreader; +use v5.12; +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::OnDestroy; +use PublicInbox::IO qw(try_cat); +use PublicInbox::MdirSort; +use Carp qw(carp); +use autodie qw(chdir closedir opendir); + +my %FL2OFF = ( # mh_sequences key => our keyword + replied => 0, + flagged => 1, + unseen => 2, # negate +); +my @OFF2KW = qw(answered flagged); # [2] => unseen (negated) + +sub new { + my ($cls, $dir, $cwdfh) = @_; + if (substr($dir, -1) ne '/') { # TODO: do this earlier + carp "W: appending `/' to `$dir' (fix caller)\n"; + $dir .= '/'; + } + bless { dir => $dir, cwdfh => $cwdfh }, $cls; +} + +sub read_mh_sequences ($) { # caller must chdir($self->{dir}) + my ($self) = @_; + my ($fl, $off, @n); + my @seq = ('', '', ''); + for (split /\n+/s, try_cat('.mh_sequences')) { + ($fl, @n) = split /[: \t]+/; + $off = $FL2OFF{$fl} // do { warn <<EOM; +W: unknown `$fl' in $self->{dir}.mh_sequences (ignoring) +EOM + next; + }; + @n = grep /\A[0-9]+\z/s, @n; # don't stat, yet + if (@n) { + @n = sort { $b <=> $a } @n; # to avoid resize + my $buf = ''; + vec($buf, $_, 1) = 1 for @n; + $seq[$off] = $buf; + } + } + \@seq; +} + +sub mh_each_file { + my ($self, $efcb, @arg) = @_; + opendir(my $dh, my $dir = $self->{dir}); + my $restore = on_destroy \&chdir, $self->{cwdfh}; + chdir($dh); + my $sort = $self->{sort}; + if (defined $sort && "@$sort" ne 'none') { + my @sort = map { + my @tmp = $_ eq '' ? ('sequence') : split(/[, ]/); + # sorting by name alphabetically makes no sense for MH: + for my $k (@tmp) { + s/\A(\-|\+|)(?:name|)\z/$1sequence/; + } + @tmp; + } @$sort; + my @n = grep /\A[0-9]+\z/s, readdir $dh; + mdir_sort \@n, \@sort; + $efcb->($dir, $_, $self, @arg) for @n; + } else { + while (readdir $dh) { # perl v5.12+ to set $_ on readdir + $efcb->($dir, $_, $self, @arg) if /\A[0-9]+\z/s; + } + } + closedir $dh; # may die +} + +sub kw_for ($$) { + my ($self, $n) = @_; + my $seq = $self->{mh_seq} //= read_mh_sequences($self); + my @kw = map { vec($seq->[$_], $n, 1) ? $OFF2KW[$_] : () } (0, 1); + vec($seq->[2], $n, 1) or push @kw, 'seen'; + \@kw; +} + +sub _file2eml { # mh_each_file / mh_read_one cb + my ($dir, $n, $self, $ucb, @arg) = @_; + my $eml = eml_from_path($n); + $ucb->($dir, $n, kw_for($self, $n), $eml, @arg) if $eml; +} + +sub mh_each_eml { + my ($self, $ucb, @arg) = @_; + mh_each_file($self, \&_file2eml, $ucb, @arg); +} + +sub mh_read_one { + my ($self, $n, $ucb, @arg) = @_; + my $restore = on_destroy \&chdir, $self->{cwdfh}; + chdir(my $dir = $self->{dir}); + _file2eml($dir, $n, $self, $ucb, @arg); +} + +1; diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 35b517e0..36c05855 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -1,15 +1,15 @@ -# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Various Message-ID-related functions. package PublicInbox::MID; use strict; -use warnings; -use base qw/Exporter/; +use v5.10.1; # TODO: check unicode_strings compat for v5.12 +use parent qw(Exporter); our @EXPORT_OK = qw(mid_clean id_compress mid2path mid_escape MID_ESC mids references mids_for_index mids_in $MID_EXTRACT); use URI::Escape qw(uri_escape_utf8); -use Digest::SHA qw/sha1_hex/; +use PublicInbox::SHA qw(sha1_hex); require PublicInbox::Address; use constant { ID_MAX => 40, # SHA-1 hex length for HTML id anchors @@ -92,8 +92,7 @@ sub references ($) { my ($hdr) = @_; my @mids; foreach my $f (qw(References In-Reply-To)) { - my @v = $hdr->header_raw($f); - foreach my $v (@v) { + for my $v ($hdr->header_raw($f)) { push(@mids, ($v =~ /$MID_EXTRACT/g)); } } @@ -104,8 +103,7 @@ sub references ($) { my %addr = ( y => 1, n => 1 ); foreach my $f (qw(To From Cc)) { - my @v = $hdr->header_raw($f); - foreach my $v (@v) { + for my $v ($hdr->header_raw($f)) { $addr{$_} = 1 for (PublicInbox::Address::emails($v)); } } @@ -117,7 +115,7 @@ sub uniq_mids ($;$) { my @ret; $seen ||= {}; foreach my $mid (@$mids) { - $mid =~ tr/\n\t\r//d; + $mid =~ tr/\n\t\r\0//d; if (length($mid) > MAX_MID_SIZE) { warn "Message-ID: <$mid> too long, truncating\n"; $mid = substr($mid, 0, MAX_MID_SIZE); @@ -127,7 +125,7 @@ sub uniq_mids ($;$) { \@ret; } -# RFC3986, section 3.3: +# RFC3986, section 3.3 (pathnames only): sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\'\(\)\*\+,;=:@' } sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) } diff --git a/lib/PublicInbox/MailDiff.pm b/lib/PublicInbox/MailDiff.pm new file mode 100644 index 00000000..125360fe --- /dev/null +++ b/lib/PublicInbox/MailDiff.pm @@ -0,0 +1,137 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::MailDiff; +use v5.12; +use File::Temp 0.19 (); # 0.19 for ->newdir +use PublicInbox::ContentHash qw(content_digest); +use PublicInbox::MsgIter qw(msg_part_text); +use PublicInbox::ViewDiff qw(flush_diff); +use PublicInbox::GitAsyncCat; +use PublicInbox::ContentDigestDbg; +use PublicInbox::Qspawn; +use PublicInbox::IO qw(write_file); +use autodie qw(close mkdir); + +sub write_part { # Eml->each_part callback + my ($ary, $self) = @_; + my ($part, $depth, $idx) = @$ary; + if ($idx ne '1' || $self->{-raw_hdr}) { # lei mail-diff --raw-header + write_file '>', "$self->{curdir}/$idx.hdr", ${$part->{hdr}}; + } + my $ct = $part->content_type || 'text/plain'; + my ($s, $err) = msg_part_text($part, $ct); + my $sfx = defined($s) ? 'txt' : 'bin'; + $s //= $part->body; + $s =~ s/\r\n/\n/gs; # TODO: consider \r+\n to match View + $s =~ s/\s*\z//s; + write_file '>:utf8', "$self->{curdir}/$idx.$sfx", $s, "\n"; +} + +# public +sub dump_eml ($$$) { + my ($self, $dir, $eml) = @_; + local $self->{curdir} = $dir; + mkdir $dir; + $eml->each_part(\&write_part, $self); + my $fh = write_file '>', "$dir/content_digest"; + my $dig = PublicInbox::ContentDigestDbg->new($fh); + content_digest($eml, $dig); + say $fh "\n", $dig->hexdigest; + close $fh; +} + +# public +sub prep_a ($$) { + my ($self, $eml) = @_; + $self->{tmp} = File::Temp->newdir('mail-diff-XXXX', TMPDIR => 1); + dump_eml($self, "$self->{tmp}/a", $eml); +} + +# WWW-specific stuff below (TODO: split out for non-lei) + +sub next_smsg ($) { + my ($self) = @_; + my $ctx = $self->{ctx}; + my $over = $ctx->{ibx}->over; + $self->{smsg} = $over ? $over->next_by_mid(@{$self->{next_arg}}) + : $ctx->gone('over'); + if (!$self->{smsg}) { + $ctx->write('</pre>', $ctx->_html_end); + return $ctx->close; + } + PublicInbox::DS::requeue($self) if $ctx->{env}->{'pi-httpd.async'}; +} + +sub emit_msg_diff { + my ($bref, $self) = @_; # bref is `git diff' output + require PublicInbox::Hval; + PublicInbox::Hval::utf8_maybe($$bref); + + # will be escaped to `•' in HTML + $self->{ctx}->{ibx}->{obfuscate} and + PublicInbox::Hval::obfuscate_addrs($self->{ctx}->{ibx}, + $$bref, "\x{2022}"); + print { $self->{ctx}->{zfh} } '</pre><hr><pre>' if $self->{nr} > 1; + flush_diff($self->{ctx}, $bref); + next_smsg($self); +} + +sub do_diff { + my ($self, $eml) = @_; + my $n = 'N'.(++$self->{nr}); + my $dir = "$self->{tmp}/$n"; + $self->dump_eml($dir, $eml); + my $cmd = [ qw(git diff --no-index --no-color -- a), $n ]; + my $opt = { -C => "$self->{tmp}", quiet => 1 }; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, $opt); + $qsp->psgi_qx($self->{ctx}->{env}, undef, \&emit_msg_diff, $self); +} + +sub diff_msg_i { + my ($self, $eml) = @_; + if ($eml) { + if ($self->{tmp}) { # 2nd..last message + do_diff($self, $eml); + } else { # first message: + prep_a($self, $eml); + next_smsg($self); + } + } else { + warn "W: $self->{smsg}->{blob} missing\n"; + next_smsg($self); + } +} + +sub diff_msg_i_async { + my ($bref, $oid, $type, $size, $self) = @_; + diff_msg_i($self, $bref ? PublicInbox::Eml->new($bref) : undef); +} + +sub event_step { + my ($self) = @_; + eval { + my $ctx = $self->{ctx}; + if ($ctx->{env}->{'pi-httpd.async'}) { + ibx_async_cat($ctx->{ibx}, $self->{smsg}->{blob}, + \&diff_msg_i_async, $self); + } else { + diff_msg_i($self, $ctx->{ibx}->smsg_eml($self->{smsg})); + } + }; + if ($@) { + warn "E: $@"; + delete $self->{smsg}; + $self->{ctx}->close; + } +} + +sub begin_mail_diff { + my ($self) = @_; + if ($self->{ctx}->{env}->{'pi-httpd.async'}) { + PublicInbox::DS::requeue($self); + } else { + event_step($self) while $self->{smsg}; + } +} + +1; diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm index d5048a96..be5d5f2a 100644 --- a/lib/PublicInbox/ManifestJsGz.pm +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -1,10 +1,10 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# generates manifest.js.gz for grokmirror(1) +# generates manifest.js.gz for grokmirror(1) via PublicInbox::WWW +# This doesn't parse manifest.js.gz (that happens in LeiMirror) package PublicInbox::ManifestJsGz; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::WwwListing); use PublicInbox::Config; use IO::Compress::Gzip qw(gzip); @@ -82,8 +82,9 @@ sub response { $ctx->can('list_match_i'), $re, $ctx); sub { $ctx->{-wcb} = $_[0]; # HTTP server callback - $ctx->{env}->{'pi-httpd.async'} ? - $iter->event_step : $iter->each_section; + ($ctx->{www}->{pi_cfg}->ALL || + !$ctx->{env}->{'pi-httpd.async'}) ? + $iter->each_section : $iter->event_step; } } diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index e65f38f0..17893a09 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -1,10 +1,10 @@ -# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Streaming interface for mboxrd HTTP responses # See PublicInbox::GzipFilter for details. package PublicInbox::Mbox; -use strict; +use v5.12; use parent 'PublicInbox::GzipFilter'; use PublicInbox::MID qw/mid_escape/; use PublicInbox::Hval qw/to_filename/; @@ -19,12 +19,10 @@ sub getline { my $smsg = $ctx->{smsg} or return; my $ibx = $ctx->{ibx}; my $eml = delete($ctx->{eml}) // $ibx->smsg_eml($smsg) // return; - my $n = $ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}}); - $ctx->zmore(msg_hdr($ctx, $eml)); - if ($n) { - $ctx->translate(msg_body($eml)); + if (($ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}}))) { + $ctx->translate(msg_hdr($ctx, $eml), msg_body($eml)); } else { # last message - $ctx->zflush(msg_body($eml)); + $ctx->zflush(msg_hdr($ctx, $eml), msg_body($eml)); } } @@ -33,8 +31,8 @@ sub async_next { my ($http) = @_; # PublicInbox::HTTP my $ctx = $http->{forward} or return; # client aborted eval { - my $smsg = $ctx->{smsg} or return $ctx->close; - $ctx->smsg_blob($smsg); + my $smsg = $ctx->{smsg} // return $ctx->close; + $ctx->smsg_blob($smsg) if $smsg; }; warn "E: $@" if $@; } @@ -45,8 +43,7 @@ sub async_eml { # for async_blob_cb # next message $ctx->{smsg} = $ctx->{ibx}->over->next_by_mid(@{$ctx->{next_arg}}); local $ctx->{eml} = $eml; # for mbox_hdr - $ctx->zmore(msg_hdr($ctx, $eml)); - $ctx->write(msg_body($eml)); + $ctx->write(msg_hdr($ctx, $eml), msg_body($eml)); } sub mbox_hdr ($) { @@ -82,7 +79,6 @@ sub no_over_raw ($) { # /$INBOX/$MESSAGE_ID/raw sub emit_raw { my ($ctx) = @_; - $ctx->{base_url} = $ctx->{ibx}->base_url($ctx->{env}); my $over = $ctx->{ibx}->over or return no_over_raw($ctx); my ($id, $prev); my $mip = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; @@ -93,17 +89,15 @@ sub emit_raw { sub msg_hdr ($$) { my ($ctx, $eml) = @_; - my $header_obj = $eml->header_obj; - # drop potentially confusing headers, ssoma already should've dropped - # Lines and Content-Length - foreach my $d (qw(Lines Bytes Content-Length Status)) { - $header_obj->header_set($d); + # drop potentially confusing headers, various importers should've + # already dropped these, but we can't trust stuff we've cloned + for my $d (qw(Lines Bytes Content-Length Status)) { + $eml->header_set($d); } - my $crlf = $header_obj->crlf; - my $buf = $header_obj->as_string; - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $crlf = $eml->crlf; + my $buf = $eml->header_obj->as_string; + PublicInbox::Eml::strip_from($buf); "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf . $crlf; } @@ -165,6 +159,7 @@ sub all_ids_cb { } $ctx->{ids} = $ids = $over->ids_after(\($ctx->{prev})); } while (@$ids); + undef; } sub mbox_all_ids { @@ -181,75 +176,98 @@ sub mbox_all_ids { PublicInbox::MboxGz::mbox_gz($ctx, \&all_ids_cb, 'all'); } -sub results_cb { - my ($ctx) = @_; - my $over = $ctx->{ibx}->over or return $ctx->gone('over'); - while (1) { - while (defined(my $num = shift(@{$ctx->{ids}}))) { - my $smsg = $over->get_art($num) or next; - return $smsg; - } - # refill result set, deprioritize since there's many results - my $srch = $ctx->{ibx}->isrch or return $ctx->gone('search'); - my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); - my $size = $mset->size or return; - $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); - $ctx->{-low_prio} = 1; +my $refill_ids_cb = sub { # async_mset cb + my ($ctx, $http, $mset, $err) = @_; + $http = undef unless $ctx->{-really_async}; + if ($err) { + warn "E: $err"; + $ctx->close if $http; # our async httpd + return; } -} - -sub results_thread_cb { - my ($ctx) = @_; + # refill result set, deprioritize since there's many results + my $size = $mset->size or do { + $ctx->close if $http; + $ctx->{-mbox_done} = 1; + return; + }; + $ctx->{qopts}->{offset} += $size; + $ctx->{ids} = $ctx->{srch}->mset_to_artnums($mset, $ctx->{qopts}); + $ctx->{-low_prio} = 1; # true + return if !$http; + eval { + my $smsg = results_cb($ctx) // return $ctx->close; + return if !$smsg; # '' wait for async_mset + $ctx->smsg_blob($ctx->{smsg} = $smsg); + }; + warn "E: $@" if $@; +}; +sub results_cb { # async_next or MboxGz->getline cb + my ($ctx, $http) = @_; my $over = $ctx->{ibx}->over or return $ctx->gone('over'); while (1) { - while (defined(my $num = shift(@{$ctx->{xids}}))) { + my $ids = $ctx->{xids} // $ctx->{ids}; + while (defined(my $num = shift(@$ids))) { my $smsg = $over->get_art($num) or next; return $smsg; } - - # refills ctx->{xids} - next if $over->expand_thread($ctx); - - # refill result set, deprioritize since there's many results - my $srch = $ctx->{ibx}->isrch or return $ctx->gone('search'); - my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); - my $size = $mset->size or return; - $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset, $ctx->{qopts}); - $ctx->{-low_prio} = 1; + next if $ctx->{xids} && $over->expand_thread($ctx); + return '' if $ctx->{srch}->async_mset(@$ctx{qw(query qopts)}, + $refill_ids_cb, $ctx, $http); + return if $ctx->{-mbox_done}; } +} +sub mbox_qry_cb { # async_mset cb + my ($ctx, $q, $mset, $err) = @_; + my $wcb = delete $ctx->{wcb}; + if ($err) { + warn "E: $err"; + return $wcb->([500, [qw(Content-Type text/plain)], + [ "Internal server error\n" ]]) + } + $ctx->{qopts}->{offset} = $mset->size or + return $wcb->([404, [qw(Content-Type text/plain)], + ["No results found\n"]]); + $ctx->{ids} = $ctx->{srch}->mset_to_artnums($mset, $ctx->{qopts}); + my $fn; + if ($q->{t} && $ctx->{srch}->has_threadid) { + $ctx->{xids} = []; # triggers over->expand_thread + $fn = "results-thread-$ctx->{query}"; + } else { + $fn = "results-$ctx->{query}"; + } + require PublicInbox::MboxGz; + my $res = PublicInbox::MboxGz::mbox_gz($ctx, \&results_cb, $fn); + ref($res) eq 'CODE' ? $res->($wcb) : $wcb->($res); } sub mbox_all { my ($ctx, $q) = @_; - my $q_string = $q->{'q'}; - return mbox_all_ids($ctx) if $q_string !~ /\S/; - my $srch = $ctx->{ibx}->isrch or + my $qstr = $q->{'q'}; + return mbox_all_ids($ctx) if $qstr !~ /\S/; + my $srch = $ctx->{srch} = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); - my $over = $ctx->{ibx}->over or - return PublicInbox::WWW::need($ctx, 'Overview'); + my $opt = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC - my $qopts = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC - $qopts->{threads} = 1 if $q->{t}; - $srch->query_approxidate($ctx->{ibx}->git, $q_string); - my $mset = $srch->mset($q_string, $qopts); - $qopts->{offset} = $mset->size or - return [404, [qw(Content-Type text/plain)], - ["No results found\n"]]; - $ctx->{query} = $q_string; - $ctx->{ids} = $srch->mset_to_artnums($mset, $qopts); - require PublicInbox::MboxGz; - my $fn; - if ($q->{t} && $srch->has_threadid) { - $fn = 'results-thread-'.$q_string; - PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn); - } else { - $fn = 'results-'.$q_string; - PublicInbox::MboxGz::mbox_gz($ctx, \&results_cb, $fn); + # {threadid} limits results to a given thread + # {threads} collapses results from messages in the same thread, + # allowing us to use ->expand_thread w/o duplicates in our own code + if (defined($ctx->{mid})) { + my $over = ($ctx->{ibx}->{isrch} ? + $ctx->{ibx}->{isrch}->{es}->over : + $ctx->{ibx}->over) or + return PublicInbox::WWW::need($ctx, 'Overview'); + $opt->{threadid} = $over->mid2tid($ctx->{mid}); } + $opt->{threads} = 1 if $q->{t}; + $srch->query_approxidate($ctx->{ibx}->git, $qstr); + $ctx->{query} = $qstr; + sub { # called by PSGI server + $ctx->{wcb} = $_[0]; # PSGI server supplied write cb + $srch->async_mset($qstr, $opt, \&mbox_qry_cb, $ctx, $q) and + $ctx->{-really_async} = 1; + }; } 1; diff --git a/lib/PublicInbox/MboxGz.pm b/lib/PublicInbox/MboxGz.pm index 3ed33867..90e69c09 100644 --- a/lib/PublicInbox/MboxGz.pm +++ b/lib/PublicInbox/MboxGz.pm @@ -1,7 +1,7 @@ -# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::MboxGz; -use strict; +use v5.12; use parent 'PublicInbox::GzipFilter'; use PublicInbox::Eml; use PublicInbox::Hval qw/to_filename/; @@ -13,8 +13,8 @@ sub async_next ($) { my ($http) = @_; # PublicInbox::HTTP my $ctx = $http->{forward} or return; eval { - $ctx->{smsg} = $ctx->{cb}->($ctx) or return $ctx->close; - $ctx->smsg_blob($ctx->{smsg}); + my $smsg = $ctx->{cb}->($ctx, $http) // return $ctx->close; + $smsg and $ctx->smsg_blob($ctx->{smsg} = $smsg); }; warn "E: $@" if $@; } @@ -22,7 +22,6 @@ sub async_next ($) { sub mbox_gz { my ($self, $cb, $fn) = @_; $self->{cb} = $cb; - $self->{base_url} = $self->{ibx}->base_url($self->{env}); $self->{gz} = PublicInbox::GzipFilter::gzip_or_die(); $fn = to_filename($fn // '') // 'no-subject'; # http://www.iana.org/assignments/media-types/application/gzip @@ -38,8 +37,7 @@ sub getline { my $cb = $self->{cb} or return; while (my $smsg = $cb->($self)) { my $eml = $self->{ibx}->smsg_eml($smsg) or next; - $self->zmore(msg_hdr($self, $eml)); - return $self->translate(msg_body($eml)); + return $self->translate(msg_hdr($self, $eml), msg_body($eml)); } # signal that we're done and can return undef next call: delete $self->{cb}; diff --git a/lib/PublicInbox/MboxLock.pm b/lib/PublicInbox/MboxLock.pm index 856b1e21..5e373873 100644 --- a/lib/PublicInbox/MboxLock.pm +++ b/lib/PublicInbox/MboxLock.pm @@ -1,20 +1,24 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Various mbox locking methods package PublicInbox::MboxLock; -use strict; -use v5.10.1; -use PublicInbox::OnDestroy; +use v5.12; +use PublicInbox::OnDestroy (); use Fcntl qw(:flock F_SETLK F_SETLKW F_RDLCK F_WRLCK O_CREAT O_EXCL O_WRONLY SEEK_SET); use Carp qw(croak); use PublicInbox::DS qw(now); # ugh... +use autodie qw(chdir opendir unlink); our $TMPL = do { - if ($^O eq 'linux') { \'s @32' } - elsif ($^O =~ /bsd/) { \'@20 s @256' } # n.b. @32 may be enough... - else { eval { require File::FcntlLock; 1 } } + if ($^O eq 'linux') { + \'s @32' + } elsif ($^O =~ /bsd/ || $^O eq 'dragonfly') { + \'@20 s @256' # n.b. @32 may be enough... + } else { + eval { require File::FcntlLock; 1 } + } }; # This order matches Debian policy on Linux systems. @@ -58,16 +62,13 @@ sub acq_dotlock { rand(0xffffffff), $pid, time); if (sysopen(my $fh, $tmp, O_CREAT|O_EXCL|O_WRONLY)) { if (link($tmp, $dot_lock)) { - unlink($tmp) or die "unlink($tmp): $!"; + unlink($tmp); $self->{".lock$pid"} = $dot_lock; - if (substr($dot_lock, 0, 1) ne '/') { - opendir(my $dh, '.') or - die "opendir . $!"; - $self->{dh} = $dh; - } + substr($dot_lock, 0, 1) eq '/' or + opendir($self->{dh}, '.'); return; } - unlink($tmp) or die "unlink($tmp): $!"; + unlink($tmp); select(undef, undef, undef, $self->{delay}); } else { croak "open $tmp (for $dot_lock): $!" if !$!{EXIST}; @@ -83,18 +84,20 @@ sub acq_flock { my $end = now + $self->{timeout}; do { return if flock($self->{fh}, $op); - select(undef, undef, undef, $self->{delay}); + if ($!{EWOULDBLOCK}) { + select(undef, undef, undef, $self->{delay}); + } elsif (!$!{EINTR}) { + croak "flock($self->{f} ($self->{fh}): $!"; + } } while (now < $end); die "flock timeout $self->{f}: $!\n"; } sub acq { my ($cls, $f, $rw, $methods) = @_; - my $fh; - unless (open $fh, $rw ? '+>>' : '<', $f) { - croak "open($f): $!" if $rw || !$!{ENOENT}; - } - my $self = bless { f => $f, fh => $fh, rw => $rw }, $cls; + my $self = bless { f => $f, rw => $rw }, $cls; + my $ok = open $self->{fh}, $rw ? '+>>' : '<', $f; + croak "open($f): $!" if !$ok && ($rw || !$!{ENOENT}); my $m = "@$methods"; if ($m ne 'none') { my @m = map { @@ -116,20 +119,16 @@ sub acq { $self; } -sub _fchdir { chdir($_[0]) } # OnDestroy callback - sub DESTROY { my ($self) = @_; - if (my $f = $self->{".lock$$"}) { - my $x; - if (my $dh = delete $self->{dh}) { - opendir my $c, '.' or die "opendir . $!"; - $x = PublicInbox::OnDestroy->new(\&_fchdir, $c); - chdir($dh) or die "chdir (for $f): $!"; - } - unlink($f) or die "unlink($f): $! (lock stolen?)"; - undef $x; + my $f = $self->{".lock$$"} or return; + my $od; + if (my $dh = delete $self->{dh}) { + opendir my $c, '.'; + $od = PublicInbox::OnDestroy::all \&chdir, $c; + chdir($dh); } + CORE::unlink($f) or die "unlink($f): $! (lock stolen?)"; } 1; diff --git a/lib/PublicInbox/MboxReader.pm b/lib/PublicInbox/MboxReader.pm index beffabe8..3d78ca23 100644 --- a/lib/PublicInbox/MboxReader.pm +++ b/lib/PublicInbox/MboxReader.pm @@ -1,10 +1,10 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# reader for mbox variants we support +# reader for mbox variants we support (and also sets up commands for writing) package PublicInbox::MboxReader; use strict; -use v5.10.1; +use v5.10.1; # check regexps before v5.12 use Data::Dumper; $Data::Dumper::Useqq = 1; # should've been the default, for bad data @@ -29,7 +29,7 @@ sub _mbox_from { my @raw; while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) { if ($r == 0) { # close here to check for "curl --fail" - close($mbfh) or die "error closing mbox: \$?=$? $!"; + $mbfh->close or die "error closing mbox: \$?=$? $!"; @raw = ($buf); } else { @raw = split(/$from_strict/mos, $buf, -1); @@ -88,12 +88,12 @@ sub _mbox_cl ($$$;@) { my $buf = ''; while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) { if ($r == 0) { # detect "curl --fail" - close($mbfh) or + $mbfh->close or die "error closing mboxcl/mboxcl2: \$?=$? $!"; undef $mbfh; } while (my $hdr = _extract_hdr(\$buf)) { - $$hdr =~ s/\A[\r\n]*From [^\n]*\n//s or + PublicInbox::Eml::strip_from($$hdr) or die "E: no 'From ' line in:\n", Dumper($hdr); my $eml = PublicInbox::Eml->new($hdr); next unless $eml->raw_size; @@ -141,10 +141,9 @@ sub reads { # all of these support -c for stdout and -d for decompression, # mutt is commonly distributed with hooks for gz, bz2 and xz, at least -# { foo => '' } means "--foo" is passed to the command-line, -# otherwise { foo => '--bar' } passes "--bar" +# { foo => '' } means "--foo" is passed to the command-line my %zsfx2cmd = ( - gz => [ qw(GZIP pigz gzip) ], + gz => [ qw(GZIP pigz gzip), { rsyncable => '' } ], bz2 => [ 'bzip2', {} ], xz => [ 'xz', {} ], # don't add new entries here unless MUA support is widely available @@ -173,28 +172,9 @@ sub zsfx2cmd ($$$) { } $cmd[0] // die join(' or ', @info)." missing for .$zsfx"; - # not all gzip support --rsyncable, FreeBSD gzip doesn't even exit - # with an error code - if (!$decompress && $cmd[0] =~ m!/gzip\z! && !defined($cmd_opt)) { - pipe(my ($r, $w)) or die "pipe: $!"; - open my $null, '+>', '/dev/null' or die "open: $!"; - my $rdr = { 0 => $null, 1 => $null, 2 => $w }; - my $tst = [ $cmd[0], '--rsyncable' ]; - my $pid = PublicInbox::Spawn::spawn($tst, undef, $rdr); - close $w; - my $err = do { local $/; <$r> }; - waitpid($pid, 0) == $pid or die "BUG: waitpid: $!"; - $cmd_opt = $err ? {} : { rsyncable => '' }; - push(@$x, $cmd_opt); - } - for my $bool (keys %$cmd_opt) { - my $switch = $cmd_opt->{$bool} // next; - push @cmd, '--'.($switch || $bool); - } - for my $key (qw(rsyncable)) { # support compression level? - my $switch = $cmd_opt->{$key} // next; - my $val = $lei->{opt}->{$key} // next; - push @cmd, $switch, $val; + # only for --rsyncable. TODO: support compression level? + for my $key (keys %$cmd_opt) { + push @cmd, '--'.$key if $lei->{opt}->{$key}; } \@cmd; } diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm index dbb74d6d..2981b058 100644 --- a/lib/PublicInbox/MdirReader.pm +++ b/lib/PublicInbox/MdirReader.pm @@ -1,14 +1,14 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# Maildirs for now, MH eventually +# Maildirs only (PublicInbox::MHreader exists, now) # ref: https://cr.yp.to/proto/maildir.html # https://wiki2.dovecot.org/MailboxFormat/Maildir package PublicInbox::MdirReader; use strict; use v5.10.1; use PublicInbox::InboxWritable qw(eml_from_path); -use Digest::SHA qw(sha256_hex); +use PublicInbox::SHA qw(sha256_hex); # returns Maildir flags from a basename ('' for no flags, undef for invalid) sub maildir_basename_flags { diff --git a/lib/PublicInbox/MdirSort.pm b/lib/PublicInbox/MdirSort.pm new file mode 100644 index 00000000..6bd9fb6c --- /dev/null +++ b/lib/PublicInbox/MdirSort.pm @@ -0,0 +1,46 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# used for sorting MH (and (TODO) Maildir) names +# TODO: consider sort(1) to parallelize sorting of gigantic directories +package PublicInbox::MdirSort; +use v5.12; +use Time::HiRes (); +use parent qw(Exporter); +use Fcntl qw(S_ISREG); +our @EXPORT = qw(mdir_sort); +my %ST = (sequence => 0, size => 1, atime => 2, mtime => 3, ctime => 4); + +sub mdir_sort ($$;$) { + my ($ent, $sort, $max) = @_; + my @st; + my @ent = map { + @st = Time::HiRes::stat $_; + # name, size, {a,m,c}time + S_ISREG($st[2]) ? [ $_, @st[7..10] ] : (); + } @$ent; + @ent = grep { $_->[1] <= $max } @ent if $max; + use sort 'stable'; + for my $s (@$sort) { + if ($s =~ /\A(\-|\+|)name\z/) { + if ($1 eq '-') { + @ent = sort { $b->[0] cmp $a->[0] } @ent; + } else { + @ent = sort { $a->[0] cmp $b->[0] } @ent; + } + } elsif ($s =~ /\A(\-|\+|) + (sequence|size|ctime|mtime|atime)\z/x) { + my $key = $ST{$2}; + if ($1 eq '-') { + @ent = sort { $b->[$key] <=> $a->[$key] } @ent; + } else { + @ent = sort { $a->[$key] <=> $b->[$key] } @ent; + } + } else { + die "E: unrecognized sort parameter: `$s'"; + } + } + @$ent = map { $_->[0] } @ent; +} + +1; diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index 76b33b16..6708527d 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -5,7 +5,7 @@ # Things indexed include: # * inboxes themselves # * epoch information -# * (maybe) git code repository information +# * (maybe) git code repository information (not commits) # Expect ~100K-1M documents with no parallelism opportunities, # so no sharding, here. # @@ -72,7 +72,7 @@ sub remove_eidx_key { } for my $docid (@docids) { $xdb->delete_document($docid); - warn "I: remove inbox docid #$docid ($eidx_key)\n"; + warn "# remove inbox docid #$docid ($eidx_key)\n"; } } diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index 5ee087fd..bbc9a007 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -1,11 +1,11 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Various date/time-related functions package PublicInbox::MsgTime; +use v5.10.1; # unicode_strings in 5.12 may not work... use strict; -use warnings; -use base qw(Exporter); +use parent qw(Exporter); our @EXPORT_OK = qw(msg_timestamp msg_datestamp); use Time::Local qw(timegm); my @MoY = qw(january february march april may june @@ -125,10 +125,7 @@ sub str2date_zone ($) { # but we want to keep "git fsck" happy. # "-1200" is the furthest westermost zone offset, # but git fast-import is liberal so we use "-1400" - if ($zone >= 1400 || $zone <= -1400) { - warn "bogus TZ offset: $zone, ignoring and assuming +0000\n"; - $zone = '+0000'; - } + $zone = '+0000' if $zone >= 1400 || $zone <= -1400; [$ts, $zone]; } @@ -138,50 +135,38 @@ sub time_response ($) { } sub msg_received_at ($) { - my ($hdr) = @_; # PublicInbox::Eml - my @recvd = $hdr->header_raw('Received'); - my ($ts); - foreach my $r (@recvd) { + my ($eml) = @_; + my $ts; + for my $r ($eml->header_raw('Received')) { $r =~ /\s*([0-9]+\s+[a-zA-Z]+\s+[0-9]{2,4}\s+ [0-9]+[^0-9][0-9]+(?:[^0-9][0-9]+) - \s+([\+\-][0-9]+))/sx or next; + \s+(?:[\+\-][0-9]+))/sx or next; $ts = eval { str2date_zone($1) } and return $ts; - my $mid = $hdr->header_raw('Message-ID'); - warn "no date in $mid Received: $r\n"; } undef; } sub msg_date_only ($) { - my ($hdr) = @_; # PublicInbox::Eml - my @date = $hdr->header_raw('Date'); - my ($ts); - foreach my $d (@date) { + my ($eml) = @_; + my $ts; + for my $d ($eml->header_raw('Date')) { $ts = eval { str2date_zone($d) } and return $ts; - if ($@) { - my $mid = $hdr->header_raw('Message-ID'); - warn "bad Date: $d in $mid: $@\n"; - } } undef; } # Favors Received header for sorting globally sub msg_timestamp ($;$) { - my ($hdr, $fallback) = @_; # PublicInbox::Eml - my $ret; - $ret = msg_received_at($hdr) and return time_response($ret); - $ret = msg_date_only($hdr) and return time_response($ret); - time_response([ $fallback // time, '+0000' ]); + my ($eml, $fallback) = @_; + time_response(msg_received_at($eml) // msg_date_only($eml) // + [ $fallback // time, '+0000' ]); } # Favors the Date: header for display and sorting within a thread sub msg_datestamp ($;$) { - my ($hdr, $fallback) = @_; # PublicInbox::Eml - my $ret; - $ret = msg_date_only($hdr) and return time_response($ret); - $ret = msg_received_at($hdr) and return time_response($ret); - time_response([ $fallback // time, '+0000' ]); + my ($eml, $fallback) = @_; # PublicInbox::Eml + time_response(msg_date_only($eml) // msg_received_at($eml) // + [ $fallback // time, '+0000' ]); } 1; diff --git a/lib/PublicInbox/MultiGit.pm b/lib/PublicInbox/MultiGit.pm index 9429a00c..b7691806 100644 --- a/lib/PublicInbox/MultiGit.pm +++ b/lib/PublicInbox/MultiGit.pm @@ -5,10 +5,12 @@ package PublicInbox::MultiGit; use strict; use v5.10.1; -use PublicInbox::Spawn qw(run_die); +use PublicInbox::Spawn qw(run_die run_qx); use PublicInbox::Import; use File::Temp 0.19; use List::Util qw(max); +use PublicInbox::IO qw(read_all); +use autodie qw(chmod close rename); sub new { my ($cls, $topdir, $all, $epfx) = @_; @@ -31,7 +33,7 @@ sub read_alternates { qr!\A\Q../../$self->{epfx}\E/([0-9]+)\.git/objects\z! : undef; $$moderef = (stat($fh))[2] & 07777; - for my $rel (split(/^/m, do { local $/; <$fh> })) { + for my $rel (split(/^/m, read_all($fh, -s _))) { chomp(my $dir = $rel); my $score; if (defined($is_edir) && $dir =~ $is_edir) { @@ -67,12 +69,10 @@ sub write_alternates { my $out = join('', sort { $alt->{$b} <=> $alt->{$a} } keys %$alt); my $info_dir = "$all_dir/objects/info"; my $fh = File::Temp->new(TEMPLATE => 'alt-XXXX', DIR => $info_dir); - my $f = $fh->filename; - print $fh $out, @new or die "print($f): $!"; - chmod($mode, $fh) or die "fchmod($f): $!"; - close $fh or die "close($f): $!"; - my $fn = "$info_dir/alternates"; - rename($f, $fn) or die "rename($f, $fn): $!"; + print $fh $out, @new; + chmod($mode, $fh); + close $fh; + rename($fh->filename, "$info_dir/alternates"); $fh->unlink_on_destroy(0); } @@ -108,8 +108,13 @@ sub fill_alternates { sub epoch_cfg_set { my ($self, $epoch_nr) = @_; - run_die([qw(git config -f), epoch_dir($self)."/$epoch_nr.git/config", - 'include.path', "../../$self->{all}/config" ]); + my $f = epoch_dir($self)."/$epoch_nr.git/config"; + my $v = "../../$self->{all}/config"; + if (-r $f) { + chomp(my $x = run_qx([qw(git config -f), $f, 'include.path'])); + return if $x eq $v; + } + run_die([qw(git config -f), $f, 'include.path', $v ]); } sub add_epoch { diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index ef01f448..603cf094 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -15,7 +15,7 @@ use PublicInbox::MID qw(mid_escape $MID_EXTRACT); use PublicInbox::Eml; use POSIX qw(strftime); use PublicInbox::DS qw(now); -use Digest::SHA qw(sha1_hex); +use PublicInbox::SHA qw(sha1_hex); use Time::Local qw(timegm timelocal); use PublicInbox::GitAsyncCat; use PublicInbox::Address; @@ -72,9 +72,8 @@ sub process_line ($$) { my $res = eval { $req->($self, @args) }; my $err = $@; if ($err && $self->{sock}) { - local $/ = "\n"; - chomp($l); - err($self, 'error from: %s (%s)', $l, $err); + $l =~ s/\r?\n//s; + warn("error from: $l ($err)\n"); $res = \"503 program fault - command not performed\r\n"; } defined($res) ? $self->write($res) : 0; @@ -189,7 +188,7 @@ sub listgroup_range_i { my ($self, $beg, $end) = @_; my $r = $self->{ibx}->mm(1)->msg_range($beg, $end, 'num'); scalar(@$r) or return; - $self->msg_more(join("\r\n", @$r, '')); + $self->msg_more(join('', map { "$_->[0]\r\n" } @$r)); 1; } @@ -524,8 +523,7 @@ sub msg_hdr_write ($$) { set_nntp_headers($eml, $smsg); my $hdr = $eml->{hdr} // \(my $x = ''); - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $$hdr =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($$hdr); $$hdr =~ s/(?<!\r)\n/\r\n/sg; # Alpine barfs without this # for leafnode compatibility, we need to ensure Message-ID headers @@ -538,7 +536,11 @@ sub blob_cb { # called by git->cat_async via ibx_async_cat my ($bref, $oid, $type, $size, $smsg) = @_; my $self = $smsg->{nntp}; my $code = $smsg->{nntp_code}; - if (!defined($oid)) { + if (!defined($type)) { + warn "E: git aborted on $oid / $smsg->{blob} ". + $self->{-ibx}->{inboxdir}; + return $self->close; + } elsif ($type ne 'blob') { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message warn "E: $smsg->{blob} missing in $smsg->{-ibx}->{inboxdir}\n"; @@ -945,11 +947,6 @@ sub cmd_xpath ($$) { '223 '.join(' ', sort(@paths))."\r\n"; } -sub err ($$;@) { - my ($self, $fmt, @args) = @_; - printf { $self->{nntpd}->{err} } $fmt."\n", @args; -} - sub out ($$;@) { my ($self, $fmt, @args) = @_; printf { $self->{nntpd}->{out} } $fmt."\n", @args; diff --git a/lib/PublicInbox/NetNNTPSocks.pm b/lib/PublicInbox/NetNNTPSocks.pm index 8495204a..d27efba1 100644 --- a/lib/PublicInbox/NetNNTPSocks.pm +++ b/lib/PublicInbox/NetNNTPSocks.pm @@ -1,14 +1,13 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# wrap Net::NNTP client with SOCKS support +# wrap Net::NNTP client with SOCKS support. Convoluted, but AFAIK this +# is the only way to get SOCKS working with Net::NNTP w/o LD_PRELOAD. package PublicInbox::NetNNTPSocks; -use strict; -use v5.10.1; +use v5.12; use Net::NNTP; -our %OPT; +our %OPT; # used to pass options between ->new_socks and our ->new our @ISA = qw(IO::Socket::Socks); -my @SOCKS_KEYS = qw(ProxyAddr ProxyPort SocksVersion SocksDebug SocksResolve); # use this instead of Net::NNTP->new if using Proxy* sub new_socks { @@ -17,17 +16,21 @@ sub new_socks { local @Net::NNTP::ISA = (qw(Net::Cmd), __PACKAGE__); local %OPT = map {; defined($opt{$_}) ? ($_ => $opt{$_}) : () - } @SOCKS_KEYS; - Net::NNTP->new(%opt); # this calls our new() below: + } qw(ProxyAddr ProxyPort SocksVersion SocksDebug SocksResolve); + no warnings 'uninitialized'; # needed for $SOCKS_ERROR + my $ret = Net::NNTP->new(%opt); # calls PublicInbox::NetNNTPSocks::new + return $ret if $ret || $!{EINTR}; + $ret // die "errors: \$!=$! SOCKS=", + eval('$IO::Socket::Socks::SOCKS_ERROR // ""'), + ', SSL=', + (eval('IO::Socket::SSL->errstr') // ''), "\n"; } # called by Net::NNTP->new sub new { my ($self, %opt) = @_; @OPT{qw(ConnectAddr ConnectPort)} = @opt{qw(PeerAddr PeerPort)}; - my $ret = $self->SUPER::new(%OPT) or - die 'SOCKS error: '.eval('$IO::Socket::Socks::SOCKS_ERROR'); - $ret; + $self->SUPER::new(%OPT); } 1; diff --git a/lib/PublicInbox/NetReader.pm b/lib/PublicInbox/NetReader.pm index c1af03a3..ec18818b 100644 --- a/lib/PublicInbox/NetReader.pm +++ b/lib/PublicInbox/NetReader.pm @@ -3,8 +3,7 @@ # common reader code for IMAP and NNTP (and maybe JMAP) package PublicInbox::NetReader; -use strict; -use v5.10.1; +use v5.12; use parent qw(Exporter PublicInbox::IPC); use PublicInbox::Eml; use PublicInbox::Config; @@ -15,7 +14,7 @@ our @EXPORT = qw(uri_section imap_uri nntp_uri); sub ndump { require Data::Dumper; - Data::Dumper->new(\@_)->Useqq(1)->Terse(1)->Dump; + Data::Dumper->new([ $_[-1] ])->Useqq(1)->Terse(1)->Dump; } # returns the git config section name, e.g [imap "imaps://user@example.com"] @@ -41,10 +40,27 @@ EOM die "$val not understood (only socks5h:// is supported)\n"; } +# gives an arrayref suitable for the Mail::IMAPClient Ssl or Starttls arg +sub mic_tls_opt ($$) { + my ($o, $hostname) = @_; + require IO::Socket::SSL; + $o = {} if !ref($o); + $o->{SSL_hostname} //= $hostname; + [ map { ($_, $o->{$_}) } keys %$o ]; +} + +sub set_ssl_verify_mode ($$) { + my ($o, $bool) = @_; + require IO::Socket::SSL; + $o->{SSL_verify_mode} = $bool ? IO::Socket::SSL::SSL_VERIFY_PEER() : + IO::Socket::SSL::SSL_VERIFY_NONE(); +} + sub mic_new ($$$$) { my ($self, $mic_arg, $sec, $uri) = @_; my %mic_arg = (%$mic_arg, Keepalive => 1); my $sa = $self->{cfg_opt}->{$sec}->{-proxy_cfg} || $self->{-proxy_cli}; + my ($mic, $s, $t); if ($sa) { # this `require' needed for worker[1..Inf], since socks_args # only got called in worker[0] @@ -53,16 +69,37 @@ sub mic_new ($$$$) { $opt{SocksDebug} = 1 if $mic_arg{Debug}; $opt{ConnectAddr} = delete $mic_arg{Server}; $opt{ConnectPort} = delete $mic_arg{Port}; - my $s = IO::Socket::Socks->new(%opt) or die - "E: <$uri> ".eval('$IO::Socket::Socks::SOCKS_ERROR'); - if ($mic_arg->{Ssl}) { # for imaps:// - require IO::Socket::SSL; - $s = IO::Socket::SSL->start_SSL($s) or die - "E: <$uri> ".(IO::Socket::SSL->errstr // ''); - } + do { + $! = 0; + $s = IO::Socket::Socks->new(%opt); + } until ($s || !$!{EINTR} || $self->{quit}); + return if $self->{quit}; + $s or die "E: <$uri> ".eval('$IO::Socket::Socks::SOCKS_ERROR'); $mic_arg{Socket} = $s; + if (my $o = delete $mic_arg{Ssl}) { # for imaps:// + $o = mic_tls_opt($o, $opt{ConnectAddr}); + do { + $! = 0; + $t = IO::Socket::SSL->start_SSL($s, @$o); + } until ($t || !$!{EINTR} || $self->{quit}); + return if $self->{quit}; + $t or die "E: <$uri> ".(IO::Socket::SSL->errstr // ''); + $mic_arg{Socket} = $t; + } elsif ($o = $mic_arg{Starttls}) { + # Mail::IMAPClient will use this: + $mic_arg{Starttls} = mic_tls_opt($o, $opt{ConnectAddr}); + } + } elsif ($mic_arg{Ssl} || $mic_arg{Starttls}) { + for my $f (qw(Ssl Starttls)) { + my $o = $mic_arg{$f} or next; + $mic_arg{$f} = mic_tls_opt($o, $mic_arg{Server}); + } } - PublicInbox::IMAPClient->new(%mic_arg); + do { + $! = 0; + $mic = PublicInbox::IMAPClient->new(%mic_arg); + } until ($mic || !$!{EINTR} || $self->{quit}); + $mic; } sub auth_anon_cb { '' }; # for Mail::IMAPClient::Authcallback @@ -72,6 +109,7 @@ sub onion_hint ($$) { $uri->host =~ /\.onion\z/i or return "\n"; my $t = $uri->isa('PublicInbox::URIimap') ? 'imap' : 'nntp'; my $url = PublicInbox::Config::squote_maybe(uri_section($uri)); + my $scheme = $uri->scheme; my $set_cfg = 'lei config'; if (!$lei) { # public-inbox-watch my $f = PublicInbox::Config::squote_maybe( @@ -87,6 +125,10 @@ try configuring a socks5h:// proxy: url=$url $set_cfg $t.$dq\$url$dq.proxy socks5h://127.0.0.1:9050 +git 2.26+ users may instead rely on `*' to match all .onion URLs: + + $set_cfg '$t.$scheme://*.onion.proxy' socks5h://127.0.0.1:9050 + ...before retrying your current command EOM } @@ -122,9 +164,9 @@ sub mic_for ($$$$) { # mic = Mail::IMAPClient Server => $host, %$common, # may set Starttls, Compress, Debug .... }; - $mic_arg->{Ssl} = 1 if $uri->scheme eq 'imaps'; require PublicInbox::IMAPClient; my $mic = mic_new($self, $mic_arg, $sec, $uri); + return if $self->{quit}; ($mic && $mic->IsConnected) or die "E: <$uri> new: $@".onion_hint($lei, $uri); @@ -175,17 +217,20 @@ sub mic_for ($$$$) { # mic = Mail::IMAPClient $mic; } -sub nn_new ($$$) { - my ($nn_arg, $nntp_cfg, $uri) = @_; +sub nn_new ($$$$) { + my ($self, $nn_arg, $nntp_cfg, $uri) = @_; my $nn; + my ($Net_NNTP, $new) = qw(Net::NNTP new); if (defined $nn_arg->{ProxyAddr}) { require PublicInbox::NetNNTPSocks; + ($Net_NNTP, $new) = qw(PublicInbox::NetNNTPSocks new_socks); $nn_arg->{SocksDebug} = 1 if $nn_arg->{Debug}; - eval { $nn = PublicInbox::NetNNTPSocks->new_socks(%$nn_arg) }; - die "E: <$uri> $@\n" if $@; - } else { - $nn = Net::NNTP->new(%$nn_arg) or return; } + do { + $! = 0; + $nn = $Net_NNTP->$new(%$nn_arg); + } until ($nn || !$!{EINTR} || $self->{quit}); + $nn // return; setsockopt($nn, Socket::SOL_SOCKET(), Socket::SO_KEEPALIVE(), 1); # default to using STARTTLS if it's available, but allow @@ -195,19 +240,19 @@ sub nn_new ($$$) { try_starttls($nn_arg->{Host})) { # soft fail by default $nn->starttls or warn <<""; -W: <$uri> STARTTLS tried and failed (not requested) +W: <$uri> STARTTLS tried and failed (not requested): ${\(ndump($nn->message))} } elsif ($nntp_cfg->{starttls}) { # hard fail if explicitly configured $nn->starttls or die <<""; -E: <$uri> STARTTLS requested and failed +E: <$uri> STARTTLS requested and failed: ${\(ndump($nn->message))} } } elsif ($nntp_cfg->{starttls}) { $nn->can('starttls') or die "E: <$uri> Net::NNTP too old for STARTTLS\n"; $nn->starttls or die <<""; -E: <$uri> STARTTLS requested and failed +E: <$uri> STARTTLS requested and failed: ${\(ndump($nn->message))} } $nn; @@ -242,25 +287,32 @@ sub nn_for ($$$$) { # nn = Net::NNTP $nn_arg->{SSL} = 1 if $uri->secure; # snews == nntps my $sa = $self->{-proxy_cli}; %$nn_arg = (%$nn_arg, %$sa) if $sa; - my $nn = nn_new($nn_arg, $nntp_cfg, $uri) or - die "E: <$uri> new: $@".onion_hint($lei, $uri); + my $nn = nn_new($self, $nn_arg, $nntp_cfg, $uri); + return if $self->{quit}; + $nn // die "E: <$uri> new: $@".onion_hint($lei, $uri); if ($cred) { - $cred->fill($lei) unless defined($p); # may prompt user here + $p //= do { + $cred->fill($lei); # may prompt user here + $cred->{password}; + }; if ($nn->authinfo($u, $p)) { push @{$nntp_cfg->{-postconn}}, [ 'authinfo', $u, $p ]; } else { - warn "E: <$uri> AUTHINFO $u XXXX failed\n"; + warn <<EOM; +E: <$uri> AUTHINFO $u XXXX: ${\(ndump($nn->message))} +EOM $nn = undef; } } - - if ($nntp_cfg->{compress}) { + if ($nn && $nntp_cfg->{compress}) { # https://rt.cpan.org/Ticket/Display.html?id=129967 if ($nn->can('compress')) { if ($nn->compress) { push @{$nntp_cfg->{-postconn}}, [ 'compress' ]; } else { - warn "W: <$uri> COMPRESS failed\n"; + warn <<EOM; +W: <$uri> COMPRESS: ${\(ndump($nn->message))} +EOM } } else { delete $nntp_cfg->{compress}; @@ -304,14 +356,6 @@ sub cfg_intvl ($$$) { } } -sub cfg_bool ($$$) { - my ($cfg, $key, $url) = @_; - my $orig = $cfg->urlmatch($key, $url) // return; - my $bool = $cfg->git_bool($orig); - warn "W: $key=$orig for $url is not boolean\n" unless defined($bool); - $bool; -} - # flesh out common IMAP-specific data structures sub imap_common_init ($;$) { my ($self, $lei) = @_; @@ -329,11 +373,12 @@ sub imap_common_init ($;$) { # knobs directly for Mail::IMAPClient->new for my $k (qw(Starttls Debug Compress)) { - my $bool = cfg_bool($cfg, "imap.$k", $$uri) // next; - $mic_common->{$sec}->{$k} = $bool; + my $v = $cfg->urlmatch('--bool', "imap.$k", $$uri); + $mic_common->{$sec}->{$k} = $v if defined $v; } my $to = cfg_intvl($cfg, 'imap.timeout', $$uri); $mic_common->{$sec}->{Timeout} = $to if $to; + $mic_common->{$sec}->{Ssl} = 1 if $uri->scheme eq 'imaps'; # knobs we use ourselves: my $sa = socks_args($cfg->urlmatch('imap.Proxy', $$uri)); @@ -343,11 +388,18 @@ sub imap_common_init ($;$) { $self->{cfg_opt}->{$sec}->{$k} = $to; } my $k = 'imap.fetchBatchSize'; - my $bs = $cfg->urlmatch($k, $$uri) // next; - if ($bs =~ /\A([0-9]+)\z/ && $bs > 0) { - $self->{cfg_opt}->{$sec}->{batch_size} = $bs; - } else { - warn "$k=$bs is not a positive integer\n"; + if (defined(my $bs = $cfg->urlmatch($k, $$uri))) { + ($bs =~ /\A([0-9]+)\z/ && $bs > 0) ? + ($self->{cfg_opt}->{$sec}->{batch_size} = $bs) : + warn("$k=$bs is not a positive integer\n"); + } + my $v = $cfg->urlmatch(qw(--bool imap.sslVerify), $$uri); + if (defined $v) { + my $cur = $mic_common->{$sec} //= {}; + $cur->{Starttls} //= 1 if !$cur->{Ssl}; + for my $f (grep { $cur->{$_} } qw(Ssl Starttls)) { + set_ssl_verify_mode($cur->{$f} = {}, $v); + } } } # make sure we can connect and cache the credentials in memory @@ -356,8 +408,9 @@ sub imap_common_init ($;$) { my $sec = uri_section($orig_uri); my $uri = PublicInbox::URIimap->new("$sec/"); my $mic = $mics->{$sec} //= - mic_for($self, $uri, $mic_common, $lei) // - die "Unable to continue\n"; + mic_for($self, $uri, $mic_common, $lei); + return if $self->{quit}; + $mic // die "Unable to continue\n"; next unless $self->isa('PublicInbox::NetWriter'); next if $self->{-skip_creat}; my $dst = $orig_uri->mailbox // next; @@ -383,7 +436,7 @@ sub nntp_common_init ($;$) { my $args = $nn_common->{$sec} //= {}; # Debug and Timeout are passed to Net::NNTP->new - my $v = cfg_bool($cfg, 'nntp.Debug', $$uri); + my $v = $cfg->urlmatch(qw(--bool nntp.Debug), $$uri); $args->{Debug} = $v if defined $v; my $to = cfg_intvl($cfg, 'nntp.Timeout', $$uri); $args->{Timeout} = $to if $to; @@ -392,9 +445,11 @@ sub nntp_common_init ($;$) { # Net::NNTP post-connect commands for my $k (qw(starttls compress)) { - $v = cfg_bool($cfg, "nntp.$k", $$uri) // next; - $self->{cfg_opt}->{$sec}->{$k} = $v; + $v = $cfg->urlmatch('--bool', "nntp.$k", $$uri); + $self->{cfg_opt}->{$sec}->{$k} = $v if defined $v; } + $v = $cfg->urlmatch(qw(--bool nntp.sslVerify), $$uri); + set_ssl_verify_mode($args, $v) if defined $v; # -watch internal option for my $k (qw(pollInterval)) { @@ -685,7 +740,13 @@ sub mic_get { } my $mic = mic_new($self, $mic_arg, $sec, $uri); $cached //= {}; # invalid placeholder if no cache enabled - $mic && $mic->IsConnected ? ($cached->{$sec} = $mic) : undef; + if ($mic && $mic->IsConnected) { + $cached->{$sec} = $mic; + } else { + warn 'IMAP LastError: ',$mic->LastError, "\n" if $mic; + warn "IMAP errno: $!\n" if $!; + undef; + } } sub imap_each { @@ -717,7 +778,7 @@ sub nn_get { my $nn_arg = $self->{net_arg}->{$sec} or die "BUG: no Net::NNTP->new arg for $sec"; my $nntp_cfg = $self->{cfg_opt}->{$sec}; - $nn = nn_new($nn_arg, $nntp_cfg, $uri) or return; + $nn = nn_new($self, $nn_arg, $nntp_cfg, $uri) or return; if (my $postconn = $nntp_cfg->{-postconn}) { for my $m_arg (@$postconn) { my ($method, @args) = @$m_arg; @@ -759,14 +820,12 @@ sub _nntp_fetch_all ($$$) { $beg = $num_a if defined($num_a) && $num_a > $beg && $num_a <= $end; $end = $num_b if defined($num_b) && $num_b >= $beg && $num_b < $end; $end = $beg if defined($num_a) && !defined($num_b); - my ($err, $art, $last_art, $kw); # kw stays undef, no keywords in NNTP - unless ($self->{quiet}) { - warn "# $uri fetching ARTICLE $beg..$end\n"; - } + my ($err, $last_art, $kw); # kw stays undef, no keywords in NNTP + warn "# $uri fetching ARTICLE $beg..$end\n" if !$self->{quiet}; my $n = $self->{max_batch}; - for ($beg..$end) { + for my $art ($beg..$end) { last if $self->{quit}; - $art = $_; + local $0 = "#$art $group $sec"; if (--$n < 0) { run_commit_cb($self); $itrk->update_last(0, $last_art) if $itrk; diff --git a/lib/PublicInbox/NetWriter.pm b/lib/PublicInbox/NetWriter.pm index 4a1f34f6..7917ef89 100644 --- a/lib/PublicInbox/NetWriter.pm +++ b/lib/PublicInbox/NetWriter.pm @@ -1,10 +1,9 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # common writer code for IMAP (and later, JMAP) package PublicInbox::NetWriter; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::NetReader); use PublicInbox::Smsg; use PublicInbox::MsgTime qw(msg_timestamp); diff --git a/lib/PublicInbox/OnDestroy.pm b/lib/PublicInbox/OnDestroy.pm index 615bc450..4301edff 100644 --- a/lib/PublicInbox/OnDestroy.pm +++ b/lib/PublicInbox/OnDestroy.pm @@ -1,21 +1,31 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::OnDestroy; +use v5.12; +use parent qw(Exporter); +use autodie qw(fork); +our @EXPORT = qw(on_destroy); +our $fork_gen = 0; -sub new { - shift; # ($class, $cb, @args) - bless [ @_ ], __PACKAGE__; +# either parent or child is expected to exit or exec shortly after this: +sub fork_tmp () { + my $pid = fork; + ++$fork_gen if $pid == 0; + $pid; } +# all children +sub all (@) { bless [ undef, @_ ], __PACKAGE__ } + +# same process +sub on_destroy (@) { bless [ $fork_gen, @_ ], __PACKAGE__ } + +sub cancel { @{$_[0]} = () } + sub DESTROY { - my ($cb, @args) = @{$_[0]}; - if (!ref($cb) && $cb) { - my $pid = $cb; - return if $pid != $$; - $cb = shift @args; - } - $cb->(@args) if $cb; + my ($fgen, $cb, @args) = @{$_[0]}; + $cb->(@args) if ($cb && ($fgen // $fork_gen) == $fork_gen); } 1; diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index d6409b2a..3b7d49f5 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -12,6 +12,7 @@ use DBD::SQLite; use PublicInbox::Smsg; use Compress::Zlib qw(uncompress); use constant DEFAULT_LIMIT => 1000; +use List::Util (); # for max sub dbh_new { my ($self, $rw) = @_; @@ -81,7 +82,13 @@ sub dbh_close { } } -sub dbh ($) { $_[0]->{dbh} //= $_[0]->dbh_new } # dbh_new may be subclassed +sub dbh ($) { + my ($self) = @_; + $self->{dbh} // do { + my $dbh = $self->dbh_new; # dbh_new may be subclassed + $self->{dbh} = $dbh; + } +} sub load_from_row ($;$) { my ($smsg, $cull) = @_; @@ -193,15 +200,22 @@ ORDER BY $sort_col DESC # TODO separate strict and loose matches here once --reindex # is fixed to preserve `tid' properly push @$msgs, @$loose; + + # we wanted to retrieve the latest loose messages; but preserve + # chronological ordering for threading /$INBOX/$MSGID/[tT]/ + $sort_col eq 'ds' and + @$msgs = sort { $a->{ds} <=> $b->{ds} } @$msgs; } ($nr, $msgs); } # strict `tid' matches, only, for thread-expanded mbox.gz search results -# and future CLI interface +# and lei # returns true if we have IDs, undef if not sub expand_thread { my ($self, $ctx) = @_; + # previous maxuid for LeiSavedSearch is our min: + my $lss_min = $ctx->{min} // 0; my $dbh = dbh($self); do { defined(my $num = $ctx->{ids}->[0]) or return; @@ -214,7 +228,7 @@ SELECT num FROM over WHERE tid = ? AND num > ? ORDER BY num ASC LIMIT 1000 my $xids = $dbh->selectcol_arrayref($sql, undef, $tid, - $ctx->{prev} // 0); + List::Util::max($ctx->{prev} // 0, $lss_min)); if (scalar(@$xids)) { $ctx->{prev} = $xids->[-1]; $ctx->{xids} = $xids; @@ -253,9 +267,12 @@ SELECT ts,ds,ddd FROM over WHERE $s sub get_art { my ($self, $num) = @_; # caching $sth ourselves is faster than prepare_cached - my $sth = $self->{-get_art} //= dbh($self)->prepare(<<''); + my $sth = $self->{-get_art} // do { + my $sth = dbh($self)->prepare(<<''); SELECT num,tid,ds,ts,ddd FROM over WHERE num = ? LIMIT 1 + $self->{-get_art} = $sth; + }; $sth->execute($num); my $smsg = $sth->fetchrow_hashref; $smsg ? load_from_row($smsg) : undef; @@ -282,13 +299,35 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ? $rows; } +sub mid2tid { + my ($self, $mid) = @_; + my $dbh = dbh($self); + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT id FROM msgid WHERE mid = ? LIMIT 1 + + $sth->execute($mid); + my $id = $sth->fetchrow_array or return; + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT num FROM id2num WHERE id = ? AND num > ? +ORDER BY num ASC LIMIT 1 + + $sth->execute($id, 0); + my $num = $sth->fetchrow_array or return; + $sth = $dbh->prepare(<<''); +SELECT tid FROM over WHERE num = ? LIMIT 1 + + $sth->execute($num); + $sth->fetchrow_array; +} + sub next_by_mid { my ($self, $mid, $id, $prev) = @_; my $dbh = dbh($self); unless (defined $$id) { my $sth = $dbh->prepare_cached(<<'', undef, 1); - SELECT id FROM msgid WHERE mid = ? LIMIT 1 +SELECT id FROM msgid WHERE mid = ? LIMIT 1 $sth->execute($mid); $$id = $sth->fetchrow_array; diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index e7c96e14..4f8533f7 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -17,6 +17,7 @@ use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::Smsg qw(subject_normalized); use Compress::Zlib qw(compress); use Carp qw(croak); +use bytes (); # length sub dbh_new { my ($self) = @_; @@ -199,7 +200,7 @@ sub resolve_mid_to_tid { $tid // do { # create a new ghost my $id = mid2id($self, $mid); my $num = next_ghost_num($self); - $num < 0 or die "ghost num is non-negative: $num\n"; + $num < 0 or croak "BUG: ghost num is non-negative: $num\n"; $tid = next_tid($self); my $dbh = $self->{dbh}; $dbh->prepare_cached(<<'')->execute($num, $tid); @@ -263,7 +264,10 @@ sub ddd_for ($) { sub add_overview { my ($self, $eml, $smsg) = @_; - $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; + my $raw = $eml->body_raw; + $smsg->{lines} = $raw =~ tr!\n!\n!; + $smsg->{bytes} //= bytes::length $raw; + undef $raw; my $mids = mids_for_index($eml); my $refs = $smsg->parse_references($eml, $mids); $mids->[0] //= do { @@ -283,7 +287,7 @@ sub _add_over { my ($self, $smsg, $mid, $refs, $old_tid, $v) = @_; my $cur_tid = $smsg->{tid}; my $n = $smsg->{num}; - die "num must not be zero for $mid" if !$n; + croak "BUG: num must not be zero for $mid" if !$n; my $cur_valid = $cur_tid > $self->{min_tid}; if ($n > 0) { # regular mail @@ -454,7 +458,7 @@ sub rollback_lazy { sub dbh_close { my ($self) = @_; - die "in transaction" if $self->{txn}; + Carp::confess('BUG: in transaction') if $self->{txn}; $self->SUPER::dbh_close; } @@ -509,18 +513,18 @@ EOF next; } $pr->(<<EOM) if $pr; -I: ghost $r->{num} <$mid> THREADID=$r->{tid} culled +# ghost $r->{num} <$mid> THREADID=$r->{tid} culled EOM } delete_by_num($self, $r->{num}); } - $pr->("I: rethread culled $total ghosts\n") if $pr && $total; + $pr->("# rethread culled $total ghosts\n") if $pr && $total; } # used for cross-inbox search sub eidx_prep ($) { my ($self) = @_; - $self->{-eidx_prep} //= do { + $self->{-eidx_prep} // do { my $dbh = $self->dbh; $dbh->do(<<''); INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') @@ -565,7 +569,7 @@ CREATE TABLE IF NOT EXISTS eidx_meta ( $dbh->do(<<''); CREATE TABLE IF NOT EXISTS eidxq (docid INTEGER PRIMARY KEY NOT NULL) - 1; + $self->{-eidx_prep} = 1; }; } @@ -670,4 +674,17 @@ sub vivify_xvmd { $smsg->{-vivify_xvmd} = \@vivify_xvmd; } +sub fork_ok { + state $fork_ok = eval("v$DBD::SQLite::sqlite_version") ge v3.8.3; + return 1 if $fork_ok; + my ($opt) = @_; + my @j = split(/,/, $opt->{jobs} // ''); + state $warned; + grep { $_ > 1 } @j and $warned //= warn(<<EOM); +DBD::SQLite version is v$DBD::SQLite::sqlite_version, need >= v3.8.3 for --jobs > 1 +EOM + $opt->{jobs} = '1,1'; + undef; +} + 1; diff --git a/lib/PublicInbox/POP3.pm b/lib/PublicInbox/POP3.pm index c993e558..06772069 100644 --- a/lib/PublicInbox/POP3.pm +++ b/lib/PublicInbox/POP3.pm @@ -41,15 +41,11 @@ use PublicInbox::IMAP; # for UID slice stuff use constant { LINE_MAX => 512, # XXX unsure + UID_SLICE => PublicInbox::IMAP::UID_SLICE, }; # XXX FIXME: duplicated stuff from NNTP.pm and IMAP.pm -sub err ($$;@) { - my ($self, $fmt, @args) = @_; - printf { $self->{pop3d}->{err} } $fmt."\n", @args; -} - sub out ($$;@) { my ($self, $fmt, @args) = @_; printf { $self->{pop3d}->{out} } $fmt."\n", @args; @@ -66,7 +62,7 @@ sub new { (bless { pop3d => $pop3d }, $cls)->greet($sock) } -# POP user is $UUID1@$NEWSGROUP.$SLICE +# POP user is $UUID1@$NEWSGROUP[.$SLICE][?QUERY_ARGS] sub cmd_user ($$) { my ($self, $mailbox) = @_; $self->{salt} // return \"-ERR already authed\r\n"; @@ -75,21 +71,26 @@ sub cmd_user ($$) { my $user = $1; $user =~ tr/-//d; # most have dashes, some (dbus-uuidgen) don't $user =~ m!\A[a-f0-9]{32}\z!i or return \"-ERR user has no UUID\r\n"; - my $slice; - $mailbox =~ s/\.([0-9]+)\z// and $slice = $1 + 0; + + my %l; + if ($mailbox =~ s/\?(.*)\z//) { # query args + for (split(/&+/, $1)) { + /\A(initial_limit|limit)=([0-9]+)\z/ and $l{$1} = $2; + } + $self->{limits} = \%l; + } + my $slice = $mailbox =~ s/\.([0-9]+)\z// ? $1 + 0 : undef; + my $ibx = $self->{pop3d}->{pi_cfg}->lookup_newsgroup($mailbox) // return \"-ERR $mailbox does not exist\r\n"; - my $uidmax = $ibx->mm(1)->num_highwater // 0; + my $uidmax = $self->{uidmax} = $ibx->mm(1)->num_highwater // 0; if (defined $slice) { - my $max = int($uidmax / PublicInbox::IMAP::UID_SLICE); + my $max = int($uidmax / UID_SLICE); my $tip = "$mailbox.$max"; return \"-ERR $mailbox.$slice does not exist ($tip does)\r\n" if $slice > $max; - $self->{uid_base} = $slice * PublicInbox::IMAP::UID_SLICE; $self->{slice} = $slice; - } else { # latest 50K messages - my $base = $uidmax - PublicInbox::IMAP::UID_SLICE; - $self->{uid_base} = $base < 0 ? 0 : $base; + } else { # latest messages: $self->{slice} = -1; } $self->{ibx} = $ibx; @@ -100,12 +101,27 @@ sub cmd_user ($$) { sub _login_ok ($) { my ($self) = @_; - if ($self->{pop3d}->lock_mailbox($self)) { - $self->{uid_max} = $self->{ibx}->over(1)->max; - \"+OK logged in\r\n"; - } else { - \"-ERR [IN-USE] unable to lock maildrop\r\n"; + $self->{pop3d}->lock_mailbox($self) or + return \"-ERR [IN-USE] unable to lock maildrop\r\n"; + + my $l = delete $self->{limits}; + $l = defined($self->{uid_dele}) ? $l->{limit} + : ($l->{initial_limit} // $l->{limit}); + my $uidmax = delete $self->{uidmax}; + if ($self->{slice} >= 0) { + $self->{uid_base} = $self->{slice} * UID_SLICE; + if (defined $l) { # n.b: the last slice is not full: + my $max = int($uidmax/UID_SLICE) == $self->{slice} ? + ($uidmax % UID_SLICE) : UID_SLICE; + my $off = $max - $l; + $self->{uid_base} += $off if $off > 0; + } + } else { # latest $l messages, or 1k if unspecified + my $base = $uidmax - ($l // 1000); + $self->{uid_base} = $base < 0 ? 0 : $base; } + $self->{uid_max} = $self->{ibx}->over(1)->max; + \"+OK logged in\r\n"; } sub cmd_apop { @@ -154,32 +170,32 @@ SELECT num,ddd FROM over WHERE num >= ? AND num <= ? ORDER BY num ASC $sth->execute($beg, $end); - do { - $m = $sth->fetchall_arrayref({}, 1000); + my $tot = 0; + while (defined($m = $sth->fetchall_arrayref({}, 1000))) { for my $x (@$m) { PublicInbox::Over::load_from_row($x); push(@cache, $x->{num}, $x->{bytes} + 0, $x->{blob}); undef $x; # saves ~1.5M memory w/ 50k messages + $tot += $cache[-2]; } - } while (scalar(@$m) && ($beg = $cache[-3] + 1)); - \@cache; + } + $self->{total_bytes} = $tot; + $self->{cache} = \@cache; } sub cmd_stat { my ($self) = @_; my $err; $err = need_txn($self) and return $err; - my $cache = $self->{cache} //= _stat_cache($self); - my $tot = 0; - for (my $i = 1; $i < scalar(@$cache); $i += 3) { $tot += $cache->[$i] } + my $cache = $self->{cache} // _stat_cache($self); my $nr = @$cache / 3 - ($self->{nr_dele} // 0); - "+OK $nr $tot\r\n"; + "+OK $nr $self->{total_bytes}\r\n"; } # for LIST and UIDL sub _list { my ($desc, $idx, $self, $msn) = @_; my $err; $err = need_txn($self) and return $err; - my $cache = $self->{cache} //= _stat_cache($self); + my $cache = $self->{cache} // _stat_cache($self); if (defined $msn) { my $base_off = ($msn - 1) * 3; my $val = $cache->[$base_off + $idx] // @@ -209,8 +225,9 @@ sub mark_dele ($$) { my $old = $self->{txn_max_uid} //= $uid; $self->{txn_max_uid} = $uid if $uid > $old; + $self->{total_bytes} -= $cache->[$base_off + 1]; $cache->[$base_off] = undef; # clobber UID - $cache->[$base_off + 1] = 0; # zero bytes (simplifies cmd_stat) + $cache->[$base_off + 1] = undef; # clobber bytes $cache->[$base_off + 2] = undef; # clobber oidhex ++$self->{nr_dele}; } @@ -220,7 +237,10 @@ sub retr_cb { # called by git->cat_async via ibx_async_cat my ($self, $off, $top_nr) = @$args; my $hex = $self->{cache}->[$off * 3 + 2] // die "BUG: no hex (oid=$oid)"; - if (!defined($oid)) { + if (!defined($type)) { + warn "E: git aborted on $oid / $hex $self->{ibx}->{inboxdir}"; + return $self->close; + } elsif ($type ne 'blob') { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message warn "E: $hex missing in $self->{ibx}->{inboxdir}\n"; @@ -238,7 +258,7 @@ sub retr_cb { # called by git->cat_async via ibx_async_cat my @tmp = split(/^/m, $bdy); $hdr .= join('', splice(@tmp, 0, $top_nr)); } elsif (exists $self->{expire}) { - $self->{expire} .= pack('S', $off + 1); + $self->{expire} .= pack('S', $off); } $$bref =~ s/^\./../gms; $$bref .= substr($$bref, -2, 2) eq "\r\n" ? ".\r\n" : "\r\n.\r\n"; @@ -252,7 +272,7 @@ sub cmd_retr { return \"-ERR lines must be a non-negative number\r\n" if (defined($top_nr) && $top_nr !~ /\A[0-9]+\z/); my $err; $err = need_txn($self) and return $err; - my $cache = $self->{cache} //= _stat_cache($self); + my $cache = $self->{cache} // _stat_cache($self); my $off = $msn - 1; my $hex = $cache->[$off * 3 + 2] // return \"-ERR no such message\r\n"; ${ibx_async_cat($self->{ibx}, $hex, \&retr_cb, @@ -272,7 +292,7 @@ sub cmd_rset { sub cmd_dele { my ($self, $msn) = @_; my $err; $err = need_txn($self) and return $err; - $self->{cache} //= _stat_cache($self); + $self->{cache} // _stat_cache($self); $msn =~ /\A[1-9][0-9]*\z/ or return \"-ERR no such message\r\n"; mark_dele($self, $msn - 1) ? \"+OK\r\n" : \"-ERR no such message\r\n"; } @@ -308,7 +328,7 @@ sub __cleanup_state { $self->{pop3d}->{-state_dbh}->prepare_cached(<<'')->execute($txn_id); DELETE FROM deletes WHERE txn_id = ? AND uid_dele = -1 - my $sth = $self->{pop3d}->{-state_dbh}->prepare_cached(<<''); + my $sth = $self->{pop3d}->{-state_dbh}->prepare_cached(<<'', undef, 1); SELECT COUNT(*) FROM deletes WHERE user_id = ? $sth->execute($user_id); @@ -350,7 +370,7 @@ UPDATE users SET last_seen = ? WHERE user_id = ? $self->{pop3d}->unlock_mailbox($self); } $self->write(\"+OK public-inbox POP3 server signing off\r\n"); - $self->close; + $self->shutdn; undef; } @@ -364,8 +384,8 @@ sub process_line ($$) { \"-ERR command not recognized\r\n"; my $err = $@; if ($err && $self->{sock}) { - chomp($l); - err($self, 'error from: %s (%s)', $l, $err); + $l =~ s/\r?\n//s; + warn("error from: $l ($err)\n"); $res = \"-ERR program fault - command not performed\r\n"; } defined($res) ? $self->write($res) : 0; diff --git a/lib/PublicInbox/POP3D.pm b/lib/PublicInbox/POP3D.pm index 7432a964..bd440434 100644 --- a/lib/PublicInbox/POP3D.pm +++ b/lib/PublicInbox/POP3D.pm @@ -13,27 +13,42 @@ use PublicInbox::POP3; use PublicInbox::Syscall; use File::Temp 0.19 (); # 0.19 for ->newdir use Fcntl qw(F_SETLK F_UNLCK F_WRLCK SEEK_SET); -my @FLOCK; -if ($^O eq 'linux' || $^O eq 'freebsd') { +my ($FLOCK_TMPL, @FLOCK_ORDER); +# are all BSDs the same "struct flock"? tested Free+Net+Open... +if ($^O =~ /\A(?:linux|dragonfly)\z/ || $^O =~ /bsd/) { require Config; my $off_t; + my @LE_pad = ('', ''); my $sz = $Config::Config{lseeksize}; - - if ($sz == 8 && eval('length(pack("q", 1)) == 8')) { $off_t = 'q' } - elsif ($sz == 4) { $off_t = 'l' } - else { warn "sizeof(off_t)=$sz requires File::FcntlLock\n" } - + if ($sz == 8) { + if (eval('length(pack("q", 1)) == 8')) { + $off_t = 'q'; + } elsif ($Config::Config{byteorder} == 1234) { # OpenBSD i386 + $off_t = 'l'; + @LE_pad = ('@8', '@16'); + } else { # I have no 32-bit BE machine to test on... + warn <<EOM; +Perl built with 64-bit file support but not 64-bit int (pack("q")) support. +byteorder=$Config::Config{byteorder} +EOM + } + } elsif ($sz == 4) { + $off_t = 'l'; + } else { + warn "sizeof(off_t)=$sz requires File::FcntlLock\n" + } if (defined($off_t)) { if ($^O eq 'linux') { - @FLOCK = ("ss\@8$off_t$off_t\@32", - qw(l_type l_whence l_start l_len)); - } elsif ($^O eq 'freebsd') { - @FLOCK = ("${off_t}${off_t}lss\@256", - qw(l_start l_len l_pid l_type l_whence)); + $FLOCK_TMPL = 'ss@8'.$off_t.$LE_pad[0].$off_t.'@32'; + @FLOCK_ORDER = qw(l_type l_whence l_start l_len); + } else { # *bsd including dragonfly + $FLOCK_TMPL = $off_t.$LE_pad[0].$off_t.$LE_pad[1]. + 'lss@256'; + @FLOCK_ORDER = qw(l_start l_len l_pid l_type l_whence); } } } -@FLOCK or eval { require File::FcntlLock } or +@FLOCK_ORDER or eval { require File::FcntlLock } or die "File::FcntlLock required for POP3 on $^O: $@\n"; sub new { @@ -140,9 +155,9 @@ sub _setlk ($%) { my ($self, %lk) = @_; $lk{l_pid} = 0; # needed for *BSD $lk{l_whence} = SEEK_SET; - if (@FLOCK) { + if (@FLOCK_ORDER) { fcntl($self->{txn_fh}, F_SETLK, - pack($FLOCK[0], @lk{@FLOCK[1..$#FLOCK]})); + pack($FLOCK_TMPL, @lk{@FLOCK_ORDER})); } else { my $fs = File::FcntlLock->new(%lk); $fs->lock($self->{txn_fh}, F_SETLK); @@ -156,6 +171,7 @@ sub lock_mailbox { my ($user_id, $ngid, $mbid, $txn_id); my $uuid = delete $pop3->{uuid}; $dbh->begin_work; + my $creat = 0; # 1. make sure the user exists, update `last_seen' my $sth = $dbh->prepare_cached(<<''); @@ -218,13 +234,13 @@ SELECT mailbox_id FROM mailboxes WHERE newsgroup_id = ? AND slice = ? $sth = $dbh->prepare_cached(<<''); INSERT OR IGNORE INTO deletes (user_id,mailbox_id) VALUES (?,?) - if ($sth->execute($user_id, $mbid) == 0) { + if ($sth->execute($user_id, $mbid) == 0) { # fetching into existing $sth = $dbh->prepare_cached(<<'', undef, 1); SELECT txn_id,uid_dele FROM deletes WHERE user_id = ? AND mailbox_id = ? $sth->execute($user_id, $mbid); ($txn_id, $pop3->{uid_dele}) = $sth->fetchrow_array; - } else { + } else { # new user/mailbox combo $txn_id = $dbh->last_insert_id(undef, undef, 'deletes', 'txn_id'); } diff --git a/lib/PublicInbox/PktOp.pm b/lib/PublicInbox/PktOp.pm index 4c434566..1bcdd799 100644 --- a/lib/PublicInbox/PktOp.pm +++ b/lib/PublicInbox/PktOp.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # op dispatch socket, reads a message, runs a sub @@ -6,12 +6,11 @@ # Used for lei_xsearch and maybe other things # "command" => [ $sub, @fixed_operands ] package PublicInbox::PktOp; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::DS); use Errno qw(EAGAIN ECONNRESET); use PublicInbox::Syscall qw(EPOLLIN); -use Socket qw(AF_UNIX MSG_EOR SOCK_SEQPACKET); +use Socket qw(AF_UNIX SOCK_SEQPACKET); use PublicInbox::IPC qw(ipc_freeze ipc_thaw); use Scalar::Util qw(blessed); @@ -32,7 +31,7 @@ sub pair { sub pkt_do { # for the producer to trigger event_step in consumer my ($self, $cmd, @args) = @_; - send($self->{op_p}, @args ? "$cmd\0".ipc_freeze(\@args) : $cmd, MSG_EOR) + send($self->{op_p}, @args ? "$cmd\0".ipc_freeze(\@args) : $cmd, 0) } sub event_step { @@ -55,7 +54,15 @@ sub event_step { my $op = $self->{ops}->{$cmd //= $msg}; if ($op) { my ($obj, @args) = (@$op, @pargs); - blessed($obj) ? $obj->$cmd(@args) : $obj->(@args); + if (blessed($args[0]) && $args[0]->can('do_env')) { + my $lei = shift @args; + $lei->do_env($obj, @args); + } elsif (blessed($obj)) { + $obj->can('do_env') ? $obj->do_env($cmd, @args) + : $obj->$cmd(@args); + } else { + $obj->(@args); + } } elsif ($msg ne '') { die "BUG: unknown message: `$cmd'"; } diff --git a/lib/PublicInbox/ProcessPipe.pm b/lib/PublicInbox/ProcessPipe.pm deleted file mode 100644 index 97e9c268..00000000 --- a/lib/PublicInbox/ProcessPipe.pm +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> - -# a tied handle for auto reaping of children tied to a pipe, see perltie(1) -package PublicInbox::ProcessPipe; -use strict; -use v5.10.1; -use Carp qw(carp); - -sub TIEHANDLE { - my ($class, $pid, $fh, $cb, $arg) = @_; - bless { pid => $pid, fh => $fh, ppid => $$, cb => $cb, arg => $arg }, - $class; -} - -sub BINMODE { binmode(shift->{fh}) } # for IO::Uncompress::Gunzip - -sub READ { read($_[0]->{fh}, $_[1], $_[2], $_[3] || 0) } - -sub READLINE { readline($_[0]->{fh}) } - -sub WRITE { - use bytes qw(length); - syswrite($_[0]->{fh}, $_[1], $_[2] // length($_[1]), $_[3] // 0); -} - -sub PRINT { - my $self = shift; - print { $self->{fh} } @_; -} - -sub FILENO { fileno($_[0]->{fh}) } - -sub _close ($;$) { - my ($self, $wait) = @_; - my $fh = delete $self->{fh}; - my $ret = defined($fh) ? close($fh) : ''; - my ($pid, $cb, $arg) = delete @$self{qw(pid cb arg)}; - return $ret unless defined($pid) && $self->{ppid} == $$; - if ($wait) { # caller cares about the exit status: - my $wp = waitpid($pid, 0); - if ($wp == $pid) { - $ret = '' if $?; - if ($cb) { - eval { $cb->($arg, $pid) }; - carp "E: cb(arg, $pid): $@" if $@; - } - } else { - carp "waitpid($pid, 0) = $wp, \$!=$!, \$?=$?"; - } - } else { # caller just undef-ed it, let event loop deal with it - require PublicInbox::DS; - PublicInbox::DS::dwaitpid($pid, $cb, $arg); - } - $ret; -} - -# if caller uses close(), assume they want to check $? immediately so -# we'll waitpid() synchronously. n.b. wantarray doesn't seem to -# propagate `undef' down to tied methods, otherwise I'd rely on that. -sub CLOSE { _close($_[0], 1) } - -# if relying on DESTROY, assume the caller doesn't care about $? and -# we can let the event loop call waitpid() whenever it gets SIGCHLD -sub DESTROY { - _close($_[0]); - undef; -} - -1; diff --git a/lib/PublicInbox/Qspawn.pm b/lib/PublicInbox/Qspawn.pm index 53d0ad55..0bf857c6 100644 --- a/lib/PublicInbox/Qspawn.pm +++ b/lib/PublicInbox/Qspawn.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Like most Perl modules in public-inbox, this is internal and @@ -25,9 +25,15 @@ # processes such as git-apply(1). package PublicInbox::Qspawn; -use strict; +use v5.12; use PublicInbox::Spawn qw(popen_rd); use PublicInbox::GzipFilter; +use Scalar::Util qw(blessed); +use PublicInbox::Limiter; +use PublicInbox::Aspawn qw(run_await); +use PublicInbox::Syscall qw(EPOLLIN); +use PublicInbox::InputPipe; +use Carp qw(carp confess); # n.b.: we get EAGAIN with public-inbox-httpd, and EINTR on other PSGI servers use Errno qw(EAGAIN EINTR); @@ -38,52 +44,45 @@ my $def_limiter; # $cmd is the command to spawn # $cmd_env is the environ for the child process (not PSGI env) # $opt can include redirects and perhaps other process spawning options -sub new ($$$;) { +# {qsp_err} is an optional error buffer callers may access themselves +sub new { my ($class, $cmd, $cmd_env, $opt) = @_; - bless { args => [ $cmd, $cmd_env, $opt ] }, $class; + bless { args => [ $cmd, $cmd_env, $opt ? { %$opt } : {} ] }, $class; } sub _do_spawn { my ($self, $start_cb, $limiter) = @_; - my $err; - my ($cmd, $cmd_env, $opt) = @{delete $self->{args}}; + my ($cmd, $cmd_env, $opt) = @{$self->{args}}; my %o = %{$opt || {}}; $self->{limiter} = $limiter; - foreach my $k (@PublicInbox::Spawn::RLIMITS) { - if (defined(my $rlimit = $limiter->{$k})) { - $o{$k} = $rlimit; - } + for my $k (@PublicInbox::Spawn::RLIMITS) { + $opt->{$k} = $limiter->{$k} // next; + } + $self->{-quiet} = 1 if $o{quiet}; + $limiter->{running}++; + if ($start_cb) { + eval { # popen_rd may die on EMFILE, ENFILE + $self->{rpipe} = popen_rd($cmd, $cmd_env, $opt, + \&waitpid_err, $self); + $start_cb->($self); # EPOLL_CTL_ADD may ENOSPC/ENOMEM + }; + } else { + eval { run_await($cmd, $cmd_env, $opt, \&wait_await, $self) }; + warn "E: $@" if $@; } - $self->{cmd} = $o{quiet} ? undef : $cmd; - eval { - # popen_rd may die on EMFILE, ENFILE - $self->{rpipe} = popen_rd($cmd, $cmd_env, \%o); - - die "E: $!" unless defined($self->{rpipe}); - - $limiter->{running}++; - $start_cb->($self); # EPOLL_CTL_ADD may ENOSPC/ENOMEM - }; finish($self, $@) if $@; } -sub child_err ($) { - my ($child_error) = @_; # typically $? - my $exitstatus = ($child_error >> 8) or return; - my $sig = $child_error & 127; - my $msg = "exit status=$exitstatus"; - $msg .= " signal=$sig" if $sig; - $msg; +sub psgi_status_err { # Qspawn itself is useful w/o PSGI + require PublicInbox::WwwStatic; + PublicInbox::WwwStatic::r($_[0] // 500); } -sub finalize ($$) { - my ($self, $err) = @_; - - my ($env, $qx_cb, $qx_arg, $qx_buf) = - delete @$self{qw(psgi_env qx_cb qx_arg qx_buf)}; +sub finalize ($) { + my ($self) = @_; - # done, spawn whatever's in the queue - my $limiter = $self->{limiter}; + # process is done, spawn whatever's in the queue + my $limiter = delete $self->{limiter} or return; my $running = --$limiter->{running}; if ($running < $limiter->{max}) { @@ -91,34 +90,69 @@ sub finalize ($$) { _do_spawn(@$next, $limiter); } } - - if ($err) { - if (defined $self->{err}) { - $self->{err} .= "; $err"; - } else { - $self->{err} = $err; - } - if ($env && $self->{cmd}) { - warn join(' ', @{$self->{cmd}}) . ": $err"; + if (my $err = $self->{_err}) { # set by finish or waitpid_err + utf8::decode($err); + if (my $dst = $self->{qsp_err}) { + $$dst .= $$dst ? " $err" : "; $err"; } + warn "E: @{$self->{args}->[0]}: $err\n" if !$self->{-quiet}; + } + + my ($env, $qx_cb_arg) = delete @$self{qw(psgi_env qx_cb_arg)}; + if ($qx_cb_arg) { + my $cb = shift @$qx_cb_arg; + eval { $cb->($self->{args}->[2]->{1}, @$qx_cb_arg) }; + return unless $@; + warn "E: $@"; # hope qspawn.wcb can handle it } - if ($qx_cb) { - eval { $qx_cb->($qx_buf, $qx_arg) }; - } elsif (my $wcb = delete $env->{'qspawn.wcb'}) { + return if $self->{passed}; # another command chained it + if (my $wcb = delete $env->{'qspawn.wcb'}) { # have we started writing, yet? - require PublicInbox::WwwStatic; - $wcb->(PublicInbox::WwwStatic::r(500)); + $wcb->(psgi_status_err($env->{'qspawn.fallback'})); } } -# callback for dwaitpid or ProcessPipe -sub waitpid_err { finalize($_[0], child_err($?)) } +sub waitpid_err { # callback for awaitpid + my (undef, $self) = @_; # $_[0]: pid + $self->{_err} = ''; # for defined check in ->finish + if ($?) { # XXX this may be redundant + my $status = $? >> 8; + my $sig = $? & 127; + $self->{_err} .= "exit status=$status"; + $self->{_err} .= " signal=$sig" if $sig; + } + finalize($self) if !$self->{rpipe}; +} + +sub wait_await { # run_await cb + my ($pid, $cmd, $cmd_env, $opt, $self) = @_; + waitpid_err($pid, $self); +} + +sub yield_chunk { # $_[-1] is sysread buffer (or undef) + my ($self, $ipipe) = @_; + if (!defined($_[-1])) { + warn "error reading body: $!"; + } elsif ($_[-1] eq '') { # normal EOF + $self->finish; + $self->{qfh}->close; + } elsif (defined($self->{qfh}->write($_[-1]))) { + return; # continue while HTTP client is reading our writes + } # else { # HTTP client disconnected + delete $self->{rpipe}; + $ipipe->close; +} sub finish ($;$) { my ($self, $err) = @_; - my $tied_pp = delete($self->{rpipe}) or return finalize($self, $err); - my PublicInbox::ProcessPipe $pp = tied *$tied_pp; - @$pp{qw(cb arg)} = (\&waitpid_err, $self); # for ->DESTROY + $self->{_err} //= $err; # only for $@ + + # we can safely finalize if pipe was closed before, or if + # {_err} is defined by waitpid_err. Deleting {rpipe} will + # trigger PublicInbox::IO::DESTROY -> waitpid_err, + # but it may not fire right away if inside the event loop. + my $closed_before = !delete($self->{rpipe}); + finalize($self) if $closed_before || defined($self->{_err}); } sub start ($$$) { @@ -130,137 +164,92 @@ sub start ($$$) { } } -sub psgi_qx_init_cb { - my ($self) = @_; - my $async = delete $self->{async}; - my ($r, $buf); - my $qx_fh = $self->{qx_fh}; -reread: - $r = sysread($self->{rpipe}, $buf, 65536); - if ($async) { - $async->async_pass($self->{psgi_env}->{'psgix.io'}, - $qx_fh, \$buf); - } elsif (defined $r) { - $r ? (print $qx_fh $buf) : event_step($self, undef); - } else { - return if $! == EAGAIN; # try again when notified - goto reread if $! == EINTR; - event_step($self, $!); - } -} - -sub psgi_qx_start { - my ($self) = @_; - if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) { - # PublicInbox::HTTPD::Async->new(rpipe, $cb, cb_arg, $end_obj) - $self->{async} = $async->($self->{rpipe}, - \&psgi_qx_init_cb, $self, $self); - # init_cb will call ->async_pass or ->close - } else { # generic PSGI - psgi_qx_init_cb($self) while $self->{qx_fh}; - } -} - -# Similar to `backtick` or "qx" ("perldoc -f qx"), it calls $qx_cb with +# Similar to `backtick` or "qx" ("perldoc -f qx"), it calls @qx_cb_arg with # the stdout of the given command when done; but respects the given limiter # $env is the PSGI env. As with ``/qx; only use this when output is small # and safe to slurp. sub psgi_qx { - my ($self, $env, $limiter, $qx_cb, $qx_arg) = @_; + my ($self, $env, $limiter, @qx_cb_arg) = @_; $self->{psgi_env} = $env; - my $qx_buf = ''; - open(my $qx_fh, '+>', \$qx_buf) or die; # PerlIO::scalar - $self->{qx_cb} = $qx_cb; - $self->{qx_arg} = $qx_arg; - $self->{qx_fh} = $qx_fh; - $self->{qx_buf} = \$qx_buf; - $limiter ||= $def_limiter ||= PublicInbox::Qspawn::Limiter->new(32); - start($self, $limiter, \&psgi_qx_start); + $self->{qx_cb_arg} = \@qx_cb_arg; + $limiter ||= $def_limiter ||= PublicInbox::Limiter->new(32); + start($self, $limiter, undef); } -# this is called on pipe EOF to reap the process, may be called -# via PublicInbox::DS event loop OR via GetlineBody for generic -# PSGI servers. -sub event_step { - my ($self, $err) = @_; # $err: $! - warn "psgi_{return,qx} $err" if defined($err); - finish($self); - my ($fh, $qx_fh) = delete(@$self{qw(fh qx_fh)}); - $fh->close if $fh; # async-only (psgi_return) -} +sub yield_pass { + my ($self, $ipipe, $res) = @_; # $ipipe = InputPipe + my $env = $self->{psgi_env}; + my $wcb = delete $env->{'qspawn.wcb'} // confess('BUG: no qspawn.wcb'); + if (ref($res) eq 'CODE') { # chain another command + delete $self->{rpipe}; + $ipipe->close if $ipipe; + $res->($wcb); + $self->{passed} = 1; + return; # all done + } + confess("BUG: $res unhandled") if ref($res) ne 'ARRAY'; -sub rd_hdr ($) { - my ($self) = @_; - # typically used for reading CGI headers - # We also need to check EINTR for generic PSGI servers. - my $ret; - my $total_rd = 0; - my $hdr_buf = $self->{hdr_buf}; - my ($ph_cb, $ph_arg) = @{$self->{parse_hdr}}; - do { - my $r = sysread($self->{rpipe}, $$hdr_buf, 4096, - length($$hdr_buf)); - if (defined($r)) { - $total_rd += $r; - eval { $ret = $ph_cb->($total_rd, $hdr_buf, $ph_arg) }; - if ($@) { - warn "parse_hdr: $@"; - $ret = [ 500, [], [ "Internal error\n" ] ]; - } - } else { - # caller should notify us when it's ready: - return if $! == EAGAIN; - next if $! == EINTR; # immediate retry - warn "error reading header: $!"; - $ret = [ 500, [], [ "Internal error\n" ] ]; - } - } until (defined $ret); - delete $self->{parse_hdr}; # done parsing headers - $ret; + my $filter = blessed($res->[2]) && $res->[2]->can('attach') ? + pop(@$res) : delete($env->{'qspawn.filter'}); + $filter //= PublicInbox::GzipFilter::qsp_maybe($res->[1], $env); + + if (scalar(@$res) == 3) { # done early (likely error or static file) + delete $self->{rpipe}; + $ipipe->close if $ipipe; + $wcb->($res); # all done + return; + } + scalar(@$res) == 2 or confess("BUG: scalar(res) != 2: @$res"); + return ($wcb, $filter) if !$ipipe; # generic PSGI + # streaming response + my $qfh = $wcb->($res); # get PublicInbox::HTTP::(Chunked|Identity) + $qfh = $filter->attach($qfh) if $filter; + my ($bref) = @{delete $self->{yield_parse_hdr}}; + $qfh->write($$bref) if $$bref ne ''; + $self->{qfh} = $qfh; # keep $ipipe open } -sub psgi_return_init_cb { +sub parse_hdr_done ($$) { my ($self) = @_; - my $r = rd_hdr($self) or return; - my $env = $self->{psgi_env}; - my $filter = delete $env->{'qspawn.filter'} // - PublicInbox::GzipFilter::qsp_maybe($r->[1], $env); - - my $wcb = delete $env->{'qspawn.wcb'}; - my $async = delete $self->{async}; - if (scalar(@$r) == 3) { # error - if ($async) { - # calls rpipe->close && ->event_step - $async->close; - } else { - $self->{rpipe}->close; - event_step($self); + my ($ret, $err); + if (defined $_[-1]) { + my ($bref, $ph_cb, @ph_arg) = @{$self->{yield_parse_hdr}}; + $$bref .= $_[-1]; + $ret = eval { $ph_cb->(length($_[-1]), $bref, @ph_arg) }; + if (($err = $@)) { + $ret = psgi_status_err(); + } elsif (!$ret && $_[-1] eq '') { + $err = 'EOF'; + $ret = psgi_status_err(); } - $wcb->($r); - } elsif ($async) { - # done reading headers, handoff to read body - my $fh = $wcb->($r); # scalar @$r == 2 - $fh = $filter->attach($fh) if $filter; - $self->{fh} = $fh; - $async->async_pass($env->{'psgix.io'}, $fh, - delete($self->{hdr_buf})); - } else { # for synchronous PSGI servers - require PublicInbox::GetlineBody; - $r->[2] = PublicInbox::GetlineBody->new($self->{rpipe}, - \&event_step, $self, - ${$self->{hdr_buf}}, $filter); - $wcb->($r); + } else { + $err = "$!"; + $ret = psgi_status_err(); } + carp <<EOM if $err; +E: $err @{$self->{args}->[0]} ($self->{psgi_env}->{REQUEST_URI}) +EOM + $ret; # undef if headers incomplete +} + +sub ipipe_cb { # InputPipe callback + my ($ipipe, $self) = @_; # $_[-1] rbuf + if ($self->{qfh}) { # already streaming + yield_chunk($self, $ipipe, $_[-1]); + } elsif (my $res = parse_hdr_done($self, $_[-1])) { + yield_pass($self, $ipipe, $res); + } # else: headers incomplete, keep reading } -sub psgi_return_start { # may run later, much later... +sub _yield_start { # may run later, much later... my ($self) = @_; - if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) { - # PublicInbox::HTTPD::Async->new(rpipe, $cb, $cb_arg, $end_obj) - $self->{async} = $async->($self->{rpipe}, - \&psgi_return_init_cb, $self, $self); - } else { # generic PSGI - psgi_return_init_cb($self) while $self->{parse_hdr}; + if ($self->{psgi_env}->{'pi-httpd.async'}) { + my $rpipe = $self->{rpipe}; + $rpipe->blocking(0); + PublicInbox::InputPipe::consume($rpipe, \&ipipe_cb, $self); + } else { + require PublicInbox::GetlineResponse; + PublicInbox::GetlineResponse::response($self); } } @@ -271,7 +260,7 @@ sub psgi_return_start { # may run later, much later... # $env->{'qspawn.wcb'} - the write callback from the PSGI server # optional, use this if you've already # captured it elsewhere. If not given, -# psgi_return will return an anonymous +# psgi_yield will return an anonymous # sub for the PSGI server to call # # $env->{'qspawn.filter'} - filter object, responds to ->attach for @@ -280,76 +269,33 @@ sub psgi_return_start { # may run later, much later... # # $limiter - the Limiter object to use (uses the def_limiter if not given) # -# $parse_hdr - Initial read function; often for parsing CGI header output. +# @parse_hdr_arg - Initial read cb+args; often for parsing CGI header output. # It will be given the return value of sysread from the pipe # and a string ref of the current buffer. Returns an arrayref # for PSGI responses. 2-element arrays in PSGI mean the # body will be streamed, later, via writes (push-based) to # psgix.io. 3-element arrays means the body is available # immediately (or streamed via ->getline (pull-based)). -sub psgi_return { - my ($self, $env, $limiter, $parse_hdr, $hdr_arg) = @_; + +sub psgi_yield { + my ($self, $env, $limiter, @parse_hdr_arg)= @_; $self->{psgi_env} = $env; - $self->{hdr_buf} = \(my $hdr_buf = ''); - $self->{parse_hdr} = [ $parse_hdr, $hdr_arg ]; - $limiter ||= $def_limiter ||= PublicInbox::Qspawn::Limiter->new(32); + $self->{yield_parse_hdr} = [ \(my $buf = ''), @parse_hdr_arg ]; + $limiter ||= $def_limiter ||= PublicInbox::Limiter->new(32); # the caller already captured the PSGI write callback from # the PSGI server, so we can call ->start, here: - $env->{'qspawn.wcb'} and - return start($self, $limiter, \&psgi_return_start); - - # the caller will return this sub to the PSGI server, so - # it can set the response callback (that is, for - # PublicInbox::HTTP, the chunked_wcb or identity_wcb callback), - # but other HTTP servers are supported: - sub { + $env->{'qspawn.wcb'} ? start($self, $limiter, \&_yield_start) : sub { + # the caller will return this sub to the PSGI server, so + # it can set the response callback (that is, for + # PublicInbox::HTTP, the chunked_wcb or identity_wcb callback), + # but other HTTP servers are supported: $env->{'qspawn.wcb'} = $_[0]; - start($self, $limiter, \&psgi_return_start); + start($self, $limiter, \&_yield_start); } } -package PublicInbox::Qspawn::Limiter; -use strict; -use warnings; - -sub new { - my ($class, $max) = @_; - bless { - # 32 is same as the git-daemon connection limit - max => $max || 32, - running => 0, - run_queue => [], - # RLIMIT_CPU => undef, - # RLIMIT_DATA => undef, - # RLIMIT_CORE => undef, - }, $class; -} - -sub setup_rlimit { - my ($self, $name, $cfg) = @_; - foreach my $rlim (@PublicInbox::Spawn::RLIMITS) { - my $k = lc($rlim); - $k =~ tr/_//d; - $k = "publicinboxlimiter.$name.$k"; - defined(my $v = $cfg->{$k}) or next; - my @rlimit = split(/\s*,\s*/, $v); - if (scalar(@rlimit) == 1) { - push @rlimit, $rlimit[0]; - } elsif (scalar(@rlimit) != 2) { - warn "could not parse $k: $v\n"; - } - eval { require BSD::Resource }; - if ($@) { - warn "BSD::Resource missing for $rlim"; - next; - } - foreach my $i (0..$#rlimit) { - next if $rlimit[$i] ne 'INFINITY'; - $rlimit[$i] = BSD::Resource::RLIM_INFINITY(); - } - $self->{$rlim} = \@rlimit; - } -} +no warnings 'once'; +*DESTROY = \&finalize; # ->finalize is idempotent 1; diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm new file mode 100644 index 00000000..ab0f2fcc --- /dev/null +++ b/lib/PublicInbox/RepoAtom.pm @@ -0,0 +1,128 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# git log => Atom feed (cgit-compatible: $REPO/atom/[PATH]?h=$tip +package PublicInbox::RepoAtom; +use v5.12; +use parent qw(PublicInbox::GzipFilter); +use POSIX qw(strftime); +use URI::Escape qw(uri_escape); +use Scalar::Util (); +use PublicInbox::Hval qw(ascii_html utf8_maybe); + +# git for-each-ref and log use different format fields :< +my $ATOM_FMT = '--pretty=tformat:'.join('%n', + map { "%$_" } qw(H ct an ae at s b)).'%x00'; + +my $EACH_REF_FMT = '--format='.join(';', map { "\$r{'$_'}=%($_)" } qw( + objectname refname:short creator contents:subject contents:body + *subject *body)).'%00'; + +sub atom_ok { # parse_hdr for qspawn + my ($r, $bref, $ctx) = @_; + return [ 404, [], [ "Not Found\n"] ] if $r == 0; + bless $ctx, __PACKAGE__; + my $h = [ 'Content-Type' => 'application/atom+xml; charset=UTF-8' ]; + $ctx->{gz} = $ctx->can('gz_or_noop')->($h, $ctx->{env}); + my $title = ascii_html(delete $ctx->{-feed_title}); + my $desc = ascii_html($ctx->{git}->description); + my $url = ascii_html($ctx->{git}->base_url($ctx->{env})); + $ctx->{-base_url} = $url; + $ctx->zmore(<<EOM); +<?xml version="1.0"?> +<feed xmlns="http://www.w3.org/2005/Atom"> +<title>$title</title><subtitle>$desc</subtitle><link +rel="alternate" type="text/html" href="$url"/> +EOM + [ 200, $h, $ctx ]; # [2] is qspawn.filter +} + +# called by GzipFilter->close +sub zflush { $_[0]->SUPER::zflush('</feed>') } + +# called by GzipFilter->write or GetlineResponse->getline +sub translate { + my $self = shift; + $_[0] // return zflush($self); # getline caller + my @out; + my $lbuf = delete($self->{lbuf}) // shift; + $lbuf .= shift while @_; + my $is_tag = $self->{-is_tag}; + my ($H, $ct, $an, $ae, $at, $s, $bdy); + while ($lbuf =~ s/\A([^\0]+)\0\n//s) { + utf8_maybe($bdy = $1); + if ($is_tag) { + my %r; + eval "$bdy"; + for (qw(contents:subject contents:body)) { + $r{$_} =~ /\S/ or delete($r{$_}) + } + $H = $r{objectname}; + $s = $r{'contents:subject'} // $r{'*subject'}; + $bdy = $r{'contents:body'} // $r{'*body'}; + $s .= " ($r{'refname:short'})"; + $_ = ascii_html($_) for ($s, $bdy, $r{creator}); + ($an, $ae, $at) = split(/\s*&[gl]t;\s*/, $r{creator}); + $at =~ s/ .*\z//; # no TZ + $ct = $at = strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($at)); + } else { + $bdy = ascii_html($bdy); + ($H, $ct, $an, $ae, $at, $s, $bdy) = + split(/\n/, $bdy, 7); + $at = strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($at)); + $ct = strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($ct)); + } + $bdy //= ''; + push @out, <<""; +<entry><title>$s</title><updated>$ct</updated><author><name>$an</name> +<email>$ae</email></author><published>$at</published><link +rel="alternate" type="text/html" href="$self->{-base_url}$H/s/" +/><id>$H</id> + + push @out, <<'', $bdy, '</pre></div></content>' if $bdy ne ''; +<content type="xhtml"><div +xmlns="http://www.w3.org/1999/xhtml"><pre style="white-space:pre-wrap"> + + push @out, '</entry>'; + } + $self->{lbuf} = $lbuf; + chomp @out; + @out ? $self->SUPER::translate(@out) : ''; # not EOF, yet +} + +# $REPO/tags.atom endpoint +sub srv_tags_atom { + my ($ctx) = @_; + my $max = 50; # TODO configurable + my @cmd = ('git', "--git-dir=$ctx->{git}->{git_dir}", + qw(for-each-ref --sort=-creatordate), "--count=$max", + '--perl', $EACH_REF_FMT, 'refs/tags'); + $ctx->{-feed_title} = "$ctx->{git}->{nick} tags"; + my $qsp = PublicInbox::Qspawn->new(\@cmd); + $ctx->{-is_tag} = 1; + $qsp->psgi_yield($ctx->{env}, undef, \&atom_ok, $ctx); +} + +sub srv_atom { + my ($ctx, $path) = @_; + return if index($path, '//') >= 0 || index($path, '/') == 0; + my $max = 50; # TODO configurable + my @cmd = ('git', "--git-dir=$ctx->{git}->{git_dir}", + qw(log --no-notes --no-color --no-abbrev), + $ATOM_FMT, "-$max"); + my $tip = $ctx->{qp}->{h}; # same as cgit + $ctx->{-feed_title} = $ctx->{git}->{nick}; + $ctx->{-feed_title} .= " $path" if $path ne ''; + if (defined($tip)) { + push @cmd, $tip; + $ctx->{-feed_title} .= ", $tip"; + } + # else: let git decide based on HEAD if $tip isn't defined + push @cmd, '--'; + push @cmd, $path if $path ne ''; + my $qsp = PublicInbox::Qspawn->new(\@cmd, undef, + { quiet => 1, 2 => $ctx->{lh} }); + $qsp->psgi_yield($ctx->{env}, undef, \&atom_ok, $ctx); +} + +1; diff --git a/lib/PublicInbox/RepoList.pm b/lib/PublicInbox/RepoList.pm new file mode 100644 index 00000000..39dc9c0b --- /dev/null +++ b/lib/PublicInbox/RepoList.pm @@ -0,0 +1,39 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::RepoList; +use v5.12; +use parent qw(PublicInbox::WwwStream); +use PublicInbox::Hval qw(ascii_html prurl fmt_ts); +require PublicInbox::CodeSearch; + +sub html_top_fallback { # WwwStream->html_repo_top + my ($ctx) = @_; + my $title = delete($ctx->{-title_html}) // + ascii_html("$ctx->{env}->{PATH_INFO}*"); + my $upfx = $ctx->{-upfx} // ''; + "<html><head><title>$title</title>" . + $ctx->{www}->style($upfx) . '</head><body>'; +} + +sub html ($$$) { + my ($wcr, $ctx, $re) = @_; + my $cr = $wcr->{pi_cfg}->{-coderepos}; + my @nicks = grep(m!$re!, keys %$cr) or return; # 404 + __PACKAGE__->html_init($ctx); + my $zfh = $ctx->zfh; + print $zfh "<pre>matching coderepos\n"; + my @recs = PublicInbox::CodeSearch::repos_sorted($wcr->{pi_cfg}, + @$cr{@nicks}); + my $env = $ctx->{env}; + for (@recs) { + my ($t, $git) = @$_; + my $nick = ascii_html("$git->{nick}"); + for my $u ($git->pub_urls($env)) { + $u = prurl($env, $u); + print $zfh "\n".fmt_ts($t).qq{ <a\nhref="$u">$nick</a>} + } + } + $ctx->html_done('</pre>'); +} + +1; diff --git a/lib/PublicInbox/RepoSnapshot.pm b/lib/PublicInbox/RepoSnapshot.pm new file mode 100644 index 00000000..4c372569 --- /dev/null +++ b/lib/PublicInbox/RepoSnapshot.pm @@ -0,0 +1,87 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# cgit-compatible /snapshot/ endpoint for WWW coderepos +package PublicInbox::RepoSnapshot; +use v5.12; +use PublicInbox::Qspawn; +use PublicInbox::ViewVCS; +use PublicInbox::WwwStatic qw(r); + +# Not using standard mime types since the compressed tarballs are +# special or do not match my /etc/mime.types. Choose what gitweb +# and cgit agree on for compatibility. +our %FMT_TYPES = ( + 'tar' => 'application/x-tar', + 'tar.gz' => 'application/x-gzip', + 'tar.bz2' => 'application/x-bzip2', + 'tar.xz' => 'application/x-xz', + 'zip' => 'application/x-zip', +); + +our %FMT_CFG = ( + 'tar.xz' => 'xz -c', + 'tar.bz2' => 'bzip2 -c', + # not supporting lz nor zstd for now to avoid format proliferation + # and increased cache overhead required to handle extra formats. +); + +my $SUFFIX = join('|', map { quotemeta } keys %FMT_TYPES); + +# TODO deal with tagged blobs + +sub archive_hdr { # parse_hdr for Qspawn + my ($r, $bref, $ctx) = @_; + $r or return [500, [qw(Content-Type text/plain Content-Length 0)], []]; + my $fn = "$ctx->{snap_pfx}.$ctx->{snap_fmt}"; + my $type = $FMT_TYPES{$ctx->{snap_fmt}} // + die "BUG: bad fmt: $ctx->{snap_fmt}"; + [ 200, [ 'Content-Type', "$type; charset=UTF-8", + 'Content-Disposition', qq(inline; filename="$fn"), + 'ETag', qq("$ctx->{etag}") ] ]; +} + +sub ver_check { # git->check_async callback + my (undef, $oid, $type, $size, $ctx) = @_; + return if defined $ctx->{etag}; + my $treeish = shift @{$ctx->{-try}} // die 'BUG: no {-try}'; + if ($type eq 'missing') { + scalar(@{$ctx->{-try}}) or + delete($ctx->{env}->{'qspawn.wcb'})->(r(404)); + } else { # found, done: + $ctx->{etag} = $oid; + my @cfg; + if (my $cmd = $FMT_CFG{$ctx->{snap_fmt}}) { + @cfg = ('-c', "tar.$ctx->{snap_fmt}.command=$cmd"); + } + my $qsp = PublicInbox::Qspawn->new(['git', @cfg, + "--git-dir=$ctx->{git}->{git_dir}", 'archive', + "--prefix=$ctx->{snap_pfx}/", + "--format=$ctx->{snap_fmt}", $treeish], undef, + { quiet => 1 }); + $qsp->psgi_yield($ctx->{env}, undef, \&archive_hdr, $ctx); + } +} + +sub srv { + my ($ctx, $fn) = @_; + return if $fn =~ /["\s]/s; + my $fmt = $ctx->{wcr}->{snapshots}; # TODO per-repo snapshots + $fn =~ s/\.($SUFFIX)\z//o and $fmt->{$1} or return; + $ctx->{snap_fmt} = $1; + my $pfx = $ctx->{git}->local_nick // return; + $pfx =~ s/(?:\.git)?\z/-/; + ($pfx) = ($pfx =~ m!([^/]+)\z!); + substr($fn, 0, length($pfx)) eq $pfx or return; + $ctx->{snap_pfx} = $fn; + my $v = $ctx->{snap_ver} = substr($fn, length($pfx), length($fn)); + # try without [vV] prefix, first + my @try = map { "$_$v" } ('', 'v', 'V'); # cf. cgit:ui-snapshot.c + @{$ctx->{-try}} = @try; + sub { + $ctx->{env}->{'qspawn.wcb'} = $_[0]; + PublicInbox::ViewVCS::do_check_async($ctx, \&ver_check, @try); + } +} + +1; diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm new file mode 100644 index 00000000..5c73531a --- /dev/null +++ b/lib/PublicInbox/RepoTree.pm @@ -0,0 +1,99 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# cgit-compatible $REPO/tree/[PATH]?h=$tip redirector +package PublicInbox::RepoTree; +use v5.12; +use PublicInbox::ViewDiff qw(uri_escape_path); +use PublicInbox::WwwStatic qw(r); +use PublicInbox::Qspawn; +use PublicInbox::WwwStream qw(html_oneshot); +use PublicInbox::Hval qw(ascii_html utf8_maybe); + +sub rd_404_log { + my ($bref, $ctx) = @_; + my $path = $ctx->{-q_value_html} = ascii_html($ctx->{-path}); + my $tip = 'HEAD'; + $tip = ascii_html($ctx->{qp}->{h}) if defined($ctx->{qp}->{h}); + PublicInbox::WwwStream::html_init($ctx); + my $zfh = $ctx->{zfh}; + print $zfh "<pre>\$ git log -1 $tip -- $path\n"; + my $code = 200; + if ($$bref eq '') { + say $zfh "found no record of `$path' in git history in `$tip'"; + $ctx->{-has_srch} and + say $zfh 'perhaps try searching mail (above)'; + $code = 404; + } else { + my ($H, $h, $s_as) = split(/ /, $$bref, 3); + utf8_maybe($s_as); + my $x = uri_escape_path($ctx->{-path}); + $s_as = ascii_html($s_as); + print $zfh <<EOM; +found last record of `$path' in the following commit: + +<a href="$ctx->{-upfx}$H/s/?b=$x">$h</a> $s_as +EOM + } + my $res = $ctx->html_done; + $res->[0] = $code; + delete($ctx->{-wcb})->($res); +} + +sub find_missing { + my ($ctx) = @_; + if ($ctx->{-path} eq '') { + my $tip = 'HEAD'; + $tip = ascii_html($ctx->{qp}->{h}) if defined($ctx->{qp}->{h}); + PublicInbox::WwwStream::html_init($ctx); + print { $ctx->{zfh} } "<pre>`$tip' ref not found</pre>"; + my $res = $ctx->html_done; + $res->[0] = 404; + return delete($ctx->{-wcb})->($res); + } + my $cmd = ['git', "--git-dir=$ctx->{git}->{git_dir}", + qw(log --no-color -1), '--pretty=%H %h %s (%as)' ]; + push @$cmd, $ctx->{qp}->{h} if defined($ctx->{qp}->{h}); + push @$cmd, '--'; + push @$cmd, $ctx->{-path}; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, + { quiet => 1, 2 => $ctx->{lh} }); + $qsp->psgi_qx($ctx->{env}, undef, \&rd_404_log, $ctx); +} + +sub tree_show { # git check_async callback + my (undef, $oid, $type, $size, $ctx) = @_; + return find_missing($ctx) if $type eq 'missing'; + + my $res = [ $ctx->{git}, $oid, $type, $size ]; + my ($bn) = ($ctx->{-path} =~ m!/?([^/]+)\z!); + if ($type eq 'blob') { + my $obj = ascii_html($ctx->{-obj}); + $ctx->{-q_value_html} = 'dfn:'.ascii_html($ctx->{-path}) . + ' dfpost:'.substr($oid, 0, 7); + $ctx->{-paths} = [ $bn, qq[(<a +href="$ctx->{-upfx}$oid/s/$bn">raw</a>) +\$ git show $obj\t# shows this blob on the CLI] ]; + } + PublicInbox::ViewVCS::solve_result($res, $ctx); +} + +sub srv_tree { + my ($ctx, $path) = @_; + return if index($path, '//') >= 0 || index($path, '/') == 0; + my $tip = $ctx->{qp}->{h} // 'HEAD'; + $ctx->{-upfx} = '../' x (($path =~ tr!/!/!) + 1); + $path =~ s!/\z!!; + my $obj = $ctx->{-obj} = "$tip:$path"; + $ctx->{-path} = $path; + + # "\n" breaks with `git cat-file --batch-check', and there's no + # legitimate use of "\n" in filenames anyways. + return if index($obj, "\n") >= 0; + sub { + $ctx->{-wcb} = $_[0]; # HTTP::{Chunked,Identity} + PublicInbox::ViewVCS::do_check_async($ctx, \&tree_show, $obj); + }; +} + +1; diff --git a/lib/PublicInbox/SHA.pm b/lib/PublicInbox/SHA.pm new file mode 100644 index 00000000..3fa8530e --- /dev/null +++ b/lib/PublicInbox/SHA.pm @@ -0,0 +1,67 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# OpenSSL exception added in commit 22711f81f4e79da6b796820e37803a05cae14645 +# (README: add OpenSSL exception, 2015-10-05) + +# Replaces most uses of Digest::SHA with OpenSSL via Net::SSLeay if +# possible. OpenSSL SHA-256 is nearly twice as fast as Digest::SHA on +# x86-64, and SHA-1 is a bit faster as well. +# I don't think we can implement Digest::SHA->clone with what Net::SSLeay +# gives us... (maybe EVP_MD_CTX_copy+EVP_MD_CTX_copy_ex need to be added +# to Net::SSLeay?) +package PublicInbox::SHA; +use v5.12; +require Exporter; +our @EXPORT_OK = qw(sha1_hex sha256_hex sha256 sha_all); +use autodie qw(sysread); +our @ISA; + +BEGIN { + push @ISA, 'Exporter'; + unless (eval(<<'EOM')) { +use Net::SSLeay 1.43; +my %SHA = ( + 1 => Net::SSLeay::EVP_sha1(), + 256 => Net::SSLeay::EVP_sha256(), +); + +sub new { + my ($cls, $n) = @_; + my $mdctx = Net::SSLeay::EVP_MD_CTX_create(); + Net::SSLeay::EVP_DigestInit($mdctx, $SHA{$n}) or + die "EVP_DigestInit $n: $!"; + bless \$mdctx, $cls; +} + +sub add { + my $self = shift; + Net::SSLeay::EVP_DigestUpdate($$self, $_) for @_; + $self; +} + +sub digest { Net::SSLeay::EVP_DigestFinal(${$_[0]}) }; +sub hexdigest { unpack('H*', Net::SSLeay::EVP_DigestFinal(${$_[0]})) } +sub DESTROY { Net::SSLeay::EVP_MD_CTX_destroy(${$_[0]}) }; + +sub sha1_hex { unpack('H*', Net::SSLeay::SHA1($_[0])) }; +sub sha256_hex { unpack('H*', Net::SSLeay::SHA256($_[0])) }; +*sha256 = \&Net::SSLeay::SHA256; +# end of eval +EOM + require Digest::SHA; # stdlib fallback + push @ISA, 'Digest::SHA'; + *sha1_hex = \&Digest::SHA::sha1_hex; + *sha256_hex = \&Digest::SHA::sha256_hex; + *sha256 = \&Digest::SHA::sha256; +} + +} # /BEGIN + +sub sha_all ($$) { + my ($n, $fh) = @_; + my ($dig, $buf) = (PublicInbox::SHA->new($n)); + while (sysread($fh, $buf, 65536)) { $dig->add($buf) } + $dig +} + +1; diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pm b/lib/PublicInbox/SaPlugin/ListMirror.pm index 9acf86c0..06903cad 100644 --- a/lib/PublicInbox/SaPlugin/ListMirror.pm +++ b/lib/PublicInbox/SaPlugin/ListMirror.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # SpamAssassin rules useful for running a mailing list mirror. We want to: @@ -39,7 +39,11 @@ sub check_list_mirror_received { my $v = $pms->get($hdr) or next; local $/ = "\n"; chomp $v; - next if $v ne $hval; + if (ref($hval)) { + next if $v !~ $hval; + } else { + next if $v ne $hval; + } return 1 if $recvd !~ $host_re; } @@ -91,6 +95,8 @@ sub config_list_mirror { $host_glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; my $host_re = qr/\A\s*from\s+$host_glob(?:\s|$)/si; + (lc($hdr) eq 'list-id' && $hval =~ /<([^>]+)>/) and + $hval = qr/\A<\Q$1\E>\z/; push @{$self->{list_mirror_check}}, [ $hdr, $hval, $host_re, $addr ]; } diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pod b/lib/PublicInbox/SaPlugin/ListMirror.pod index d931d762..e6a6c2ad 100644 --- a/lib/PublicInbox/SaPlugin/ListMirror.pod +++ b/lib/PublicInbox/SaPlugin/ListMirror.pod @@ -6,11 +6,11 @@ PublicInbox::SaPlugin::ListMirror - SpamAssassin plugin for mailing list mirrors loadplugin PublicInbox::SaPlugin::ListMirror -Declare some mailing lists based on the expected List-Id value, +Declare some mailing lists based on the expected List-ID value, expected servers, and mailing list address: - list_mirror List-Id <foo.example.com> *.example.com foo@example.com - list_mirror List-Id <bar.example.com> *.example.com bar@example.com + list_mirror List-ID <foo.example.com> *.example.com foo@example.com + list_mirror List-ID <bar.example.com> *.example.com bar@example.com Bump the score for messages which come from unexpected servers: @@ -43,14 +43,25 @@ C<allow_user_rules 1> =item list_mirror HEADER HEADER_VALUE HOSTNAME_GLOB [LIST_ADDRESS] -Declare a list based on an expected C<HEADER> matching C<HEADER_NAME> -exactly coming from C<HOSTNAME_GLOB>. C<LIST_ADDRESS> is optional, +Declare a list based on an expected C<HEADER> matching C<HEADER_VALUE> +coming from C<HOSTNAME_GLOB>. C<LIST_ADDRESS> is optional, but may specify the address of the mailing list being mirrored. -C<List-Id> or C<X-Mailing-List> are common values of C<HEADER> +C<List-ID> is the recommended value of C<HEADER> as most +mailing lists support it. An example of C<HEADER_VALUE> is C<E<lt>foo.example.orgE<gt>> -if C<HEADER> is C<List-Id>. +if C<HEADER> is C<List-ID>. + +As of public-inbox 2.0, using C<List-ID> as the C<HEADER> and a +C<HEADER_VALUE> contained by angle brackets (E<lt>list-idE<gt>), +matching is done in accordance with +L<RFC 2919|https://tools.ietf.org/html/rfc2919>. That is, +C<HEADER_VALUE> will be a case-insensitive substring match +and ignore the optional description C<phrase> as documented +in RFC 2919. + +All other C<HEADER> values use exact matches for backwards-compatibility. C<HOSTNAME_GLOB> may be a wildcard match for machines where mail can come from or an exact match. @@ -105,7 +116,7 @@ and L<http://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/meta =head1 COPYRIGHT -Copyright (C) 2016-2021 all contributors L<mailto:meta@public-inbox.org> +Copyright (C) all contributors L<mailto:meta@public-inbox.org> License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 2feb3e13..fbdb48a3 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -11,6 +11,7 @@ our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); use List::Util qw(max); use POSIX qw(strftime); use Carp (); +our $XHC = 0; # defined but false # values for searching, changing the numeric value breaks # compatibility with old indices (so don't change them it) @@ -56,23 +57,47 @@ use constant { }; use PublicInbox::Smsg; -use PublicInbox::Over; +eval { require PublicInbox::Over }; our $QP_FLAGS; our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); -our $Xap; # 'Search::Xapian' or 'Xapian' +our $Xap; # 'Xapian' or 'Search::Xapian' our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') # ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, # let's hope the ABI is stable our $ENQ_DESCENDING = 0; our $ENQ_ASCENDING = 1; +our @MAIL_VMAP = ( + [ YYYYMMDD, 'd:'], + [ TS, 'rt:' ], + # these are undocumented for WWW, but lei and IMAP use them + [ DT, 'dt:' ], + [ BYTES, 'z:' ], + [ UID, 'uid:' ] +); +our @MAIL_NRP; + +# Getopt::Long spec, only short options for portability in C++ implementation +our @XH_SPEC = ( + 'a', # ascending sort + 'c', # code search + 'd=s@', # shard dirs + 'g=s', # git dir (with -c) + 'k=i', # sort column (like sort(1)) + 'm=i', # maximum number of results + 'o=i', # offset + 'r', # 1=relevance then column + 't', # collapse threads + 'A=s@', # prefixes + 'K=i', # timeout kill after i seconds + 'O=s', # eidx_key + 'T=i', # threadid +); sub load_xapian () { return 1 if defined $Xap; - # n.b. PI_XAPIAN is intended for development use only. We still - # favor Search::Xapian since that's what's available in current - # Debian stable (10.x) and derived distros. - for my $x (($ENV{PI_XAPIAN} // 'Search::Xapian'), 'Xapian') { + # n.b. PI_XAPIAN is intended for development use only + for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') { eval "require $x"; next if $@; @@ -85,8 +110,7 @@ sub load_xapian () { # NumberRangeProcessor was added in Xapian 1.3.6, # NumberValueRangeProcessor was removed for 1.5.0+, - # favor the older /Value/ variant since that's what our - # (currently) preferred Search::Xapian supports + # continue with the older /Value/ variant for now... $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ? 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); @@ -101,6 +125,7 @@ sub load_xapian () { # or make indexlevel=medium as default $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() | FLAG_WILDCARD(); + @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP; return 1; } undef; @@ -110,43 +135,50 @@ sub load_xapian () { # a prefix common in patch emails our $LANG = 'english'; +our %PATCH_BOOL_COMMON = ( + dfpre => 'XDFPRE', + dfpost => 'XDFPOST', + dfblob => 'XDFPRE XDFPOST', + patchid => 'XDFID', +); + # note: the non-X term prefix allocations are shared with # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue lid => 'G', # newsGroup (or similar entity), just inside <> - dfpre => 'XDFPRE', - dfpost => 'XDFPOST', - dfblob => 'XDFPRE XDFPOST', - patchid => 'XDFID', + %PATCH_BOOL_COMMON ); -my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; -my %prob_prefix = ( - # for mairix compatibility +# for mairix compatibility +our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; +our %PATCH_PROB_COMMON = ( s => 'S', - m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial - l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial f => 'A', - t => 'XTO', - tc => 'XTO XCC', - c => 'XCC', - tcf => 'XTO XCC A', - a => 'XTO XCC A', - b => $non_quoted_body . ' XQUOT', - bs => $non_quoted_body . ' XQUOT S', + b => $NON_QUOTED_BODY . ' XQUOT', + bs => $NON_QUOTED_BODY . ' XQUOT S', n => 'XFN', q => 'XQUOT', - nq => $non_quoted_body, + nq => $NON_QUOTED_BODY, dfn => 'XDFN', dfa => 'XDFA', dfb => 'XDFB', dfhh => 'XDFHH', dfctx => 'XDFCTX', +); +my %prob_prefix = ( + m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial + t => 'XTO', + tc => 'XTO XCC', + c => 'XCC', + tcf => 'XTO XCC A', + a => 'XTO XCC A', + %PATCH_PROB_COMMON, # default: - '' => 'XM S A XQUOT XFN ' . $non_quoted_body, + '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY, ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian @@ -190,33 +222,37 @@ sub xdir ($;$) { my ($self, $rdonly) = @_; if ($rdonly || !defined($self->{shard})) { $self->{xpfx}; - } else { # v2 + extindex only: + } else { # v2, extindex, cindex only: "$self->{xpfx}/$self->{shard}"; } } -# returns all shards as separate Xapian::Database objects w/o combining -sub xdb_shards_flat ($) { +# returns shard directories as an array of strings, does not verify existence +sub shard_dirs ($) { my ($self) = @_; my $xpfx = $self->{xpfx}; - my (@xdb, $slow_phrase); - load_xapian(); - $self->{qp_flags} //= $QP_FLAGS; - if ($xpfx =~ m!/xapian[0-9]+\z!) { - @xdb = ($X{Database}->new($xpfx)); - $self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert"; - } else { + if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox + ($xpfx); + } else { # v2 inbox, eidx, cidx opendir(my $dh, $xpfx) or return (); # not initialized yet # We need numeric sorting so shard[0] is first for reading # Xapian metadata, if needed my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return (); - for (0..$last) { - my $shard_dir = "$self->{xpfx}/$_"; - push @xdb, $X{Database}->new($shard_dir); - $slow_phrase ||= -f "$shard_dir/iamchert"; - } - $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; + map { "$xpfx/$_" } (0..$last); } +} + +# returns all shards as separate Xapian::Database objects w/o combining +sub xdb_shards_flat ($) { + my ($self) = @_; + load_xapian(); + $self->{qp_flags} //= $QP_FLAGS; + my $slow_phrase; + my @xdb = map { + $slow_phrase ||= -f "$_/iamchert"; + $X{Database}->new($_); # raises if missing + } shard_dirs($self); + $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; @xdb; } @@ -229,6 +265,12 @@ sub mdocid { int(($docid - 1) / $nshard) + 1; } +sub docids_to_artnums { + my $nshard = shift->{nshard}; + # XXX does array vs arrayref make a difference in modern Perls? + map { int(($_ - 1) / $nshard) + 1 } @_; +} + sub mset_to_artnums { my ($self, $mset) = @_; my $nshard = $self->{nshard}; @@ -277,42 +319,18 @@ sub date_parse_prepare { my $end = $range =~ s/([\)\s]*)\z// ? $1 : ''; my @r = split(/\.\./, $range, 2); - # expand "d:20101002" => "d:20101002..20101003" and like + # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD - # We upgrade "d:" to "dt:" to iff using approxidate + # We upgrade "d:" to "dt:" unconditionally if ($pfx eq 'd') { - my $fmt = "\0%Y%m%d"; - if (!defined($r[1])) { - if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2})\z/) { - push @$to_parse, "$1-$2-$3"; - # we could've handled as-is, but we need - # to parse anyways for "d+" below - } else { - push @$to_parse, $r[0]; - if ($r[0] !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) { - $pfx = 'dt'; - $fmt = "\0%Y%m%d%H%M%S"; - } - } - $r[0] = "$fmt+$#$to_parse\0"; - $r[1] = "$fmt+\0"; - } else { - for my $x (@r) { - next if $x eq '' || $x =~ /\A[0-9]{8}\z/; - push @$to_parse, $x; - if ($x !~ /\A[0-9]{4}-[0-9]{2}-[0-9]{2}\z/) { - $pfx = 'dt'; - } - $x = "$fmt$#$to_parse\0"; - } - if ($pfx eq 'dt') { - for (@r) { - s/\0%Y%m%d/\0%Y%m%d%H%M%S/; - s/\A([0-9]{8})\z/${1}000000/; - } - } - } - } elsif ($pfx eq 'dt') { + $pfx = 'dt'; + # upgrade YYYYMMDD to YYYYMMDDHHMMSS + $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]] + [0-9]{2}[^[:alnum:]] + [0-9]{2}\z!x, @r)); + $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r)); + } + if ($pfx eq 'dt') { if (!defined($r[1])) { # git needs gaps and not /\d{14}/ if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2}) ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) { @@ -329,7 +347,7 @@ sub date_parse_prepare { $x = "\0%Y%m%d%H%M%S$#$to_parse\0"; } } - } else { # "rt", let git interpret "YYYY", deal with Y10K later :P + } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P for my $x (@r) { next if $x eq '' || $x =~ /\A[0-9]{5,}\z/; push @$to_parse, $x; @@ -388,13 +406,104 @@ sub query_approxidate { date_parse_finalize($git, $to_parse, $_[2]) if $to_parse; } -# read-only +# read-only, for mail only (codesearch has different rules) sub mset { - my ($self, $query_string, $opts) = @_; - $opts ||= {}; + my ($self, $qry_str, $opt) = @_; my $qp = $self->{qp} //= $self->qparse_new; - my $query = $qp->parse_query($query_string, $self->{qp_flags}); - _do_enquire($self, $query, $opts); + my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); + if (defined(my $eidx_key = $opt->{eidx_key})) { + $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key); + } + if (defined(my $uid_range = $opt->{uid_range})) { + my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, + sortable_serialise($uid_range->[0]), + sortable_serialise($uid_range->[1])); + $qry = $X{Query}->new(OP_FILTER(), $qry, $range); + } + if (defined(my $tid = $opt->{threadid})) { + $tid = sortable_serialise($tid); + $qry = $X{Query}->new(OP_FILTER(), $qry, + $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid)); + } + do_enquire($self, $qry, $opt, TS); +} + +sub xhc_start_maybe (@) { + require PublicInbox::XapClient; + my $xhc = PublicInbox::XapClient::start_helper(@_); + require PublicInbox::XhcMset if $xhc; + $xhc; +} + +sub xh_opt ($) { + my ($opt) = @_; + my $lim = $opt->{limit} || 50; + my @ret; + push @ret, '-o', $opt->{offset} if $opt->{offset}; + push @ret, '-m', $lim; + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + push @ret, '-k', '-1'; + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + push @ret, '-k', '-1'; + push @ret, '-a'; + } elsif ($rel == 0) { + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } else { # rel > 0 + push @ret, '-r'; + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } + push @ret, '-t' if $opt->{threads}; + push @ret, '-T', $opt->{threadid} if defined $opt->{threadid}; + push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; + @ret; +} + +# returns a true value if actually handled asynchronously, +# and a falsy value if handled synchronously +sub async_mset { + my ($self, $qry_str, $opt, $cb, @args) = @_; + if ($XHC) { # unconditionally retrieving pct + rank for now + xdb($self); # populate {nshards} + my @margs = ($self->xh_args, xh_opt($opt)); + my $ret = eval { + my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); + PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); + }; + $cb->(@args, undef, $@) if $@; + $ret; + } else { # synchronous + my $mset = $self->mset($qry_str, $opt); + $cb->(@args, $mset); + undef; + } +} + +sub do_enquire { # shared with CodeSearch + my ($self, $qry, $opt, $col) = @_; + my $enq = $X{Enquire}->new(xdb($self)); + $enq->set_query($qry); + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_DESCENDING); + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enq->set_sort_by_value_then_relevance($col, !$opt->{asc}); + } else { # rel > 0 + $enq->set_sort_by_relevance_then_value($col, !$opt->{asc}); + } + + # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse + # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword + (($opt->{threads} // 0) == 1 && has_threadid($self)) and + $enq->set_collapse_key(THREADID); + retry_reopen($self, \&enquire_once, $enq, + $opt->{offset} || 0, $opt->{limit} || 50); } sub retry_reopen { @@ -421,50 +530,15 @@ sub retry_reopen { Carp::croak("Too many Xapian database modifications in progress\n"); } -sub _do_enquire { - my ($self, $query, $opts) = @_; - retry_reopen($self, \&_enquire_once, $query, $opts); -} - # returns true if all docs have the THREADID value sub has_threadid ($) { my ($self) = @_; (xdb($self)->get_metadata('has_threadid') // '') eq '1'; } -sub _enquire_once { # retry_reopen callback - my ($self, $query, $opts) = @_; - my $xdb = xdb($self); - if (defined(my $eidx_key = $opts->{eidx_key})) { - $query = $X{Query}->new(OP_FILTER(), $query, 'O'.$eidx_key); - } - if (defined(my $uid_range = $opts->{uid_range})) { - my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, - sortable_serialise($uid_range->[0]), - sortable_serialise($uid_range->[1])); - $query = $X{Query}->new(OP_FILTER(), $query, $range); - } - my $enquire = $X{Enquire}->new($xdb); - $enquire->set_query($query); - $opts ||= {}; - my $rel = $opts->{relevance} // 0; - if ($rel == -2) { # ORDER BY docid/UID (highest first) - $enquire->set_weighting_scheme($X{BoolWeight}->new); - $enquire->set_docid_order($ENQ_DESCENDING); - } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) - $enquire->set_weighting_scheme($X{BoolWeight}->new); - $enquire->set_docid_order($ENQ_ASCENDING); - } elsif ($rel == 0) { - $enquire->set_sort_by_value_then_relevance(TS, !$opts->{asc}); - } else { # rel > 0 - $enquire->set_sort_by_relevance_then_value(TS, !$opts->{asc}); - } - - # `mairix -t / --threads' or JMAP collapseThreads - if ($opts->{threads} && has_threadid($self)) { - $enquire->set_collapse_key(THREADID); - } - $enquire->get_mset($opts->{offset} || 0, $opts->{limit} || 50); +sub enquire_once { # retry_reopen callback + my (undef, $enq, $offset, $limit) = @_; + $enq->get_mset($offset, $limit); } sub mset_to_smsg { @@ -481,29 +555,27 @@ sub mset_to_smsg { # read-write sub stemmer { $X{Stem}->new($LANG) } -# read-only -sub qparse_new { +sub qp_init_common { my ($self) = @_; - - my $xdb = xdb($self); my $qp = $X{QueryParser}->new; $qp->set_default_op(OP_AND()); - $qp->set_database($xdb); + $qp->set_database(xdb($self)); $qp->set_stemmer(stemmer($self)); $qp->set_stemming_strategy(STEM_SOME()); my $cb = $qp->can('set_max_wildcard_expansion') // $qp->can('set_max_expansion'); # Xapian 1.5.0+ $cb->($qp, 100); - $cb = $qp->can('add_valuerangeprocessor') // - $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ - $cb->($qp, $NVRP->new(YYYYMMDD, 'd:')); - $cb->($qp, $NVRP->new(DT, 'dt:')); + $qp; +} - # for IMAP, undocumented for WWW and may be split off go away - $cb->($qp, $NVRP->new(BYTES, 'z:')); - $cb->($qp, $NVRP->new(TS, 'rt:')); - $cb->($qp, $NVRP->new(UID, 'uid:')); +# read-only +sub qparse_new { + my ($self) = @_; + my $qp = qp_init_common($self); + my $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + $cb->($qp, $_) for @MAIL_NRP; while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix); } @@ -533,6 +605,40 @@ EOF $qp; } +sub generate_cxx () { # generates snippet for xap_helper.h + my $ret = <<EOM; +# line ${\__LINE__} "${\__FILE__}" +static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}]; +static void mail_nrp_init(void) +{ +EOM + for (0..$#MAIL_VMAP) { + my $x = $MAIL_VMAP[$_]; + $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n} + } +$ret .= <<EOM; +} + +# line ${\__LINE__} "${\__FILE__}" +static void qp_init_mail_search(Xapian::QueryParser *qp) +{ + for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++) + qp->ADD_RP(mail_nrp[i]); +EOM + for my $name (sort keys %bool_pfx_external) { + for (split(/ /, $bool_pfx_external{$name})) { + $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} + } + } + # TODO: altid support + for my $name (sort keys %prob_prefix) { + for (split(/ /, $prob_prefix{$name})) { + $ret .= qq{\tqp->add_prefix("$name", "$_");\n} + } + } + $ret .= "}\n"; +} + sub help { my ($self) = @_; $self->{qp} //= $self->qparse_new; # parse altids @@ -560,24 +666,57 @@ sub get_pct ($) { # mset item sub xap_terms ($$;@) { my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () - my %ret; my $end = $xdb_or_doc->termlist_end(@docid); my $cur = $xdb_or_doc->termlist_begin(@docid); + $cur->skip_to($pfx); + my (@ret, $tn); + my $pfxlen = length($pfx); for (; $cur != $end; $cur++) { - $cur->skip_to($pfx); - last if $cur == $end; - my $tn = $cur->get_termname; - $ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx); + $tn = $cur->get_termname; + index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen)); } - wantarray ? sort(keys(%ret)) : \%ret; + wantarray ? @ret : +{ map { $_ => undef } @ret }; } # get combined docid from over.num: -# (not generic Xapian, only works with our sharding scheme) +# (not generic Xapian, only works with our sharding scheme for mail) sub num2docid ($$) { my ($self, $num) = @_; my $nshard = $self->{nshard}; ($num - 1) * $nshard + $num % $nshard + 1; } +sub all_terms { + my ($self, $pfx) = @_; + my $cur = xdb($self)->allterms_begin($pfx); + my $end = $self->{xdb}->allterms_end($pfx); + my $pfxlen = length($pfx); + my @ret; + for (; $cur != $end; $cur++) { + push @ret, substr($cur->get_termname, $pfxlen); + } + wantarray ? @ret : +{ map { $_ => undef } @ret }; +} + +sub xh_args { # prep getopt args to feed to xap_helper.h socket + map { ('-d', $_) } shard_dirs($_[0]); +} + +sub docids_by_postlist ($$) { + my ($self, $q) = @_; + my $cur = $self->xdb->postlist_begin($q); + my $end = $self->{xdb}->postlist_end($q); + my @ids; + for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) }; + @ids; +} + +sub get_doc ($$) { + my ($self, $docid) = @_; + eval { $self->{xdb}->get_document($docid) } // do { + die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/; + undef; + } +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index bdb84fc7..1cbf6d23 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -9,7 +9,8 @@ package PublicInbox::SearchIdx; use strict; use v5.10.1; -use parent qw(PublicInbox::Search PublicInbox::Lock Exporter); +use parent qw(PublicInbox::Search PublicInbox::Lock PublicInbox::Umask + Exporter); use PublicInbox::Eml; use PublicInbox::Search qw(xap_terms); use PublicInbox::InboxWritable; @@ -21,7 +22,7 @@ use POSIX qw(strftime); use Fcntl qw(SEEK_SET); use Time::Local qw(timegm); use PublicInbox::OverIdx; -use PublicInbox::Spawn qw(spawn); +use PublicInbox::Spawn qw(run_wait popen_rd); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use PublicInbox::Address; @@ -37,12 +38,13 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : # typical 32-bit system: (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024); use constant DEBUG => !!$ENV{DEBUG}; -my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/; +my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; -my @VMD_MAP = (kw => 'K', L => 'L'); +my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; +our $PATCHID_BROKEN; sub new { my ($class, $ibx, $creat, $shard) = @_; @@ -62,6 +64,7 @@ sub new { die("Invalid indexlevel $ibx->{indexlevel}\n"); } } + undef $PATCHID_BROKEN; # retry on new instances in case of upgrades $ibx = PublicInbox::InboxWritable->new($ibx); my $self = PublicInbox::Search->new($ibx); bless $self, $class; @@ -90,7 +93,7 @@ sub new { $self; } -sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels } +sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels } sub idx_release { my ($self, $wake) = @_; @@ -113,15 +116,15 @@ sub load_xapian_writable () { *sortable_serialise = $xap.'::sortable_serialise'; $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); - my $ver = (eval($xap.'::major_version()') << 16) | - (eval($xap.'::minor_version()') << 8) | - eval($xap.'::revision()'); - if ($ver >= 0x10400) { + my $ver = eval 'v'.join('.', eval($xap.'::major_version()'), + eval($xap.'::minor_version()'), + eval($xap.'::revision()')); + if ($ver ge 1.4) { # new flags in Xapian 1.4 $DB_NO_SYNC = 0x4; $DB_DANGEROUS = 0x10; } # Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks - $X->{CLOEXEC_UNSET} = 1 if $ver >= 0x010215 && $ver <= 0x010218; + $X->{CLOEXEC_UNSET} = 1 if $ver ge v1.2.21 && $ver le v1.2.24; 1; } @@ -134,6 +137,7 @@ sub idx_acquire { load_xapian_writable(); $flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN; } + my $owner = $self->{ibx} // $self->{eidx} // $self; if ($self->{creat}) { require File::Path; $self->lock_acquire; @@ -145,14 +149,13 @@ sub idx_acquire { File::Path::mkpath($dir); require PublicInbox::Syscall; PublicInbox::Syscall::nodatacow_dir($dir); - $self->{-set_has_threadid_once} = 1; - if (($self->{ibx} // $self->{eidx})->{-dangerous}) { - $flag |= $DB_DANGEROUS; - } + # owner == self for CodeSearchIdx + $self->{-set_has_threadid_once} = 1 if $owner != $self; + $flag |= $DB_DANGEROUS if $owner->{-dangerous}; } } return unless defined $flag; - $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync}; + $flag |= $DB_NO_SYNC if $owner->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; @@ -177,9 +180,8 @@ sub term_generator ($) { # write-only sub index_phrase ($$$$) { my ($self, $text, $wdf_inc, $prefix) = @_; - my $tg = term_generator($self); - $tg->index_text($text, $wdf_inc, $prefix); - $tg->increase_termpos; + term_generator($self)->index_text($text, $wdf_inc, $prefix); + $self->{term_generator}->increase_termpos; } sub index_text ($$$$) { @@ -188,8 +190,8 @@ sub index_text ($$$$) { if ($self->{indexlevel} eq 'full') { index_phrase($self, $text, $wdf_inc, $prefix); } else { - my $tg = term_generator($self); - $tg->index_text_without_positions($text, $wdf_inc, $prefix); + term_generator($self)->index_text_without_positions( + $text, $wdf_inc, $prefix); } } @@ -270,7 +272,7 @@ sub index_diff ($$$) { push @$xnq, shift(@l); # skip base85 and empty lines - while (@l && ($l[0] =~ /$BASE85/o || + while (@l && ($l[0] =~ /\A$BASE85\h*\z/o || $l[0] !~ /\S/)) { shift @l; } @@ -350,6 +352,52 @@ sub index_diff ($$$) { index_text($self, join("\n", @$xnq), 1, 'XNQ'); } +sub index_body_text { + my ($self, $doc, $sref) = @_; + my $rd; + # start patch-id in parallel + if ($$sref =~ /^(?:diff|---|\+\+\+) /ms && !$PATCHID_BROKEN) { + my $git = ($self->{ibx} // $self->{eidx} // $self)->git; + my $fh = PublicInbox::IO::write_file '+>:utf8', undef, $$sref; + $fh->flush or die "flush: $!"; + sysseek($fh, 0, SEEK_SET); + $rd = popen_rd($git->cmd(qw(patch-id --stable)), undef, + { 0 => $fh }); + } + + # split off quoted and unquoted blocks: + my @sections = PublicInbox::MsgIter::split_quotes($$sref); + undef $$sref; # free memory + for my $txt (@sections) { + if ($txt =~ /\A>/) { + if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) { + # get rid of Base-85 noise + $txt =~ s/^([>\h]+(?:literal|delta) + \x20[0-9]+\r?\n) + (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx; + } + index_text($self, $txt, 0, 'XQUOT'); + } else { # does it look like a diff? + if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { + index_diff($self, \$txt, $doc); + } else { + index_text($self, $txt, 1, 'XNQ'); + } + } + undef $txt; # free memory + } + if (defined $rd) { # reap `git patch-id' + (readline($rd) // '') =~ /\A([a-f0-9]{40,})/ and + $doc->add_term('XDFID'.$1); + if (!$rd->close) { + my $c = 'git patch-id --stable'; + $PATCHID_BROKEN = ($? >> 8) == 129; + $PATCHID_BROKEN ? warn("W: $c requires git v2.1.0+\n") + : warn("W: $c failed: \$?=$? (non-fatal)"); + } + } +} + sub index_xapian { # msg_iter callback my $part = $_[0]->[0]; # ignore $depth and $idx my ($self, $doc) = @{$_[1]}; @@ -369,37 +417,7 @@ sub index_xapian { # msg_iter callback my ($s, undef) = msg_part_text($part, $ct); defined $s or return; $_[0]->[0] = $part = undef; # free memory - - if ($s =~ /^(?:diff|---|\+\+\+) /ms) { - open(my $fh, '+>:utf8', undef) or die "open: $!"; - open(my $eh, '+>', undef) or die "open: $!"; - $fh->autoflush(1); - print $fh $s or die "print: $!"; - sysseek($fh, 0, SEEK_SET) or die "sysseek: $!"; - my $id = ($self->{ibx} // $self->{eidx})->git->qx( - [qw(patch-id --stable)], - {}, { 0 => $fh, 2 => $eh }); - $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1); - seek($eh, 0, SEEK_SET) or die "seek: $!"; - while (<$eh>) { warn $_ } - } - - # split off quoted and unquoted blocks: - my @sections = PublicInbox::MsgIter::split_quotes($s); - undef $s; # free memory - for my $txt (@sections) { - if ($txt =~ /\A>/) { - index_text($self, $txt, 0, 'XQUOT'); - } else { - # does it look like a diff? - if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { - index_diff($self, \$txt, $doc); - } else { - index_text($self, $txt, 1, 'XNQ'); - } - } - undef $txt; # free memory - } + index_body_text($self, $doc, \$s); } sub index_list_id ($$$) { @@ -407,6 +425,7 @@ sub index_list_id ($$$) { for my $l ($hdr->header_raw('List-Id')) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; + $lid =~ tr/\n\t\r\0//d; # same rules as Message-ID $doc->add_boolean_term('G' . $lid); index_phrase($self, $lid, 1, 'XL'); # probabilistic } @@ -442,8 +461,7 @@ sub eml2doc ($$$;$) { add_val($doc, PublicInbox::Search::UID(), $smsg->{num}); add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid}); - my $tg = term_generator($self); - $tg->set_document($doc); + term_generator($self)->set_document($doc); index_headers($self, $smsg); if (defined(my $eidx_key = $smsg->{eidx_key})) { @@ -453,7 +471,7 @@ sub eml2doc ($$$;$) { index_ids($self, $doc, $eml, $mids); # by default, we maintain compatibility with v1.5.0 and earlier - # by writing to docdata.glass, users who never exect to downgrade can + # by writing to docdata.glass, users who never expect to downgrade can # use --skip-docdata if (!$self->{-skip_docdata}) { # WWW doesn't need {to} or {cc}, only NNTP @@ -540,9 +558,7 @@ sub add_message { sub _get_doc ($$) { my ($self, $docid) = @_; - my $doc = eval { $self->{xdb}->get_document($docid) }; - $doc // do { - warn "E: $@\n" if $@; + $self->get_doc($docid) // do { warn "E: #$docid missing in Xapian\n"; undef; } @@ -600,17 +616,16 @@ sub set_vmd { my ($self, $docid, $vmd) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; - my ($end, @rm, @add); + my ($v, @rm, @add); my @x = @VMD_MAP; + my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end); while (my ($field, $pfx) = splice(@x, 0, 2)) { my $set = $vmd->{$field} // next; my %keep = map { $_ => 1 } @$set; my %add = %keep; - $end //= $doc->termlist_end; - for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { - $cur->skip_to($pfx); - last if $cur == $end; - my $v = $cur->get_termname; + $cur->skip_to($pfx); # works due to @VMD_MAP order + for (; $cur != $end; $cur++) { + $v = $cur->get_termname; $v =~ s/\A$pfx//s or next; $keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v); } @@ -690,7 +705,7 @@ sub xdb_remove { my $xdb = $self->{xdb} // die 'BUG: missing {xdb}'; for my $docid (@docids) { eval { $xdb->delete_document($docid) }; - warn "E: #$docid not in in Xapian? $@\n" if $@; + warn "E: #$docid not in Xapian? $@\n" if $@; } } @@ -707,7 +722,6 @@ sub nr_quiet_rm { delete($_[0]->{-quiet_rm}) // 0 } sub index_git_blob_id { my ($doc, $pfx, $objid) = @_; - my $len = length($objid); for (my $len = length($objid); $len >= 7; ) { $doc->add_term($pfx.$objid); $objid = substr($objid, 0, --$len); @@ -801,7 +815,8 @@ sub unindex_both { # git->cat_async callback sub with_umask { my $self = shift; - ($self->{ibx} // $self->{eidx})->with_umask(@_); + my $owner = $self->{ibx} // $self->{eidx}; + $owner ? $owner->with_umask(@_) : $self->SUPER::with_umask(@_) } # called by public-inbox-index @@ -819,10 +834,10 @@ sub index_sync { } sub check_size { # check_async cb for -index --max-size=... - my ($oid, $type, $size, $arg, $git) = @_; - (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}"; + my (undef, $oid, $type, $size, $arg) = @_; + ($type // '') eq 'blob' or die "E: bad $oid in $arg->{git}->{git_dir}"; if ($size <= $arg->{max_size}) { - $git->cat_async($oid, $arg->{index_oid}, $arg); + $arg->{git}->cat_async($oid, $arg->{index_oid}, $arg); } else { warn "W: skipping $oid ($size > $arg->{max_size})\n"; } @@ -904,6 +919,7 @@ sub process_stack { $arg->{autime} = $at; $arg->{cotime} = $ct; if ($sync->{max_size}) { + $arg->{git} = $git; $git->check_async($oid, \&check_size, $arg); } else { $git->cat_async($oid, \&index_both, $arg); @@ -964,7 +980,7 @@ sub log2stack ($$$) { $stk->push_rec('m', $at, $ct, $oid, $cmt); } } - close $fh or die "git log failed: \$?=$?"; + $fh->close or die "git log failed: \$?=$?"; $stk //= PublicInbox::IdxStack->new; $stk->read_prepare; } @@ -989,9 +1005,7 @@ sub is_ancestor ($$$) { return 0 unless $git->check($cur); my $cmd = [ 'git', "--git-dir=$git->{git_dir}", qw(merge-base --is-ancestor), $cur, $tip ]; - my $pid = spawn($cmd); - waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; - $? == 0; + run_wait($cmd) == 0; } sub need_update ($$$$) { @@ -1052,7 +1066,11 @@ sub _index_sync { my $ibx = $self->{ibx}; local $self->{current_info} = "$ibx->{inboxdir}"; $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; - $ibx->git->batch_prepare; + + if ($X->{CLOEXEC_UNSET}) { + $ibx->git->cat_file($tip); + $ibx->git->check($tip); + } my $pr = $opt->{-progress}; my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; my $quit = quit_cb($sync); @@ -1088,8 +1106,10 @@ sub DESTROY { $_[0]->{lockfh} = undef; } -sub _begin_txn { +sub begin_txn_lazy { my ($self) = @_; + return if $self->{txn}; + my $restore = $self->with_umask; my $xdb = $self->{xdb} || idx_acquire($self); $self->{oidx}->begin_lazy if $self->{oidx}; $xdb->begin_transaction if $xdb; @@ -1097,13 +1117,8 @@ sub _begin_txn { $xdb; } -sub begin_txn_lazy { - my ($self) = @_; - $self->with_umask(\&_begin_txn, $self) if !$self->{txn}; -} - # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) -# This metadata is read by Admin::detect_indexlevel: +# This metadata is read by InboxWritable->detect_indexlevel: sub set_metadata_once { my ($self) = @_; @@ -1125,8 +1140,10 @@ sub set_metadata_once { } } -sub _commit_txn { +sub commit_txn_lazy { my ($self) = @_; + return unless delete($self->{txn}); + my $restore = $self->with_umask; if (my $eidx = $self->{eidx}) { $eidx->git->async_wait_all; $eidx->{transact_bytes} = 0; @@ -1138,12 +1155,6 @@ sub _commit_txn { $self->{oidx}->commit_lazy if $self->{oidx}; } -sub commit_txn_lazy { - my ($self) = @_; - delete($self->{txn}) and - $self->with_umask(\&_commit_txn, $self); -} - sub eidx_shard_new { my ($class, $eidx, $shard) = @_; my $self = bless { diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 000abd94..ea261bda 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -1,13 +1,13 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Internal interface for a single Xapian shard in V2 inboxes. # See L<public-inbox-v2-format(5)> for more info on how we shard Xapian package PublicInbox::SearchIdxShard; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::SearchIdx PublicInbox::IPC); use PublicInbox::OnDestroy; +use PublicInbox::Syscall qw($F_SETPIPE_SZ); sub new { my ($class, $v2w, $shard) = @_; # v2w may be ExtSearchIdx @@ -21,24 +21,21 @@ sub new { if ($v2w->{parallel}) { local $self->{-v2w_afc} = $v2w; $self->ipc_worker_spawn("shard[$shard]"); - # F_SETPIPE_SZ = 1031 on Linux; increasing the pipe size for - # inputs speeds V2Writable batch imports across 8 cores by - # nearly 20%. Since any of our responses are small, make - # the response pipe as small as possible - if ($^O eq 'linux') { - fcntl($self->{-ipc_req}, 1031, 1048576); - fcntl($self->{-ipc_res}, 1031, 4096); + # Increasing the pipe size for requests speeds V2 batch imports + # across 8 cores by nearly 20%. Since many of our responses + # are small, make the response pipe as small as possible + if ($F_SETPIPE_SZ) { + fcntl($self->{-ipc_req}, $F_SETPIPE_SZ, 1048576); + fcntl($self->{-ipc_res}, $F_SETPIPE_SZ, 4096); } } $self; } -sub _worker_done { +sub _worker_done { # OnDestroy cb my ($self) = @_; - if ($self->need_xapian) { - die "$$ $0 xdb not released\n" if $self->{xdb}; - } - die "$$ $0 still in transaction\n" if $self->{txn}; + die "BUG: $$ $0 xdb active" if $self->need_xapian && $self->{xdb}; + die "BUG: $$ $0 txn active" if $self->{txn}; } sub ipc_atfork_child { # called automatically before ipc_worker_loop @@ -47,8 +44,8 @@ sub ipc_atfork_child { # called automatically before ipc_worker_loop $v2w->atfork_child; # calls ipc_sibling_atfork_child on our siblings $v2w->{current_info} = "[$self->{shard}]"; # for $SIG{__WARN__} $self->begin_txn_lazy; - # caller must capture this: - PublicInbox::OnDestroy->new($$, \&_worker_done, $self); + # caller (ipc_worker_spawn) must capture this: + on_destroy \&_worker_done, $self; } sub index_eml { @@ -65,7 +62,7 @@ sub echo { sub idx_close { my ($self) = @_; - die "transaction in progress $self\n" if $self->{txn}; + die "BUG: $$ $0 txn active" if $self->{txn}; $self->idx_release if $self->{xdb}; } diff --git a/lib/PublicInbox/SearchQuery.pm b/lib/PublicInbox/SearchQuery.pm index a6b7d843..747e3249 100644 --- a/lib/PublicInbox/SearchQuery.pm +++ b/lib/PublicInbox/SearchQuery.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # used by PublicInbox::SearchView and PublicInbox::WwwListing @@ -6,7 +6,7 @@ package PublicInbox::SearchQuery; use strict; use v5.10.1; use URI::Escape qw(uri_escape); -use PublicInbox::MID qw(MID_ESC); +use PublicInbox::Hval qw(ascii_html); our $LIM = 200; sub new { @@ -16,10 +16,11 @@ sub new { my $t = $qp->{t}; # collapse threads my ($l) = (($qp->{l} || '') =~ /([0-9]+)/); $l = $LIM if !$l || $l > $LIM; + my ($o) = (($qp->{o} || '0') =~ /(-?[0-9]+)/); bless { q => $qp->{'q'}, x => $qp->{x} || '', - o => (($qp->{o} || '0') =~ /(-?[0-9]+)/), + o => $o, l => $l, r => (defined $r && $r ne '0'), t => (defined $t && $t ne '0'), @@ -34,9 +35,13 @@ sub qs_html { } my $qs = ''; if (defined(my $q = $self->{'q'})) { - $q = uri_escape($q, MID_ESC); + # not using MID_ESC since that's for the path component and + # this is for the query component. Unlike MID_ESC, + # this disallows [\&\'\+=] and allows slash [/] for + # nicer looking dfn: queries + $q = uri_escape($q, '^A-Za-z0-9\-\._~!\$\(\)\*,;:@/'); $q =~ s/%20/+/g; # improve URL readability - $qs .= "q=$q"; + $qs .= 'q='.ascii_html($q); } if (my $o = $self->{o}) { # ignore o == 0 $qs .= "&o=$o"; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index b025ec96..9919e25c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -30,59 +30,67 @@ sub mbox_results { sub sres_top_html { my ($ctx) = @_; - my $srch = $ctx->{ibx}->isrch or + my $srch = $ctx->{srch} = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); my $q = PublicInbox::SearchQuery->new($ctx->{qp}); - my $x = $q->{x}; - my $o = $q->{o}; + my $o = $q->{o} // 0; my $asc; if ($o < 0) { $asc = 1; $o = -($o + 1); # so [-1] is the last element, like Perl lists } - my $code = 200; # double the limit for expanded views: - my $opts = { + my $opt = { limit => $q->{l}, offset => $o, relevance => $q->{r}, threads => $q->{t}, asc => $asc, }; - my ($mset, $total, $err, $html); -retry: - eval { - my $query = $q->{'q'}; - $srch->query_approxidate($ctx->{ibx}->git, $query); - $mset = $srch->mset($query, $opts); - $total = $mset->get_matches_estimated; - }; - $err = $@; + my $qs = $q->{'q'}; + $srch->query_approxidate($ctx->{ibx}->git, $qs); + sub { + $ctx->{wcb} = $_[0]; # PSGI server supplied write cb + $srch->async_mset($qs, $opt, \&sres_html_cb, $ctx, $opt, $q); + } +} + +sub sres_html_cb { # async_mset cb + my ($ctx, $opt, $q, $mset, $err) = @_; + my $code = 200; + my $total = $mset ? $mset->get_matches_estimated : undef; ctx_prepare($q, $ctx); + my ($res, $html); if ($err) { $code = 400; $html = '<pre>'.err_txt($ctx, $err).'</pre><hr>'; } elsif ($total == 0) { - if (defined($ctx->{-uxs_retried})) { - # undo retry damage: + if (defined($ctx->{-uxs_retried})) { # undo retry damage: $q->{'q'} = $ctx->{-uxs_retried}; - } elsif (index($q->{'q'}, '%') >= 0) { + } elsif (index($q->{'q'}, '%') >= 0) { # retry unescaped $ctx->{-uxs_retried} = $q->{'q'}; - $q->{'q'} = uri_unescape($q->{'q'}); - goto retry; + my $qs = $q->{'q'} = uri_unescape($q->{'q'}); + $ctx->{srch}->query_approxidate($ctx->{ibx}->git, $qs); + return $ctx->{srch}->async_mset($qs, $opt, + \&sres_html_cb, $ctx, $opt, $q); } $code = 404; $html = "<pre>\n[No results found]</pre><hr>"; + } elsif ($q->{x} eq 'A') { + $res = adump($mset, $q, $ctx); } else { - return adump($_[0], $mset, $q, $ctx) if $x eq 'A'; - $ctx->{-html_tip} = search_nav_top($mset, $q, $ctx); - return mset_thread($ctx, $mset, $q) if $x eq 't'; - mset_summary($ctx, $mset, $q); # appends to {-html_tip} - $html = ''; + if ($q->{x} eq 't') { + $res = mset_thread($ctx, $mset, $q); + } else { + mset_summary($ctx, $mset, $q); # appends to {-html_tip} + $html = ''; + } } - html_oneshot($ctx, $code); + $res //= html_oneshot($ctx, $code, $html); + my $wcb = delete $ctx->{wcb}; + ref($res) eq 'CODE' ? $res->($wcb) : $wcb->($res); } # display non-nested search results similar to what users expect from @@ -134,7 +142,7 @@ sub mset_summary { $q->{-min_pct} = $min; $q->{-max_pct} = $max; } - $$res .= search_nav_bot($mset, $q); + $$res .= search_nav_bot($ctx, $mset, $q); undef; } @@ -225,7 +233,7 @@ EOM } sub search_nav_bot { # also used by WwwListing for searching extindex miscidx - my ($mset, $q) = @_; + my ($ctx, $mset, $q) = @_; my $total = $mset->get_matches_estimated; my $l = $q->{l}; my $rv = '</pre><hr><pre id=t>'; @@ -274,9 +282,10 @@ sub search_nav_bot { # also used by WwwListing for searching extindex miscidx $rv .= qq{<a\nhref="?$prev"\nrel=prev>prev $pd</a>} if $prev; my $rev = $q->qs_html(o => $o < 0 ? 0 : -1); - $rv .= qq{ | <a\nhref="?$rev">reverse</a>} . - q{ | sort options + mbox downloads } . - q{<a href=#d>above</a></pre>}; + $rv .= qq{ | <a\nhref="?$rev">reverse</a>}; + exists($ctx->{ibx}) and + $rv .= q{ | options <a href=#d>above</a></pre>}; + $rv; } sub sort_relevance { @@ -301,9 +310,9 @@ sub mset_thread { my $rootset = PublicInbox::SearchThread::thread($msgs, $r ? \&sort_relevance : \&PublicInbox::View::sort_ds, $ctx); - my $skel = search_nav_bot($mset, $q). - "<pre>-- links below jump to the message on this page --\n"; - + my $skel = search_nav_bot($ctx, $mset, $q).'<pre>'. <<EOM; +-- pct% links below jump to the message on this page, permalinks otherwise -- +EOM $ctx->{-upfx} = ''; $ctx->{anchor_idx} = 1; $ctx->{cur_level} = 0; @@ -321,20 +330,20 @@ sub mset_thread { # link $INBOX_DIR/description text to "recent" view around # the newest message in this result set: - $ctx->{-t_max} = max(map { delete $_->{ts} } @$msgs); + $ctx->{-t_max} = max(map { $_->{ts} } @$msgs); @$msgs = reverse @$msgs if $r; $ctx->{msgs} = $msgs; - PublicInbox::WwwStream::aresponse($ctx, 200, \&mset_thread_i); + PublicInbox::WwwStream::aresponse($ctx, \&mset_thread_i); } # callback for PublicInbox::WwwStream::getline sub mset_thread_i { my ($ctx, $eml) = @_; - $ctx->zmore($ctx->html_top) if exists $ctx->{-html_tip}; + print { $ctx->zfh } $ctx->html_top if exists $ctx->{-html_tip}; $eml and return PublicInbox::View::eml_entry($ctx, $eml); my $smsg = shift @{$ctx->{msgs}} or - $ctx->zmore(${delete($ctx->{skel})}); + print { $ctx->zfh } ${delete($ctx->{skel})}; $smsg; } @@ -356,10 +365,10 @@ sub ctx_prepare { } sub adump { - my ($cb, $mset, $q, $ctx) = @_; + my ($mset, $q, $ctx) = @_; $ctx->{ids} = $ctx->{ibx}->isrch->mset_to_artnums($mset); $ctx->{search_query} = $q; # used by WwwAtomStream::atom_header - PublicInbox::WwwAtomStream->response($ctx, 200, \&adump_i); + PublicInbox::WwwAtomStream->response($ctx, \&adump_i); } # callback for PublicInbox::WwwAtomStream::getline diff --git a/lib/PublicInbox/Select.pm b/lib/PublicInbox/Select.pm new file mode 100644 index 00000000..face8edc --- /dev/null +++ b/lib/PublicInbox/Select.pm @@ -0,0 +1,43 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# This makes select(2) look like epoll to simplify the code in DS.pm. +# Unlike IO::Select, it does NOT hold references to IO handles. +# This is NOT meant to be an all encompassing emulation of epoll +# via select, but only to support cases we care about. +package PublicInbox::Select; +use v5.12; +use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT); +use Errno; + +sub new { bless {}, __PACKAGE__ } # fd => events + +sub ep_wait { + my ($self, $msec, $events) = @_; + my ($rvec, $wvec) = ('', ''); # we don't use EPOLLERR + while (my ($fd, $ev) = each %$self) { + vec($rvec, $fd, 1) = 1 if $ev & EPOLLIN; + vec($wvec, $fd, 1) = 1 if $ev & EPOLLOUT; + } + @$events = (); + my $to = $msec < 0 ? undef : ($msec/1000); + my $n = select $rvec, $wvec, undef, $to or return; # timeout expired + return if $n < 0 && $! == Errno::EINTR; # caller recalculates timeout + die "select: $!" if $n < 0; + while (my ($fd, $ev) = each %$self) { + if (vec($rvec, $fd, 1) || vec($wvec, $fd, 1)) { + delete($self->{$fd}) if $ev & EPOLLONESHOT; + push @$events, $fd; + } + } + $n == scalar(@$events) or + warn "BUG? select() returned $n, but got ".scalar(@$events); +} + +sub ep_del { delete($_[0]->{fileno($_[1])}); 0 } +sub ep_add { $_[0]->{fileno($_[1])} = $_[2]; 0 } + +no warnings 'once'; +*ep_mod = \&ep_add; + +1; diff --git a/lib/PublicInbox/SharedKV.pm b/lib/PublicInbox/SharedKV.pm index 90ccf2b4..89ab3f74 100644 --- a/lib/PublicInbox/SharedKV.pm +++ b/lib/PublicInbox/SharedKV.pm @@ -11,7 +11,7 @@ use parent qw(PublicInbox::Lock); use File::Temp qw(tempdir); use DBI qw(:sql_types); # SQL_BLOB use PublicInbox::Spawn; -use File::Path qw(rmtree make_path); +use File::Path qw(rmtree); sub dbh { my ($self, $lock) = @_; @@ -43,7 +43,7 @@ CREATE TABLE IF NOT EXISTS kv ( sub new { my ($cls, $dir, $base, $opt) = @_; my $self = bless { opt => $opt }, $cls; - make_path($dir) if defined($dir) && !-d $dir; + File::Path::mkpath($dir) if defined($dir); $dir //= $self->{"tmp$$.$self"} = tempdir("skv.$$-XXXX", TMPDIR => 1); $base //= ''; my $f = $self->{filename} = "$dir/$base.sqlite3"; diff --git a/lib/PublicInbox/Sigfd.pm b/lib/PublicInbox/Sigfd.pm index 81e5a1b1..b8a1ddfb 100644 --- a/lib/PublicInbox/Sigfd.pm +++ b/lib/PublicInbox/Sigfd.pm @@ -1,43 +1,35 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Wraps a signalfd (or similar) for PublicInbox::DS # fields: (sig: hashref similar to %SIG, but signal numbers as keys) package PublicInbox::Sigfd; -use strict; +use v5.12; use parent qw(PublicInbox::DS); -use PublicInbox::Syscall qw(signalfd EPOLLIN EPOLLET); +use PublicInbox::Syscall qw(signalfd EPOLLIN EPOLLET %SIGNUM); use POSIX (); # returns a coderef to unblock signals if neither signalfd or kqueue # are available. sub new { - my ($class, $sig, $nonblock) = @_; + my ($class, $sig) = @_; my %signo = map {; - my $cb = $sig->{$_}; - # SIGWINCH is 28 on FreeBSD, NetBSD, OpenBSD - my $num = ($_ eq 'WINCH' && $^O =~ /linux|bsd/i) ? 28 : do { - my $m = "SIG$_"; - POSIX->$m; - }; - $num => $cb; + # $num => [ $cb, $signame ]; + ($SIGNUM{$_} // POSIX->can("SIG$_")->()) => [ $sig->{$_}, $_ ] } keys %$sig; my $self = bless { sig => \%signo }, $class; my $io; - my $fd = signalfd([keys %signo], $nonblock); + my $fd = signalfd([keys %signo]); if (defined $fd && $fd >= 0) { open($io, '+<&=', $fd) or die "open: $!"; } elsif (eval { require PublicInbox::DSKQXS }) { - $io = PublicInbox::DSKQXS->signalfd([keys %signo], $nonblock); + $io = PublicInbox::DSKQXS->signalfd([keys %signo]); } else { return; # wake up every second to check for signals } - if ($nonblock) { # it can go into the event loop - $self->SUPER::new($io, EPOLLIN | EPOLLET); - } else { # master main loop - $self->{sock} = $io; - $self; - } + $self->SUPER::new($io, EPOLLIN | EPOLLET); + $self->{is_kq} = 1 if tied(*$io); + $self; } # PublicInbox::Daemon in master main loop (blocking) @@ -50,8 +42,8 @@ sub wait_once ($) { for my $off (0..$nr) { # the first uint32_t of signalfd_siginfo: ssi_signo my $signo = unpack('L', substr($buf, 128 * $off, 4)); - my $cb = $self->{sig}->{$signo}; - $cb->($signo) if $cb ne 'IGNORE'; + my ($cb, $signame) = @{$self->{sig}->{$signo}}; + $cb->($signame) if $cb ne 'IGNORE'; } } $r; diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 260ce6bb..b132381b 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -99,9 +99,6 @@ sub populate { # to protect git and NNTP clients $val =~ tr/\0\t\n/ /; - # rare: in case headers have wide chars (not RFC2047-encoded) - utf8::decode($val); - # lower-case fields for read-only stuff $self->{lc($f)} = $val; @@ -115,8 +112,10 @@ sub populate { $self->{$f} = $val if $val ne ''; } $sync //= {}; - $self->{-ds} = [ my @ds = msg_datestamp($hdr, $sync->{autime}) ]; - $self->{-ts} = [ my @ts = msg_timestamp($hdr, $sync->{cotime}) ]; + my @ds = msg_datestamp($hdr, $sync->{autime} // $self->{ds}); + my @ts = msg_timestamp($hdr, $sync->{cotime} // $self->{ts}); + $self->{-ds} = \@ds; + $self->{-ts} = \@ts; $self->{ds} //= $ds[0]; # no zone $self->{ts} //= $ts[0]; $self->{mid} //= mids($hdr)->[0]; diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index d3567aa2..296e7d17 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -11,13 +11,16 @@ package PublicInbox::SolverGit; use strict; use v5.10.1; use File::Temp 0.19 (); # 0.19 for ->newdir +use autodie qw(mkdir); use Fcntl qw(SEEK_SET); use PublicInbox::Git qw(git_unquote git_quote); +use PublicInbox::IO qw(write_file); use PublicInbox::MsgIter qw(msg_part_text); use PublicInbox::Qspawn; use PublicInbox::Tmpfile; use PublicInbox::GitAsyncCat; use PublicInbox::Eml; +use PublicInbox::Compat qw(uniqstr); use URI::Escape qw(uri_escape_utf8); # POSIX requires _POSIX_ARG_MAX >= 4096, and xargs is required to @@ -79,11 +82,13 @@ sub solve_existing ($$) { my $try = $want->{try_gits} //= [ @{$self->{gits}} ]; # array copy my $git = shift @$try or die 'BUG {try_gits} empty'; my $oid_b = $want->{oid_b}; + + # can't use async_check due to last_check_err :< my ($oid_full, $type, $size) = $git->check($oid_b); + $git->schedule_cleanup if $self->{psgi_env}->{'pi-httpd.async'}; - # other than {oid_b, try_gits, try_ibxs} - my $have_hints = scalar keys %$want > 3; - if (defined($type) && (!$have_hints || $type eq 'blob')) { + if ($oid_b eq ($oid_full // '') || (defined($type) && + (!$self->{have_hints} || $type eq 'blob'))) { delete $want->{try_gits}; return [ $git, $oid_full, $type, int($size) ]; # done, success } @@ -106,6 +111,11 @@ sub solve_existing ($$) { scalar(@$try); } +sub _tmp { + $_[0]->{tmp} //= + File::Temp->newdir("solver.$_[0]->{oid_want}-XXXX", TMPDIR => 1); +} + sub extract_diff ($$) { my ($p, $arg) = @_; my ($self, $want, $smsg) = @$arg; @@ -193,10 +203,8 @@ sub extract_diff ($$) { my $path = ++$self->{tot}; $di->{n} = $path; - open(my $tmp, '>:utf8', $self->{tmp}->dirname . "/$path") or - die "open(tmp): $!"; - print $tmp $di->{hdr_lines}, $patch or die "print(tmp): $!"; - close $tmp or die "close(tmp): $!"; + my $f = _tmp($self)->dirname."/$path"; + write_file '>:utf8', $f, $di->{hdr_lines}, $patch; # for debugging/diagnostics: $di->{ibx} = $want->{cur_ibx}; @@ -242,14 +250,18 @@ sub find_smsgs ($$$) { sub update_index_result ($$) { my ($bref, $self) = @_; - my ($qsp, $msg) = delete @$self{qw(-qsp -msg)}; - if (my $err = $qsp->{err}) { - ERR($self, "git update-index error: $err"); - } + my ($qsp_err, $msg) = delete @$self{qw(-qsp_err -msg)}; + ERR($self, "git update-index error:$qsp_err") if $qsp_err; dbg($self, $msg); next_step($self); # onto do_git_apply } +sub qsp_qx ($$$) { + my ($self, $qsp, $cb) = @_; + $qsp->{qsp_err} = \($self->{-qsp_err} = ''); + $qsp->psgi_qx($self->{psgi_env}, $self->{limiter}, $cb, $self); +} + sub prepare_index ($) { my ($self) = @_; my $patches = $self->{patches}; @@ -278,47 +290,33 @@ sub prepare_index ($) { my $cmd = [ qw(git update-index -z --index-info) ]; my $qsp = PublicInbox::Qspawn->new($cmd, $self->{git_env}, $rdr); $path_a = git_quote($path_a); - $self->{-qsp} = $qsp; $self->{-msg} = "index prepared:\n$mode_a $oid_full\t$path_a"; - $qsp->psgi_qx($self->{psgi_env}, undef, \&update_index_result, $self); + qsp_qx $self, $qsp, \&update_index_result; } # pure Perl "git init" sub do_git_init ($) { my ($self) = @_; - my $dir = $self->{tmp}->dirname; - my $git_dir = "$dir/git"; + my $git_dir = _tmp($self)->dirname.'/git'; - foreach ('', qw(objects refs objects/info refs/heads)) { - mkdir("$git_dir/$_") or die "mkdir $_: $!"; - } - open my $fh, '>', "$git_dir/config" or die "open git/config: $!"; + mkdir("$git_dir/$_") for ('', qw(objects refs objects/info refs/heads)); my $first = $self->{gits}->[0]; my $fmt = $first->object_format; - my $v = defined($$fmt) ? 1 : 0; - print $fh <<EOF or die "print git/config $!"; + my ($v, @ext) = defined($$fmt) ? (1, <<EOM) : (0); +[extensions] + objectformat = $$fmt +EOM + write_file '>', "$git_dir/config", <<EOF, @ext; [core] repositoryFormatVersion = $v filemode = true bare = false logAllRefUpdates = false EOF - print $fh <<EOM if defined($$fmt); -[extensions] - objectformat = $$fmt -EOM - close $fh or die "close git/config: $!"; - - open $fh, '>', "$git_dir/HEAD" or die "open git/HEAD: $!"; - print $fh "ref: refs/heads/master\n" or die "print git/HEAD: $!"; - close $fh or die "close git/HEAD: $!"; - - my $f = 'objects/info/alternates'; - open $fh, '>', "$git_dir/$f" or die "open: $f: $!"; - foreach my $git (@{$self->{gits}}) { - print $fh $git->git_path('objects'),"\n" or die "print $f: $!"; - } - close $fh or die "close: $f: $!"; + write_file '>', "$git_dir/HEAD", "ref: refs/heads/master\n"; + write_file '>', "$git_dir/objects/info/alternates", map { + $_->git_path('objects')."\n" + } @{$self->{gits}}; my $tmp_git = $self->{tmp_git} = PublicInbox::Git->new($git_dir); $tmp_git->{-tmp} = $self->{tmp}; $self->{git_env} = { @@ -384,12 +382,9 @@ sub event_step ($) { } sub next_step ($) { - my ($self) = @_; # if outside of public-inbox-httpd, caller is expected to be # looping event_step, anyways - my $async = $self->{psgi_env}->{'pi-httpd.async'} or return; - # PublicInbox::HTTPD::Async->new - $async->(undef, undef, $self); + PublicInbox::DS::requeue($_[0]) if $_[0]->{psgi_env}->{'pi-httpd.async'} } sub mark_found ($$$) { @@ -405,21 +400,18 @@ sub mark_found ($$$) { sub parse_ls_files ($$) { my ($self, $bref) = @_; - my ($qsp, $di) = delete @$self{qw(-qsp -cur_di)}; - if (my $err = $qsp->{err}) { - die "git ls-files error: $err"; - } + my ($qsp_err, $di) = delete @$self{qw(-qsp_err -cur_di)}; + die "git ls-files -s -z error:$qsp_err" if $qsp_err; - my ($line, @extra) = split(/\0/, $$bref); + my @ls = split(/\0/, $$bref); + my ($line, @extra) = grep(/\t\Q$di->{path_b}\E\z/, @ls); scalar(@extra) and die "BUG: extra files in index: <", - join('> <', @extra), ">"; - + join('> <', $line, @extra), ">"; + $line // die "no \Q$di->{path_b}\E in <",join('> <', @ls), '>'; my ($info, $file) = split(/\t/, $line, 2); my ($mode_b, $oid_b_full, $stage) = split(/ /, $info); - if ($file ne $di->{path_b}) { - die + $file eq $di->{path_b} or die "BUG: index mismatch: file=$file != path_b=$di->{path_b}"; - } my $tmp_git = $self->{tmp_git} or die 'no git working tree'; my (undef, undef, $size) = $tmp_git->check($oid_b_full); @@ -454,50 +446,49 @@ sub skip_identical ($$$) { } } -sub apply_result ($$) { +sub apply_result ($$) { # qx_cb my ($bref, $self) = @_; - my ($qsp, $di) = delete @$self{qw(-qsp -cur_di)}; + my ($qsp_err, $di) = delete @$self{qw(-qsp_err -cur_di)}; dbg($self, $$bref); my $patches = $self->{patches}; - if (my $err = $qsp->{err}) { - my $msg = "git apply error: $err"; + if ($qsp_err) { + my $msg = "git apply error:$qsp_err"; my $nxt = $patches->[0]; if ($nxt && oids_same_ish($nxt->{oid_b}, $di->{oid_b})) { dbg($self, $msg); dbg($self, 'trying '.di_url($self, $nxt)); return do_git_apply($self); } else { - ERR($self, $msg); + $msg .= " (no patches left to try for $di->{oid_b})\n"; + dbg($self, $msg); + return done($self, undef); } } else { skip_identical($self, $patches, $di->{oid_b}); } my @cmd = qw(git ls-files -s -z); - $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}); + my $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}); $self->{-cur_di} = $di; - $self->{-qsp} = $qsp; - $qsp->psgi_qx($self->{psgi_env}, undef, \&ls_files_result, $self); + qsp_qx $self, $qsp, \&ls_files_result; } sub do_git_apply ($) { my ($self) = @_; - my $dn = $self->{tmp}->dirname; my $patches = $self->{patches}; # we need --ignore-whitespace because some patches are CRLF my @cmd = (qw(git apply --cached --ignore-whitespace --unidiff-zero --whitespace=warn --verbose)); my $len = length(join(' ', @cmd)); - my $total = $self->{tot}; my $di; # keep track of the last one for "git ls-files" my $prv_oid_b; do { my $i = ++$self->{nr}; $di = shift @$patches; - dbg($self, "\napplying [$i/$total] " . di_url($self, $di) . - "\n" . $di->{hdr_lines}); + dbg($self, "\napplying [$i/$self->{nr_p}] " . + di_url($self, $di) . "\n" . $di->{hdr_lines}); my $path = $di->{n}; $len += length($path) + 1; push @cmd, $path; @@ -505,11 +496,10 @@ sub do_git_apply ($) { } while (@$patches && $len < $ARG_SIZE_MAX && !oids_same_ish($patches->[0]->{oid_b}, $prv_oid_b)); - my $opt = { 2 => 1, -C => $dn, quiet => 1 }; + my $opt = { 2 => 1, -C => _tmp($self)->dirname, quiet => 1 }; my $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}, $opt); $self->{-cur_di} = $di; - $self->{-qsp} = $qsp; - $qsp->psgi_qx($self->{psgi_env}, undef, \&apply_result, $self); + qsp_qx $self, $qsp, \&apply_result; } sub di_url ($$) { @@ -558,8 +548,9 @@ sub extract_diffs_done { my $diffs = delete $self->{tmp_diffs}; if (scalar @$diffs) { unshift @{$self->{patches}}, @$diffs; - dbg($self, "found $want->{oid_b} in " . join(" ||\n\t", - map { di_url($self, $_) } @$diffs)); + my @u = uniqstr(map { di_url($self, $_) } @$diffs); + dbg($self, "found $want->{oid_b} in " . join(" ||\n\t", @u)); + ++$self->{nr_p}; # good, we can find a path to the oid we $want, now # lets see if we need to apply more patches: @@ -641,7 +632,7 @@ sub resolve_patch ($$) { # scan through inboxes to look for emails which results in # the oid we want: - my $ibx = shift(@{$want->{try_ibxs}}) or die 'BUG: {try_ibxs} empty'; + my $ibx = shift(@{$want->{try_ibxs}}) or return done($self, undef); if (my $msgs = find_smsgs($self, $ibx, $want)) { $want->{try_smsgs} = $msgs; $want->{cur_ibx} = $ibx; @@ -655,15 +646,19 @@ sub resolve_patch ($$) { # so user_cb never references the SolverGit object sub new { my ($class, $ibx, $user_cb, $uarg) = @_; + my $gits = $ibx ? $ibx->{-repo_objs} : undef; + + # FIXME: cindex --join= is super-aggressive and may hit too many + $gits = [ @$gits[0..2] ] if $gits && @$gits > 3; - bless { - gits => $ibx->{-repo_objs}, + bless { # $ibx is undef if coderepo only (see WwwCoderepo) + gits => $gits, user_cb => $user_cb, uarg => $uarg, - # -cur_di, -qsp, -msg => temporary fields for Qspawn callbacks + # -cur_di, -qsp_err, -msg => temp fields for Qspawn callbacks # TODO: config option for searching related inboxes - inboxes => [ $ibx ], + inboxes => $ibx ? [ $ibx ] : [], }, $class; } @@ -682,17 +677,16 @@ sub solve ($$$$$) { $self->{oid_want} = $oid_want; $self->{out} = $out; $self->{seen_oid} = {}; - $self->{tot} = 0; + $self->{tot} = $self->{nr_p} = 0; $self->{psgi_env} = $env; + $self->{have_hints} = 1 if scalar keys %$hints; $self->{todo} = [ { %$hints, oid_b => $oid_want } ]; $self->{patches} = []; # [ $di, $di, ... ] $self->{found} = {}; # { abbr => [ ::Git, oid, type, size, $di ] } - $self->{tmp} = File::Temp->newdir("solver.$oid_want-XXXX", TMPDIR => 1); dbg($self, "solving $oid_want ..."); - if (my $async = $env->{'pi-httpd.async'}) { - # PublicInbox::HTTPD::Async->new - $async->(undef, undef, $self); + if ($env->{'pi-httpd.async'}) { + PublicInbox::DS::requeue($self); } else { event_step($self) while $self->{user_cb}; } diff --git a/lib/PublicInbox/Spamcheck.pm b/lib/PublicInbox/Spamcheck.pm index d8fa80c8..fbf9355d 100644 --- a/lib/PublicInbox/Spamcheck.pm +++ b/lib/PublicInbox/Spamcheck.pm @@ -1,21 +1,17 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Spamchecking used by -watch and -mda tools package PublicInbox::Spamcheck; -use strict; -use warnings; +use v5.12; sub get { my ($cfg, $key, $default) = @_; - my $spamcheck = $cfg->{$key}; - $spamcheck = $default unless $spamcheck; + my $spamcheck = $cfg->{$key} || $default; return if !$spamcheck || $spamcheck eq 'none'; - if ($spamcheck eq 'spamc') { - $spamcheck = 'PublicInbox::Spamcheck::Spamc'; - } + $spamcheck = 'PublicInbox::Spamcheck::Spamc' if $spamcheck eq 'spamc'; if ($spamcheck =~ /::/) { eval "require $spamcheck"; return $spamcheck->new; diff --git a/lib/PublicInbox/Spamcheck/Spamc.pm b/lib/PublicInbox/Spamcheck/Spamc.pm index d2b6429c..b4f95e2b 100644 --- a/lib/PublicInbox/Spamcheck/Spamc.pm +++ b/lib/PublicInbox/Spamcheck/Spamc.pm @@ -1,18 +1,17 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Default spam filter class for wrapping spamc(1) package PublicInbox::Spamcheck::Spamc; -use strict; -use warnings; -use PublicInbox::Spawn qw(popen_rd spawn); +use v5.12; +use PublicInbox::Spawn qw(run_qx run_wait); use IO::Handle; use Fcntl qw(SEEK_SET); sub new { my ($class) = @_; bless { - checkcmd => [qw(spamc -E --headers)], + checkcmd => [qw(spamc -E)], hamcmd => [qw(spamc -L ham)], spamcmd => [qw(spamc -L spam)], }, $class; @@ -21,15 +20,9 @@ sub new { sub spamcheck { my ($self, $msg, $out) = @_; + $out = \(my $buf = '') unless ref($out); my $rdr = { 0 => _msg_to_fh($self, $msg) }; - my ($fh, $pid) = popen_rd($self->{checkcmd}, undef, $rdr); - unless (ref $out) { - my $buf = ''; - $out = \$buf; - } - $$out = do { local $/; <$fh> }; - close $fh or die "close failed: $!"; - waitpid($pid, 0); + $$out = run_qx($self->{checkcmd}, undef, $rdr); ($? || $$out eq '') ? 0 : 1; } @@ -49,9 +42,7 @@ sub _learn { $rdr->{0} = _msg_to_fh($self, $msg); $rdr->{1} ||= $self->_devnull; $rdr->{2} ||= $self->_devnull; - my $pid = spawn($self->{$field}, undef, $rdr); - waitpid($pid, 0); - !$?; + 0 == run_wait($self->{$field}, undef, $rdr); } sub _devnull { diff --git a/lib/PublicInbox/Spawn.pm b/lib/PublicInbox/Spawn.pm index 3f69108a..e9e81e88 100644 --- a/lib/PublicInbox/Spawn.pm +++ b/lib/PublicInbox/Spawn.pm @@ -6,10 +6,8 @@ # is explicitly defined in the environment (and writable). # Under Linux, vfork can make a big difference in spawning performance # as process size increases (fork still needs to mark pages for CoW use). -# Currently, we only use this for code intended for long running -# daemons (inside the PSGI code (-httpd) and -nntpd). The short-lived -# scripts (-mda, -index, -learn, -init) either use IPC::run or standard -# Perl routines. +# None of this is intended to be thread-safe since Perl5 maintainers +# officially discourage the use of threads. # # There'll probably be more OS-level C stuff here, down the line. # We don't want too many DSOs: https://udrepper.livejournal.com/8790.html @@ -17,14 +15,17 @@ package PublicInbox::Spawn; use v5.12; use parent qw(Exporter); -use Symbol qw(gensym); -use Fcntl qw(LOCK_EX SEEK_SET); +use PublicInbox::Lock; +use Fcntl qw(SEEK_SET); use IO::Handle (); -use PublicInbox::ProcessPipe; -our @EXPORT_OK = qw(which spawn popen_rd run_die); -our @RLIMITS = qw(RLIMIT_CPU RLIMIT_CORE RLIMIT_DATA); +use Carp qw(croak); +use PublicInbox::IO; +our @EXPORT_OK = qw(which spawn popen_rd popen_wr run_die run_wait run_qx); +our (@RLIMITS, %RLIMITS); +use autodie qw(close open pipe seek sysseek truncate); BEGIN { + @RLIMITS = qw(RLIMIT_CPU RLIMIT_CORE RLIMIT_DATA); my $all_libc = <<'ALL_LIBC'; # all *nix systems we support #include <sys/resource.h> #include <sys/socket.h> @@ -37,12 +38,6 @@ BEGIN { #include <time.h> #include <stdio.h> #include <string.h> - -/* some platforms need alloca.h, but some don't */ -#if defined(__GNUC__) && !defined(alloca) -# define alloca(sz) __builtin_alloca(sz) -#endif - #include <signal.h> #include <assert.h> @@ -54,11 +49,17 @@ BEGIN { * This is unlike "sv_len", which returns what you would expect. */ #define AV2C_COPY(dst, src) do { \ + static size_t dst##__capa; \ I32 i; \ I32 top_index = av_len(src); \ I32 real_len = top_index + 1; \ I32 capa = real_len + 1; \ - dst = alloca(capa * sizeof(char *)); \ + if (capa > dst##__capa) { \ + dst##__capa = 0; /* in case Newx croaks */ \ + Safefree(dst); \ + Newx(dst, capa, char *); \ + dst##__capa = capa; \ + } \ for (i = 0; i < real_len; i++) { \ SV **sv = av_fetch(src, i, 0); \ dst[i] = SvPV_nolen(*sv); \ @@ -87,23 +88,28 @@ int pi_fork_exec(SV *redirref, SV *file, SV *cmdref, SV *envref, SV *rlimref, AV *env = (AV *)SvRV(envref); AV *rlim = (AV *)SvRV(rlimref); const char *filename = SvPV_nolen(file); - pid_t pid; - char **argv, **envp; + pid_t pid = -1; + static char **argv, **envp; sigset_t set, old; int ret, perrnum; volatile int cerrnum = 0; /* shared due to vfork */ - int chld_is_member; + int chld_is_member; /* needed due to shared memory w/ vfork */ I32 max_fd = av_len(redir); AV2C_COPY(argv, cmd); AV2C_COPY(envp, env); - if (sigfillset(&set)) return -1; - if (sigprocmask(SIG_SETMASK, &set, &old)) return -1; + if (sigfillset(&set)) goto out; + if (sigdelset(&set, SIGABRT)) goto out; + if (sigdelset(&set, SIGBUS)) goto out; + if (sigdelset(&set, SIGFPE)) goto out; + if (sigdelset(&set, SIGILL)) goto out; + if (sigdelset(&set, SIGSEGV)) goto out; + /* no XCPU/XFSZ here */ + if (sigprocmask(SIG_SETMASK, &set, &old)) goto out; chld_is_member = sigismember(&old, SIGCHLD); - if (chld_is_member < 0) return -1; - if (chld_is_member > 0) - sigdelset(&old, SIGCHLD); + if (chld_is_member < 0) goto out; + if (chld_is_member > 0 && sigdelset(&old, SIGCHLD)) goto out; pid = vfork(); if (pid == 0) { @@ -122,8 +128,10 @@ int pi_fork_exec(SV *redirref, SV *file, SV *cmdref, SV *envref, SV *rlimref, exit_err("setpgid", &cerrnum); for (sig = 1; sig < NSIG; sig++) signal(sig, SIG_DFL); /* ignore errors on signals */ - if (*cd && chdir(cd) < 0) - exit_err("chdir", &cerrnum); + if (*cd && chdir(cd) < 0) { + write(2, "cd ", 3); + exit_err(cd, &cerrnum); + } max_rlim = av_len(rlim); for (i = 0; i < max_rlim; i += 3) { @@ -162,22 +170,26 @@ int pi_fork_exec(SV *redirref, SV *file, SV *cmdref, SV *envref, SV *rlimref, } else if (perrnum) { errno = perrnum; } +out: + if (pid < 0) + croak("E: fork_exec %s: %s\n", filename, strerror(errno)); return (int)pid; } -static int sleep_wait(unsigned *tries, int err) +static int sendmsg_retry(int *tries) { const struct timespec req = { 0, 100000000 }; /* 100ms */ + int err = errno; switch (err) { + case EINTR: PERL_ASYNC_CHECK(); return 1; case ENOBUFS: case ENOMEM: case ETOOMANYREFS: - if (++*tries < 50) { - fprintf(stderr, "sleeping on sendmsg: %s (#%u)\n", - strerror(err), *tries); - nanosleep(&req, NULL); - return 1; - } - default: - return 0; + if (--*tries < 0) return 0; + fprintf(stderr, "# sleeping on sendmsg: %s (%d tries left)\n", + strerror(err), *tries); + nanosleep(&req, NULL); + PERL_ASYNC_CHECK(); + return 1; + default: return 0; } } @@ -189,7 +201,7 @@ union my_cmsg { char pad[sizeof(struct cmsghdr) + 16 + SEND_FD_SPACE]; }; -SV *send_cmd4(PerlIO *s, SV *svfds, SV *data, int flags) +SV *send_cmd4_(PerlIO *s, SV *svfds, SV *data, int flags, int tries) { struct msghdr msg = { 0 }; union my_cmsg cmsg = { 0 }; @@ -199,7 +211,6 @@ SV *send_cmd4(PerlIO *s, SV *svfds, SV *data, int flags) AV *fds = (AV *)SvRV(svfds); I32 i, nfds = av_len(fds) + 1; int *fdp; - unsigned tries = 0; if (SvOK(data)) { iov.iov_base = SvPV(data, dlen); @@ -229,7 +240,7 @@ SV *send_cmd4(PerlIO *s, SV *svfds, SV *data, int flags) } do { sent = sendmsg(PerlIO_fileno(s), &msg, flags); - } while (sent < 0 && sleep_wait(&tries, errno)); + } while (sent < 0 && sendmsg_retry(&tries)); return sent >= 0 ? newSViv(sent) : &PL_sv_undef; } @@ -251,58 +262,79 @@ void recv_cmd4(PerlIO *s, SV *buf, STRLEN n) msg.msg_control = &cmsg.hdr; msg.msg_controllen = CMSG_SPACE(SEND_FD_SPACE); - i = recvmsg(PerlIO_fileno(s), &msg, 0); - if (i < 0) - Inline_Stack_Push(&PL_sv_undef); - else + for (;;) { + i = recvmsg(PerlIO_fileno(s), &msg, 0); + if (i >= 0 || errno != EINTR) break; + PERL_ASYNC_CHECK(); + } + if (i >= 0) { SvCUR_set(buf, i); - if (i > 0 && cmsg.hdr.cmsg_level == SOL_SOCKET && - cmsg.hdr.cmsg_type == SCM_RIGHTS) { - size_t len = cmsg.hdr.cmsg_len; - int *fdp = (int *)CMSG_DATA(&cmsg.hdr); - for (i = 0; CMSG_LEN((i + 1) * sizeof(int)) <= len; i++) - Inline_Stack_Push(sv_2mortal(newSViv(*fdp++))); + if (cmsg.hdr.cmsg_level == SOL_SOCKET && + cmsg.hdr.cmsg_type == SCM_RIGHTS) { + size_t len = cmsg.hdr.cmsg_len; + int *fdp = (int *)CMSG_DATA(&cmsg.hdr); + for (i = 0; CMSG_LEN((i + 1) * sizeof(int)) <= len; i++) + Inline_Stack_Push(sv_2mortal(newSViv(*fdp++))); + } + } else { + Inline_Stack_Push(&PL_sv_undef); + SvCUR_set(buf, 0); } Inline_Stack_Done; } #endif /* defined(CMSG_SPACE) && defined(CMSG_LEN) */ -ALL_LIBC - my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} //= ( +void rlimit_map() +{ + Inline_Stack_Vars; + Inline_Stack_Reset; +ALL_LIBC + my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} // ( $ENV{XDG_CACHE_HOME} // ( ($ENV{HOME} // '/nonexistent').'/.cache' ) ).'/public-inbox/inline-c'; - warn "$inline_dir exists, not writable\n" if -e $inline_dir && !-w _; - $all_libc = undef unless -d _ && -w _; + undef $all_libc unless -d $inline_dir; if (defined $all_libc) { - my $f = "$inline_dir/.public-inbox.lock"; - open my $oldout, '>&', \*STDOUT or die "dup(1): $!"; - open my $olderr, '>&', \*STDERR or die "dup(2): $!"; - open my $fh, '+>', $f or die "open($f): $!"; - open STDOUT, '>&', $fh or die "1>$f: $!"; - open STDERR, '>&', $fh or die "2>$f: $!"; + for (@RLIMITS, 'RLIM_INFINITY') { + $all_libc .= <<EOM; + Inline_Stack_Push(sv_2mortal(newSVpvs("$_"))); + Inline_Stack_Push(sv_2mortal(newSViv($_))); +EOM + } + $all_libc .= <<EOM; + Inline_Stack_Done; +} // rlimit_map +EOM + local $ENV{PERL_INLINE_DIRECTORY} = $inline_dir; + # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking + my $lk = PublicInbox::Lock->new($inline_dir. + '/.public-inbox.lock'); + my $fh = $lk->lock_acquire; + open my $oldout, '>&', \*STDOUT; + open my $olderr, '>&', \*STDERR; + open STDOUT, '>&', $fh; + open STDERR, '>&', $fh; STDERR->autoflush(1); STDOUT->autoflush(1); - - # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking - flock($fh, LOCK_EX) or die "LOCK_EX($f): $!"; - eval <<'EOM'; -use Inline C => $all_libc, BUILD_NOISY => 1; -EOM + eval 'use Inline C => $all_libc, BUILD_NOISY => 1'; my $err = $@; - my $ndc_err = ''; - $err = $@; - open(STDERR, '>&', $olderr) or warn "restore stderr: $!"; - open(STDOUT, '>&', $oldout) or warn "restore stdout: $!"; + open(STDERR, '>&', $olderr); + open(STDOUT, '>&', $oldout); if ($err) { seek($fh, 0, SEEK_SET); my @msg = <$fh>; - warn "Inline::C build failed:\n", - $ndc_err, $err, "\n", @msg; + truncate($fh, 0); + warn "Inline::C build failed:\n", $err, "\n", @msg; $all_libc = undef; } } - unless ($all_libc) { + if (defined $all_libc) { # set for Gcf2 + $ENV{PERL_INLINE_DIRECTORY} = $inline_dir; + %RLIMITS = rlimit_map(); + *send_cmd4 = sub ($$$$;$) { + send_cmd4_($_[0], $_[1], $_[2], $_[3], 50); + } + } else { require PublicInbox::SpawnPP; *pi_fork_exec = \&PublicInbox::SpawnPP::pi_fork_exec } @@ -319,59 +351,94 @@ sub which ($) { } sub spawn ($;$$) { - my ($cmd, $env, $opts) = @_; + my ($cmd, $env, $opt) = @_; my $f = which($cmd->[0]) // die "$cmd->[0]: command not found\n"; - my @env; - $opts ||= {}; + my (@env, @rdr); my %env = (%ENV, $env ? %$env : ()); while (my ($k, $v) = each %env) { push @env, "$k=$v" if defined($v); } - my $redir = []; for my $child_fd (0..2) { - my $parent_fd = $opts->{$child_fd}; - if (defined($parent_fd) && $parent_fd !~ /\A[0-9]+\z/) { - my $fd = fileno($parent_fd) // - die "$parent_fd not an IO GLOB? $!"; - $parent_fd = $fd; + my $pfd = $opt->{$child_fd}; + if ('SCALAR' eq ref($pfd)) { + open my $fh, '+>', undef; + $opt->{"fh.$child_fd"} = $fh; # for read_out_err + if ($child_fd == 0) { + print $fh $$pfd; + $fh->flush or die "flush: $!"; + sysseek($fh, 0, SEEK_SET); + } + $pfd = fileno($fh); + } elsif (defined($pfd) && $pfd !~ /\A[0-9]+\z/) { + my $fd = fileno($pfd) // + croak "BUG: $pfd not an IO GLOB? $!"; + $pfd = $fd; } - $redir->[$child_fd] = $parent_fd // $child_fd; + $rdr[$child_fd] = $pfd // $child_fd; } my $rlim = []; - foreach my $l (@RLIMITS) { - my $v = $opts->{$l} // next; - my $r = eval "require BSD::Resource; BSD::Resource::$l();"; - unless (defined $r) { - warn "$l undefined by BSD::Resource: $@\n"; - next; - } + my $v = $opt->{$l} // next; + my $r = $RLIMITS{$l} // + eval "require BSD::Resource; BSD::Resource::$l();" // + do { + warn "$l undefined by BSD::Resource: $@\n"; + next; + }; push @$rlim, $r, @$v; } - my $cd = $opts->{'-C'} // ''; # undef => NULL mapping doesn't work? - my $pgid = $opts->{pgid} // -1; - my $pid = pi_fork_exec($redir, $f, $cmd, \@env, $rlim, $cd, $pgid); - die "fork_exec @$cmd failed: $!\n" unless $pid > 0; - $pid; + my $cd = $opt->{'-C'} // ''; # undef => NULL mapping doesn't work? + my $pgid = $opt->{pgid} // -1; + pi_fork_exec(\@rdr, $f, $cmd, \@env, $rlim, $cd, $pgid); } sub popen_rd { + my ($cmd, $env, $opt, @cb_arg) = @_; + pipe(my $r, local $opt->{1}); + PublicInbox::IO::attach_pid($r, spawn($cmd, $env, $opt), @cb_arg); +} + +sub popen_wr { + my ($cmd, $env, $opt, @cb_arg) = @_; + pipe(local $opt->{0}, my $w); + $w->autoflush(1); + PublicInbox::IO::attach_pid($w, spawn($cmd, $env, $opt), @cb_arg); +} + +sub read_out_err ($) { + my ($opt) = @_; + for my $fd (1, 2) { # read stdout/stderr + my $fh = delete($opt->{"fh.$fd"}) // next; + seek($fh, 0, SEEK_SET); + PublicInbox::IO::read_all $fh, undef, $opt->{$fd}; + } +} + +sub run_wait ($;$$) { my ($cmd, $env, $opt) = @_; - pipe(my ($r, $w)) or die "pipe: $!\n"; - $opt ||= {}; - $opt->{1} = fileno($w); - my $pid = spawn($cmd, $env, $opt); - return ($r, $pid) if wantarray; - my $ret = gensym; - tie *$ret, 'PublicInbox::ProcessPipe', $pid, $r, @$opt{qw(cb arg)}; - $ret; + waitpid(spawn($cmd, $env, $opt), 0); + read_out_err($opt); + $? } sub run_die ($;$$) { my ($cmd, $env, $rdr) = @_; - my $pid = spawn($cmd, $env, $rdr); - waitpid($pid, 0) == $pid or die "@$cmd did not finish"; - $? == 0 or die "@$cmd failed: \$?=$?\n"; + run_wait($cmd, $env, $rdr) and croak "E: @$cmd failed: \$?=$?"; +} + +sub run_qx { + my ($cmd, $env, $opt) = @_; + my $fh = popen_rd($cmd, $env, $opt); + my @ret; + if (wantarray) { + @ret = <$fh>; + } else { + local $/; + $ret[0] = <$fh>; + } + $fh->close; # caller should check $? + read_out_err($opt); + wantarray ? @ret : $ret[0]; } 1; diff --git a/lib/PublicInbox/SpawnPP.pm b/lib/PublicInbox/SpawnPP.pm index 6d7e2c34..9ad4d0a1 100644 --- a/lib/PublicInbox/SpawnPP.pm +++ b/lib/PublicInbox/SpawnPP.pm @@ -1,27 +1,29 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Pure-Perl implementation of "spawn". This can't take advantage # of vfork, so no speedups under Linux for spawning from large processes. +# Do not require this directly, only use from PublicInbox::Spawn package PublicInbox::SpawnPP; -use strict; -use v5.10.1; +use v5.12; use POSIX qw(dup2 _exit setpgid :signal_h); +use autodie qw(chdir close pipe); +use PublicInbox::OnDestroy; +# this is loaded by PublicInbox::Spawn, so we can't use/require it, here # Pure Perl implementation for folks that do not use Inline::C sub pi_fork_exec ($$$$$$$) { my ($redir, $f, $cmd, $env, $rlim, $cd, $pgid) = @_; my $old = POSIX::SigSet->new(); my $set = POSIX::SigSet->new(); - $set->fillset or die "fillset failed: $!"; - sigprocmask(SIG_SETMASK, $set, $old) or die "can't block signals: $!"; - my $syserr; - pipe(my ($r, $w)); - my $pid = fork; - unless (defined $pid) { # compat with Inline::C version - $syserr = $!; - $pid = -1; + $set->fillset or die "sigfillset: $!"; + for (POSIX::SIGABRT, POSIX::SIGBUS, POSIX::SIGFPE, + POSIX::SIGILL, POSIX::SIGSEGV) { + $set->delset($_) or die "delset($_): $!"; } + sigprocmask(SIG_SETMASK, $set, $old) or die "SIG_SETMASK(set): $!"; + pipe(my $r, my $w); + my $pid = PublicInbox::OnDestroy::fork_tmp; if ($pid == 0) { close $r; $SIG{__DIE__} = sub { @@ -38,36 +40,30 @@ sub pi_fork_exec ($$$$$$$) { if ($pgid >= 0 && !defined(setpgid(0, $pgid))) { die "setpgid(0, $pgid): $!"; } - for (keys %SIG) { - $SIG{$_} = 'DEFAULT' if substr($_, 0, 1) ne '_'; - } - if ($cd ne '') { - chdir $cd or die "chdir $cd: $!"; - } + $SIG{$_} = 'DEFAULT' for grep(!/\A__/, keys %SIG); + chdir($cd) if $cd ne ''; while (@$rlim) { my ($r, $soft, $hard) = splice(@$rlim, 0, 3); BSD::Resource::setrlimit($r, $soft, $hard) or die "setrlimit($r=[$soft,$hard]: $!)"; } - $old->delset(POSIX::SIGCHLD) or die "delset SIGCHLD: $!"; - sigprocmask(SIG_SETMASK, $old) or die "SETMASK: ~SIGCHLD: $!"; + $old->delset(POSIX::SIGCHLD) or die "sigdelset CHLD: $!"; + sigprocmask(SIG_SETMASK, $old) or die "SIG_SETMASK ~CHLD: $!"; $cmd->[0] = $f; if ($ENV{MOD_PERL}) { - @$cmd = (which('env'), '-i', @$env, @$cmd); + $f = PublicInbox::Spawn::which('env'); + @$cmd = ('env', '-i', @$env, @$cmd); } else { %ENV = map { split(/=/, $_, 2) } @$env; } - undef $r; exec { $f } @$cmd; die "exec @$cmd failed: $!"; } close $w; - sigprocmask(SIG_SETMASK, $old) or die "can't unblock signals: $!"; + sigprocmask(SIG_SETMASK, $old) or die "SIG_SETMASK(old): $!"; if (my $cerrnum = do { local $/, <$r> }) { - $pid = -1; $! = $cerrnum; - } else { - $! = $syserr; + die "forked child $@: $!"; } $pid; } diff --git a/lib/PublicInbox/Syscall.pm b/lib/PublicInbox/Syscall.pm index 777c44d0..4cbe9623 100644 --- a/lib/PublicInbox/Syscall.pm +++ b/lib/PublicInbox/Syscall.pm @@ -2,9 +2,9 @@ # specifically the Debian libsys-syscall-perl 0.25-6 version to # fix upstream regressions in 0.25. # -# See devel/syscall-list in the public-inbox source tree for maintenance +# See devel/sysdefs-list in the public-inbox source tree for maintenance # <https://80x24.org/public-inbox.git>, and machines from the GCC Farm: -# <https://cfarm.tetaneutral.net/> +# <https://portal.cfarm.net/> # # This license differs from the rest of public-inbox # @@ -21,19 +21,15 @@ use parent qw(Exporter); use POSIX qw(ENOENT ENOSYS EINVAL O_NONBLOCK); use Socket qw(SOL_SOCKET SCM_RIGHTS); use Config; +our %SIGNUM = (WINCH => 28); # most Linux, {Free,Net,Open}BSD, *Darwin +our ($INOTIFY, %PACK); # $VERSION = '0.25'; # Sys::Syscall version our @EXPORT_OK = qw(epoll_ctl epoll_create epoll_wait - EPOLLIN EPOLLOUT EPOLLET - EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD - EPOLLONESHOT EPOLLEXCLUSIVE - signalfd rename_noreplace); -our %EXPORT_TAGS = (epoll => [qw(epoll_ctl epoll_create epoll_wait - EPOLLIN EPOLLOUT - EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD - EPOLLONESHOT EPOLLEXCLUSIVE)], - ); - + EPOLLIN EPOLLOUT EPOLLET + EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD + EPOLLONESHOT EPOLLEXCLUSIVE + signalfd rename_noreplace %SIGNUM $F_SETPIPE_SZ); use constant { EPOLLIN => 1, EPOLLOUT => 4, @@ -48,242 +44,287 @@ use constant { EPOLL_CTL_MOD => 3, SIZEOF_int => $Config{intsize}, SIZEOF_size_t => $Config{sizesize}, + SIZEOF_ptr => $Config{ptrsize}, NUL => "\0", }; -use constant { - TMPL_size_t => SIZEOF_size_t == 8 ? 'Q' : 'L', - BYTES_4_hole => SIZEOF_size_t == 8 ? 'L' : '', - # cmsg_len, cmsg_level, cmsg_type - SIZEOF_cmsghdr => SIZEOF_int * 2 + SIZEOF_size_t, -}; - -my @BYTES_4_hole = BYTES_4_hole ? (0) : (); -our $loaded_syscall = 0; - -sub _load_syscall { - # props to Gaal for this! - return if $loaded_syscall++; - my $clean = sub { - delete @INC{qw<syscall.ph asm/unistd.ph bits/syscall.ph - _h2ph_pre.ph sys/syscall.ph>}; - }; - $clean->(); # don't trust modules before us - my $rv = eval { require 'syscall.ph'; 1 } || eval { require 'sys/syscall.ph'; 1 }; - $clean->(); # don't require modules after us trust us - $rv; -} - +use constant TMPL_size_t => SIZEOF_size_t == 8 ? 'Q' : 'L'; -our ( - $SYS_epoll_create, - $SYS_epoll_ctl, - $SYS_epoll_wait, - $SYS_signalfd4, - $SYS_renameat2, - ); +our ($SYS_epoll_create, + $SYS_epoll_ctl, + $SYS_epoll_wait, + $SYS_signalfd4, + $SYS_renameat2, + $F_SETPIPE_SZ, + $SYS_sendmsg, + $SYS_recvmsg); -my ($SYS_sendmsg, $SYS_recvmsg); my $SYS_fstatfs; # don't need fstatfs64, just statfs.f_type my ($FS_IOC_GETFLAGS, $FS_IOC_SETFLAGS); my $SFD_CLOEXEC = 02000000; # Perl does not expose O_CLOEXEC our $no_deprecated = 0; if ($^O eq "linux") { - my (undef, undef, $release, undef, $machine) = POSIX::uname(); - my ($maj, $min) = ($release =~ /\A([0-9]+)\.([0-9]+)/); - $SYS_renameat2 = 0 if "$maj.$min" < 3.15; - # whether the machine requires 64-bit numbers to be on 8-byte - # boundaries. - my $u64_mod_8 = 0; - - # if we're running on an x86_64 kernel, but a 32-bit process, - # we need to use the x32 or i386 syscall numbers. - if ($machine eq "x86_64" && $Config{ptrsize} == 4) { - $machine = $Config{cppsymbols} =~ /\b__ILP32__=1\b/ ? 'x32' : 'i386'; - } - - # Similarly for mips64 vs mips - if ($machine eq "mips64" && $Config{ptrsize} == 4) { - $machine = "mips"; - } - - if ($machine =~ m/^i[3456]86$/) { - $SYS_epoll_create = 254; - $SYS_epoll_ctl = 255; - $SYS_epoll_wait = 256; - $SYS_signalfd4 = 327; - $SYS_renameat2 //= 353; - $SYS_fstatfs = 100; - $SYS_sendmsg = 370; - $SYS_recvmsg = 372; - $FS_IOC_GETFLAGS = 0x80046601; - $FS_IOC_SETFLAGS = 0x40046602; - } elsif ($machine eq "x86_64") { - $SYS_epoll_create = 213; - $SYS_epoll_ctl = 233; - $SYS_epoll_wait = 232; - $SYS_signalfd4 = 289; - $SYS_renameat2 //= 316; - $SYS_fstatfs = 138; - $SYS_sendmsg = 46; - $SYS_recvmsg = 47; - $FS_IOC_GETFLAGS = 0x80086601; - $FS_IOC_SETFLAGS = 0x40086602; - } elsif ($machine eq 'x32') { - $SYS_epoll_create = 1073742037; - $SYS_epoll_ctl = 1073742057; - $SYS_epoll_wait = 1073742056; - $SYS_signalfd4 = 1073742113; - $SYS_renameat2 //= 0x40000000 + 316; - $SYS_fstatfs = 138; - $SYS_sendmsg = 0x40000206; - $SYS_recvmsg = 0x40000207; - $FS_IOC_GETFLAGS = 0x80046601; - $FS_IOC_SETFLAGS = 0x40046602; - } elsif ($machine eq 'sparc64') { - $SYS_epoll_create = 193; - $SYS_epoll_ctl = 194; - $SYS_epoll_wait = 195; - $u64_mod_8 = 1; - $SYS_signalfd4 = 317; - $SYS_renameat2 //= 345; - $SFD_CLOEXEC = 020000000; - $SYS_fstatfs = 158; - $SYS_sendmsg = 114; - $SYS_recvmsg = 113; - $FS_IOC_GETFLAGS = 0x40086601; - $FS_IOC_SETFLAGS = 0x80086602; - } elsif ($machine =~ m/^parisc/) { - $SYS_epoll_create = 224; - $SYS_epoll_ctl = 225; - $SYS_epoll_wait = 226; - $u64_mod_8 = 1; - $SYS_signalfd4 = 309; - } elsif ($machine =~ m/^ppc64/) { - $SYS_epoll_create = 236; - $SYS_epoll_ctl = 237; - $SYS_epoll_wait = 238; - $u64_mod_8 = 1; - $SYS_signalfd4 = 313; - $SYS_renameat2 //= 357; - $SYS_fstatfs = 100; - $SYS_sendmsg = 341; - $SYS_recvmsg = 342; - $FS_IOC_GETFLAGS = 0x40086601; - $FS_IOC_SETFLAGS = 0x80086602; - } elsif ($machine eq "ppc") { - $SYS_epoll_create = 236; - $SYS_epoll_ctl = 237; - $SYS_epoll_wait = 238; - $u64_mod_8 = 1; - $SYS_signalfd4 = 313; - $SYS_renameat2 //= 357; - $SYS_fstatfs = 100; - $FS_IOC_GETFLAGS = 0x40086601; - $FS_IOC_SETFLAGS = 0x80086602; - } elsif ($machine =~ m/^s390/) { # untested, no machine on cfarm - $SYS_epoll_create = 249; - $SYS_epoll_ctl = 250; - $SYS_epoll_wait = 251; - $u64_mod_8 = 1; - $SYS_signalfd4 = 322; - $SYS_renameat2 //= 347; - $SYS_fstatfs = 100; - $SYS_sendmsg = 370; - $SYS_recvmsg = 372; - } elsif ($machine eq 'ia64') { # untested, no machine on cfarm - $SYS_epoll_create = 1243; - $SYS_epoll_ctl = 1244; - $SYS_epoll_wait = 1245; - $u64_mod_8 = 1; - $SYS_signalfd4 = 289; - } elsif ($machine eq "alpha") { # untested, no machine on cfarm - # natural alignment, ints are 32-bits - $SYS_epoll_create = 407; - $SYS_epoll_ctl = 408; - $SYS_epoll_wait = 409; - $u64_mod_8 = 1; - $SYS_signalfd4 = 484; - $SFD_CLOEXEC = 010000000; - } elsif ($machine eq 'aarch64' || $machine eq 'loongarch64') { - $SYS_epoll_create = 20; # (sys_epoll_create1) - $SYS_epoll_ctl = 21; - $SYS_epoll_wait = 22; # (sys_epoll_pwait) - $u64_mod_8 = 1; - $no_deprecated = 1; - $SYS_signalfd4 = 74; - $SYS_renameat2 //= 276; - $SYS_fstatfs = 44; - $SYS_sendmsg = 211; - $SYS_recvmsg = 212; - $FS_IOC_GETFLAGS = 0x80086601; - $FS_IOC_SETFLAGS = 0x40086602; - } elsif ($machine =~ m/arm(v\d+)?.*l/) { # ARM OABI (untested on cfarm) - $SYS_epoll_create = 250; - $SYS_epoll_ctl = 251; - $SYS_epoll_wait = 252; - $u64_mod_8 = 1; - $SYS_signalfd4 = 355; - $SYS_renameat2 //= 382; - $SYS_fstatfs = 100; - $SYS_sendmsg = 296; - $SYS_recvmsg = 297; - } elsif ($machine =~ m/^mips64/) { # cfarm only has 32-bit userspace - $SYS_epoll_create = 5207; - $SYS_epoll_ctl = 5208; - $SYS_epoll_wait = 5209; - $u64_mod_8 = 1; - $SYS_signalfd4 = 5283; - $SYS_renameat2 //= 5311; - $SYS_fstatfs = 5135; - $SYS_sendmsg = 5045; - $SYS_recvmsg = 5046; - $FS_IOC_GETFLAGS = 0x40046601; - $FS_IOC_SETFLAGS = 0x80046602; - } elsif ($machine =~ m/^mips/) { # 32-bit, tested on mips64 cfarm machine - $SYS_epoll_create = 4248; - $SYS_epoll_ctl = 4249; - $SYS_epoll_wait = 4250; - $u64_mod_8 = 1; - $SYS_signalfd4 = 4324; - $SYS_renameat2 //= 4351; - $SYS_fstatfs = 4100; - $SYS_sendmsg = 4179; - $SYS_recvmsg = 4177; - $FS_IOC_GETFLAGS = 0x40046601; - $FS_IOC_SETFLAGS = 0x80046602; - } else { - # as a last resort, try using the *.ph files which may not - # exist or may be wrong - _load_syscall(); - $SYS_epoll_create = eval { &SYS_epoll_create; } || 0; - $SYS_epoll_ctl = eval { &SYS_epoll_ctl; } || 0; - $SYS_epoll_wait = eval { &SYS_epoll_wait; } || 0; - - # Note: do NOT add new syscalls to depend on *.ph, here. - # Better to miss syscalls (so we can fallback to IO::Poll) - # than to use wrong ones, since the names are not stable - # (at least not on FreeBSD), if the actual numbers are. - } - - if ($u64_mod_8) { - *epoll_wait = \&epoll_wait_mod8; - *epoll_ctl = \&epoll_ctl_mod8; - } else { - *epoll_wait = \&epoll_wait_mod4; - *epoll_ctl = \&epoll_ctl_mod4; - } + $F_SETPIPE_SZ = 1031; + my (undef, undef, $release, undef, $machine) = POSIX::uname(); + my ($maj, $min) = ($release =~ /\A([0-9]+)\.([0-9]+)/); + $SYS_renameat2 = 0 if "$maj.$min" < 3.15; + # whether the machine requires 64-bit numbers to be on 8-byte + # boundaries. + my $u64_mod_8 = 0; + + if (SIZEOF_ptr == 4) { + # if we're running on an x86_64 kernel, but a 32-bit process, + # we need to use the x32 or i386 syscall numbers. + if ($machine eq 'x86_64') { + my $s = $Config{cppsymbols}; + $machine = ($s =~ /\b__ILP32__=1\b/ && + $s =~ /\b__x86_64__=1\b/) ? + 'x32' : 'i386' + } elsif ($machine eq 'mips64') { # similarly for mips64 vs mips + $machine = 'mips'; + } + } + if ($machine =~ m/^i[3456]86$/) { + $SYS_epoll_create = 254; + $SYS_epoll_ctl = 255; + $SYS_epoll_wait = 256; + $SYS_signalfd4 = 327; + $SYS_renameat2 //= 353; + $SYS_fstatfs = 100; + $SYS_sendmsg = 370; + $SYS_recvmsg = 372; + $INOTIFY = { # usage: `use constant $PublicInbox::Syscall::INOTIFY' + SYS_inotify_init1 => 332, + SYS_inotify_add_watch => 292, + SYS_inotify_rm_watch => 293, + }; + $FS_IOC_GETFLAGS = 0x80046601; + $FS_IOC_SETFLAGS = 0x40046602; + } elsif ($machine eq "x86_64") { + $SYS_epoll_create = 213; + $SYS_epoll_ctl = 233; + $SYS_epoll_wait = 232; + $SYS_signalfd4 = 289; + $SYS_renameat2 //= 316; + $SYS_fstatfs = 138; + $SYS_sendmsg = 46; + $SYS_recvmsg = 47; + $INOTIFY = { + SYS_inotify_init1 => 294, + SYS_inotify_add_watch => 254, + SYS_inotify_rm_watch => 255, + }; + $FS_IOC_GETFLAGS = 0x80086601; + $FS_IOC_SETFLAGS = 0x40086602; + } elsif ($machine eq 'x32') { + $SYS_epoll_create = 1073742037; + $SYS_epoll_ctl = 1073742057; + $SYS_epoll_wait = 1073742056; + $SYS_signalfd4 = 1073742113; + $SYS_renameat2 //= 0x40000000 + 316; + $SYS_fstatfs = 138; + $SYS_sendmsg = 0x40000206; + $SYS_recvmsg = 0x40000207; + $FS_IOC_GETFLAGS = 0x80046601; + $FS_IOC_SETFLAGS = 0x40046602; + $INOTIFY = { + SYS_inotify_init1 => 1073742118, + SYS_inotify_add_watch => 1073742078, + SYS_inotify_rm_watch => 1073742079, + }; + } elsif ($machine eq 'sparc64') { + $SYS_epoll_create = 193; + $SYS_epoll_ctl = 194; + $SYS_epoll_wait = 195; + $u64_mod_8 = 1; + $SYS_signalfd4 = 317; + $SYS_renameat2 //= 345; + $SFD_CLOEXEC = 020000000; + $SYS_fstatfs = 158; + $SYS_sendmsg = 114; + $SYS_recvmsg = 113; + $FS_IOC_GETFLAGS = 0x40086601; + $FS_IOC_SETFLAGS = 0x80086602; + } elsif ($machine =~ m/^parisc/) { # untested, no machine on cfarm + $SYS_epoll_create = 224; + $SYS_epoll_ctl = 225; + $SYS_epoll_wait = 226; + $u64_mod_8 = 1; + $SYS_signalfd4 = 309; + $SIGNUM{WINCH} = 23; + } elsif ($machine =~ m/^ppc64/) { + $SYS_epoll_create = 236; + $SYS_epoll_ctl = 237; + $SYS_epoll_wait = 238; + $u64_mod_8 = 1; + $SYS_signalfd4 = 313; + $SYS_renameat2 //= 357; + $SYS_fstatfs = 100; + $SYS_sendmsg = 341; + $SYS_recvmsg = 342; + $FS_IOC_GETFLAGS = 0x40086601; + $FS_IOC_SETFLAGS = 0x80086602; + $INOTIFY = { + SYS_inotify_init1 => 318, + SYS_inotify_add_watch => 276, + SYS_inotify_rm_watch => 277, + }; + } elsif ($machine eq "ppc") { # untested, no machine on cfarm + $SYS_epoll_create = 236; + $SYS_epoll_ctl = 237; + $SYS_epoll_wait = 238; + $u64_mod_8 = 1; + $SYS_signalfd4 = 313; + $SYS_renameat2 //= 357; + $SYS_fstatfs = 100; + $FS_IOC_GETFLAGS = 0x40086601; + $FS_IOC_SETFLAGS = 0x80086602; + } elsif ($machine =~ m/^s390/) { # untested, no machine on cfarm + $SYS_epoll_create = 249; + $SYS_epoll_ctl = 250; + $SYS_epoll_wait = 251; + $u64_mod_8 = 1; + $SYS_signalfd4 = 322; + $SYS_renameat2 //= 347; + $SYS_fstatfs = 100; + $SYS_sendmsg = 370; + $SYS_recvmsg = 372; + } elsif ($machine eq 'ia64') { # untested, no machine on cfarm + $SYS_epoll_create = 1243; + $SYS_epoll_ctl = 1244; + $SYS_epoll_wait = 1245; + $u64_mod_8 = 1; + $SYS_signalfd4 = 289; + } elsif ($machine eq "alpha") { # untested, no machine on cfarm + # natural alignment, ints are 32-bits + $SYS_epoll_create = 407; + $SYS_epoll_ctl = 408; + $SYS_epoll_wait = 409; + $u64_mod_8 = 1; + $SYS_signalfd4 = 484; + $SFD_CLOEXEC = 010000000; + } elsif ($machine =~ /\A(?:loong|a)arch64\z/ || $machine eq 'riscv64') { + $SYS_epoll_create = 20; # (sys_epoll_create1) + $SYS_epoll_ctl = 21; + $SYS_epoll_wait = 22; # (sys_epoll_pwait) + $u64_mod_8 = 1; + $no_deprecated = 1; + $SYS_signalfd4 = 74; + $SYS_renameat2 //= 276; + $SYS_fstatfs = 44; + $SYS_sendmsg = 211; + $SYS_recvmsg = 212; + $INOTIFY = { + SYS_inotify_init1 => 26, + SYS_inotify_add_watch => 27, + SYS_inotify_rm_watch => 28, + }; + $FS_IOC_GETFLAGS = 0x80086601; + $FS_IOC_SETFLAGS = 0x40086602; + } elsif ($machine =~ m/arm(v\d+)?.*l/) { # ARM OABI (untested on cfarm) + $SYS_epoll_create = 250; + $SYS_epoll_ctl = 251; + $SYS_epoll_wait = 252; + $u64_mod_8 = 1; + $SYS_signalfd4 = 355; + $SYS_renameat2 //= 382; + $SYS_fstatfs = 100; + $SYS_sendmsg = 296; + $SYS_recvmsg = 297; + } elsif ($machine =~ m/^mips64/) { # cfarm only has 32-bit userspace + $SYS_epoll_create = 5207; + $SYS_epoll_ctl = 5208; + $SYS_epoll_wait = 5209; + $u64_mod_8 = 1; + $SYS_signalfd4 = 5283; + $SYS_renameat2 //= 5311; + $SYS_fstatfs = 5135; + $SYS_sendmsg = 5045; + $SYS_recvmsg = 5046; + $FS_IOC_GETFLAGS = 0x40046601; + $FS_IOC_SETFLAGS = 0x80046602; + } elsif ($machine =~ m/^mips/) { # 32-bit, tested on mips64 cfarm host + $SYS_epoll_create = 4248; + $SYS_epoll_ctl = 4249; + $SYS_epoll_wait = 4250; + $u64_mod_8 = 1; + $SYS_signalfd4 = 4324; + $SYS_renameat2 //= 4351; + $SYS_fstatfs = 4100; + $SYS_sendmsg = 4179; + $SYS_recvmsg = 4177; + $FS_IOC_GETFLAGS = 0x40046601; + $FS_IOC_SETFLAGS = 0x80046602; + $SIGNUM{WINCH} = 20; + $INOTIFY = { + SYS_inotify_init1 => 4329, + SYS_inotify_add_watch => 4285, + SYS_inotify_rm_watch => 4286, + }; + } else { + warn <<EOM; +machine=$machine ptrsize=$Config{ptrsize} has no syscall definitions +git clone https://80x24.org/public-inbox.git and +Send the output of ./devel/sysdefs-list to meta\@public-inbox.org +EOM + } + if ($u64_mod_8) { + *epoll_wait = \&epoll_wait_mod8; + *epoll_ctl = \&epoll_ctl_mod8; + } else { + *epoll_wait = \&epoll_wait_mod4; + *epoll_ctl = \&epoll_ctl_mod4; + } +} elsif ($^O =~ /\A(?:freebsd|openbsd|netbsd|dragonfly)\z/) { +# don't use syscall.ph here, name => number mappings are not stable on *BSD +# but the actual numbers are. +# OpenBSD perl redirects syscall perlop to libc functions +# https://cvsweb.openbsd.org/src/gnu/usr.bin/perl/gen_syscall_emulator.pl +# https://www.netbsd.org/docs/internals/en/chap-processes.html#syscall_versioning +# https://wiki.freebsd.org/AddingSyscalls#Backward_compatibily +# (I'm assuming Dragonfly copies FreeBSD, here, too) + $SYS_recvmsg = 27; + $SYS_sendmsg = 28; } -# use Inline::C for *BSD-only or general POSIX stuff. -# Linux guarantees stable syscall numbering, BSDs only offer a stable libc -# use scripts/syscall-list on Linux to detect new syscall numbers -############################################################################ -# epoll functions -############################################################################ +BEGIN { + if ($^O eq 'linux') { + %PACK = ( + TMPL_cmsg_len => TMPL_size_t, + # cmsg_len, cmsg_level, cmsg_type + SIZEOF_cmsghdr => SIZEOF_int * 2 + SIZEOF_size_t, + CMSG_DATA_off => '', + TMPL_msghdr => 'PL' . # msg_name, msg_namelen + '@'.(2 * SIZEOF_ptr).'P'. # msg_iov + 'i'. # msg_iovlen + '@'.(4 * SIZEOF_ptr).'P'. # msg_control + 'L'. # msg_controllen (socklen_t) + 'i', # msg_flags + ); + } elsif ($^O =~ /\A(?:freebsd|openbsd|netbsd|dragonfly)\z/) { + %PACK = ( + TMPL_cmsg_len => 'L', # socklen_t + SIZEOF_cmsghdr => SIZEOF_int * 3, + CMSG_DATA_off => SIZEOF_ptr == 8 ? '@16' : '', + TMPL_msghdr => 'PL' . # msg_name, msg_namelen + '@'.(2 * SIZEOF_ptr).'P'. # msg_iov + TMPL_size_t. # msg_iovlen + '@'.(4 * SIZEOF_ptr).'P'. # msg_control + TMPL_size_t. # msg_controllen + 'i', # msg_flags + + ) + } + $PACK{CMSG_ALIGN_size} = SIZEOF_size_t; + $PACK{SIZEOF_cmsghdr} //= 0; + $PACK{TMPL_cmsg_len} //= undef; + $PACK{CMSG_DATA_off} //= undef; + $PACK{TMPL_msghdr} //= undef; +} -sub epoll_defined { $SYS_epoll_create ? 1 : 0; } +# SFD_CLOEXEC is arch-dependent, so IN_CLOEXEC may be, too +$INOTIFY->{IN_CLOEXEC} //= 0x80000 if $INOTIFY; sub epoll_create { syscall($SYS_epoll_create, $no_deprecated ? 0 : 100); @@ -292,10 +333,13 @@ sub epoll_create { # epoll_ctl wrapper # ARGS: (epfd, op, fd, events_mask) sub epoll_ctl_mod4 { - syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, pack("LLL", $_[3], $_[2], 0)); + syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, + pack("LLL", $_[3], $_[2], 0)); } + sub epoll_ctl_mod8 { - syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, pack("LLLL", $_[3], 0, $_[2], 0)); + syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, + pack("LLLL", $_[3], 0, $_[2], 0)); } # epoll_wait wrapper @@ -308,7 +352,7 @@ sub epoll_wait_mod4 { # resize our static buffer if maxevents bigger than we've ever done if ($maxevents > $epoll_wait_size) { $epoll_wait_size = $maxevents; - vec($epoll_wait_events, $maxevents * 12 * 8 - 1, 1) = 0; + vec($epoll_wait_events, $maxevents * 12 - 1, 8) = 0; } @$events = (); my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, @@ -329,7 +373,7 @@ sub epoll_wait_mod8 { # resize our static buffer if maxevents bigger than we've ever done if ($maxevents > $epoll_wait_size) { $epoll_wait_size = $maxevents; - vec($epoll_wait_events, $maxevents * 16 * 8 - 1, 1) = 0; + vec($epoll_wait_events, $maxevents * 16 - 1, 8) = 0; } @$events = (); my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, @@ -346,15 +390,15 @@ sub epoll_wait_mod8 { } } -sub signalfd ($$) { - my ($signos, $nonblock) = @_; +sub signalfd ($) { + my ($signos) = @_; if ($SYS_signalfd4) { my $set = POSIX::SigSet->new(@$signos); syscall($SYS_signalfd4, -1, "$$set", # $Config{sig_count} is NSIG, so this is NSIG/8: int($Config{sig_count}/8), # SFD_NONBLOCK == O_NONBLOCK for every architecture - ($nonblock ? O_NONBLOCK : 0) |$SFD_CLOEXEC); + O_NONBLOCK|$SFD_CLOEXEC); } else { $! = ENOSYS; undef; @@ -411,70 +455,70 @@ sub nodatacow_dir { if (open my $fh, '<', $_[0]) { nodatacow_fh($fh) } } -sub CMSG_ALIGN ($) { ($_[0] + SIZEOF_size_t - 1) & ~(SIZEOF_size_t - 1) } +use constant \%PACK; +sub CMSG_ALIGN ($) { ($_[0] + CMSG_ALIGN_size - 1) & ~(CMSG_ALIGN_size - 1) } use constant CMSG_ALIGN_SIZEOF_cmsghdr => CMSG_ALIGN(SIZEOF_cmsghdr); sub CMSG_SPACE ($) { CMSG_ALIGN($_[0]) + CMSG_ALIGN_SIZEOF_cmsghdr } sub CMSG_LEN ($) { CMSG_ALIGN_SIZEOF_cmsghdr + $_[0] } -use constant msg_controllen => CMSG_SPACE(10 * SIZEOF_int) + 16; # 10 FDs +use constant msg_controllen_max => + CMSG_SPACE(10 * SIZEOF_int) + SIZEOF_cmsghdr; # space for 10 FDs if (defined($SYS_sendmsg) && defined($SYS_recvmsg)) { no warnings 'once'; -*send_cmd4 = sub ($$$$) { - my ($sock, $fds, undef, $flags) = @_; +require PublicInbox::CmdIPC4; + +*send_cmd4 = sub ($$$$;$) { + my ($sock, $fds, undef, $flags, $tries) = @_; my $iov = pack('P'.TMPL_size_t, $_[2] // NUL, length($_[2] // NUL) || 1); - my $cmsghdr = pack(TMPL_size_t . # cmsg_len + my $fd_space = scalar(@$fds) * SIZEOF_int; + my $msg_controllen = CMSG_SPACE($fd_space); + my $cmsghdr = pack(TMPL_cmsg_len . 'LL' . # cmsg_level, cmsg_type, - ('i' x scalar(@$fds)), - CMSG_LEN(scalar(@$fds) * SIZEOF_int), # cmsg_len + CMSG_DATA_off.('i' x scalar(@$fds)). # CMSG_DATA + '@'.($msg_controllen - 1).'x1', # pad to space, not len + CMSG_LEN($fd_space), # cmsg_len SOL_SOCKET, SCM_RIGHTS, # cmsg_{level,type} @$fds); # CMSG_DATA - my $mh = pack('PL' . # msg_name, msg_namelen (socklen_t (U32)) - BYTES_4_hole . # 4-byte padding on 64-bit - 'P'.TMPL_size_t . # msg_iov, msg_iovlen, - 'P'.TMPL_size_t . # msg_control, msg_controllen, - 'i', # msg_flags - NUL, 0, # msg_name, msg_namelen (unused) - @BYTES_4_hole, + my $mh = pack(TMPL_msghdr, + undef, 0, # msg_name, msg_namelen (unused) $iov, 1, # msg_iov, msg_iovlen $cmsghdr, # msg_control - CMSG_SPACE(scalar(@$fds) * SIZEOF_int), # msg_controllen + $msg_controllen, 0); # msg_flags - my $sent; - my $try = 0; + my $s; + $tries //= 50; do { - $sent = syscall($SYS_sendmsg, fileno($sock), $mh, $flags); - } while ($sent < 0 && - ($!{ENOBUFS} || $!{ENOMEM} || $!{ETOOMANYREFS}) && - (++$try < 50) && - warn "sleeping on sendmsg: $! (#$try)\n" && - select(undef, undef, undef, 0.1) == 0); - $sent >= 0 ? $sent : undef; + $s = syscall($SYS_sendmsg, fileno($sock), $mh, $flags); + } while ($s < 0 && PublicInbox::CmdIPC4::sendmsg_retry($tries)); + $s >= 0 ? $s : undef; }; *recv_cmd4 = sub ($$$) { my ($sock, undef, $len) = @_; - vec($_[1], ($len + 1) * 8, 1) = 0; - my $cmsghdr = "\0" x msg_controllen; # 10 * sizeof(int) + vec($_[1] //= '', $len - 1, 8) = 0; + my $cmsghdr = "\0" x msg_controllen_max; # 10 * sizeof(int) my $iov = pack('P'.TMPL_size_t, $_[1], $len); - my $mh = pack('PL' . # msg_name, msg_namelen (socklen_t (U32)) - BYTES_4_hole . # 4-byte padding on 64-bit - 'P'.TMPL_size_t . # msg_iov, msg_iovlen, - 'P'.TMPL_size_t . # msg_control, msg_controllen, - 'i', # msg_flags - NUL, 0, # msg_name, msg_namelen (unused) - @BYTES_4_hole, + my $mh = pack(TMPL_msghdr, + undef, 0, # msg_name, msg_namelen (unused) $iov, 1, # msg_iov, msg_iovlen $cmsghdr, # msg_control - msg_controllen, + msg_controllen_max, 0); # msg_flags - my $r = syscall($SYS_recvmsg, fileno($sock), $mh, 0); - return (undef) if $r < 0; # $! set + my $r; + do { + $r = syscall($SYS_recvmsg, fileno($sock), $mh, 0); + } while ($r < 0 && $!{EINTR}); + if ($r < 0) { + $_[1] = ''; + return (undef); + } substr($_[1], $r, length($_[1]), ''); my @ret; if ($r > 0) { - my ($len, $lvl, $type, @fds) = unpack(TMPL_size_t . # cmsg_len - 'LLi*', # cmsg_level, cmsg_type, @fds + my ($len, $lvl, $type, @fds) = unpack(TMPL_cmsg_len. + 'LL'. # cmsg_level, cmsg_type + CMSG_DATA_off.'i*', # @fds $cmsghdr); if ($lvl == SOL_SOCKET && $type == SCM_RIGHTS) { $len -= CMSG_ALIGN_SIZEOF_cmsghdr; diff --git a/lib/PublicInbox/TailNotify.pm b/lib/PublicInbox/TailNotify.pm new file mode 100644 index 00000000..84340a35 --- /dev/null +++ b/lib/PublicInbox/TailNotify.pm @@ -0,0 +1,97 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# only used for tests at the moment... +package PublicInbox::TailNotify; +use v5.12; +use parent qw(PublicInbox::DirIdle); # not optimal, maybe.. +use PublicInbox::DS qw(now); + +my ($TAIL_MOD, $ino_cls); +if ($^O eq 'linux' && eval { require PublicInbox::Inotify; 1 }) { + $TAIL_MOD = PublicInbox::Inotify::IN_MOVED_TO() | + PublicInbox::Inotify::IN_CREATE() | + PublicInbox::Inotify::IN_MODIFY(); + $ino_cls = 'PublicInbox::Inotify'; +} elsif (eval { require PublicInbox::KQNotify }) { + $TAIL_MOD = PublicInbox::KQNotify::MOVED_TO_OR_CREATE() | + IO::KQueue::NOTE_DELETE() | IO::KQueue::NOTE_RENAME(); + $ino_cls = 'PublicInbox::KQNotify'; +} else { + require PublicInbox::FakeInotify; + $TAIL_MOD = PublicInbox::FakeInotify::MOVED_TO_OR_CREATE() | + PublicInbox::FakeInotify::IN_MODIFY() | + PublicInbox::FakeInotify::IN_DELETE(); +} +require IO::Poll if $ino_cls; + +sub reopen_file ($) { + my ($self) = @_; + + open my $fh, '<', $self->{fn} or return undef; + my @st = stat $fh or die "fstat($self->{fn}): $!"; + $self->{ino_dev} = "@st[0, 1]"; + $self->{inot}->watch($self->{fn}, $TAIL_MOD); + $self->{watch_fh} = $fh; # return value +} + +sub new { + my ($cls, $fn) = @_; + my $self = bless { fn => $fn }, $cls; + if ($ino_cls) { + $self->{inot} = $ino_cls->new or die "E: $ino_cls->new: $!"; + $self->{inot}->blocking(0); + my ($dn) = ($fn =~ m!\A(.+)/+[^/]+\z!); + $self->{inot}->watch($dn // '.', $TAIL_MOD); + } else { + $self->{inot} = PublicInbox::FakeInotify->new; + } + reopen_file($self); + $self->{inot}->watch($fn, $TAIL_MOD); + $self; +} + +sub delete_self { + for (@_) { return 1 if $_->IN_DELETE_SELF } + undef; +} + +sub getlines { + my ($self, $timeo) = @_; + my ($fh, $buf, $rfds, @ret, @events); + my $end = defined($timeo) ? now + $timeo : undef; +again: + while (1) { + @events = $self->{inot}->read; # Linux::Inotify2::read + last if @events; + return () if defined($timeo) && (!$timeo || (now > $end)); + my $wait = 0.1; + if ($ino_cls) { + vec($rfds = '', $self->{inot}->fileno, 1) = 1; + if (defined $end) { + $wait = $end - now; + $wait = 0 if $wait < 0; + } else { + undef $wait; + } + } + select($rfds, undef, undef, $wait); + } + if ($fh = $self->{watch_fh}) { + sysread($fh, $buf, -s $fh) and + push @ret, split(/^/sm, $buf); + my @st = stat($self->{fn}); + if (!@st || "@st[0, 1]" ne $self->{ino_dev} || + delete_self(@events)) { + delete @$self{qw(ino_dev watch_fh)}; + } + } + if ($fh = $self->{watch_fh} // reopen_file($self)) { + sysread($fh, $buf, -s $fh) and + push @ret, split(/^/sm, $buf); + } + goto again if (!@ret && (!defined($end) || now < $end)); + @ret; +} + +1; diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index ecf7a261..aeff5d1d 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -6,21 +6,32 @@ package PublicInbox::TestCommon; use strict; use parent qw(Exporter); use v5.10.1; -use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD :seek); +use Fcntl qw(F_SETFD F_GETFD FD_CLOEXEC :seek); use POSIX qw(dup2); use IO::Socket::INET; use File::Spec; +use Scalar::Util qw(isvstring); +use Carp (); our @EXPORT; my $lei_loud = $ENV{TEST_LEI_ERR_LOUD}; -my $tail_cmd = $ENV{TAIL}; -our ($lei_opt, $lei_out, $lei_err, $lei_cwdfh); +our $tail_cmd = $ENV{TAIL}; +our ($lei_opt, $lei_out, $lei_err); +use autodie qw(chdir close fcntl mkdir open opendir seek unlink); +$ENV{XDG_CACHE_HOME} //= "$ENV{HOME}/.cache"; # reuse C++ xap_helper builds + +$_ = File::Spec->rel2abs($_) for (grep(!m!^/!, @INC)); +our $CURRENT_DAEMON; BEGIN { @EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods run_script start_script key2sub xsys xsys_e xqx eml_load tick have_xapian_compact json_utf8 setup_public_inboxes create_inbox + create_dir + create_coderepo require_bsd kernel_version check_broken_tmpfs + quit_waiter_pipe wait_for_eof require_git_http_backend tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt test_httpd xbail require_cmd is_xdeeply tail_f - ignore_inline_c_missing); + ignore_inline_c_missing no_pollerfd no_coredump cfg_new + strace strace_inject lsof_pid oct_is); require Test::More; my @methods = grep(!/\W/, @Test::More::EXPORT); eval(join('', map { "*$_=\\&Test::More::$_;" } @methods)); @@ -28,23 +39,56 @@ BEGIN { push @EXPORT, @methods; } +sub kernel_version () { + state $version = do { + require POSIX; + my @u = POSIX::uname(); + if ($u[2] =~ /\A([0-9]+(?:\.[0-9]+)+)/) { + eval "v$1"; + } else { + local $" = "', `"; + diag "Unable to get kernel version from: `@u'"; + undef; + } + }; +} + +sub check_broken_tmpfs () { + return if $^O ne 'dragonfly' || kernel_version ge v6.5; + diag 'EVFILT_VNODE + tmpfs is broken on dragonfly <= 6.4 (have: '. + sprintf('%vd', kernel_version).')'; + 1; +} + +sub require_bsd (;$) { + state $ok = ($^O =~ m!\A(?:free|net|open)bsd\z! || + $^O eq 'dragonfly'); + return 1 if $ok; + return if defined(wantarray); + my $m = "$0 is BSD-only (\$^O=$^O)"; + @_ ? skip($m, 1) : plan(skip_all => $m); +} + sub xbail (@) { BAIL_OUT join(' ', map { ref() ? (explain($_)) : ($_) } @_) } +sub read_all ($;$$$) { + require PublicInbox::IO; + PublicInbox::IO::read_all($_[0], $_[1], $_[2], $_[3]) +} + sub eml_load ($) { my ($path, $cb) = @_; - open(my $fh, '<', $path) or die "open $path: $!"; + open(my $fh, '<', $path); require PublicInbox::Eml; - PublicInbox::Eml->new(\(do { local $/; <$fh> })); + PublicInbox::Eml->new(\(scalar read_all $fh)); } sub tmpdir (;$) { my ($base) = @_; require File::Temp; - unless (defined $base) { - ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); - } + ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!) unless defined $base; my $tmpdir = File::Temp->newdir("pi-$base-$$-XXXX", TMPDIR => 1); - ($tmpdir->dirname, $tmpdir); + wantarray ? ($tmpdir->dirname, $tmpdir) : $tmpdir; } sub tcp_server () { @@ -57,8 +101,12 @@ sub tcp_server () { ); eval { die 'IPv4-only' if $ENV{TEST_IPV4_ONLY}; - require IO::Socket::INET6; - IO::Socket::INET6->new(%opt, LocalAddr => '[::1]') + my $pkg; + for (qw(IO::Socket::IP IO::Socket::INET6)) { + eval "require $_" or next; + $pkg = $_ and last; + } + $pkg->new(%opt, LocalAddr => '[::1]'); } || eval { die 'IPv6-only' if $ENV{TEST_IPV6_ONLY}; IO::Socket::INET->new(%opt, LocalAddr => '127.0.0.1') @@ -90,31 +138,65 @@ sub tcp_connect { } sub require_cmd ($;$) { - my ($cmd, $maybe) = @_; + my ($cmd, $nr) = @_; require PublicInbox::Spawn; - my $bin = PublicInbox::Spawn::which($cmd); + state %CACHE; + my $bin = $CACHE{$cmd} //= PublicInbox::Spawn::which($cmd); return $bin if $bin; - $maybe ? 0 : plan(skip_all => "$cmd missing from PATH for $0"); + return plan(skip_all => "$cmd missing from PATH for $0") if !$nr; + defined(wantarray) ? undef : skip("$cmd missing", $nr); } -sub have_xapian_compact () { - require_cmd($ENV{XAPIAN_COMPACT} || 'xapian-compact', 1); +sub have_xapian_compact (;$) { + require_cmd($ENV{XAPIAN_COMPACT} || 'xapian-compact', @_ ? $_[0] : ()); } sub require_git ($;$) { - my ($req, $maybe) = @_; - my ($req_maj, $req_min, $req_sub) = split(/\./, $req); - my ($cur_maj, $cur_min, $cur_sub) = (xqx([qw(git --version)]) - =~ /version (\d+)\.(\d+)(?:\.(\d+))?/); - - my $req_int = ($req_maj << 24) | ($req_min << 16) | ($req_sub // 0); - my $cur_int = ($cur_maj << 24) | ($cur_min << 16) | ($cur_sub // 0); - if ($cur_int < $req_int) { - return 0 if $maybe; - plan skip_all => - "git $req+ required, have $cur_maj.$cur_min.$cur_sub"; + my ($req, $nr) = @_; + require PublicInbox::Git; + state $cur_vstr = PublicInbox::Git::git_version(); + $req = eval("v$req") unless isvstring($req); + + return 1 if $cur_vstr ge $req; + state $cur_ver = sprintf('%vd', $cur_vstr); + return plan skip_all => "git $req+ required, have $cur_ver" if !$nr; + defined(wantarray) ? undef : + skip("git $req+ required (have $cur_ver)", $nr) +} + +sub require_git_http_backend (;$) { + my ($nr) = @_; + state $ok = do { + require PublicInbox::Git; + my $git = PublicInbox::Git::check_git_exe() or plan + skip_all => 'nothing in public-inbox works w/o git'; + my $rdr = { 1 => \my $out, 2 => \my $err }; + xsys([$git, qw(http-backend)], undef, $rdr); + $out =~ /^Status:/ism; + }; + if (!$ok) { + my $msg = "`git http-backend' not available"; + defined($nr) ? skip $msg, $nr : plan skip_all => $msg; } - 1; + $ok; +} + +my %IPv6_VERSION = ( + 'Net::NNTP' => 3.00, + 'Mail::IMAPClient' => 3.40, + 'HTTP::Tiny' => 0.042, + 'Net::POP3' => 2.32, +); + +sub need_accept_filter ($) { + my ($af) = @_; + return if $^O eq 'netbsd'; # since NetBSD 5.0, no kldstat needed + $^O =~ /\A(?:freebsd|dragonfly)\z/ or + skip 'SO_ACCEPTFILTER is FreeBSD/NetBSD/Dragonfly-only so far', + 1; + state $tried = {}; + ($tried->{$af} //= system("kldstat -m $af >/dev/null")) and + skip "$af not loaded: kldload $af", 1; } sub require_mods { @@ -124,7 +206,7 @@ sub require_mods { while (my $mod = shift(@mods)) { if ($mod eq 'lei') { require_git(2.6, $maybe ? $maybe : ()); - push @mods, qw(DBD::SQLite Search::Xapian); + push @mods, qw(DBD::SQLite Xapian +SCM_RIGHTS); $mod = 'json'; # fall-through } if ($mod eq 'json') { @@ -133,18 +215,30 @@ sub require_mods { push @mods, qw(Plack::Builder Plack::Util); next; } elsif ($mod eq '-imapd') { - push @mods, qw(Parse::RecDescent DBD::SQLite - Email::Address::XS||Mail::Address); + push @mods, qw(Parse::RecDescent DBD::SQLite); next; - } elsif ($mod eq '-nntpd') { + } elsif ($mod eq '-nntpd' || $mod eq 'v2') { push @mods, qw(DBD::SQLite); next; } - if ($mod eq 'Search::Xapian') { + if ($mod eq 'Xapian') { if (eval { require PublicInbox::Search } && PublicInbox::Search::load_xapian()) { next; } + } elsif ($mod eq '+SCM_RIGHTS') { + if (my $msg = need_scm_rights()) { + push @need, $msg; + next; + } + } elsif ($mod eq ':fcntl_lock') { + next if $^O eq 'linux' || require_bsd; + diag "untested platform: $^O, ". + "requiring File::FcntlLock..."; + push @mods, 'File::FcntlLock'; + } elsif ($mod =~ /\A\+(accf_.*)\z/) { + need_accept_filter($1); + next } elsif (index($mod, '||') >= 0) { # "Foo||Bar" my $ok; for my $m (split(/\Q||\E/, $mod)) { @@ -167,9 +261,13 @@ sub require_mods { !eval{ IO::Socket::SSL->VERSION(2.007); 1 }) { push @need, $@; } + if (defined(my $v = $IPv6_VERSION{$mod})) { + $ENV{TEST_IPV4_ONLY} = 1 if !eval { $mod->VERSION($v) }; + } } return unless @need; my $m = join(', ', @need)." missing for $0"; + $m =~ s/\bEmail::MIME\b/Email::MIME (development purposes only)/; skip($m, $maybe) if $maybe; plan(skip_all => $m) } @@ -192,9 +290,9 @@ sub _prepare_redirects ($) { for (my $fd = 0; $fd <= $#io_mode; $fd++) { my $fh = $fhref->[$fd] or next; my ($oldfh, $mode) = @{$io_mode[$fd]}; - open my $orig, $mode, $oldfh or die "$oldfh $mode stash: $!"; + open(my $orig, $mode, $oldfh); $orig_io->[$fd] = $orig; - open $oldfh, $mode, $fh or die "$oldfh $mode redirect: $!"; + open $oldfh, $mode, $fh; } $orig_io; } @@ -204,7 +302,7 @@ sub _undo_redirects ($) { for (my $fd = 0; $fd <= $#io_mode; $fd++) { my $fh = $orig_io->[$fd] or next; my ($oldfh, $mode) = @{$io_mode[$fd]}; - open $oldfh, $mode, $fh or die "$$oldfh $mode redirect: $!"; + open $oldfh, $mode, $fh; } } @@ -230,8 +328,8 @@ sub key2sub ($) { my ($key) = @_; $cached_scripts{$key} //= do { my $f = key2script($key); - open my $fh, '<', $f or die "open $f: $!"; - my $str = do { local $/; <$fh> }; + open my $fh, '<', $f; + my $str = read_all($fh); my $pkg = (split(m!/!, $f))[-1]; $pkg =~ s/([a-z])([a-z0-9]+)(\.t)?\z/\U$1\E$2/; $pkg .= "_T" if $3; @@ -246,7 +344,7 @@ use subs qw(exit); sub main { # the below "line" directive is a magic comment, see perlsyn(1) manpage # line 1 "$f" -$str +{ $str } 0; } 1; @@ -275,20 +373,46 @@ sub _run_sub ($$$) { } } +sub no_coredump (@) { + my @dirs = @_; + my $cwdfh; + opendir($cwdfh, '.') if @dirs; + my @found; + for (@dirs, '.') { + chdir $_; + my @cores = glob('core.* *.core'); + push @cores, 'core' if -d 'core'; + push(@found, "@cores found in $_") if @cores; + chdir $cwdfh if $cwdfh; + } + return if !@found; # keep it quiet. + is(scalar(@found), 0, 'no core dumps found'); + diag(join("\n", @found) . Carp::longmess()); + if (-t STDIN) { + diag 'press ENTER to continue, (q) to quit'; + chomp(my $line = <STDIN>); + xbail 'user quit' if $line =~ /\Aq/; + } +} + sub run_script ($;$$) { my ($cmd, $env, $opt) = @_; + no_coredump($opt->{-C} ? ($opt->{-C}) : ()); my ($key, @argv) = @$cmd; my $run_mode = $ENV{TEST_RUN_MODE} // $opt->{run_mode} // 1; my $sub = $run_mode == 0 ? undef : key2sub($key); my $fhref = []; my $spawn_opt = {}; my @tail_paths; + local $tail_cmd = $tail_cmd; for my $fd (0..2) { my $redir = $opt->{$fd}; my $ref = ref($redir); if ($ref eq 'SCALAR') { my $fh; - if ($tail_cmd && $ENV{TAIL_ALL} && $fd > 0) { + if ($ENV{TAIL_ALL} && $fd > 0) { + # tail -F is better, but not portable :< + $tail_cmd //= 'tail -f'; require File::Temp; $fh = File::Temp->new("fd.$fd-XXXX", TMPDIR=>1); push @tail_paths, $fh->filename; @@ -301,15 +425,15 @@ sub run_script ($;$$) { next if $fd > 0; $fh->autoflush(1); print $fh $$redir or die "print: $!"; - seek($fh, 0, SEEK_SET) or die "seek: $!"; + seek($fh, 0, SEEK_SET); } elsif ($ref eq 'GLOB') { $spawn_opt->{$fd} = $fhref->[$fd] = $redir; } elsif ($ref) { die "unable to deal with $ref $redir"; } } - my $tail = @tail_paths ? tail_f(@tail_paths) : undef; - if ($key =~ /-(index|convert|extindex|convert|xcpdb)\z/) { + my $tail = @tail_paths ? tail_f(@tail_paths, $opt) : undef; + if ($key =~ /-(index|cindex|extindex|convert|xcpdb)\z/) { unshift @argv, '--no-fsync'; } if ($run_mode == 0) { @@ -320,11 +444,7 @@ sub run_script ($;$$) { $cmd->[0] = File::Spec->rel2abs($cmd->[0]); $spawn_opt->{'-C'} = $d; } - my $pid = PublicInbox::Spawn::spawn($cmd, $env, $spawn_opt); - if (defined $pid) { - my $r = waitpid($pid, 0) // die "waitpid: $!"; - $r == $pid or die "waitpid: expected $pid, got $r"; - } + PublicInbox::Spawn::run_wait($cmd, $env, $spawn_opt); } else { # localize and run everything in the same process: # note: "local *STDIN = *STDIN;" and so forth did not work in # old versions of perl @@ -334,16 +454,12 @@ sub run_script ($;$$) { local $SIG{FPE} = 'IGNORE'; # Perl default local $0 = join(' ', @$cmd); my $orig_io = _prepare_redirects($fhref); - my $cwdfh = $lei_cwdfh; - if (my $d = $opt->{'-C'}) { - unless ($cwdfh) { - opendir $cwdfh, '.' or die "opendir .: $!"; - } - chdir $d or die "chdir $d: $!"; - } + opendir(my $cwdfh, '.'); + chdir $opt->{-C} if defined $opt->{-C}; _run_sub($sub, $key, \@argv); - eval { PublicInbox::Inbox::cleanup_task() }; - die "fchdir(restore): $!" if $cwdfh && !chdir($cwdfh); + # n.b. all our uses of PublicInbox::DS should be fine + # with this and we can't Reset here. + chdir($cwdfh); _undo_redirects($orig_io); select STDOUT; umask($umask); @@ -354,11 +470,10 @@ sub run_script ($;$$) { for my $fd (1..2) { my $fh = $fhref->[$fd] or next; next unless -f $fh; - seek($fh, 0, SEEK_SET) or die "seek: $!"; - my $redir = $opt->{$fd}; - local $/; - $$redir = <$fh>; + seek($fh, 0, SEEK_SET); + ${$opt->{$fd}} = read_all($fh); } + no_coredump($opt->{-C} ? ($opt->{-C}) : ()); $? == 0; } @@ -386,7 +501,7 @@ sub wait_for_tail { $ino[0] =~ s!/fd/!/fdinfo/!; my @info; do { - if (open my $fh, '<', $ino[0]) { + if (CORE::open(my $fh, '<', $ino[0])) { local $/ = "\n"; @info = grep(/^inotify wd:/, <$fh>); } @@ -424,12 +539,23 @@ sub xqx { } sub tail_f (@) { + my @f = grep(defined, @_); $tail_cmd or return; # "tail -F" or "tail -f" - for (@_) { open(my $fh, '>>', $_) or die $! }; - my $cmd = [ split(/ /, $tail_cmd), @_ ]; + my $opt = (ref($f[-1]) eq 'HASH') ? pop(@f) : {}; + my $clofork = $opt->{-CLOFORK} // []; + my @cfmap = map { + my $fl = fcntl($_, F_GETFD, 0); + fcntl($_, F_SETFD, $fl | FD_CLOEXEC) unless $fl & FD_CLOEXEC; + ($_, $fl); + } @$clofork; + for (@f) { open(my $fh, '>>', $_) }; + my $cmd = [ split(/ /, $tail_cmd), @f ]; require PublicInbox::Spawn; my $pid = PublicInbox::Spawn::spawn($cmd, undef, { 1 => 2 }); - wait_for_tail($pid, scalar @_); + while (my ($io, $fl) = splice(@cfmap, 0, 2)) { + fcntl($io, F_SETFD, $fl); + } + wait_for_tail($pid, scalar @f); require PublicInbox::AutoReap; PublicInbox::AutoReap->new($pid, \&wait_for_tail); } @@ -440,6 +566,9 @@ sub start_script { my $run_mode = $ENV{TEST_RUN_MODE} // $opt->{run_mode} // 2; my $sub = $run_mode == 0 ? undef : key2sub($key); my $tail; + my @xh = split(/\s+/, $ENV{TEST_DAEMON_XH} // ''); + @xh = () if $key !~ /-(?:imapd|netd|httpd|pop3d|nntpd)\z/; + push @argv, @xh; if ($tail_cmd) { my @paths; for (@argv) { @@ -458,24 +587,27 @@ sub start_script { } } } - $tail = tail_f(@paths); + $tail = tail_f(@paths, $opt); } - my $pid = fork // die "fork: $!\n"; + require PublicInbox::DS; + my $oset = PublicInbox::DS::block_signals(); + require PublicInbox::OnDestroy; + my $tmp_mask = PublicInbox::OnDestroy::all( + \&PublicInbox::DS::sig_setmask, $oset); + my $pid = PublicInbox::DS::fork_persist(); if ($pid == 0) { - eval { PublicInbox::DS->Reset }; + close($_) for (@{delete($opt->{-CLOFORK}) // []}); # pretend to be systemd (cf. sd_listen_fds(3)) # 3 == SD_LISTEN_FDS_START my $fd; - for ($fd = 0; 1; $fd++) { - my $s = $opt->{$fd}; - last if $fd >= 3 && !defined($s); - next unless $s; - my $fl = fcntl($s, F_GETFD, 0); - if (($fl & FD_CLOEXEC) != FD_CLOEXEC) { - warn "got FD:".fileno($s)." w/o CLOEXEC\n"; + for ($fd = 0; $fd < 3 || defined($opt->{$fd}); $fd++) { + my $io = $opt->{$fd} // next; + my $old = fileno($io); + if ($old == $fd) { + fcntl($io, F_SETFD, 0); + } else { + dup2($old, $fd) // die "dup2($old, $fd): $!"; } - fcntl($s, F_SETFD, $fl &= ~FD_CLOEXEC); - dup2(fileno($s), $fd) or die "dup2 failed: $!\n"; } %ENV = (%ENV, %$env) if $env; my $fds = $fd - 3; @@ -483,10 +615,12 @@ sub start_script { $ENV{LISTEN_PID} = $$; $ENV{LISTEN_FDS} = $fds; } - if ($opt->{-C}) { chdir($opt->{-C}) or die "chdir: $!" } - $0 = join(' ', @$cmd); + if ($opt->{-C}) { chdir($opt->{-C}) } + $0 = join(' ', @$cmd, @xh); + local @SIG{keys %SIG} = map { undef } values %SIG; + local $SIG{FPE} = 'IGNORE'; # Perl default + undef $tmp_mask; if ($sub) { - eval { PublicInbox::DS->Reset }; _run_sub($sub, $key, \@argv); POSIX::_exit($? >> 8); } else { @@ -494,6 +628,7 @@ sub start_script { die "FAIL: ",join(' ', $key, @argv), ": $!\n"; } } + undef $tmp_mask; require PublicInbox::AutoReap; my $td = PublicInbox::AutoReap->new($pid); $td->{-extra} = $tail; @@ -528,7 +663,7 @@ sub lei_ok (@) { my @msg = ref($_[0]) eq 'ARRAY' ? @{$_[0]} : @_; if (!$lei_loud) { for (@msg) { - s!\A([a-z0-9]+://)[^/]+/!$1\$HOST_PORT/!; + s!(127\.0\.0\.1|\[::1\]):(?:\d+)!$1:\$PORT!g; s!$tmpdir\b/(?:[^/]+/)?!\$TMPDIR/!g; s!\Q$PWD\E\b!\$PWD!g; } @@ -553,13 +688,35 @@ sub ignore_inline_c_missing { grep(!/\bInline\b/, split(/^/m, $_[0]))))); } +sub need_scm_rights () { + state $ok = PublicInbox::Spawn->can('send_cmd4') || do { + require PublicInbox::Syscall; + PublicInbox::Syscall->can('send_cmd4'); # Linux only + } || eval { require Socket::MsgHdr; 1 }; + return if $ok; + 'need SCM_RIGHTS support: Inline::C unconfigured/missing '. + '(mkdir -p ~/.cache/public-inbox/inline-c) OR Socket::MsgHdr missing'; +} + +# returns a pipe with FD_CLOEXEC disabled on the write-end +sub quit_waiter_pipe () { + pipe(my $r, my $w); + fcntl($w, F_SETFD, fcntl($w, F_GETFD, 0) & ~FD_CLOEXEC); + ($r, $w); +} + +sub wait_for_eof ($$;$) { + my ($io, $msg, $sec) = @_; + vec(my $rset = '', fileno($io), 1) = 1; + ok(select($rset, undef, undef, $sec // 9), "$msg (select)"); + is(my $line = <$io>, undef, "$msg EOF"); +} + sub test_lei { SKIP: { my ($cb) = pop @_; my $test_opt = shift // {}; - local $lei_cwdfh; - opendir $lei_cwdfh, '.' or xbail "opendir .: $!"; - require_git(2.6, 1) or skip('git 2.6+ required for lei test', 2); + require_git(2.6, 1); my $mods = $test_opt->{mods} // [ 'lei' ]; require_mods(@$mods, 2); @@ -567,7 +724,10 @@ SKIP: { require PublicInbox::Spawn; require PublicInbox::Config; require File::Path; - + eval { # use XDG_CACHE_HOME, first: + require PublicInbox::XapHelperCxx; + PublicInbox::XapHelperCxx::check_build(); + }; local %ENV = %ENV; delete $ENV{XDG_DATA_HOME}; delete $ENV{XDG_CONFIG_HOME}; @@ -577,40 +737,35 @@ SKIP: { $ENV{LANG} = $ENV{LC_ALL} = 'C'; my (undef, $fn, $lineno) = caller(0); my $t = "$fn:$lineno"; - state $lei_daemon = PublicInbox::Spawn->can('send_cmd4') || do { - require PublicInbox::Syscall; - PublicInbox::Syscall->can('send_cmd4'); - } || eval { require Socket::MsgHdr; 1 }; - unless ($lei_daemon) { - skip('Inline::C unconfigured/missing '. -'(mkdir -p ~/.cache/public-inbox/inline-c) OR Socket::MsgHdr missing', - 1); - } $lei_opt = { 1 => \$lei_out, 2 => \$lei_err }; my ($daemon_pid, $for_destroy, $daemon_xrd); my $tmpdir = $test_opt->{tmpdir}; - File::Path::mkpath($tmpdir) if (defined $tmpdir && !-d $tmpdir); + File::Path::mkpath($tmpdir) if defined $tmpdir; ($tmpdir, $for_destroy) = tmpdir unless $tmpdir; + my ($dead_r, $dead_w); state $persist_xrd = $ENV{TEST_LEI_DAEMON_PERSIST_DIR}; SKIP: { $ENV{TEST_LEI_ONESHOT} and xbail 'TEST_LEI_ONESHOT no longer supported'; my $home = "$tmpdir/lei-daemon"; - mkdir($home, 0700) or BAIL_OUT "mkdir: $!"; + mkdir($home, 0700); local $ENV{HOME} = $home; my $persist; if ($persist_xrd && !$test_opt->{daemon_only}) { $persist = $daemon_xrd = $persist_xrd; } else { $daemon_xrd = "$home/xdg_run"; - mkdir($daemon_xrd, 0700) or BAIL_OUT "mkdir: $!"; + mkdir($daemon_xrd, 0700); + ($dead_r, $dead_w) = quit_waiter_pipe; } local $ENV{XDG_RUNTIME_DIR} = $daemon_xrd; - $cb->(); + $cb->(); # likely shares $dead_w with lei-daemon + undef $dead_w; # so select() wakes up when daemon dies if ($persist) { # remove before ~/.local gets removed File::Path::rmtree([glob("$home/*")]); File::Path::rmtree("$home/.config"); } else { + no_coredump $tmpdir; lei_ok(qw(daemon-pid), \"daemon-pid after $t"); chomp($daemon_pid = $lei_out); if (!$daemon_pid) { @@ -622,13 +777,10 @@ SKIP: { } }; # SKIP for lei_daemon if ($daemon_pid) { - for (0..10) { - kill(0, $daemon_pid) or last; - tick; - } - ok(!kill(0, $daemon_pid), "$t daemon stopped"); + wait_for_eof($dead_r, 'daemon quit pipe'); + no_coredump $tmpdir; my $f = "$daemon_xrd/lei/errors.log"; - open my $fh, '<', $f or BAIL_OUT "$f: $!"; + open my $fh, '<', $f; my @l = <$fh>; is_xdeeply(\@l, [], "$t daemon XDG_RUNTIME_DIR/lei/errors.log empty"); @@ -646,8 +798,7 @@ sub setup_public_inboxes () { return @ret if -f $stamp; require PublicInbox::Lock; - my $lk = bless { lock_path => "$test_home/setup.lock" }, - 'PublicInbox::Lock'; + my $lk = PublicInbox::Lock->new("$test_home/setup.lock"); my $end = $lk->lock_for_scope; return @ret if -f $stamp; @@ -657,7 +808,7 @@ sub setup_public_inboxes () { '--newsgroup', "t.v$V", "t$V", "$test_home/t$V", "http://example.com/t$V", "t$V\@example.com" ]) or xbail "init v$V"; - unlink "$test_home/t$V/description" or xbail "unlink $!"; + unlink "$test_home/t$V/description"; } require PublicInbox::Config; require PublicInbox::InboxWritable; @@ -677,11 +828,63 @@ sub setup_public_inboxes () { $im->done; }); $seen or BAIL_OUT 'no imports'; - open my $fh, '>', $stamp or BAIL_OUT "open $stamp: $!"; + open my $fh, '>', $stamp; @ret; } -sub create_inbox ($$;@) { +our %COMMIT_ENV = ( + GIT_AUTHOR_NAME => 'A U Thor', + GIT_COMMITTER_NAME => 'C O Mitter', + GIT_AUTHOR_EMAIL => 'a@example.com', + GIT_COMMITTER_EMAIL => 'c@example.com', +); + +# for memoizing based on coderefs and various create_* params +sub my_sum { + require PublicInbox::SHA; + require Data::Dumper; + my $d = Data::Dumper->new(\@_); + $d->$_(1) for qw(Deparse Sortkeys Terse); + my @l = split /\n/s, $d->Dump; + @l = grep !/\$\^H\{.+?[A-Z]+\(0x[0-9a-f]+\)/, @l; # autodie addresses + my @addr = grep /[A-Za-z]+\(0x[0-9a-f]+\)/, @l; + xbail 'undumpable addresses: ', \@addr if @addr; + substr PublicInbox::SHA::sha256_hex(join('', @l)), 0, 8; +} + +sub create_dir (@) { + my ($ident, $cb) = (shift, pop); + my %opt = @_; + require PublicInbox::Lock; + require PublicInbox::Import; + my $tmpdir = delete $opt{tmpdir}; + my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); + my $dir = "t/data-gen/$base.$ident-".my_sum($cb, \%opt); + require File::Path; + my $new = File::Path::make_path($dir); + my $lk = PublicInbox::Lock->new("$dir/creat.lock"); + my $scope = $lk->lock_for_scope; + if (!-f "$dir/creat.stamp") { + opendir(my $cwd, '.'); + chdir($dir); + local %ENV = (%ENV, %COMMIT_ENV); + $cb->($dir); + chdir($cwd); # some $cb chdir around + open my $s, '>', "$dir/creat.stamp"; + } + return $dir if !defined($tmpdir); + xsys_e([qw(/bin/cp -Rp), $dir, $tmpdir]); + $tmpdir; +} + +sub create_coderepo (@) { + my $ident = shift; + require PublicInbox::Import; + my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!); + create_dir "$ident-$db", @_; +} + +sub create_inbox ($;@) { my $ident = shift; my $cb = pop; my %opt = @_; @@ -690,13 +893,11 @@ sub create_inbox ($$;@) { require PublicInbox::Import; my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!); - my $dir = "t/data-gen/$base.$ident-$db"; - my $new = !-d $dir; - if ($new) { - mkdir $dir; # may race - -d $dir or BAIL_OUT "$dir could not be created: $!"; - } - my $lk = bless { lock_path => "$dir/creat.lock" }, 'PublicInbox::Lock'; + my $tmpdir = delete $opt{tmpdir}; + my $dir = "t/data-gen/$base.$ident-".my_sum($db, $cb, \%opt); + require File::Path; + my $new = File::Path::make_path($dir); + my $lk = PublicInbox::Lock->new("$dir/creat.lock"); $opt{inboxdir} = File::Spec->rel2abs($dir); $opt{name} //= $ident; my $scope = $lk->lock_for_scope; @@ -704,7 +905,6 @@ sub create_inbox ($$;@) { $pre_cb->($dir) if $pre_cb && $new; $opt{-no_fsync} = 1; my $no_gc = delete $opt{-no_gc}; - my $tmpdir = delete $opt{tmpdir}; my $addr = $opt{address} // []; $opt{-primary_address} //= $addr->[0] // "$ident\@example.com"; my $parallel = delete($opt{importer_parallel}) // 0; @@ -721,8 +921,7 @@ sub create_inbox ($$;@) { xsys_e([ qw(git gc -q) ], { GIT_DIR => $dir }); } } - open my $s, '>', "$dir/creat.stamp" or - BAIL_OUT "error creating $dir/creat.stamp: $!"; + open my $s, '>', "$dir/creat.stamp"; } if ($tmpdir) { undef $ibx; @@ -734,23 +933,32 @@ sub create_inbox ($$;@) { $ibx; } -sub test_httpd ($$;$) { - my ($env, $client, $skip) = @_; - for (qw(PI_CONFIG TMPDIR)) { - $env->{$_} or BAIL_OUT "$_ unset"; - } +sub test_httpd ($$;$$) { + my ($env, $client, $skip, $cb) = @_; + my ($tmpdir, $for_destroy); + $env->{TMPDIR} //= do { + ($tmpdir, $for_destroy) = tmpdir(); + $tmpdir; + }; + for (qw(PI_CONFIG)) { $env->{$_} or BAIL_OUT "$_ unset" } SKIP: { - require_mods(qw(Plack::Test::ExternalServer), $skip // 1); + require_mods(qw(Plack::Test::ExternalServer LWP::UserAgent), + $skip // 1); my $sock = tcp_server() or die; my ($out, $err) = map { "$env->{TMPDIR}/std$_.log" } qw(out err); my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ]; my $td = start_script($cmd, $env, { 3 => $sock }); my ($h, $p) = tcp_host_port($sock); local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p"; - Plack::Test::ExternalServer::test_psgi(client => $client); + my $ua = LWP::UserAgent->new; + $ua->max_redirect(0); + local $CURRENT_DAEMON = $td; + Plack::Test::ExternalServer::test_psgi(client => $client, + ua => $ua); + $cb->() if $cb; $td->join('TERM'); - open my $fh, '<', $err or BAIL_OUT $!; - my $e = do { local $/; <$fh> }; + open my $fh, '<', $err; + my $e = read_all($fh); if ($e =~ s/^Plack::Middleware::ReverseProxy missing,\n//gms) { $e =~ s/^URL generation for redirects .*\n//gms; } @@ -758,6 +966,93 @@ sub test_httpd ($$;$) { } }; +# TODO: support fstat(1) on OpenBSD, lsof already works on FreeBSD + Linux +# don't use this for deleted file checks, we only check that on Linux atm +# and we can readlink /proc/PID/fd/* directly +sub lsof_pid ($;$) { + my ($pid, $rdr) = @_; + state $lsof = require_cmd('lsof', 1); + $lsof or skip 'lsof missing/broken', 1; + my @out = xqx([$lsof, '-p', $pid], undef, $rdr); + if ($?) { + undef $lsof; + skip "lsof -p PID broken \$?=$?", 1; + } + my @cols = split ' ', $out[0]; + if (($cols[7] // '') eq 'NODE') { # normal lsof + @out; + } else { # busybox lsof ignores -p, so we DIY it + grep /\b$pid\b/, @out; + } +} + +sub no_pollerfd ($) { + my ($pid) = @_; + my ($re, @cmd); + $^O eq 'linux' and + ($re, @cmd) = (qr/\Q[eventpoll]\E/, qw(lsof -p), $pid); + # n.b. *BSDs uses kqueue to emulate signalfd and/or inotify, + # and we can't distinguish which is which easily. + SKIP: { + (@cmd && $re) or + skip 'open poller test is Linux-only', 1; + my $bin = require_cmd($cmd[0], 1) or skip "$cmd[0] missing", 1; + $cmd[0] = $bin; + my @of = xqx(\@cmd, {}, {2 => \(my $e)}); + my $err = $?; + skip "$bin broken? (\$?=$err) ($e)", 1 if $err; + @of = grep /\b$pid\b/, @of; # busybox lsof ignores -p + is(grep(/$re/, @of), 0, "no $re FDs") or diag explain(\@of); + } +} + +sub cfg_new ($;@) { + my ($tmpdir, @body) = @_; + require PublicInbox::Config; + my $f = "$tmpdir/tmp_cfg"; + open my $fh, '>', $f; + print $fh @body; + close $fh; + PublicInbox::Config->new($f); +} + +our $strace_cmd; +sub strace (@) { + my ($for_daemon) = @_; + skip 'linux only test', 1 if $^O ne 'linux'; + if ($for_daemon) { + my $f = '/proc/sys/kernel/yama/ptrace_scope'; + # TODO: we could fiddle with prctl in the daemon to make + # things work, but I'm not sure it's worth it... + state $ps = do { + my $fh; + CORE::open($fh, '<', $f) ? readline($fh) : 0; + }; + chomp $ps; + skip "strace unusable on daemons\n$f is `$ps' (!= 0)", 1 if $ps; + } + require_cmd('strace', 1) or skip 'strace not available', 1; +} + +sub strace_inject (;$) { + my $cmd = strace(@_); + state $ver = do { + require PublicInbox::Spawn; + my $v = PublicInbox::Spawn::run_qx([$cmd, '-V']); + $v =~ m!version\s+([1-9]+\.[0-9]+)! or + xbail "no strace -V: $v"; + eval("v$1"); + }; + $ver ge v4.16 or skip "$cmd too old for syscall injection (". + sprintf('v%vd', $ver). ' < v4.16)', 1; + $cmd +} + +sub oct_is ($$$) { + my ($got, $exp, $msg) = @_; + @_ = (sprintf('0%03o', $got), sprintf('0%03o', $exp), $msg); + goto &is; # tail recursion to get lineno from callers on failure +} package PublicInbox::TestCommon::InboxWakeup; use strict; diff --git a/lib/PublicInbox/Tmpfile.pm b/lib/PublicInbox/Tmpfile.pm index 3040dd77..72dd9d24 100644 --- a/lib/PublicInbox/Tmpfile.pm +++ b/lib/PublicInbox/Tmpfile.pm @@ -1,9 +1,9 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::Tmpfile; -use strict; -use v5.10.1; +use v5.12; use parent qw(Exporter); +use autodie qw(unlink); our @EXPORT = qw(tmpfile); use Fcntl qw(:DEFAULT); use Errno qw(EEXIST); @@ -21,7 +21,7 @@ sub tmpfile ($;$$) { if (defined $sock) { # add the socket inode number so we can figure out which # socket it belongs to - my @st = stat($sock); + my @st = stat($sock) or die "stat($sock): $!"; $id .= '-ino:'.$st[1]; } $id =~ tr!/!^!; @@ -31,7 +31,7 @@ sub tmpfile ($;$$) { do { my $fn = File::Spec->tmpdir . "/$id-".time.'-'.rand; if (sysopen(my $fh, $fn, $fl, 0600)) { # likely - unlink($fn) or warn "unlink($fn): $!"; # FS broken + unlink($fn); return $fh; # success } } while ($! == EEXIST); diff --git a/lib/PublicInbox/URIimap.pm b/lib/PublicInbox/URIimap.pm index 81644914..41c2842a 100644 --- a/lib/PublicInbox/URIimap.pm +++ b/lib/PublicInbox/URIimap.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # cf. RFC 5092, which the `URI' package doesn't support # @@ -11,8 +11,7 @@ # # RFC 2192 also describes ";TYPE=<list_type>" package PublicInbox::URIimap; -use strict; -use v5.10.1; +use v5.12; use URI::Split qw(uri_split uri_join); # part of URI use URI::Escape qw(uri_unescape uri_escape); use overload '""' => \&as_string; diff --git a/lib/PublicInbox/URInntps.pm b/lib/PublicInbox/URInntps.pm index 231b247b..88c8d641 100644 --- a/lib/PublicInbox/URInntps.pm +++ b/lib/PublicInbox/URInntps.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # deal with the lack of URI::nntps in upstream URI. @@ -6,7 +6,7 @@ # cf. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=983419 # Fixed in URI 5.08, we can drop this by 2035 when LTS distros all have it package PublicInbox::URInntps; -use strict; +use v5.12; use parent qw(URI::snews); use URI; diff --git a/lib/PublicInbox/Umask.pm b/lib/PublicInbox/Umask.pm new file mode 100644 index 00000000..2c859e65 --- /dev/null +++ b/lib/PublicInbox/Umask.pm @@ -0,0 +1,70 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# base class to ensures Xapian||SQLite files respect core.sharedRepository +# of git repos +package PublicInbox::Umask; +use v5.12; +use PublicInbox::OnDestroy; + +use constant { + PERM_UMASK => 0, + OLD_PERM_GROUP => 1, + OLD_PERM_EVERYBODY => 2, + PERM_GROUP => 0660, + PERM_EVERYBODY => 0664, +}; + +sub _read_git_config_perm { + my ($self) = @_; + chomp(my $perm = $self->git->qx('config', 'core.sharedRepository')); + $perm; +} + +sub _git_config_perm { + my $self = shift; + my $perm = scalar @_ ? $_[0] : _read_git_config_perm($self); + $perm //= ''; + return PERM_UMASK if $perm eq '' || $perm eq 'umask'; + return PERM_GROUP if $perm eq 'group'; + return PERM_EVERYBODY if $perm =~ /\A(?:all|world|everybody)\z/; + return PERM_GROUP if ($perm =~ /\A(?:true|yes|on|1)\z/); + return PERM_UMASK if ($perm =~ /\A(?:false|no|off|0)\z/); + + my $i = oct($perm); + return PERM_UMASK if $i == PERM_UMASK; + return PERM_GROUP if $i == OLD_PERM_GROUP; + return PERM_EVERYBODY if $i == OLD_PERM_EVERYBODY; + + if (($i & 0600) != 0600) { + die "core.sharedRepository mode invalid: ". + sprintf('%.3o', $i) . "\nOwner must have permissions\n"; + } + ($i & 0666); +} + +sub _umask_for { + my ($perm) = @_; # _git_config_perm return value + my $rv = $perm; + return umask if $rv == 0; + + # set +x bit if +r or +w were set + $rv |= 0100 if ($rv & 0600); + $rv |= 0010 if ($rv & 0060); + $rv |= 0001 if ($rv & 0006); + (~$rv & 0777); +} + +sub with_umask { + my ($self, $cb, @arg) = @_; + my $old = umask($self->{umask} //= umask_prepare($self)); + my $restore = on_destroy \&CORE::umask, $old; + $cb ? $cb->(@arg) : $restore; +} + +sub umask_prepare { + my ($self) = @_; + _umask_for(_git_config_perm($self)); +} + +1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ed5182ae..43f37f60 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # This interface wraps and mimics PublicInbox::Import @@ -8,7 +8,7 @@ use strict; use v5.10.1; use parent qw(PublicInbox::Lock PublicInbox::IPC); use PublicInbox::SearchIdxShard; -use PublicInbox::IPC; +use PublicInbox::IPC qw(nproc_shards); use PublicInbox::Eml; use PublicInbox::Git; use PublicInbox::Import; @@ -22,37 +22,12 @@ use PublicInbox::Spawn qw(spawn popen_rd run_die); use PublicInbox::Search; use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob); use IO::Handle; # ->autoflush -use File::Temp (); use POSIX (); my $OID = qr/[a-f0-9]{40,}/; # an estimate of the post-packed size to the raw uncompressed size our $PACKING_FACTOR = 0.4; -# SATA storage lags behind what CPUs are capable of, so relying on -# nproc(1) can be misleading and having extra Xapian shards is a -# waste of FDs and space. It can also lead to excessive IO latency -# and slow things down. Users on NVME or other fast storage can -# use the NPROC env or switches in our script/public-inbox-* programs -# to increase Xapian shards -our $NPROC_MAX_DEFAULT = 4; - -sub nproc_shards ($) { - my ($creat_opt) = @_; - my $n = $creat_opt->{nproc} if ref($creat_opt) eq 'HASH'; - $n //= $ENV{NPROC}; - if (!$n) { - # assume 2 cores if not detectable or zero - state $NPROC_DETECTED = PublicInbox::IPC::detect_nproc() || 2; - $n = $NPROC_DETECTED; - $n = $NPROC_MAX_DEFAULT if $n > $NPROC_MAX_DEFAULT; - } - - # subtract for the main process and git-fast-import - $n -= 1; - $n < 1 ? 1 : $n; -} - sub count_shards ($) { my ($self) = @_; # always load existing shards in case core count changes: @@ -113,13 +88,6 @@ sub init_inbox { $self->done; } -# returns undef on duplicate or spam -# mimics Import::add and wraps it for v2 -sub add { - my ($self, $eml, $check_cb) = @_; - $self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb); -} - sub idx_shard ($$) { my ($self, $num) = @_; $self->{idx_shards}->[$num % scalar(@{$self->{idx_shards}})]; @@ -137,8 +105,11 @@ sub do_idx ($$$) { $n >= $self->{batch_bytes}; } -sub _add { +# returns undef on duplicate or spam +# mimics Import::add and wraps it for v2 +sub add { my ($self, $mime, $check_cb) = @_; + my $restore = $self->{ibx}->with_umask; # spam check: if ($check_cb) { @@ -164,7 +135,6 @@ sub _add { if (do_idx($self, $mime, $smsg)) { $self->checkpoint; } - $cmt; } @@ -415,17 +385,16 @@ sub rewrite_internal ($$;$$$) { # (retval[2]) is not part of the stable API shared with Import->remove sub remove { my ($self, $eml, $cmt_msg) = @_; - my $r = $self->{ibx}->with_umask(\&rewrite_internal, - $self, $eml, $cmt_msg); + my $restore = $self->{ibx}->with_umask; + my $r = rewrite_internal($self, $eml, $cmt_msg); defined($r) && defined($r->[0]) ? @$r: undef; } sub _replace ($$;$$) { my ($self, $old_eml, $new_eml, $sref) = @_; - my $arg = [ $self, $old_eml, undef, $new_eml, $sref ]; - my $rewritten = $self->{ibx}->with_umask(\&rewrite_internal, - $self, $old_eml, undef, $new_eml, $sref) or return; - + my $restore = $self->{ibx}->with_umask; + my $rewritten = rewrite_internal($self, $old_eml, undef, + $new_eml, $sref) or return; my $rewrites = $rewritten->{rewrites}; # ->done is called if there are rewrites since we gc+prune from git $self->idx_init if @$rewrites; @@ -538,13 +507,7 @@ sub set_last_commits ($) { # this is NOT for ExtSearchIdx sub checkpoint ($;$) { my ($self, $wait) = @_; - if (my $im = $self->{im}) { - if ($wait) { - $im->barrier; - } else { - $im->checkpoint; - } - } + $self->{im}->barrier if $self->{im}; my $shards = $self->{idx_shards}; if ($shards) { my $dbh = $self->{mm}->{dbh} if $self->{mm}; @@ -689,23 +652,6 @@ sub import_init { $im; } -# XXX experimental -sub diff ($$$) { - my ($mid, $cur, $new) = @_; - - my $ah = File::Temp->new(TEMPLATE => 'email-cur-XXXX', TMPDIR => 1); - print $ah $cur->as_string or die "print: $!"; - $ah->flush or die "flush: $!"; - PublicInbox::Import::drop_unwanted_headers($new); - my $bh = File::Temp->new(TEMPLATE => 'email-new-XXXX', TMPDIR => 1); - print $bh $new->as_string or die "print: $!"; - $bh->flush or die "flush: $!"; - my $cmd = [ qw(diff -u), $ah->filename, $bh->filename ]; - print STDERR "# MID conflict <$mid>\n"; - my $pid = spawn($cmd, undef, { 1 => 2 }); - waitpid($pid, 0) == $pid or die "diff did not finish"; -} - sub get_blob ($$) { my ($self, $smsg) = @_; if (my $im = $self->{im}) { @@ -729,9 +675,6 @@ sub content_exists ($$$) { } my $cur = PublicInbox::Eml->new($msg); return 1 if content_matches($chashes, $cur); - - # XXX DEBUG_DIFF is experimental and may be removed - diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF}; } undef; } @@ -1120,7 +1063,7 @@ sub unindex_todo ($$$) { /\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o or next; $self->git->cat_async($1, $unindex_oid, { %$sync, oid => $1 }); } - close $fh or die "git log failed: \$?=$?"; + $fh->close or die "git log failed: \$?=$?"; $self->git->async_wait_all; return unless $sync->{-opt}->{prune}; @@ -1210,6 +1153,7 @@ sub index_todo ($$$) { }; if ($f eq 'm') { if ($sync->{max_size}) { + $req->{git} = $all; $all->check_async($oid, \&check_size, $req); } else { $all->cat_async($oid, $index_oid, $req); diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 26094082..44e1f2a8 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -7,6 +7,7 @@ package PublicInbox::View; use strict; use v5.10.1; use List::Util qw(max); +use Text::Wrap qw(wrap); # stdlib, we need Perl 5.6+ for $huge use PublicInbox::MsgTime qw(msg_datestamp); use PublicInbox::Hval qw(ascii_html obfuscate_addrs prurl mid_href ts2str fmt_ts); @@ -19,6 +20,7 @@ use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::Reply; use PublicInbox::ViewDiff qw(flush_diff); use PublicInbox::Eml; +use POSIX qw(strftime); use Time::Local qw(timegm); use PublicInbox::Smsg qw(subject_normalized); use PublicInbox::ContentHash qw(content_hash); @@ -36,14 +38,12 @@ sub msg_page_i { : $ctx->gone('over'); $ctx->{mhref} = ($ctx->{nr} || $ctx->{smsg}) ? "../${\mid_href($smsg->{mid})}/" : ''; - my $obuf = $ctx->{obuf} = _msg_page_prepare_obuf($eml, $ctx); - if (length($$obuf)) { - multipart_text_as_html($eml, $ctx); - $$obuf .= '</pre><hr>'; + if (_msg_page_prepare($eml, $ctx, $smsg->{ts})) { + $eml->each_part(\&add_text_body, $ctx, 1); + print { $ctx->{zfh} } '</pre><hr>'; } - delete $ctx->{obuf}; - $$obuf .= html_footer($ctx, $ctx->{first_hdr}) if !$ctx->{smsg}; - $$obuf; + html_footer($ctx, $ctx->{first_hdr}) if !$ctx->{smsg}; + ''; # XXX TODO cleanup } else { # called by WwwStream::async_next or getline $ctx->{smsg}; # may be undef } @@ -56,14 +56,12 @@ sub no_over_html ($) { my $eml = PublicInbox::Eml->new($bref); $ctx->{mhref} = ''; PublicInbox::WwwStream::init($ctx); - my $obuf = $ctx->{obuf} = _msg_page_prepare_obuf($eml, $ctx); - if (length($$obuf)) { - multipart_text_as_html($eml, $ctx); - $$obuf .= '</pre><hr>'; + if (_msg_page_prepare($eml, $ctx)) { # sets {-title_html} + $eml->each_part(\&add_text_body, $ctx, 1); + print { $ctx->{zfh} } '</pre><hr>'; } - delete $ctx->{obuf}; - eval { $$obuf .= html_footer($ctx, $eml) }; - html_oneshot($ctx, 200, $obuf); + html_footer($ctx, $eml); + $ctx->html_done; } # public functions: (unstable) @@ -82,7 +80,8 @@ sub msg_page { # allow user to easily browse the range around this message if # they have ->over $ctx->{-t_max} = $smsg->{ts}; - PublicInbox::WwwStream::aresponse($ctx, 200, \&msg_page_i); + $ctx->{-spfx} = '../' if $ibx->{-repo_objs}; + PublicInbox::WwwStream::aresponse($ctx, \&msg_page_i); } # /$INBOX/$MESSAGE_ID/#R @@ -184,6 +183,59 @@ sub nr_to_s ($$$) { $nr == 1 ? "$nr $singular" : "$nr $plural"; } +sub addr2urlmap ($) { + my ($ctx) = @_; + # cache makes a huge difference with /[tT] and large threads + my $key = PublicInbox::Git::host_prefix_url($ctx->{env}, ''); + my $ent = $ctx->{www}->{pi_cfg}->{-addr2urlmap}->{$key} // do { + my $by_addr = $ctx->{www}->{pi_cfg}->{-by_addr}; + my (%addr2url, $url); + while (my ($addr, $ibx) = each %$by_addr) { + $url = $ibx->base_url // $ibx->base_url($ctx->{env}); + $addr2url{$addr} = ascii_html($url) if defined $url; + } + # don't allow attackers to randomly change Host: headers + # and OOM us if the server handles all hostnames: + my $tmp = $ctx->{www}->{pi_cfg}->{-addr2urlmap}; + my @k = keys %$tmp; # random order + delete @$tmp{@k[0..3]} if scalar(@k) > 7; + my $re = join('|', map { quotemeta } keys %addr2url); + $tmp->{$key} = [ qr/\b($re)\b/i, \%addr2url ]; + }; + @$ent; +} + +sub to_cc_html ($$$$) { + my ($ctx, $eml, $field, $t) = @_; + my @vals = $eml->header($field) or return ('', 0); + my (undef, $addr2url) = addr2urlmap($ctx); + my $pairs = PublicInbox::Address::pairs(join(', ', @vals)); + my ($len, $line_len, $html) = (0, 0, ''); + my ($pair, $url); + my ($cur_ibx, $env) = @$ctx{qw(ibx env)}; + # avoid excessive ascii_html calls (already hot in profiles): + my @html = split /\n/, ascii_html(join("\n", map { + $_->[0] // (split(/\@/, $_->[1]))[0]; # addr user if no name + } @$pairs)); + for my $n (@html) { + $pair = shift @$pairs; + if ($line_len) { # 9 = display width of ",\t": + if ($line_len + length($n) > COLS - 9) { + $html .= ",\n\t"; + $len += $line_len; + $line_len = 0; + } else { + $html .= ', '; + $line_len += 2; + } + } + $line_len += length($n); + $url = $addr2url->{lc($pair->[1] // '')}; + $html .= $url ? qq(<a\nhref="$url$t">$n</a>) : $n; + } + ($html, $len + $line_len); +} + # Displays the text of of the message for /$INBOX/$MSGID/[Tt]/ endpoint # this is already inside a <pre> sub eml_entry { @@ -208,7 +260,8 @@ sub eml_entry { my $ds = delete $smsg->{ds}; # for v1 non-Xapian/SQLite users # Deleting these fields saves about 400K as we iterate across 1K msgs - delete @$smsg{qw(ts blob)}; + my ($t, undef) = delete @$smsg{qw(ts blob)}; + $t = $t ? '?t='.ts2str($t) : ''; my $from = _hdr_names_html($eml, 'From'); obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx; @@ -217,9 +270,8 @@ sub eml_entry { my $mhref = $upfx . mid_href($mid_raw) . '/'; $rv .= qq{ (<a\nhref="$mhref">permalink</a> / }; $rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n}; - my $to = fold_addresses(_hdr_names_html($eml, 'To')); - my $cc = fold_addresses(_hdr_names_html($eml, 'Cc')); - my ($tlen, $clen) = (length($to), length($cc)); + my ($to, $tlen) = to_cc_html($ctx, $eml, 'To', $t); + my ($cc, $clen) = to_cc_html($ctx, $eml, 'Cc', $t); my $to_cc = ''; if (($tlen + $clen) > COLS) { $to_cc .= ' To: '.$to."\n" if $tlen; @@ -242,20 +294,22 @@ sub eml_entry { my $html = ascii_html($irt); $rv .= qq(In-Reply-To: <<a\nhref="$href">$html</a>>\n) } - $rv .= "\n"; + say { $ctx->zfh } $rv; # scan through all parts, looking for displayable text $ctx->{mhref} = $mhref; - $ctx->{obuf} = \$rv; - $eml->each_part(\&add_text_body, $ctx, 1); - delete $ctx->{obuf}; + $ctx->{changed_href} = "#e$id"; # for diffstat "files? changed," + $eml->each_part(\&add_text_body, $ctx, 1); # expensive # add the footer - $rv .= "\n<a\nhref=#$id_m\nid=e$id>^</a> ". + $rv = "\n<a\nhref=#$id_m\nid=e$id>^</a> ". "<a\nhref=\"$mhref\">permalink</a>" . " <a\nhref=\"${mhref}raw\">raw</a>" . " <a\nhref=\"${mhref}#R\">reply</a>"; + delete($ctx->{-qry}) and + $rv .= qq[ <a\nhref="${mhref}#related">related</a>]; + my $hr; if (defined(my $pct = $smsg->{pct})) { # used by SearchView.pm $rv .= "\t[relevance $pct%]"; @@ -300,8 +354,7 @@ sub _th_index_lite { my $rv = ''; my $mapping = $ctx->{mapping} or return $rv; my $pad = ' '; - my $mid_map = $mapping->{$mid_raw}; - defined $mid_map or + my $mid_map = $mapping->{$mid_raw} // return 'public-inbox BUG: '.ascii_html($mid_raw).' not mapped'; my ($attr, $node, $idx, $level) = @$mid_map; my $children = $node->{children}; @@ -333,10 +386,10 @@ sub _th_index_lite { } my $s_s = nr_to_s($nr_s, 'sibling', 'siblings'); my $s_c = nr_to_s($nr_c, 'reply', 'replies'); - $attr =~ s!\n\z!</b>\n!s; + chop $attr; # remove "\n" $attr =~ s!<a\nhref.*</a> (?:" )?!!s; # no point in dup subject $attr =~ s!<a\nhref=[^>]+>([^<]+)</a>!$1!s; # no point linking to self - $rv .= "<b>@ $attr"; + $rv .= "<b>@ $attr</b>\n"; if ($nr_c) { my $cmid = $children->[0] ? $children->[0]->{mid} : undef; $rv .= $pad . _skel_hdr($mapping, $cmid); @@ -386,7 +439,9 @@ sub pre_thread { # walk_thread callback sub thread_eml_entry { my ($ctx, $eml) = @_; my ($beg, $end) = thread_adj_level($ctx, $ctx->{level}); - $beg . '<pre>' . eml_entry($ctx, $eml) . '</pre>' . $end; + print { $ctx->zfh } $beg, '<pre>'; + print { $ctx->{zfh} } eml_entry($ctx, $eml), '</pre>'; + $end; } sub next_in_queue ($$) { @@ -413,15 +468,15 @@ sub stream_thread_i { # PublicInbox::WwwStream::getline callback if (!$ghost_ok) { # first non-ghost $ctx->{-title_html} = ascii_html($smsg->{subject}); - $ctx->zmore($ctx->html_top); + print { $ctx->zfh } $ctx->html_top; } return $smsg; } # buffer the ghost entry and loop - $ctx->zmore(ghost_index_entry($ctx, $lvl, $smsg)); + print { $ctx->zfh } ghost_index_entry($ctx, $lvl, $smsg) } else { # all done - $ctx->zmore(join('', thread_adj_level($ctx, 0))); - $ctx->zmore(${delete($ctx->{skel})}); + print { $ctx->zfh } thread_adj_level($ctx, 0), + ${delete($ctx->{skel})}; return; } } @@ -430,7 +485,7 @@ sub stream_thread_i { # PublicInbox::WwwStream::getline callback sub stream_thread ($$) { my ($rootset, $ctx) = @_; @{$ctx->{-queue}} = map { (0, $_) } @$rootset; - PublicInbox::WwwStream::aresponse($ctx, 200, \&stream_thread_i); + PublicInbox::WwwStream::aresponse($ctx, \&stream_thread_i); } # /$INBOX/$MSGID/t/ and /$INBOX/$MSGID/T/ @@ -441,10 +496,11 @@ sub thread_html { my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); return missing_thread($ctx) if $nr == 0; + $ctx->{-spfx} = '../../' if $ibx->{-repo_objs}; # link $INBOX_DIR/description text to "index_topics" view around # the newest message in this thread - my $t = ts2str($ctx->{-t_max} = max(map { delete $_->{ts} } @$msgs)); + my $t = ts2str($ctx->{-t_max} = max(map { $_->{ts} } @$msgs)); my $t_fmt = fmt_ts($ctx->{-t_max}); my $skel = '<hr><pre>'; @@ -481,7 +537,7 @@ EOF # flat display: lazy load the full message from smsg $ctx->{msgs} = $msgs; $ctx->{-html_tip} = '<pre>'; - PublicInbox::WwwStream::aresponse($ctx, 200, \&thread_html_i); + PublicInbox::WwwStream::aresponse($ctx, \&thread_html_i); } sub thread_html_i { # PublicInbox::WwwStream::getline callback @@ -490,7 +546,7 @@ sub thread_html_i { # PublicInbox::WwwStream::getline callback my $smsg = $ctx->{smsg}; if (exists $ctx->{-html_tip}) { $ctx->{-title_html} = ascii_html($smsg->{subject}); - $ctx->zmore($ctx->html_top); + print { $ctx->zfh } $ctx->html_top; } return eml_entry($ctx, $eml); } else { @@ -498,31 +554,19 @@ sub thread_html_i { # PublicInbox::WwwStream::getline callback return $smsg if exists($smsg->{blob}); } my $skel = delete($ctx->{skel}) or return; # all done - $ctx->zmore($$skel); + print { $ctx->zfh } $$skel; undef; } } -sub multipart_text_as_html { - # ($mime, $ctx) = @_; # each_part may do "$_[0] = undef" - - # scan through all parts, looking for displayable text - $_[0]->each_part(\&add_text_body, $_[1], 1); -} - sub submsg_hdr ($$) { my ($ctx, $eml) = @_; - my $obfs_ibx = $ctx->{-obfs_ibx}; - my $rv = $ctx->{obuf}; - $$rv .= "\n"; + my $s = "\n"; for my $h (qw(From To Cc Subject Date Message-ID X-Alt-Message-ID)) { - my @v = $eml->header($h); - for my $v (@v) { - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $v = ascii_html($v); - $$rv .= "$h: $v\n"; - } + $s .= "$h: $_\n" for $eml->header($h); } + obfuscate_addrs($ctx->{-obfs_ibx}, $s) if $ctx->{-obfs_ibx}; + ascii_html($s); } sub attach_link ($$$$;$) { @@ -533,7 +577,6 @@ sub attach_link ($$$$;$) { # downloads for 0-byte multipart attachments return unless $part->{bdy}; - my $nl = $idx eq '1' ? '' : "\n"; # like join("\n", ...) my $size = length($part->body); delete $part->{bdy}; # save memory @@ -549,23 +592,17 @@ sub attach_link ($$$$;$) { } else { $sfn = 'a.bin'; } - my $rv = $ctx->{obuf}; - $$rv .= qq($nl<a\nhref="$ctx->{mhref}$idx-$sfn">); - if ($err) { - $$rv .= <<EOF; + my $rv = $idx eq '1' ? '' : "\n"; # like join("\n", ...) + $rv .= qq(<a\nhref="$ctx->{mhref}$idx-$sfn">); + $rv .= <<EOF if $err; [-- Warning: decoded text below may be mangled, UTF-8 assumed --] EOF - } - $$rv .= "[-- Attachment #$idx: "; - my $ts = "Type: $ct, Size: $size bytes"; + $rv .= "[-- Attachment #$idx: "; my $desc = $part->header('Content-Description') // $fn // ''; - $desc = ascii_html($desc); - $$rv .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]"; - $$rv .= "</a>\n"; - - submsg_hdr($ctx, $part) if $part->{is_submsg}; - - undef; + $rv .= ascii_html($desc)." --]\n[-- " if $desc ne ''; + $rv .= "Type: $ct, Size: $size bytes --]</a>\n"; + $rv .= submsg_hdr($ctx, $part) if $part->{is_submsg}; + $rv; } sub add_text_body { # callback for each_part @@ -578,13 +615,9 @@ sub add_text_body { # callback for each_part my $ct = $part->content_type || 'text/plain'; my $fn = $part->filename; my ($s, $err) = msg_part_text($part, $ct); - return attach_link($ctx, $ct, $p, $fn) unless defined $s; - - my $rv = $ctx->{obuf}; - if ($part->{is_submsg}) { - submsg_hdr($ctx, $part); - $$rv .= "\n"; - } + my $zfh = $ctx->zfh; + $s // return print $zfh (attach_link($ctx, $ct, $p, $fn) // ''); + say $zfh submsg_hdr($ctx, $part) if $part->{is_submsg}; # makes no difference to browsers, and don't screw up filename # link generation in diffs with the extra '%0D' @@ -607,24 +640,6 @@ sub add_text_body { # callback for each_part $ctx->{-anchors} = {} if $s =~ /^diff --git /sm; $diff = 1; delete $ctx->{-long_path}; - my $spfx; - # absolute URL (Atom feeds) - if ($ibx->{coderepo}) { - if (index($upfx, '//') >= 0) { - $spfx = $upfx; - $spfx =~ s!/([^/]*)/\z!/!; - } else { - my $n_slash = $upfx =~ tr!/!/!; - if ($n_slash == 0) { - $spfx = '../'; - } elsif ($n_slash == 1) { - $spfx = ''; - } else { # nslash == 2 - $spfx = '../../'; - } - } - } - $ctx->{-spfx} = $spfx; }; # split off quoted and unquoted blocks: @@ -632,110 +647,122 @@ sub add_text_body { # callback for each_part undef $s; # free memory if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) { # badly-encoded message with $err? tell the world about it! - attach_link($ctx, $ct, $p, $fn, $err); - $$rv .= "\n"; + say $zfh attach_link($ctx, $ct, $p, $fn, $err); } delete $part->{bdy}; # save memory - foreach my $cur (@sections) { + for my $cur (@sections) { # $cur may be huge if ($cur =~ /\A>/) { # we use a <span> here to allow users to specify # their own color for quoted text - $$rv .= qq(<span\nclass="q">); - $$rv .= $l->to_html($cur); - $$rv .= '</span>'; + print $zfh qq(<span\nclass="q">), + $l->to_html($cur), '</span>'; } elsif ($diff) { flush_diff($ctx, \$cur); - } else { - # regular lines, OK - $$rv .= $l->to_html($cur); + } else { # regular lines, OK + print $zfh $l->to_html($cur); } undef $cur; # free memory } } -sub _msg_page_prepare_obuf { - my ($eml, $ctx) = @_; - my $over = $ctx->{ibx}->over; - my $obfs_ibx = $ctx->{-obfs_ibx}; - my $rv = ''; +sub _msg_page_prepare { + my ($eml, $ctx, $ts) = @_; + my $have_over = !!$ctx->{ibx}->over; my $mids = mids_for_index($eml); my $nr = $ctx->{nr}++; if ($nr) { # unlikely if ($ctx->{chash} eq content_hash($eml)) { warn "W: BUG? @$mids not deduplicated properly\n"; - return \$rv; + return; } - $rv .= -"<pre>WARNING: multiple messages have this Message-ID\n</pre>"; - $rv .= '<pre>'; + $ctx->{-html_tip} = +qq[<pre>WARNING: multiple messages have this Message-ID (<a +href="d/">diff</a>)</pre><pre>]; } else { $ctx->{first_hdr} = $eml->header_obj; $ctx->{chash} = content_hash($eml) if $ctx->{smsg}; # reused MID - $rv .= "<pre\nid=b>"; # anchor for body start + $ctx->{-html_tip} = "<pre\nid=b>"; # anchor for body start } - $ctx->{-upfx} = '../' if $over; + $ctx->{-upfx} = '../'; my @title; # (Subject[0], From[0]) + my $hbuf = ''; for my $v ($eml->header('From')) { my @n = PublicInbox::Address::names($v); - $v = ascii_html($v); - $title[1] //= ascii_html(join(', ', @n)); - if ($obfs_ibx) { - obfuscate_addrs($obfs_ibx, $v); - obfuscate_addrs($obfs_ibx, $title[1]); - } - $rv .= "From: $v\n" if $v ne ''; + $title[1] //= join(', ', @n); + $hbuf .= "From: $v\n" if $v ne ''; } - foreach my $h (qw(To Cc)) { + for my $h (qw(To Cc)) { for my $v ($eml->header($h)) { fold_addresses($v); - $v = ascii_html($v); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $rv .= "$h: $v\n" if $v ne ''; + $hbuf .= "$h: $v\n" if $v ne ''; } } my @subj = $eml->header('Subject'); - if (@subj) { - my $v = ascii_html(shift @subj); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $rv .= 'Subject: '; - $rv .= $over ? qq(<a\nhref="#r"\nid=t>$v</a>\n) : "$v\n"; - $title[0] = $v; - for $v (@subj) { # multi-Subject message :< - $v = ascii_html($v); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $rv .= "Subject: $v\n"; - } - } else { # dummy anchor for thread skeleton at bottom of page - $rv .= qq(<a\nhref="#r"\nid=t></a>) if $over; - $title[0] = '(no subject)'; + $hbuf .= "Subject: $_\n" for @subj; + $title[0] = $subj[0] // '(no subject)'; + $hbuf .= "Date: $_\n" for $eml->header('Date'); + $hbuf = ascii_html($hbuf); + my $t = $ts ? '?t='.ts2str($ts) : ''; + my ($re, $addr2url) = addr2urlmap($ctx); + $hbuf =~ s!$re!qq(<a\nhref=").$addr2url->{lc $1}.qq($t">$1</a>)!sge; + $ctx->{-title_html} = ascii_html(join(' - ', @title)); + if (my $obfs_ibx = $ctx->{-obfs_ibx}) { + obfuscate_addrs($obfs_ibx, $hbuf); + obfuscate_addrs($obfs_ibx, $ctx->{-title_html}); } - for my $v ($eml->header('Date')) { - $v = ascii_html($v); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; # possible :P - $rv .= qq{Date: $v\t<a\nhref="#r">[thread overview]</a>\n}; - } - if (!$nr) { # first (and only) message, common case - $ctx->{-title_html} = join(' - ', @title); - $rv = $ctx->html_top . $rv; + + # [thread overview] link is typically added after Date, + # but added after Subject, or even nothing. + if ($have_over) { + chop $hbuf; # drop "\n", or noop if $rv eq '' + $hbuf .= qq{\t<a\nhref="#r">[thread overview]</a>\n}; + $hbuf =~ s!^Subject:\x20(.*?)(\n[A-Z]|\z) + !Subject: <a\nhref="#r"\nid=t>$1</a>$2!msx or + $hbuf .= qq(<a\nhref="#r\nid=t></a>); } if (scalar(@$mids) == 1) { # common case - my $mhtml = ascii_html($mids->[0]); - $rv .= "Message-ID: <$mhtml> "; - $rv .= "(<a\nhref=\"raw\">raw</a>)\n"; + my $x = ascii_html($mids->[0]); + $hbuf .= qq[Message-ID: <$x> (<a href="raw">raw</a>)\n]; + } + if (!$nr) { # first (and only) message, common case + print { $ctx->zfh } $ctx->html_top, $hbuf; } else { + delete $ctx->{-title_html}; + print { $ctx->zfh } $ctx->{-html_tip}, $hbuf; + } + $ctx->{-linkify} //= PublicInbox::Linkify->new; + $hbuf = ''; + if (scalar(@$mids) != 1) { # unlikely, but it happens :< # X-Alt-Message-ID can happen if a message is injected from # public-inbox-nntpd because of multiple Message-ID headers. - my $lnk = PublicInbox::Linkify->new; - my $s = ''; for my $h (qw(Message-ID X-Alt-Message-ID)) { - $s .= "$h: $_\n" for ($eml->header_raw($h)); + $hbuf .= "$h: $_\n" for ($eml->header_raw($h)); } - $lnk->linkify_mids('..', \$s, 1); - $rv .= $s; + $ctx->{-linkify}->linkify_mids('..', \$hbuf, 1); # escapes HTML + print { $ctx->{zfh} } $hbuf; + $hbuf = ''; } - $rv .= _parent_headers($eml, $over); - $rv .= "\n"; - \$rv; + my @irt = $eml->header_raw('In-Reply-To'); + my $refs; + if (@irt) { # ("so-and-so's message of $DATE") added by some MUAs + for (grep(/=\?/, @irt)) { + s/(=\?.*)\z/PublicInbox::Eml::mhdr_decode $1/se; + } + } else { + $refs = references($eml); + $irt[0] = pop(@$refs) if scalar @$refs; + } + $hbuf .= "In-Reply-To: $_\n" for @irt; + + # do not display References: if search is present, + # we show the thread skeleton at the bottom, instead. + if (!$have_over) { + $refs //= references($eml); + $hbuf .= 'References: <'.join(">\n\t<", @$refs).">\n" if @$refs; + } + $ctx->{-linkify}->linkify_mids('..', \$hbuf); # escapes HTML + say { $ctx->{zfh} } $hbuf; + 1; } sub SKEL_EXPAND () { @@ -772,7 +799,6 @@ sub thread_skel ($$$) { # when multiple Subject: headers are present, so we follow suit: my $subj = $hdr->header('Subject') // ''; $subj = '(no subject)' if $subj eq ''; - $ctx->{prev_subj} = [ split(/ /, subject_normalized($subj)) ]; $ctx->{cur} = $mid; $ctx->{prev_attr} = ''; $ctx->{prev_level} = 0; @@ -785,54 +811,47 @@ sub thread_skel ($$$) { $ctx->{parent_msg} = $parent; } -sub _parent_headers { - my ($hdr, $over) = @_; - my $rv = ''; - my @irt = $hdr->header_raw('In-Reply-To'); - my $refs; - if (@irt) { - my $lnk = PublicInbox::Linkify->new; - $rv .= "In-Reply-To: $_\n" for @irt; - $lnk->linkify_mids('..', \$rv); - } else { - $refs = references($hdr); - my $irt = pop @$refs; - if (defined $irt) { - my $html = ascii_html($irt); - my $href = mid_href($irt); - $rv .= "In-Reply-To: <"; - $rv .= "<a\nhref=\"../$href/\">$html</a>>\n"; - } - } - - # do not display References: if search is present, - # we show the thread skeleton at the bottom, instead. - return $rv if $over; - - $refs //= references($hdr); - if (@$refs) { - @$refs = map { linkify_ref_no_over($_) } @$refs; - $rv .= 'References: '. join("\n\t", @$refs) . "\n"; - } - $rv; -} - -# returns a string buffer +# writes to zbuf sub html_footer { my ($ctx, $hdr) = @_; - my $ibx = $ctx->{ibx}; my $upfx = '../'; - my $skel; - my $rv = '<pre>'; - if ($ibx->over) { + my (@related, $skel); + my $foot = '<pre>'; + my $qry = delete $ctx->{-qry}; + if ($qry && $ctx->{ibx}->isrch) { + my $q = ''; # search for either ancestor or descendent patches + for (@{$qry->{dfpre}}, @{$qry->{dfpost}}) { + chop if length > 7; # include 1 abbrev "older" patches + $q .= "dfblob:$_ "; + } + chop $q; # omit trailing SP + local $Text::Wrap::columns = COLS; + local $Text::Wrap::huge = 'overflow'; + $q = wrap('', '', $q); + my $rows = ($q =~ tr/\n/\n/) + 1; + $q = ascii_html($q); + $related[0] = <<EOM; +<form id=related +action=$upfx +><pre>find likely ancestor, descendant, or conflicting patches for <a +href=#t>this message</a>: +<textarea name=q cols=${\COLS} rows=$rows>$q</textarea> +<input type=submit value=search +/>\t(<a href=${upfx}_/text/help/#search>help</a>)</pre></form> +EOM + # TODO: related codesearch + # my $csrchv = $ctx->{ibx}->{-csrch} // []; + # push @related, '<pre>'.ascii_html(Dumper($csrchv)).'</pre>'; + } + if ($ctx->{ibx}->over) { my $t = ts2str($ctx->{-t_max}); my $t_fmt = fmt_ts($ctx->{-t_max}); - $skel .= <<EOF; - other threads:[<a + my $fallback = @related ? "\t" : "<a id=related>\t</a>"; + $skel = <<EOF; +${fallback}other threads:[<a href="$upfx?t=$t">~$t_fmt UTC</a>|<a href="$upfx">newest</a>] EOF - thread_skel(\$skel, $ctx, $hdr); my ($next, $prev); my $parent = ' '; @@ -840,43 +859,32 @@ EOF if (my $n = $ctx->{next_msg}) { $n = mid_href($n); - $next = "<a\nhref=\"$upfx$n/\"\nrel=next>next</a>"; + $next = qq(<a\nhref="$upfx$n/"\nrel=next>next</a>); } - my $u; my $par = $ctx->{parent_msg}; - if ($par) { - $u = mid_href($par); - $u = "$upfx$u/"; - } + my $u = $par ? $upfx.mid_href($par).'/' : undef; if (my $p = $ctx->{prev_msg}) { $prev = mid_href($p); if ($p && $par && $p eq $par) { - $prev = "<a\nhref=\"$upfx$prev/\"\n" . + $prev = qq(<a\nhref="$upfx$prev/"\n) . 'rel=prev>prev parent</a>'; $parent = ''; } else { - $prev = "<a\nhref=\"$upfx$prev/\"\n" . + $prev = qq(<a\nhref="$upfx$prev/"\n) . 'rel=prev>prev</a>'; - $parent = " <a\nhref=\"$u\">parent</a>" if $u; + $parent = qq( <a\nhref="$u">parent</a>) if $u; } } elsif ($u) { # unlikely - $parent = " <a\nhref=\"$u\"\nrel=prev>parent</a>"; + $parent = qq( <a\nhref="$u"\nrel=prev>parent</a>); } - $rv .= "$next $prev$parent "; + $foot .= "$next $prev$parent "; } else { # unindexed inboxes w/o over $skel = qq( <a\nhref="$upfx">latest</a>); } - $rv .= qq(<a\nhref="#R">reply</a>); - $rv .= $skel; - $rv .= '</pre>'; - $rv .= msg_reply($ctx, $hdr); -} - -sub linkify_ref_no_over { - my ($mid) = @_; - my $href = mid_href($mid); - my $html = ascii_html($mid); - "<<a\nhref=\"../$href/\">$html</a>>"; + # $skel may be big for big threads, don't append it to $foot + print { $ctx->zfh } $foot, qq(<a\nhref="#R">reply</a>), + $skel, '</pre>', @related, + msg_reply($ctx, $hdr); } sub ghost_parent { @@ -937,8 +945,8 @@ sub thread_results { my $tip = splice(@$rootset, $idx, 1); @$rootset = reverse @$rootset; unshift @$rootset, $tip; - $ctx->{sl_note} = strict_loose_note($nr); } + $ctx->{sl_note} = strict_loose_note($nr); } $rootset } @@ -1074,6 +1082,8 @@ sub _skel_ghost { 1; } +# note: we favor Date: here because git-send-email increments it +# to preserve [PATCH $N/$M] ordering in series (it can't control Received:) sub sort_ds { @{$_[0]} = sort { (eval { $a->topmost->{ds} } || 0) <=> @@ -1095,9 +1105,10 @@ sub acc_topic { # walk_thread callback if ($has_blob) { my $subj = subject_normalized($smsg->{subject}); $subj = '(no subject)' if $subj eq ''; + my $ts = $smsg->{ts}; my $ds = $smsg->{ds}; if ($level == 0) { # new, top-level topic - my $topic = [ $ds, 1, { $subj => $mid }, $subj ]; + my $topic = [ $ts, $ds, 1, { $subj => $mid }, $subj ]; $ctx->{-cur_topic} = $topic; push @{$ctx->{order}}, $topic; return 1; @@ -1105,10 +1116,11 @@ sub acc_topic { # walk_thread callback # continue existing topic my $topic = $ctx->{-cur_topic}; # should never be undef - $topic->[0] = $ds if $ds > $topic->[0]; - $topic->[1]++; # bump N+ message counter - my $seen = $topic->[2]; - if (scalar(@$topic) == 3) { # parent was a ghost + $topic->[0] = $ts if $ts > $topic->[0]; + $topic->[1] = $ds if $ds > $topic->[1]; + $topic->[2]++; # bump N+ message counter + my $seen = $topic->[3]; + if (scalar(@$topic) == 4) { # parent was a ghost push @$topic, $subj; } elsif (!defined($seen->{$subj})) { push @$topic, $level, $subj; # @extra messages @@ -1116,7 +1128,7 @@ sub acc_topic { # walk_thread callback $seen->{$subj} = $mid; # latest for subject } else { # ghost message return 1 if $level != 0; # ignore child ghosts - my $topic = $ctx->{-cur_topic} = [ -666, 0, {} ]; + my $topic = $ctx->{-cur_topic} = [ -666, -666, 0, {} ]; push @{$ctx->{order}}, $topic; } 1; @@ -1131,12 +1143,13 @@ sub dump_topics { } my @out; - my $ibx = $ctx->{ibx}; - my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef; - + my $obfs_ibx = $ctx->{ibx}->{obfuscate} ? $ctx->{ibx} : undef; + if (my $note = delete $ctx->{t_note}) { + push @out, $note; # "messages from ... to ..." + } # sort by recency, this allows new posts to "bump" old topics... foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) { - my ($ds, $n, $seen, $top_subj, @extra) = @$topic; + my ($ts, $ds, $n, $seen, $top_subj, @extra) = @$topic; @$topic = (); next unless defined $top_subj; # ghost topic my $mid = delete $seen->{$top_subj}; @@ -1158,9 +1171,9 @@ sub dump_topics { my $s = "<a\nhref=\"$href/T/$anchor\">$top_subj</a>\n" . " $ds UTC $n\n"; - for (my $i = 0; $i < scalar(@extra); $i += 2) { - my $level = $extra[$i]; - my $subj = $extra[$i + 1]; # already normalized + while (@extra) { + my $level = shift @extra; + my $subj = shift @extra; # already normalized $mid = delete $seen->{$subj}; my @subj = split(/ /, $subj); my @next_prev = @subj; # full copy @@ -1192,7 +1205,11 @@ sub pagination_footer ($$) { $next = $next ? "$next | " : ' | '; $prev .= qq[ | <a\nhref="$latest">latest</a>]; } - "<hr><pre>page: $next$prev</pre>"; + my $rv = '<hr><pre id=nav>'; + $rv .= "page: $next$prev\n" if $next || $prev; + $rv .= q{- recent:[<b>subjects (threaded)</b>|<a +href="./topics_new.html">topics (new)</a>|<a +href="./topics_active.html">topics (active)</a>]</pre>}; } sub paginate_recent ($$) { @@ -1207,23 +1224,30 @@ sub paginate_recent ($$) { $t =~ s/\A([0-9]{8,14})-// and $after = str2ts($1); $t =~ /\A([0-9]{8,14})\z/ and $before = str2ts($1); - my $ibx = $ctx->{ibx}; - my $msgs = $ibx->recent($opts, $after, $before); - my $nr = scalar @$msgs; - if ($nr < $lim && defined($after)) { + my $msgs = $ctx->{ibx}->over->recent($opts, $after, $before); + if (defined($after) && scalar(@$msgs) < $lim) { $after = $before = undef; - $msgs = $ibx->recent($opts); - $nr = scalar @$msgs; + $msgs = $ctx->{ibx}->over->recent($opts); } - my $more = $nr == $lim; + my $more = scalar(@$msgs) == $lim; my ($newest, $oldest); - if ($nr) { + if (@$msgs) { $newest = $msgs->[0]->{ts}; $oldest = $msgs->[-1]->{ts}; # if we only had $after, our SQL query in ->recent ordered if ($newest < $oldest) { ($oldest, $newest) = ($newest, $oldest); - $more = 0 if defined($after) && $after < $oldest; + $more = undef if defined($after) && $after < $oldest; + } + if (defined($after // $before)) { + my $n = strftime('%Y-%m-%d %H:%M:%S', gmtime($newest)); + my $o = strftime('%Y-%m-%d %H:%M:%S', gmtime($oldest)); + $ctx->{t_note} = <<EOM; + messages from $o to $n UTC [<a href="#nav">more...</a>] +EOM + my $s = ts2str($newest); + $ctx->{prev_page} = qq[<a\nhref="?t=$s-"\nrel=prev>] . + 'prev (newer)</a>'; } } if (defined($oldest) && $more) { @@ -1231,11 +1255,6 @@ sub paginate_recent ($$) { $ctx->{next_page} = qq[<a\nhref="?t=$s"\nrel=next>] . 'next (older)</a>'; } - if (defined($newest) && (defined($before) || defined($after))) { - my $s = ts2str($newest); - $ctx->{prev_page} = qq[<a\nhref="?t=$s-"\nrel=prev>] . - 'prev (newer)</a>'; - } $msgs; } @@ -1243,11 +1262,8 @@ sub paginate_recent ($$) { sub index_topics { my ($ctx) = @_; my $msgs = paginate_recent($ctx, 200); # 200 is our window - if (@$msgs) { - walk_thread(thread_results($ctx, $msgs), $ctx, \&acc_topic); - } - html_oneshot($ctx, dump_topics($ctx), \pagination_footer($ctx, '.')); - + walk_thread(thread_results($ctx, $msgs), $ctx, \&acc_topic) if @$msgs; + html_oneshot($ctx, dump_topics($ctx), pagination_footer($ctx, '.')); } sub thread_adj_level { @@ -1281,4 +1297,30 @@ sub ghost_index_entry { . '</pre>' . $end; } +# /$INBOX/$MSGID/d/ endpoint +sub diff_msg { + my ($ctx) = @_; + require PublicInbox::MailDiff; + my $ibx = $ctx->{ibx}; + my $over = $ibx->over or return no_over_html($ctx); + my ($id, $prev); + my $md = bless { ctx => $ctx }, 'PublicInbox::MailDiff'; + my $next_arg = $md->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; + my $smsg = $md->{smsg} = $over->next_by_mid(@$next_arg) or + return; # undef == 404 + $ctx->{-t_max} = $smsg->{ts}; + $ctx->{-upfx} = '../../'; + $ctx->{-apfx} = '//'; # fail on to_attr() + $ctx->{-linkify} = PublicInbox::Linkify->new; + my $mid = ascii_html($smsg->{mid}); + $ctx->{-title_html} = "diff for duplicates of <$mid>"; + PublicInbox::WwwStream::html_init($ctx); + print { $ctx->{zfh} } '<pre>diff for duplicates of <<a href="../">', + $mid, "</a>>\n\n"; + sub { + $ctx->attach($_[0]->([200, delete $ctx->{-res_hdr}])); + $md->begin_mail_diff; + }; +} + 1; diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm index fb394b7c..d078c5f9 100644 --- a/lib/PublicInbox/ViewDiff.pm +++ b/lib/PublicInbox/ViewDiff.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # used by PublicInbox::View @@ -7,15 +7,13 @@ # (or reconstruct) blobs. package PublicInbox::ViewDiff; -use strict; -use v5.10.1; +use v5.12; use parent qw(Exporter); -our @EXPORT_OK = qw(flush_diff); +our @EXPORT_OK = qw(flush_diff uri_escape_path); use URI::Escape qw(uri_escape_utf8); -use PublicInbox::Hval qw(ascii_html to_attr); +use PublicInbox::Hval qw(ascii_html to_attr utf8_maybe); use PublicInbox::Git qw(git_unquote); -my $UNSAFE = "^A-Za-z0-9\-\._~/"; # '/' + $URI::Escape::Unsafe{RFC3986} my $OID_NULL = '0{7,}'; my $OID_BLOB = '[a-f0-9]{7,}'; my $LF = qr!\n!; @@ -41,22 +39,24 @@ our $EXTRACT_DIFFS = qr/( ^\+{3}\x20($FN)$LF)/msx; our $IS_OID = qr/\A$OID_BLOB\z/s; +sub uri_escape_path { + # '/' + $URI::Escape::Unsafe{RFC3986} + uri_escape_utf8($_[0], "^A-Za-z0-9\-\._~/"); +} + # link to line numbers in blobs -sub diff_hunk ($$$$) { - my ($dst, $dctx, $ca, $cb) = @_; +sub diff_hunk ($$$) { + my ($dctx, $ca, $cb) = @_; my ($oid_a, $oid_b, $spfx) = @$dctx{qw(oid_a oid_b spfx)}; if (defined($spfx) && defined($oid_a) && defined($oid_b)) { - my ($n) = ($ca =~ /^-([0-9]+)/); - $n = defined($n) ? "#n$n" : ''; - - $$dst .= qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>); + my $n = ($ca =~ /^-([0-9]+)/) ? "#n$1" : ''; + my $x = qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>); - ($n) = ($cb =~ /^\+([0-9]+)/); - $n = defined($n) ? "#n$n" : ''; - $$dst .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@); + $n = ($cb =~ /^\+([0-9]+)/) ? "#n$1" : ''; + $x .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@); } else { - $$dst .= "@@ $ca $cb @@"; + "@@ $ca $cb @@"; } } @@ -66,8 +66,8 @@ sub oid ($$$) { } # returns true if diffstat anchor written, false otherwise -sub anchor0 ($$$$) { - my ($dst, $ctx, $fn, $rest) = @_; +sub anchor0 ($$$) { + my ($ctx, $fn, $rest) = @_; my $orig = $fn; @@ -83,15 +83,12 @@ sub anchor0 ($$$$) { # long filenames will require us to check in anchor1() push(@{$ctx->{-long_path}}, $fn) if $fn =~ s!\A\.\.\./?!!; - if (defined(my $attr = to_attr($ctx->{-apfx}.$fn))) { - $ctx->{-anchors}->{$attr} = 1; - my $spaces = ($orig =~ s/( +)\z//) ? $1 : ''; - $$dst .= " <a\nid=i$attr\nhref=#$attr>" . - ascii_html($orig) . '</a>' . $spaces . + my $attr = to_attr($ctx->{-apfx}.$fn) // return; + $ctx->{-anchors}->{$attr} = 1; + my $spaces = ($orig =~ s/( +)\z//) ? $1 : ''; + print { $ctx->{zfh} } " <a\nid=i$attr\nhref=#$attr>", + ascii_html($orig), '</a>', $spaces, $ctx->{-linkify}->to_html($rest); - return 1; - } - undef; } # returns "diff --git" anchor destination, undef otherwise @@ -123,15 +120,11 @@ sub diff_header ($$$) { $pa = (split(m'/', git_unquote($pa), 2))[1] if $pa ne '/dev/null'; $pb = (split(m'/', git_unquote($pb), 2))[1] if $pb ne '/dev/null'; if ($pa eq $pb && $pb ne '/dev/null') { - $dctx->{Q} = "?b=".uri_escape_utf8($pb, $UNSAFE); + $dctx->{Q} = '?b='.uri_escape_path($pb); } else { my @q; - if ($pb ne '/dev/null') { - push @q, 'b='.uri_escape_utf8($pb, $UNSAFE); - } - if ($pa ne '/dev/null') { - push @q, 'a='.uri_escape_utf8($pa, $UNSAFE); - } + push @q, 'b='.uri_escape_path($pb) if $pb ne '/dev/null'; + push @q, 'a='.uri_escape_path($pa) if $pa ne '/dev/null'; $dctx->{Q} = '?'.join('&', @q); } @@ -141,44 +134,48 @@ sub diff_header ($$$) { # no need to capture oid_a and oid_b on add/delete, # we just linkify OIDs directly via s///e in conditional - if (($$x =~ s/$NULL_TO_BLOB/$1 . oid($dctx, $spfx, $2)/e) || - ($$x =~ s/$BLOB_TO_NULL/ - 'index ' . oid($dctx, $spfx, $1) . $2/e)) { + if ($$x =~ s/$NULL_TO_BLOB/$1 . oid($dctx, $spfx, $2)/e) { + push @{$ctx->{-qry}->{dfpost}}, $2; + } elsif ($$x =~ s/$BLOB_TO_NULL/'index '.oid($dctx, $spfx, $1).$2/e) { + push @{$ctx->{-qry}->{dfpre}}, $1; } elsif ($$x =~ $BLOB_TO_BLOB) { # modification-only, not add/delete: # linkify hunk headers later using oid_a and oid_b @$dctx{qw(oid_a oid_b)} = ($1, $2); + push @{$ctx->{-qry}->{dfpre}}, $1; + push @{$ctx->{-qry}->{dfpost}}, $2; } else { warn "BUG? <$$x> had no ^index line"; } $$x =~ s!^diff --git!anchor1($ctx, $pb) // 'diff --git'!ems; - my $dst = $ctx->{obuf}; - $$dst .= qq(<span\nclass="head">); - $$dst .= $$x; - $$dst .= '</span>'; + print { $ctx->{zfh} } qq(<span\nclass="head">), $$x, '</span>'; $dctx; } sub diff_before_or_after ($$) { my ($ctx, $x) = @_; - my $linkify = $ctx->{-linkify}; - my $dst = $ctx->{obuf}; - my $anchors = exists($ctx->{-anchors}) ? 1 : 0; - for my $y (split(/(^---\n)/sm, $$x)) { - if ($y =~ /\A---\n\z/s) { - $$dst .= "---\n"; # all HTML is "\r\n" => "\n" - $anchors |= 2; - } elsif ($anchors == 3 && $y =~ /^ [0-9]+ files? changed, /sm) { - # ok, looks like a diffstat, go line-by-line: - for my $l (split(/^/m, $y)) { - if ($l =~ /^ (.+)( +\| .*\z)/s) { - anchor0($dst, $ctx, $1, $2) and next; - } - $$dst .= $linkify->to_html($l); - } - } else { # commit message, notes, etc - $$dst .= $linkify->to_html($y); + if (exists $ctx->{-anchors} && $$x =~ # diffstat lines: + /((?:^\x20(?:[^\n]+?)(?:\x20+\|\x20[^\n]*\n))+) + (\x20[0-9]+\x20files?\x20)changed,/msx) { + my $pre = substr($$x, 0, $-[0]); # (likely) short prefix + substr($$x, 0, $+[0], ''); # sv_chop on $$x ($$x may be long) + my @x = ($2, $1); + my $lnk = $ctx->{-linkify}; + my $zfh = $ctx->{zfh}; + # uninteresting prefix + print $zfh $lnk->to_html($pre); + for my $l (split(/^/m, pop(@x))) { # $2 per-file stat lines + $l =~ /^ (.+)( +\| .*\z)/s and + anchor0($ctx, $1, $2) and next; + print $zfh $lnk->to_html($l); } + my $ch = $ctx->{changed_href} // '#related'; + print $zfh pop(@x), # $3 /^ \d+ files? / + qq(<a href="$ch">changed</a>,), + # insertions/deletions, notes, commit message, etc: + $lnk->to_html($$x); + } else { + print { $ctx->{zfh} } $ctx->{-linkify}->to_html($$x); } } @@ -189,9 +186,9 @@ sub flush_diff ($$) { my @top = split($EXTRACT_DIFFS, $$cur); undef $$cur; # free memory - my $linkify = $ctx->{-linkify}; - my $dst = $ctx->{obuf}; + my $lnk = $ctx->{-linkify}; my $dctx; # {}, keys: Q, oid_a, oid_b + my $zfh = $ctx->zfh; while (defined(my $x = shift @top)) { if (scalar(@top) >= 4 && @@ -199,7 +196,8 @@ sub flush_diff ($$) { $top[0] =~ $IS_OID) { $dctx = diff_header(\$x, $ctx, \@top); } elsif ($dctx) { - my $after = ''; + open(my $afh, '>>:utf8', \(my $after='')) or + die "open: $!"; # Quiet "Complex regular subexpression recursion limit" # warning. Perl will truncate matches upon hitting @@ -214,29 +212,33 @@ sub flush_diff ($$) { for my $s (split(/((?:(?:^\+[^\n]*\n)+)| (?:(?:^-[^\n]*\n)+)| (?:^@@ [^\n]+\n))/xsm, $x)) { + undef $x; if (!defined($dctx)) { - $after .= $s; + print $afh $s; } elsif ($s =~ s/\A@@ (\S+) (\S+) @@//) { - $$dst .= qq(<span\nclass="hunk">); - diff_hunk($dst, $dctx, $1, $2); - $$dst .= $linkify->to_html($s); - $$dst .= '</span>'; - } elsif ($s =~ /\A\+/) { - $$dst .= qq(<span\nclass="add">); - $$dst .= $linkify->to_html($s); - $$dst .= '</span>'; + print $zfh qq(<span\nclass="hunk">), + diff_hunk($dctx, $1, $2), + $lnk->to_html($s), + '</span>'; + } elsif ($s =~ /\A\+/) { # $s may be huge + print $zfh qq(<span\nclass="add">), + $lnk->to_html($s), + '</span>'; } elsif ($s =~ /\A-- $/sm) { # email sig starts $dctx = undef; - $after .= $s; - } elsif ($s =~ /\A-/) { - $$dst .= qq(<span\nclass="del">); - $$dst .= $linkify->to_html($s); - $$dst .= '</span>'; - } else { - $$dst .= $linkify->to_html($s); + print $afh $s; + } elsif ($s =~ /\A-/) { # $s may be huge + print $zfh qq(<span\nclass="del">), + $lnk->to_html($s), + '</span>'; + } else { # $s may be huge + print $zfh $lnk->to_html($s); } } - diff_before_or_after($ctx, \$after) unless $dctx; + if (!$dctx) { + utf8_maybe($after); + diff_before_or_after($ctx, \$after); + } } else { diff_before_or_after($ctx, \$x); } diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm index 3cbc363b..f47c2703 100644 --- a/lib/PublicInbox/ViewVCS.pm +++ b/lib/PublicInbox/ViewVCS.pm @@ -1,8 +1,7 @@ -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # show any VCS object, similar to "git show" -# FIXME: we only show blobs for now # # This can use a "solver" to reconstruct blobs based on git # patches (with abbreviated OIDs in the header). However, the @@ -16,11 +15,22 @@ package PublicInbox::ViewVCS; use strict; use v5.10.1; +use File::Temp 0.19 (); # newdir use PublicInbox::SolverGit; +use PublicInbox::Git; +use PublicInbox::GitAsyncCat; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::Linkify; use PublicInbox::Tmpfile; -use PublicInbox::Hval qw(ascii_html to_filename); +use PublicInbox::ViewDiff qw(flush_diff uri_escape_path); +use PublicInbox::View; +use PublicInbox::Eml; +use PublicInbox::OnDestroy; +use Text::Wrap qw(wrap); +use PublicInbox::Hval qw(ascii_html to_filename prurl utf8_maybe); +use POSIX qw(strftime); +use autodie qw(open seek truncate); +use Fcntl qw(SEEK_SET); my $hl = eval { require PublicInbox::HlMod; PublicInbox::HlMod->new; @@ -29,22 +39,52 @@ my $hl = eval { my %QP_MAP = ( A => 'oid_a', a => 'path_a', b => 'path_b' ); our $MAX_SIZE = 1024 * 1024; # TODO: configurable my $BIN_DETECT = 8000; # same as git +my $SHOW_FMT = '--pretty=format:'.join('%n', '%P', '%p', '%H', '%T', '%s', '%f', + '%an <%ae> %ai', '%cn <%ce> %ci', '%b%x00'); -sub html_page ($$$) { - my ($ctx, $code, $strref) = @_; +my %GIT_MODE = ( + '100644' => ' ', # blob + '100755' => 'x', # executable blob + '040000' => 'd', # tree + '120000' => 'l', # symlink + '160000' => 'g', # commit (gitlink) +); + +# TODO: not fork safe, but we don't fork w/o exec in PublicInbox::WWW +my (@solver_q, $solver_lim); +my $solver_nr = 0; + +sub html_page ($$;@) { + my ($ctx, $code) = @_[0, 1]; my $wcb = delete $ctx->{-wcb}; - $ctx->{-upfx} = '../../'; # from "/$INBOX/$OID/s/" - my $res = html_oneshot($ctx, $code, $strref); + $ctx->{-upfx} //= '../../'; # from "/$INBOX/$OID/s/" + my $res = html_oneshot($ctx, $code, @_[2..$#_]); $wcb ? $wcb->($res) : $res; } +sub dbg_log ($) { + my ($ctx) = @_; + my $log = delete $ctx->{lh} // die 'BUG: already captured debug log'; + if (!CORE::seek($log, 0, SEEK_SET)) { + warn "seek(log): $!"; + return '<pre>debug log seek error</pre>'; + } + $log = eval { PublicInbox::IO::read_all $log } // do { + warn "read(log): $@"; + return '<pre>debug log read error</pre>'; + }; + return '' if $log eq ''; + $ctx->{-linkify} //= PublicInbox::Linkify->new; + "<hr><pre>debug log:\n\n". + $ctx->{-linkify}->to_html($log).'</pre>'; +} + sub stream_blob_parse_hdr { # {parse_hdr} for Qspawn my ($r, $bref, $ctx) = @_; - my ($res, $logref) = delete @$ctx{qw(-res -logref)}; - my ($git, $oid, $type, $size, $di) = @$res; + my ($git, $oid, $type, $size, $di) = @{$ctx->{-res}}; my @cl = ('Content-Length', $size); - if (!defined $r) { # error - html_page($ctx, 500, $logref); + if (!defined $r) { # sysread error + html_page($ctx, 500, dbg_log($ctx)); } elsif (index($$bref, "\0") >= 0) { [200, [qw(Content-Type application/octet-stream), @cl] ]; } else { @@ -54,114 +94,512 @@ sub stream_blob_parse_hdr { # {parse_hdr} for Qspawn 'text/plain; charset=UTF-8', @cl ] ]; } if ($r == 0) { - warn "premature EOF on $oid $$logref"; - return html_page($ctx, 500, $logref); + my $log = dbg_log($ctx); + warn "premature EOF on $oid $log"; + return html_page($ctx, 500, $log); } - @$ctx{qw(-res -logref)} = ($res, $logref); undef; # bref keeps growing } } -sub stream_large_blob ($$$$) { - my ($ctx, $res, $logref, $fn) = @_; - $ctx->{-logref} = $logref; +sub stream_large_blob ($$) { + my ($ctx, $res) = @_; $ctx->{-res} = $res; my ($git, $oid, $type, $size, $di) = @$res; my $cmd = ['git', "--git-dir=$git->{git_dir}", 'cat-file', $type, $oid]; my $qsp = PublicInbox::Qspawn->new($cmd); - my $env = $ctx->{env}; - $env->{'qspawn.wcb'} = delete $ctx->{-wcb}; - $qsp->psgi_return($env, undef, \&stream_blob_parse_hdr, $ctx); + $ctx->{env}->{'qspawn.wcb'} = $ctx->{-wcb}; + $qsp->psgi_yield($ctx->{env}, undef, \&stream_blob_parse_hdr, $ctx); } -sub show_other_result ($$) { +sub show_other_result ($$) { # future-proofing my ($bref, $ctx) = @_; - my ($qsp, $logref) = delete @$ctx{qw(-qsp -logref)}; - if (my $err = $qsp->{err}) { - utf8::decode($$err); - $$logref .= "git show error: $err"; - return html_page($ctx, 500, $logref); + if (my $qsp_err = delete $ctx->{-qsp_err}) { + return html_page($ctx, 500, dbg_log($ctx) . + "git show error:$qsp_err"); } my $l = PublicInbox::Linkify->new; - utf8::decode($$bref); - $$bref = '<pre>'. $l->to_html($$bref); - $$bref .= '</pre><hr>' . $$logref; - html_page($ctx, 200, $bref); + utf8_maybe($$bref); + html_page($ctx, 200, '<pre>', $l->to_html($$bref), '</pre><hr>', + dbg_log($ctx)); } -sub show_other ($$$$) { - my ($ctx, $res, $logref, $fn) = @_; - my ($git, $oid, $type, $size) = @$res; - if ($size > $MAX_SIZE) { - $$logref = "$oid is too big to show\n" . $$logref; - return html_page($ctx, 200, $logref); +sub cmt_title { # git->cat_async callback + my ($bref, $oid, $type, $size, $ctx_cb) = @_; + utf8_maybe($$bref); + my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : ''; + # $ctx_cb is [ $ctx, $cmt_fin ] + push @{$ctx_cb->[0]->{-cmt_pt}}, ascii_html($title); +} + +sub do_cat_async { + my ($arg, $cb, @req) = @_; + # favor git(1) over Gcf2 (libgit2) for SHA-256 support + my $ctx = ref $arg eq 'ARRAY' ? $arg->[0] : $arg; + $ctx->{git}->cat_async($_, $cb, $arg) for @req; + if ($ctx->{env}->{'pi-httpd.async'}) { + $ctx->{git}->watch_async; + } else { # synchronous, generic PSGI + $ctx->{git}->cat_async_wait; + } +} + +sub do_check_async { + my ($ctx, $cb, @req) = @_; + if ($ctx->{env}->{'pi-httpd.async'}) { + async_check($ctx, $_, $cb, $ctx) for @req; + } else { # synchronous, generic PSGI + $ctx->{git}->check_async($_, $cb, $ctx) for @req; + $ctx->{git}->check_async_wait; + } +} + +sub cmt_hdr_prep { # psgi_qx cb + my ($fh, $ctx, $cmt_fin) = @_; + return if $ctx->{-qsp_err_h}; # let cmt_fin handle it + seek $fh, 0, SEEK_SET; + my $buf = do { local $/ = "\0"; <$fh> } // die "readline: $!"; + chop($buf) eq "\0" or die 'no NUL in git show -z output'; + utf8_maybe($buf); # non-UTF-8 commits exist + chomp $buf; + (my $P, my $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9); + truncate $fh, 0; + return unless $P; + seek $fh, 0, SEEK_SET; + my $qsp_p = PublicInbox::Qspawn->new($ctx->{git}->cmd(qw(show + --encoding=UTF-8 --pretty=format:%n -M --stat -p), $ctx->{oid}), + undef, { 1 => $fh }); + $qsp_p->{qsp_err} = \($ctx->{-qsp_err_p} = ''); + $qsp_p->psgi_qx($ctx->{env}, undef, \&cmt_patch_prep, $ctx, $cmt_fin); + @{$ctx->{-cmt_P}} = split / /, $P; + @{$ctx->{-cmt_p}} = split / /, $p; # abbreviated + do_cat_async([$ctx, $cmt_fin], \&cmt_title, @{$ctx->{-cmt_P}}); +} + +sub read_patchid { # psgi_qx cb + my ($bref, $ctx, $cmt_fin) = @_; + my ($patchid) = split(/ /, $$bref); # ignore commit + $ctx->{-q_value_html} = "patchid:$patchid" if defined $patchid; +} + +sub cmt_patch_prep { # psgi_qx cb + my ($fh, $ctx, $cmt_fin) = @_; + return if $ctx->{-qsp_err_p}; # let cmt_fin handle error + return if -s $fh > $MAX_SIZE; # too big to show, too big to patch-id + seek $fh, 0, SEEK_SET; + my $qsp = PublicInbox::Qspawn->new( + $ctx->{git}->cmd(qw(patch-id --stable)), + undef, { 0 => $fh }); + $qsp->{qsp_err} = \$ctx->{-qsp_err_p}; + $qsp->psgi_qx($ctx->{env}, undef, \&read_patchid, $ctx, $cmt_fin); +} + +sub ibx_url_for { + my ($ctx) = @_; + $ctx->{ibx} and return; # fall back to $upfx + $ctx->{git} or die 'BUG: no {git}'; + if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) { + if (defined(my $u = $ALL->base_url($ctx->{env}))) { + return wantarray ? ($u) : $u; + } + } + my @ret; + if (my $ibx_names = $ctx->{git}->{ibx_names}) { + my $by_name = $ctx->{www}->{pi_cfg}->{-by_name}; + for my $name (@$ibx_names) { + my $ibx = $by_name->{$name} // do { + warn "inbox `$name' no longer exists\n"; + next; + }; + $ibx->isrch // next; + my $u = defined($ibx->{url}) ? + prurl($ctx->{env}, $ibx->{url}) : $name; + $u .= '/' if substr($u, -1) ne '/'; + push @ret, $u; + } } + wantarray ? (@ret) : $ret[0]; +} + +sub cmt_fin { # OnDestroy cb + my ($ctx) = @_; + my ($eh, $ep) = delete @$ctx{qw(-qsp_err_h -qsp_err_p)}; + if ($eh || $ep) { + my $e = join(' - ', grep defined, $eh, $ep); + return html_page($ctx, 500, dbg_log($ctx) . + "git show/patch-id error:$e"); + } + $ctx->{-linkify} //= PublicInbox::Linkify->new; + my $upfx = $ctx->{-upfx} = '../../'; # from "/$INBOX/$OID/s/" + my ($H, $T, $s, $f, $au, $co, $bdy) = @{delete $ctx->{cmt_info}}; + # try to keep author and committer dates lined up + my $x = length($au) - length($co); + if ($x > 0) { + $x = ' ' x $x; + $co =~ s/>/>$x/; + } elsif ($x < 0) { + $x = ' ' x (-$x); + $au =~ s/>/>$x/; + } + $_ = ascii_html($_) for ($au, $co); + my $ibx_url = ibx_url_for($ctx) // $upfx; + $au =~ s!(> +)([0-9]{4,}-\S+ \S+)! + my ($gt, $t) = ($1, $2); + $t =~ tr/ :-//d; + qq($gt<a +href="$ibx_url?t=$t" +title="list contemporary emails">$2</a>) + !e; + + $ctx->{-title_html} = $s = $ctx->{-linkify}->to_html($s); + my ($P, $p, $pt) = delete @$ctx{qw(-cmt_P -cmt_p -cmt_pt)}; + $_ = qq(<a href="$upfx$_/s/">).shift(@$p).'</a> '.shift(@$pt) for @$P; + if (@$P == 1) { + $x = qq{ (<a +href="$f.patch">patch</a>)\n <a href=#parent>parent</a> $P->[0]}; + } elsif (@$P > 1) { + $x = qq(\n <a href=#parents>parents</a> $P->[0]\n); + shift @$P; + $x .= qq( $_\n) for @$P; + chop $x; + } else { + $x = ' (<a href=#root_commit>root commit</a>)'; + } + PublicInbox::WwwStream::html_init($ctx); + my $zfh = $ctx->zfh; + print $zfh <<EOM; +<pre> <a href=#commit>commit</a> $H$x + <a href=#tree>tree</a> <a href="$upfx$T/s/?b=">$T</a> + author $au +committer $co + +<b>$s</b> +EOM + print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy); + undef $bdy; # free memory + my $fh = delete $ctx->{patch_fh}; + if (-s $fh > $MAX_SIZE) { + print $zfh '</pre><hr><pre>patch is too large to show</pre>'; + } else { # prepare flush_diff: + seek $fh, 0, SEEK_SET; + PublicInbox::IO::read_all $fh, -s _, \$x; + utf8_maybe($x); + $ctx->{-apfx} = $ctx->{-spfx} = $upfx; + $x =~ s/\r?\n/\n/gs; + $ctx->{-anchors} = {} if $x =~ /^diff --git /sm; + flush_diff($ctx, \$x); # undefs $x + # TODO: should there be another textarea which attempts to + # search for the exact email which was applied to make this + # commit? + if (my $qry = delete $ctx->{-qry}) { + my $q = ''; + for (@{$qry->{dfpost}}, @{$qry->{dfpre}}) { + # keep blobs as short as reasonable, emails + # are going to be older than what's in git + substr($_, 7, 64, ''); + $q .= "dfblob:$_ "; + } + chop $q; # no trailing SP + local $Text::Wrap::columns = PublicInbox::View::COLS; + local $Text::Wrap::huge = 'overflow'; + $q = wrap('', '', $q); + my $rows = ($q =~ tr/\n/\n/) + 1; + $q = ascii_html($q); + my $ibx_url = ibx_url_for($ctx); + my $alt; + if (defined $ibx_url) { + $alt = " `$ibx_url'"; + $ibx_url =~ m!://! or + substr($ibx_url, 0, 0, '../../../'); + $ibx_url = ascii_html($ibx_url); + } else { + $ibx_url = $upfx; + $alt = ''; + } + print $zfh <<EOM; +</pre><hr><form action="$ibx_url" +id=related><pre>find related emails, including ancestors/descendants/conflicts +<textarea name=q cols=${\PublicInbox::View::COLS} rows=$rows>$q</textarea> +<input type=submit value="search$alt" +/>\t(<a href="${ibx_url}_/text/help/">help</a>)</pre></form> +EOM + } + } + chop($x = <<EOM); +<hr><pre>glossary +-------- +<dfn +id=commit>Commit</dfn> objects reference one tree, and zero or more parents. + +Single <dfn +id=parent>parent</dfn> commits can typically generate a patch in +unified diff format via `git format-patch'. + +Multiple <dfn id=parents>parents</dfn> means the commit is a merge. + +<dfn id=root_commit>Root commits</dfn> have no ancestor. Note that it is +possible to have multiple root commits when merging independent histories. + +Every commit references one top-level <dfn id=tree>tree</dfn> object.</pre> +EOM + delete($ctx->{-wcb})->($ctx->html_done($x)); +} + +sub stream_patch_parse_hdr { # {parse_hdr} for Qspawn + my ($r, $bref, $ctx) = @_; + if (!defined $r) { # sysread error + html_page($ctx, 500, dbg_log($ctx)); + } elsif (index($$bref, "\n\n") >= 0) { + my $eml = bless { hdr => $bref }, 'PublicInbox::Eml'; + my $fn = to_filename($eml->header('Subject') // ''); + $fn = substr($fn // 'PATCH-no-subject', 6); # drop "PATCH-" + return [ 200, [ 'Content-Type', 'text/plain; charset=UTF-8', + 'Content-Disposition', + qq(inline; filename=$fn.patch) ] ]; + } elsif ($r == 0) { + my $log = dbg_log($ctx); + warn "premature EOF on $ctx->{patch_oid} $log"; + return html_page($ctx, 500, $log); + } else { + undef; # bref keeps growing until "\n\n" + } +} + +sub show_patch ($$) { + my ($ctx, $res) = @_; + my ($git, $oid) = @$res; + my @cmd = ('git', "--git-dir=$git->{git_dir}", + qw(format-patch -1 --stdout -C), + "--signature=git format-patch -1 --stdout -C $oid", $oid); + my $qsp = PublicInbox::Qspawn->new(\@cmd); + $ctx->{env}->{'qspawn.wcb'} = $ctx->{-wcb}; + $ctx->{patch_oid} = $oid; + $qsp->psgi_yield($ctx->{env}, undef, \&stream_patch_parse_hdr, $ctx); +} + +sub show_commit ($$) { + my ($ctx, $res) = @_; + return show_patch($ctx, $res) if ($ctx->{fn} // '') =~ /\.patch\z/; + my ($git, $oid) = @$res; + # patch-id needs two passes, and we use the initial show to ensure + # a patch embedded inside the commit message body doesn't get fed + # to patch-id: + open $ctx->{patch_fh}, '+>', "$ctx->{-tmp}/show"; + my $qsp_h = PublicInbox::Qspawn->new($git->cmd('show', $SHOW_FMT, + qw(--encoding=UTF-8 -z --no-notes --no-patch), $oid), + undef, { 1 => $ctx->{patch_fh} }); + $qsp_h->{qsp_err} = \($ctx->{-qsp_err_h} = ''); + my $cmt_fin = on_destroy \&cmt_fin, $ctx; + $ctx->{git} = $git; + $ctx->{oid} = $oid; + $qsp_h->psgi_qx($ctx->{env}, undef, \&cmt_hdr_prep, $ctx, $cmt_fin); +} + +sub show_other ($$) { # just in case... + my ($ctx, $res) = @_; + my ($git, $oid, $type, $size) = @$res; + $size > $MAX_SIZE and return html_page($ctx, 200, + ascii_html($type)." $oid is too big to show\n". dbg_log($ctx)); my $cmd = ['git', "--git-dir=$git->{git_dir}", qw(show --encoding=UTF-8 --no-color --no-abbrev), $oid ]; my $qsp = PublicInbox::Qspawn->new($cmd); - my $env = $ctx->{env}; - $ctx->{-qsp} = $qsp; - $ctx->{-logref} = $logref; - $qsp->psgi_qx($env, undef, \&show_other_result, $ctx); + $qsp->{qsp_err} = \($ctx->{-qsp_err} = ''); + $qsp->psgi_qx($ctx->{env}, undef, \&show_other_result, $ctx); } -# user_cb for SolverGit, called as: user_cb->($result_or_error, $uarg) -sub solve_result { - my ($res, $ctx) = @_; - my ($log, $hints, $fn) = delete @$ctx{qw(log hints fn)}; - - unless (seek($log, 0, 0)) { - warn "seek(log): $!"; - return html_page($ctx, 500, \'seek error'); +sub show_tree_result ($$) { + my ($bref, $ctx) = @_; + if (my $qsp_err = delete $ctx->{-qsp_err}) { + return html_page($ctx, 500, dbg_log($ctx) . + "git ls-tree -z error:$qsp_err"); } - $log = do { local $/; <$log> }; + my @ent = split(/\0/, $$bref); + my $qp = delete $ctx->{qp}; + my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; + my $pfx = $ctx->{-path} // $qp->{b}; # {-path} is from RepoTree + $$bref = "<pre><a href=#tree>tree</a> $ctx->{tree_oid}"; + # $REPO/tree/$path already sets {-upfx} + my $upfx = $ctx->{-upfx} //= '../../'; + if (defined $pfx) { + $pfx =~ s!/+\z!!s; + if (my $t = $ctx->{-obj}) { + my $t = ascii_html($t); + $$bref .= <<EOM +\n\$ git ls-tree -l $t # shows similar output on the CLI +EOM + } elsif ($pfx eq '') { + $$bref .= " (root)\n"; + } else { + my $x = ascii_html($pfx); + $pfx .= '/'; + $$bref .= qq( <a href=#path>path</a>: $x</a>\n); + } + } else { + $pfx = ''; + $$bref .= qq[ (<a href=#path>path</a> unknown)\n]; + } + my ($x, $m, $t, $oid, $sz, $f, $n, $gitlink); + $$bref .= "\n size name"; + for (@ent) { + ($x, $f) = split(/\t/, $_, 2); + undef $_; + ($m, $t, $oid, $sz) = split(/ +/, $x, 4); + $m = $GIT_MODE{$m} // '?'; + utf8_maybe($f); + $n = ascii_html($f); + if ($m eq 'g') { # gitlink submodule commit + $$bref .= "\ng\t\t$n @ <a\nhref=#g>commit</a>$oid"; + $gitlink = 1; + next; + } + my $q = 'b='.ascii_html(uri_escape_path($pfx.$f)); + if ($m eq 'd') { $n .= '/' } + elsif ($m eq 'x') { $n = "<b>$n</b>" } + elsif ($m eq 'l') { $n = "<i>$n</i>" } + $$bref .= qq(\n$m\t$sz\t<a\nhref="$upfx$oid/s/?$q">$n</a>); + } + $$bref .= dbg_log($ctx); + $$bref .= <<EOM; +<hr><pre>glossary +-------- +<dfn +id=tree>Tree</dfn> objects belong to commits or other tree objects. Trees may +reference blobs, sub-trees, or (rarely) commits of submodules. + +<dfn +id=path>Path</dfn> names are stored in tree objects, but trees do not know +their own path name. A tree's path name comes from their parent tree, +or it is the root tree referenced by a commit object. Thus, this web UI +relies on the `b=' URI parameter as a hint to display the path name. +EOM + + $$bref .= <<EOM if $gitlink; - my $ref = ref($res); +<dfn title="submodule commit" +id=g>Commit</dfn> objects may be stored in trees to reference submodules.</pre> +EOM + chop $$bref; + html_page($ctx, 200, $$bref); +} + +sub show_tree ($$) { # also used by RepoTree + my ($ctx, $res) = @_; + my ($git, $oid, undef, $size) = @$res; + $size > $MAX_SIZE and return html_page($ctx, 200, + "tree $oid is too big to show\n". dbg_log($ctx)); + my $cmd = [ 'git', "--git-dir=$git->{git_dir}", + qw(ls-tree -z -l --no-abbrev), $oid ]; + my $qsp = PublicInbox::Qspawn->new($cmd); + $ctx->{tree_oid} = $oid; + $qsp->{qsp_err} = \($ctx->{-qsp_err} = ''); + $qsp->psgi_qx($ctx->{env}, undef, \&show_tree_result, $ctx); +} + +# returns seconds offset from git TZ offset +sub tz_adj ($) { + my ($tz) = @_; # e.g "-0700" + $tz = int($tz); + my $mm = $tz < 0 ? -$tz : $tz; + $mm = int($mm / 100) * 60 + ($mm % 100); + $mm = $tz < 0 ? -$mm : $mm; + ($mm * 60); +} + +sub show_tag_result { # git->cat_async callback + my ($bref, $oid, $type, $size, $ctx) = @_; + utf8_maybe($$bref); my $l = PublicInbox::Linkify->new; - $log = '<pre>debug log:</pre><hr /><pre>' . - $l->to_html($log) . '</pre>'; + $$bref = $l->to_html($$bref); + $$bref =~ s!^object ([a-f0-9]+)!object <a +href=../../$1/s/>$1</a>!; - $res or return html_page($ctx, 404, \$log); - $ref eq 'ARRAY' or return html_page($ctx, 500, \$log); + $$bref =~ s/^(tagger .*> )([0-9]+) ([\-+]?[0-9]+)/$1.strftime( + '%Y-%m-%d %H:%M:%S', gmtime($2 + tz_adj($3)))." $3"/sme; + # TODO: download link + html_page($ctx, 200, '<pre>', $$bref, '</pre>', dbg_log($ctx)); +} + +sub show_tag ($$) { + my ($ctx, $res) = @_; + my ($git, $oid) = @$res; + $ctx->{git} = $git; + do_cat_async($ctx, \&show_tag_result, $oid); +} + +# user_cb for SolverGit, called as: user_cb->($result_or_error, $uarg) +sub solve_result { + my ($res, $ctx) = @_; + my $hints = delete $ctx->{hints}; + $res or return html_page($ctx, 404, 'Not found', dbg_log($ctx)); + ref($res) eq 'ARRAY' or + return html_page($ctx, 500, 'Internal error', dbg_log($ctx)); my ($git, $oid, $type, $size, $di) = @$res; - return show_other($ctx, $res, \$log, $fn) if $type ne 'blob'; - my $path = to_filename($di->{path_b} // $hints->{path_b} // 'blob'); - my $raw_link = "(<a\nhref=$path>raw</a>)"; + return show_commit($ctx, $res) if $type eq 'commit'; + return show_tree($ctx, $res) if $type eq 'tree'; + return show_tag($ctx, $res) if $type eq 'tag'; + return show_other($ctx, $res) if $type ne 'blob'; + my $fn = $di->{path_b} // $hints->{path_b}; + my $paths = $ctx->{-paths} //= do { + my $path = to_filename($fn // 'blob') // 'blob'; + my $raw_more = qq[(<a\nhref="$path">raw</a>)]; + my @def; + + # XXX not sure if this is the correct wording + if (defined($fn)) { + $raw_more .= qq( +name: ${\ascii_html($fn)} \t # note: path name is non-authoritative<a +href="#pathdef" id=top>(*)</a>); + $def[0] = "<hr><pre\nid=pathdef>" . +'(*) Git path names are given by the tree(s) the blob belongs to. + Blobs themselves have no identifier aside from the hash of its contents.'. +qq(<a\nhref="#top">^</a></pre>); + } + [ $path, $raw_more, @def ]; + }; + $ctx->{-q_value_html} //= do { + my $s = defined($fn) ? 'dfn:'.ascii_html($fn).' ' : ''; + $s.'dfpost:'.substr($oid, 0, 7); + }; + if ($size > $MAX_SIZE) { - return stream_large_blob($ctx, $res, \$log, $fn) if defined $fn; - $log = "<pre><b>Too big to show, download available</b>\n" . - "$oid $type $size bytes $raw_link</pre>" . $log; - return html_page($ctx, 200, \$log); + return stream_large_blob($ctx, $res) if defined $ctx->{fn}; + return html_page($ctx, 200, <<EOM . dbg_log($ctx)); +<pre><b>Too big to show, download available</b> +blob $oid $size bytes $paths->[1]</pre> +EOM } + bless $ctx, 'PublicInbox::WwwStream'; # for DESTROY + $ctx->{git} = $git; + do_cat_async($ctx, \&show_blob, $oid); +} - my $blob = $git->cat_file($oid); - if (!$blob) { # WTF? +sub show_blob { # git->cat_async callback + my ($blob, $oid, $type, $size, $ctx) = @_; + if (!$blob) { my $e = "Failed to retrieve generated blob ($oid)"; - warn "$e ($git->{git_dir})"; - $log = "<pre><b>$e</b></pre>" . $log; - return html_page($ctx, 500, \$log); + warn "$e ($ctx->{git}->{git_dir}) type=$type"; + return html_page($ctx, 500, "<pre><b>$e</b></pre>".dbg_log($ctx)) } my $bin = index(substr($$blob, 0, $BIN_DETECT), "\0") >= 0; - if (defined $fn) { + if (defined $ctx->{fn}) { my $h = [ 'Content-Length', $size, 'Content-Type' ]; push(@$h, ($bin ? 'application/octet-stream' : 'text/plain')); return delete($ctx->{-wcb})->([200, $h, [ $$blob ]]); } - if ($bin) { - $log = "<pre>$oid $type $size bytes (binary)" . - " $raw_link</pre>" . $log; - return html_page($ctx, 200, \$log); - } + my ($path, $raw_more, @def) = @{delete $ctx->{-paths}}; + $bin and return html_page($ctx, 200, + "<pre>blob $oid $size bytes (binary)" . + " $raw_more</pre>".dbg_log($ctx)); # TODO: detect + convert to ensure validity - utf8::decode($$blob); + utf8_maybe($$blob); my $nl = ($$blob =~ s/\r?\n/\n/sg); my $pad = length($nl); - $l->linkify_1($$blob); + ($ctx->{-linkify} //= PublicInbox::Linkify->new)->linkify_1($$blob); my $ok = $hl->do_hl($blob, $path) if $hl; if ($ok) { $blob = $ok; @@ -170,38 +608,63 @@ sub solve_result { } # using some of the same CSS class names and ids as cgit - $log = "<pre>$oid $type $size bytes $raw_link</pre>" . + my $x = "<pre>blob $oid $size bytes $raw_more</pre>" . "<hr /><table\nclass=blob>". - "<tr><td\nclass=linenumbers><pre>" . join('', map { - sprintf("<a id=n$_ href=#n$_>% ${pad}u</a>\n", $_) - } (1..$nl)) . '</pre></td>' . - '<td><pre> </pre></td>'. # pad for non-CSS users - "<td\nclass=lines><pre\nstyle='white-space:pre'><code>" . - $l->linkify_2($$blob) . - '</code></pre></td></tr></table>' . $log; - - html_page($ctx, 200, \$log); + "<tr><td\nclass=linenumbers><pre>"; + # scratchpad in this loop is faster here than `printf $zfh': + $x .= sprintf("<a id=n$_ href=#n$_>% ${pad}u</a>\n", $_) for (1..$nl); + $x .= '</pre></td><td><pre> </pre></td>'. # pad for non-CSS users + "<td\nclass=lines><pre\nstyle='white-space:pre'><code>"; + html_page($ctx, 200, $x, $ctx->{-linkify}->linkify_2($$blob), + '</code></pre></td></tr></table>'.dbg_log($ctx), @def); } -# GET /$INBOX/$GIT_OBJECT_ID/s/ -# GET /$INBOX/$GIT_OBJECT_ID/s/$FILENAME -sub show ($$;$) { - my ($ctx, $oid_b, $fn) = @_; - my $qp = $ctx->{qp}; - my $hints = $ctx->{hints} = {}; +sub start_solver ($) { + my ($ctx) = @_; while (my ($from, $to) = each %QP_MAP) { - defined(my $v = $qp->{$from}) or next; - $hints->{$to} = $v if $v ne ''; + my $v = $ctx->{qp}->{$from} // next; + $ctx->{hints}->{$to} = $v if $v ne ''; } - - $ctx->{'log'} = tmpfile("solve.$oid_b") // die "tmpfile: $!"; - $ctx->{fn} = $fn; + $ctx->{-next_solver} = on_destroy \&next_solver; + ++$solver_nr; + $ctx->{-tmp} = File::Temp->newdir("solver.$ctx->{oid_b}-XXXX", + TMPDIR => 1); + $ctx->{lh} or open $ctx->{lh}, '+>>', "$ctx->{-tmp}/solve.log"; my $solver = PublicInbox::SolverGit->new($ctx->{ibx}, \&solve_result, $ctx); + $solver->{limiter} = $solver_lim; + $solver->{gits} //= [ $ctx->{git} ]; + $solver->{tmp} = $ctx->{-tmp}; # share tmpdir # PSGI server will call this immediately and give us a callback (-wcb) + $solver->solve(@$ctx{qw(env lh oid_b hints)}); +} + +# run the next solver job when done and DESTROY-ed +sub next_solver { + --$solver_nr; + # XXX FIXME: client may've disconnected if it waited a long while + start_solver(shift(@solver_q) // return); +} + +sub may_start_solver ($) { + my ($ctx) = @_; + $solver_lim //= $ctx->{www}->{pi_cfg}->limiter('codeblob'); + if ($solver_nr >= $solver_lim->{max}) { + @solver_q > 128 ? html_page($ctx, 503, 'too busy') + : push(@solver_q, $ctx); + } else { + start_solver($ctx); + } +} + +# GET /$INBOX/$GIT_OBJECT_ID/s/ +# GET /$INBOX/$GIT_OBJECT_ID/s/$FILENAME +sub show ($$;$) { + my ($ctx, $oid_b, $fn) = @_; + @$ctx{qw(oid_b fn)} = ($oid_b, $fn); sub { $ctx->{-wcb} = $_[0]; # HTTP write callback - $solver->solve($ctx->{env}, $ctx->{log}, $oid_b, $hints); + may_start_solver $ctx; }; } diff --git a/lib/PublicInbox/WQBlocked.pm b/lib/PublicInbox/WQBlocked.pm index fbb43600..8d931fa9 100644 --- a/lib/PublicInbox/WQBlocked.pm +++ b/lib/PublicInbox/WQBlocked.pm @@ -8,7 +8,6 @@ use parent qw(PublicInbox::DS); use PublicInbox::Syscall qw(EPOLLOUT EPOLLONESHOT); use PublicInbox::IPC; use Carp (); -use Socket qw(MSG_EOR); sub new { my ($cls, $wq, $buf) = @_; @@ -25,7 +24,7 @@ sub flush_send { } else { my $wq_s1 = $self->{sock}; my $n = $PublicInbox::IPC::send_cmd->($wq_s1, [], $buf, - MSG_EOR); + 0); next if defined($n); Carp::croak("sendmsg: $!") unless $!{EAGAIN}; PublicInbox::DS::epwait($wq_s1, EPOLLOUT|EPOLLONESHOT); diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 755d7558..289599b8 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -14,6 +14,7 @@ package PublicInbox::WWW; use strict; use v5.10.1; use PublicInbox::Config; +use PublicInbox::Git; use PublicInbox::Hval; use URI::Escape qw(uri_unescape); use PublicInbox::MID qw(mid_escape); @@ -23,9 +24,9 @@ use PublicInbox::WwwStatic qw(r path_info_raw); use PublicInbox::Eml; # TODO: consider a routing tree now that we have more endpoints: -our $INBOX_RE = qr!\A/([\w\-][\w\.\-]*)!; +our $INBOX_RE = qr!\A/([\w\-][\w\.\-\+]*)!; our $MID_RE = qr!([^/]+)!; -our $END_RE = qr!(T/|t/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; +our $END_RE = qr!(T/|t/|d/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; our $ATTACH_RE = qr!([0-9][0-9\.]*)-($PublicInbox::Hval::FN)!; our $OID_RE = qr![a-f0-9]{7,}!; @@ -45,14 +46,21 @@ sub call { my $ctx = { env => $env, www => $self }; # we don't care about multi-value - %{$ctx->{qp}} = map { - utf8::decode($_); - tr/+/ /; - my ($k, $v) = split(/=/, $_, 2); - # none of the keys we care about will need escaping - ($k // '', uri_unescape($v // '')) - } split(/[&;]+/, $env->{QUERY_STRING}); - + # '0' isn't a QUERY_STRING we care about + if (my $qs = $env->{QUERY_STRING}) { + utf8::decode($qs); + $qs =~ tr/+/ /; + %{$ctx->{qp}} = map { + # we only use single-char query param keys + if (s/\A([A-Za-z])=//) { + $1 => uri_unescape($_) + } elsif (/\A[a-z]\z/) { # some boolean options + $_ => '' + } else { + () # ignored + } + } split(/[&;]+/, $qs); + } my $path_info = path_info_raw($env); my $method = $env->{REQUEST_METHOD}; @@ -68,7 +76,9 @@ sub call { my ($idx, $fn) = ($3, $4); return invalid_inbox_mid($ctx, $1, $2) || get_attach($ctx, $idx, $fn); - } elsif ($path_info =~ m!$INBOX_RE/!o) { + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/\z!o) { + return invalid_inbox_mid($ctx, $1, $2) || mbox_results($ctx); + } elsif ($path_info =~ m!$INBOX_RE/\z!o) { return invalid_inbox($ctx, $1) || mbox_results($ctx); } } @@ -91,6 +101,9 @@ sub call { invalid_inbox($ctx, $1) || get_atom($ctx); } elsif ($path_info =~ m!$INBOX_RE/new\.html\z!o) { invalid_inbox($ctx, $1) || get_new($ctx); + } elsif ($path_info =~ + m!$INBOX_RE/topics_(new|active)\.(atom|html)\z!o) { + get_topics($ctx, $1, $2, $3); } elsif ($path_info =~ m!$INBOX_RE/description\z!o) { get_description($ctx, $1); } elsif ($path_info =~ m!$INBOX_RE/(?:(?:git/)?([0-9]+)(?:\.git)?/)? @@ -176,6 +189,7 @@ sub preload { } $pi_cfg->ALL and require PublicInbox::Isearch; $self->cgit; + $self->coderepo; $self->stylesheets_prepare($_) for ('', '../', '../../'); $self->news_www; } @@ -194,10 +208,20 @@ sub r404 { sub news_cgit_fallback ($) { my ($ctx) = @_; - my $www = $ctx->{www}; - my $env = $ctx->{env}; - my $res = $www->news_www->call($env); - $res->[0] == 404 ? $www->cgit->call($env) : $res; + my $res = $ctx->{www}->news_www->call($ctx->{env}); + + $res->[0] == 404 and ($ctx->{www}->{cgit_fallback} //= do { + my $c = $ctx->{www}->{pi_cfg}->{'publicinbox.cgit'} // 'first'; + $c ne 'first' # `fallback' and `rewrite' => true + } // 0) and $res = $ctx->{www}->coderepo->srv($ctx); + + ref($res) eq 'ARRAY' && $res->[0] == 404 and + $res = $ctx->{www}->cgit->call($ctx->{env}, $ctx); + + ref($res) eq 'ARRAY' && $res->[0] == 404 && + !$ctx->{www}->{cgit_fallback} and + $res = $ctx->{www}->coderepo->srv($ctx); + $res; } # returns undef if valid, array ref response if invalid @@ -250,6 +274,13 @@ sub get_new { PublicInbox::Feed::new_html($ctx); } +# /$INBOX/topics_(new|active).(html|atom) +sub get_topics { + my ($ctx, $ibx_name, $category, $type) = @_; + require PublicInbox::WwwTopics; + PublicInbox::WwwTopics::response($ctx, $ibx_name, $category, $type); +} + # /$INBOX/?r=$GIT_COMMIT -> HTML only sub get_index { my ($ctx) = @_; @@ -303,7 +334,8 @@ sub get_text { sub get_vcs_object ($$$;$) { my ($ctx, $inbox, $oid, $filename) = @_; my $r404 = invalid_inbox($ctx, $inbox); - return $r404 if $r404 || !$ctx->{www}->{pi_cfg}->repo_objs($ctx->{ibx}); + return $r404 if $r404; + return r(404) if !$ctx->{www}->{pi_cfg}->repo_objs($ctx->{ibx}); require PublicInbox::ViewVCS; PublicInbox::ViewVCS::show($ctx, $oid, $filename); } @@ -317,11 +349,12 @@ sub get_altid_dump { } sub need { - my ($ctx, $extra) = @_; + my ($ctx, $extra, $upref) = @_; require PublicInbox::WwwStream; - PublicInbox::WwwStream::html_oneshot($ctx, 501, \<<EOF); + $upref //= '../'; + PublicInbox::WwwStream::html_oneshot($ctx, 501, <<EOF); <pre>$extra is not available for this public-inbox -<a\nhref="../">Return to index</a></pre> +<a\nhref="$upref">Return to index</a></pre> EOF } @@ -441,6 +474,10 @@ sub msg_page { # legacy, but no redirect for compatibility: 'f/' eq $e and return get_mid_html($ctx); + if ($e eq 'd/') { + require PublicInbox::View; + return PublicInbox::View::diff_msg($ctx); + } r404($ctx); } @@ -480,16 +517,21 @@ sub news_www { sub cgit { my ($self) = @_; - $self->{cgit} //= do { - my $pi_cfg = $self->{pi_cfg}; - - if (defined($pi_cfg->{'publicinbox.cgitrc'})) { + $self->{cgit} //= + (defined($self->{pi_cfg}->{'publicinbox.cgitrc'}) ? do { require PublicInbox::Cgit; - PublicInbox::Cgit->new($pi_cfg); - } else { + PublicInbox::Cgit->new($self->{pi_cfg}); + } : undef) // do { require Plack::Util; Plack::Util::inline_object(call => sub { r404() }); - } + }; +} + +sub coderepo { + my ($self) = @_; + $self->{coderepo} //= do { + require PublicInbox::WwwCoderepo; + PublicInbox::WwwCoderepo->new($self->{pi_cfg}); } } @@ -558,9 +600,9 @@ sub stylesheets_prepare ($$) { next; }; my $ctime = 0; - my $local = do { local $/; <$fh> }; + my $local = PublicInbox::IO::read_all $fh; # sets _ if ($local =~ /\S/) { - $ctime = sprintf('%x',(stat($fh))[10]); + $ctime = sprintf('%x',(stat(_))[10]); $local = $mini->($local); } diff --git a/lib/PublicInbox/WWW.pod b/lib/PublicInbox/WWW.pod index 9f6ba466..b55f010e 100644 --- a/lib/PublicInbox/WWW.pod +++ b/lib/PublicInbox/WWW.pod @@ -20,13 +20,14 @@ The PSGI web interface for public-inbox. Using this directly is not needed unless you wish to customize your public-inbox PSGI deployment or are using a PSGI server -other than L<public-inbox-httpd(1)>. +other than L<public-inbox-netd(1)> (C<-netd>) / +L<public-inbox-httpd(1)> (C<-httpd>) -While this PSGI application works with all PSGI/Plack web +While this PSGI application should work with all PSGI/Plack web servers such as L<starman(1)>, L<starlet(1)> or L<twiggy(1)>; -PublicInbox::WWW takes advantage of currently-undocumented APIs -of L<public-inbox-httpd(1)> to improve fairness when serving -large responses for thread views and git clones. +PublicInbox::WWW takes advantage of internal APIs of C<-netd> +and C<-httpd> to improve fairness when serving large responses +for thread views, mbox downloads, and git clones. =head1 ENVIRONMENT @@ -47,10 +48,11 @@ and L<http://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/meta =head1 COPYRIGHT -Copyright (C) 2016-2021 all contributors L<mailto:meta@public-inbox.org> +Copyright (C) all contributors L<mailto:meta@public-inbox.org> License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> =head1 SEE ALSO -L<http://plackperl.org/>, L<Plack>, L<public-inbox-httpd(1)> +L<http://plackperl.org/>, L<Plack>, L<public-inbox-netd(1)>, +L<public-inbox-httpd(1)> diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm index 3f6fe21b..eb90d353 100644 --- a/lib/PublicInbox/Watch.pm +++ b/lib/PublicInbox/Watch.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # ref: https://cr.yp.to/proto/maildir.html @@ -12,11 +12,11 @@ use PublicInbox::MdirReader; use PublicInbox::NetReader; use PublicInbox::Filter::Base qw(REJECT); use PublicInbox::Spamcheck; -use PublicInbox::DS qw(now add_timer); +use PublicInbox::DS qw(now add_timer awaitpid); use PublicInbox::MID qw(mids); use PublicInbox::ContentHash qw(content_hash); -use PublicInbox::EOFpipe; use POSIX qw(_exit WNOHANG); +use constant { D_MAILDIR => 1, D_MH => 2 }; sub compile_watchheaders ($) { my ($ibx) = @_; @@ -41,11 +41,25 @@ sub compile_watchheaders ($) { $ibx->{-watchheaders} = $watch_hdrs if scalar @$watch_hdrs; } +sub d_type_set ($$$) { + my ($d_type, $dir, $is) = @_; + my $isnt = D_MAILDIR; + if ($is == D_MAILDIR) { + $isnt = D_MH; + $d_type->{"$dir/cur"} |= $is; + $d_type->{"$dir/new"} |= $is; + } + warn <<EOM if ($d_type->{$dir} |= $is) & $isnt; +W: `$dir' is both Maildir and MH (non-fatal) +EOM +} + sub new { my ($class, $cfg) = @_; - my (%mdmap, $spamc); + my (%d_map, %d_type); my (%imap, %nntp); # url => [inbox objects] or 'watchspam' my (@imap, @nntp); + PublicInbox::Import::load_config($cfg); # "publicinboxwatch" is the documented namespace # "publicinboxlearn" is legacy but may be supported @@ -57,7 +71,11 @@ sub new { my $uri; if (is_maildir($dir)) { # skip "new", no MUA has seen it, yet. - $mdmap{"$dir/cur"} = 'watchspam'; + $d_map{"$dir/cur"} = 'watchspam'; + d_type_set \%d_type, $dir, D_MAILDIR; + } elsif (is_mh($dir)) { + $d_map{$dir} = 'watchspam'; + d_type_set \%d_type, $dir, D_MH; } elsif ($uri = imap_uri($dir)) { $imap{$$uri} = 'watchspam'; push @imap, $uri; @@ -69,7 +87,6 @@ sub new { } } } - my $k = 'publicinboxwatch.spamcheck'; my $default = undef; my $spamcheck = PublicInbox::Spamcheck::get($cfg, $k, $default); @@ -80,16 +97,28 @@ sub new { my $ibx = $_[0] = PublicInbox::InboxWritable->new($_[0]); my $watches = $ibx->{watch} or return; + + $ibx->{indexlevel} //= $ibx->detect_indexlevel; $watches = PublicInbox::Config::_array($watches); for my $watch (@$watches) { my $uri; - if (is_maildir($watch)) { + my $bool = $cfg->git_bool($watch); + if (defined $bool && !$bool) { + $ibx->{-watch_disabled} = 1; + } elsif (is_maildir($watch)) { compile_watchheaders($ibx); my ($new, $cur) = ("$watch/new", "$watch/cur"); - my $cur_dst = $mdmap{$cur} //= []; + my $cur_dst = $d_map{$cur} //= []; return if is_watchspam($cur, $cur_dst, $ibx); - push @{$mdmap{$new} //= []}, $ibx; + push @{$d_map{$new} //= []}, $ibx; push @$cur_dst, $ibx; + d_type_set \%d_type, $watch, D_MAILDIR; + } elsif (is_mh($watch)) { + my $cur_dst = $d_map{$watch} //= []; + return if is_watchspam($watch, $cur_dst, $ibx); + compile_watchheaders($ibx); + push(@$cur_dst, $ibx); + d_type_set \%d_type, $watch, D_MH; } elsif ($uri = imap_uri($watch)) { my $cur_dst = $imap{$$uri} //= []; return if is_watchspam($uri, $cur_dst, $ibx); @@ -106,18 +135,19 @@ sub new { } }); - my $mdre; - if (scalar keys %mdmap) { - $mdre = join('|', map { quotemeta($_) } keys %mdmap); - $mdre = qr!\A($mdre)/!; + my $d_re; + if (scalar keys %d_map) { + $d_re = join('|', map quotemeta, keys %d_map); + $d_re = qr!\A($d_re)/!; } - return unless $mdre || scalar(keys %imap) || scalar(keys %nntp); + return unless $d_re || scalar(keys %imap) || scalar(keys %nntp); bless { max_batch => 10, # avoid hogging locks for too long spamcheck => $spamcheck, - mdmap => \%mdmap, - mdre => $mdre, + d_map => \%d_map, + d_re => $d_re, + d_type => \%d_type, pi_cfg => $cfg, imap => scalar keys %imap ? \%imap : undef, nntp => scalar keys %nntp? \%nntp : undef, @@ -141,6 +171,7 @@ sub _done_for_now { sub remove_eml_i { # each_inbox callback my ($ibx, $self, $eml, $loc) = @_; + return if $ibx->{-watch_disabled}; eval { # try to avoid taking a lock or unnecessary spawning @@ -214,17 +245,23 @@ sub import_eml ($$$) { sub _try_path { my ($self, $path) = @_; - my $fl = PublicInbox::MdirReader::maildir_path_flags($path) // return; - return if $fl =~ /[DT]/; # no Drafts or Trash - if ($path !~ $self->{mdre}) { - warn "unrecognized path: $path\n"; - return; - } - my $inboxes = $self->{mdmap}->{$1}; - unless ($inboxes) { - warn "unmappable dir: $1\n"; - return; - } + $path =~ $self->{d_re} or + return warn("BUG? unrecognized path: $path\n"); + my $dir = $1; + my $inboxes = $self->{d_map}->{$dir} // + return warn("W: unmappable dir: $dir\n"); + my ($md_fl, $mh_seq); + if ($self->{d_type}->{$dir} & D_MH) { + $path =~ m!/([0-9]+)\z! ? ($mh_seq = $1) : return; + } + $self->{d_type}->{$dir} & D_MAILDIR and + $md_fl = PublicInbox::MdirReader::maildir_path_flags($path); + $md_fl // $mh_seq // return; + return if ($md_fl // '') =~ /[DT]/; # no Drafts or Trash + # n.b. none of the MH keywords are relevant for public mail, + # mh_seq is only used to validate we're reading an email + # and not treating .mh_sequences as an email + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; @@ -244,25 +281,24 @@ sub quit_done ($) { return unless $self->{quit}; # don't have reliable wakeups, keep signalling - my $done = 1; - for (qw(idle_pids poll_pids)) { - my $pids = $self->{$_} or next; - for (keys %$pids) { - $done = undef if kill('QUIT', $_); - } - } - $done; + my $live = grep { kill('QUIT', $_) } keys %{$self->{pids}}; + add_timer(0.01, \&quit_done, $self) if $live; + $live == 0; } -sub quit { +sub quit { # may be called in IMAP/NNTP children my ($self) = @_; $self->{quit} = 1; %{$self->{opendirs}} = (); _done_for_now($self); quit_done($self); - if (my $idle_mic = $self->{idle_mic}) { + if (my $dir_idle = delete $self->{dir_idle}) { + $dir_idle->close if $dir_idle; + } + if (my $idle_mic = delete $self->{idle_mic}) { # IMAP child + return unless $idle_mic->IsConnected && $idle_mic->Socket; eval { $idle_mic->done }; - if ($@) { + if ($@ && $idle_mic->IsConnected && $idle_mic->Socket) { warn "IDLE DONE error: $@\n"; eval { $idle_mic->disconnect }; warn "IDLE LOGOUT error: $@\n" if $@; @@ -282,8 +318,8 @@ sub watch_fs_init ($) { }; require PublicInbox::DirIdle; # inotify_create + EPOLL_CTL_ADD - my $dir_idle = PublicInbox::DirIdle->new($cb); - $dir_idle->add_watches([keys %{$self->{mdmap}}]); + my $dir_idle = $self->{dir_idle} = PublicInbox::DirIdle->new($cb); + $dir_idle->add_watches([keys %{$self->{d_map}}]); } sub net_cb { # NetReader::(nntp|imap)_each callback @@ -318,7 +354,7 @@ sub imap_fetch_all ($$) { local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: |# )/g ? $1 : ''; my $uid = $self->{cur_uid}; - $warn_cb->("$pfx$uri", $uid ? ("UID:$uid") : (), "\n", @_); + $warn_cb->("$pfx$uri", $uid ? (" UID:$uid") : (), "\n", @_); }; PublicInbox::NetReader::imap_each($self, $uri, \&net_cb, $self, $self->{imap}->{$$uri}); @@ -328,7 +364,7 @@ sub imap_idle_once ($$$$) { my ($self, $mic, $intvl, $uri) = @_; my $i = $intvl //= (29 * 60); my $end = now() + $intvl; - warn "I: $uri idling for ${intvl}s\n"; + warn "# $uri idling for ${intvl}s\n"; local $0 = "IDLE $0"; return if $self->{quit}; unless ($mic->idle) { @@ -381,63 +417,42 @@ sub watch_imap_idle_1 ($$$) { sub watch_atfork_child ($) { my ($self) = @_; - delete $self->{idle_pids}; - delete $self->{poll_pids}; - delete $self->{opendirs}; - PublicInbox::DS->Reset; + delete @$self{qw(dir_idle pids opendirs)}; my $sig = delete $self->{sig}; - $sig->{CHLD} = 'DEFAULT'; + $sig->{CHLD} = $sig->{HUP} = $sig->{USR1} = 'DEFAULT'; + # TERM/QUIT/INT call ->quit, which works in both parent+child @SIG{keys %$sig} = values %$sig; - PublicInbox::DS::sig_setmask($self->{oldset}); + PublicInbox::DS::sig_setmask(PublicInbox::DS::allowset($sig)); } sub watch_atfork_parent ($) { _done_for_now($_[0]) } sub imap_idle_requeue { # DS::add_timer callback - my ($self, $uri_intvl) = @_; + my ($self, $uri, $intvl) = @_; return if $self->{quit}; - push @{$self->{idle_todo}}, $uri_intvl; + push @{$self->{idle_todo}}, $uri, $intvl; event_step($self); } -sub imap_idle_reap { # PublicInbox::DS::dwaitpid callback - my ($self, $pid) = @_; - my $uri_intvl = delete $self->{idle_pids}->{$pid} or - die "BUG: PID=$pid (unknown) reaped: \$?=$?\n"; - - my ($uri, $intvl) = @$uri_intvl; +sub imap_idle_reap { # awaitpid callback + my ($pid, $self, $uri, $intvl) = @_; + delete $self->{pids}->{$pid}; return if $self->{quit}; warn "W: PID=$pid on $uri died: \$?=$?\n" if $?; - add_timer(60, \&imap_idle_requeue, $self, $uri_intvl); + add_timer(60, \&imap_idle_requeue, $self, $uri, $intvl); } -sub reap { # callback for EOFpipe - my ($pid, $cb, $self) = @{$_[0]}; - my $ret = waitpid($pid, 0); - if ($ret == $pid) { - $cb->($self, $pid); # poll_fetch_reap || imap_idle_reap - } else { - warn "W: waitpid($pid) => ", $ret // "($!)", "\n"; - } -} - -sub imap_idle_fork ($$) { - my ($self, $uri_intvl) = @_; - my ($uri, $intvl) = @$uri_intvl; - pipe(my ($r, $w)) or die "pipe: $!"; - my $seed = rand(0xffffffff); - my $pid = fork // die "fork: $!"; +sub imap_idle_fork { + my ($self, $uri, $intvl) = @_; + return if $self->{quit}; + my $pid = PublicInbox::DS::fork_persist; if ($pid == 0) { - srand($seed); - eval { Net::SSLeay::randomize() }; - close $r; watch_atfork_child($self); watch_imap_idle_1($self, $uri, $intvl); - close $w; _exit(0); } - $self->{idle_pids}->{$pid} = $uri_intvl; - PublicInbox::EOFpipe->new($r, \&reap, [$pid, \&imap_idle_reap, $self]); + $self->{pids}->{$pid} = undef; + awaitpid($pid, \&imap_idle_reap, $self, $uri, $intvl); } sub event_step { @@ -447,13 +462,13 @@ sub event_step { if ($idle_todo && @$idle_todo) { watch_atfork_parent($self); eval { - while (my $uri_intvl = shift(@$idle_todo)) { - imap_idle_fork($self, $uri_intvl); + while (my ($uri, $intvl) = splice(@$idle_todo, 0, 2)) { + imap_idle_fork($self, $uri, $intvl); } }; die $@ if $@; } - fs_scan_step($self) if $self->{mdre}; + fs_scan_step($self) if $self->{d_re}; } sub watch_imap_fetch_all ($$) { @@ -474,7 +489,7 @@ sub watch_nntp_fetch_all ($$) { local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: |# )/g ? $1 : ''; my $art = $self->{cur_uid}; - $warn_cb->("$pfx$uri", $art ? ("ARTICLE $art") : (), "\n", @_); + $warn_cb->("$pfx$uri", $art ? (" ARTICLE $art") : (), "\n", @_); }; for $uri (@$uris) { PublicInbox::NetReader::nntp_each($self, $uri, \&net_cb, $self, @@ -486,52 +501,44 @@ sub watch_nntp_fetch_all ($$) { sub poll_fetch_fork { # DS::add_timer callback my ($self, $intvl, $uris) = @_; return if $self->{quit}; - pipe(my ($r, $w)) or die "pipe: $!"; watch_atfork_parent($self); - my $seed = rand(0xffffffff); - my $pid = fork; - if (defined($pid) && $pid == 0) { - srand($seed); - eval { Net::SSLeay::randomize() }; - close $r; + my @nntp; + my @imap = grep { # push() always returns > 0 + $_->scheme =~ m!\Aimaps?!i ? 1 : (push(@nntp, $_) < 0) + } @$uris; + my $pid = PublicInbox::DS::fork_persist; + if ($pid == 0) { watch_atfork_child($self); - if ($uris->[0]->scheme =~ m!\Aimaps?!i) { - watch_imap_fetch_all($self, $uris); - } else { - watch_nntp_fetch_all($self, $uris); - } - close $w; + watch_imap_fetch_all($self, \@imap) if @imap; + watch_nntp_fetch_all($self, \@nntp) if @nntp; _exit(0); } - die "fork: $!" unless defined $pid; - $self->{poll_pids}->{$pid} = [ $intvl, $uris ]; - PublicInbox::EOFpipe->new($r, \&reap, [$pid, \&poll_fetch_reap, $self]); + $self->{pids}->{$pid} = undef; + awaitpid($pid, \&poll_fetch_reap, $self, $intvl, $uris); } -sub poll_fetch_reap { - my ($self, $pid) = @_; - my $intvl_uris = delete $self->{poll_pids}->{$pid} or - die "BUG: PID=$pid (unknown) reaped: \$?=$?\n"; +sub poll_fetch_reap { # awaitpid callback + my ($pid, $self, $intvl, $uris) = @_; + delete $self->{pids}->{$pid}; return if $self->{quit}; - my ($intvl, $uris) = @$intvl_uris; if ($?) { warn "W: PID=$pid died: \$?=$?\n", map { "$_\n" } @$uris; } - warn("I: will check $_ in ${intvl}s\n") for @$uris; + warn("# will check $_ in ${intvl}s\n") for @$uris; add_timer($intvl, \&poll_fetch_fork, $self, $intvl, $uris); } sub watch_imap_init ($$) { my ($self, $poll) = @_; - my $mics = PublicInbox::NetReader::imap_common_init($self); - my $idle = []; # [ [ uri1, intvl1 ], [uri2, intvl2] ] + my $mics = PublicInbox::NetReader::imap_common_init($self) or return; + my $idle = []; # [ uri1, intvl1, uri2, intvl2 ] for my $uri (@{$self->{imap_order}}) { my $sec = uri_section($uri); my $mic = $mics->{$sec}; my $intvl = $self->{cfg_opt}->{$sec}->{pollInterval}; if ($mic->has_capability('IDLE') && !$intvl) { $intvl = $self->{cfg_opt}->{$sec}->{idleInterval}; - push @$idle, [ $uri, $intvl // () ]; + push @$idle, $uri, $intvl; } else { push @{$poll->{$intvl || 120}}, $uri; } @@ -552,10 +559,12 @@ sub watch_nntp_init ($$) { } } +sub quit_inprogress { !$_[0]->quit_done } # post_loop_do CB + sub watch { # main entry point - my ($self, $sig, $oldset) = @_; - $self->{oldset} = $oldset; - $self->{sig} = $sig; + my ($self, $sig) = @_; + my $first_sig; + $self->{sig} //= ($first_sig = $sig); my $poll = {}; # intvl_seconds => [ uri1, uri2 ] watch_imap_init($self, $poll) if $self->{imap}; watch_nntp_init($self, $poll) if $self->{nntp}; @@ -563,9 +572,9 @@ sub watch { # main entry point # poll all URIs for a given interval sequentially add_timer(0, \&poll_fetch_fork, $self, $intvl, $uris); } - watch_fs_init($self) if $self->{mdre}; - PublicInbox::DS->SetPostLoopCallback(sub { !$self->quit_done }); - PublicInbox::DS::event_loop($sig, $oldset); # calls ->event_step + watch_fs_init($self) if $self->{d_re}; + local @PublicInbox::DS::post_loop_do = (\&quit_inprogress, $self); + PublicInbox::DS::event_loop($first_sig); # calls ->event_step _done_for_now($self); } @@ -594,7 +603,7 @@ sub fs_scan_step { $opendirs->{$dir} = $dh if $n < 0; } if ($op && $op eq 'full') { - foreach my $dir (keys %{$self->{mdmap}}) { + foreach my $dir (keys %{$self->{d_map}}) { next if $opendirs->{$dir}; # already in progress my $ok = opendir(my $dh, $dir); unless ($ok) { @@ -669,6 +678,13 @@ sub is_maildir { $_[0]; } +sub is_mh { + $_[0] =~ s!\Amh:!!i or return; + $_[0] =~ tr!/!/!s; + $_[0] =~ s!/\z!!; + $_[0]; +} + sub is_watchspam { my ($cur, $ws, $ibx) = @_; if ($ws && !ref($ws) && $ws eq 'watchspam') { diff --git a/lib/PublicInbox/WwwAltId.pm b/lib/PublicInbox/WwwAltId.pm index e107dfe0..31d9b607 100644 --- a/lib/PublicInbox/WwwAltId.pm +++ b/lib/PublicInbox/WwwAltId.pm @@ -1,9 +1,9 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # dumps using the ".dump" command of sqlite3(1) package PublicInbox::WwwAltId; -use strict; +use v5.12; use PublicInbox::Qspawn; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::AltId; @@ -33,14 +33,14 @@ sub sqldump ($$) { my $altid_map = $ibx->altid_map; my $fn = $altid_map->{$altid_pfx}; unless (defined $fn) { - return html_oneshot($ctx, 404, \<<EOF); + return html_oneshot($ctx, 404, <<EOF); <pre>`$altid_pfx' is not a valid altid for this inbox</pre> EOF } if ($env->{REQUEST_METHOD} ne 'POST') { my $url = $ibx->base_url($ctx->{env}) . "$altid_pfx.sql.gz"; - return html_oneshot($ctx, 405, \<<EOF); + return html_oneshot($ctx, 405, <<EOF); <pre>A POST request is required to retrieve $altid_pfx.sql.gz curl -d '' -O $url @@ -54,24 +54,19 @@ or EOF } - $sqlite3 //= which('sqlite3') // return html_oneshot($ctx, 501, \<<EOF); + $sqlite3 //= which('sqlite3') // return html_oneshot($ctx, 501, <<EOF); <pre>sqlite3 not available The administrator needs to install the sqlite3(1) binary to support gzipped sqlite3 dumps.</pre> EOF - # setup stdin, POSIX requires writes <= 512 bytes to succeed so - # we can close the pipe right away. - pipe(my ($r, $w)) or die "pipe: $!"; - syswrite($w, ".dump\n") == 6 or die "write: $!"; - close($w) or die "close: $!"; - # TODO: use -readonly if available with newer sqlite3(1) - my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, { 0 => $r }); + my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, + { 0 => \".dump\n" }); $ctx->{altid_pfx} = $altid_pfx; $env->{'qspawn.filter'} = PublicInbox::GzipFilter->new; - $qsp->psgi_return($env, undef, \&check_output, $ctx); + $qsp->psgi_yield($env, undef, \&check_output, $ctx); } 1; diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm index 82895db6..26b366f5 100644 --- a/lib/PublicInbox/WwwAtomStream.pm +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Atom body stream for HTTP responses @@ -8,7 +8,7 @@ use strict; use parent 'PublicInbox::GzipFilter'; use POSIX qw(strftime); -use Digest::SHA qw(sha1_hex); +use PublicInbox::SHA qw(sha1_hex); use PublicInbox::Address; use PublicInbox::Hval qw(ascii_html mid_href); use PublicInbox::MsgTime qw(msg_timestamp); @@ -16,6 +16,7 @@ use PublicInbox::MsgTime qw(msg_timestamp); sub new { my ($class, $ctx, $cb) = @_; $ctx->{feed_base_url} = $ctx->{ibx}->base_url($ctx->{env}); + $ctx->{-spfx} = $ctx->{feed_base_url} if $ctx->{ibx}->{coderepo}; $ctx->{cb} = $cb || \&PublicInbox::GzipFilter::close; $ctx->{emit_header} = 1; bless $ctx, $class; @@ -38,14 +39,15 @@ sub async_next ($) { sub async_eml { # for async_blob_cb my ($ctx, $eml) = @_; my $smsg = delete $ctx->{smsg}; + $smsg->{mid} // $smsg->populate($eml); $ctx->write(feed_entry($ctx, $smsg, $eml)); } sub response { - my ($class, $ctx, $code, $cb) = @_; + my ($class, $ctx, $cb) = @_; my $res_hdr = [ 'Content-Type' => 'application/atom+xml' ]; $class->new($ctx, $cb); - $ctx->psgi_response($code, $res_hdr); + $ctx->psgi_response(200, $res_hdr); } # called once for each message by PSGI server @@ -97,15 +99,16 @@ sub atom_header { $base_url .= '?' . $search_q->qs_html(x => undef); $self_url .= '?' . $search_q->qs_html; $page_id = to_uuid("q\n".$query); + } elsif (defined(my $cat = $ctx->{topic_category})) { + $title = title_tag("$cat topics - ".$ibx->description); + $self_url .= "topics_$cat.atom"; } else { $title = title_tag($ibx->description); $self_url .= 'new.atom'; - if (defined(my $addr = $ibx->{-primary_address})) { - $page_id = "mailto:$addr"; - } else { - $page_id = to_uuid($self_url); - } + my $addr = $ibx->{-primary_address}; + $page_id = "mailto:$addr" if defined $addr; } + $page_id //= to_uuid($self_url); qq(<?xml version="1.0" encoding="us-ascii"?>\n) . qq(<feed\nxmlns="http://www.w3.org/2005/Atom"\n) . qq(xmlns:thr="http://purl.org/syndication/thread/1.0">) . @@ -145,19 +148,19 @@ sub feed_entry { my $name = ascii_html(join(', ', PublicInbox::Address::names($from))); $email = ascii_html($email // $ctx->{ibx}->{-primary_address}); - my $s = delete($ctx->{emit_header}) ? atom_header($ctx, $title) : ''; - $s .= "<entry><author><name>$name</name><email>$email</email>" . + print { $ctx->zfh } + (delete($ctx->{emit_header}) ? atom_header($ctx, $title) : ''), + "<entry><author><name>$name</name><email>$email</email>" . "</author>$title$updated" . - qq(<link\nhref="$href"/>). + qq(<link\nhref="$href"/>) . "<id>$uuid</id>$irt" . qq{<content\ntype="xhtml">} . qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} . qq(<pre\nstyle="white-space:pre-wrap">); - $ctx->{obuf} = \$s; $ctx->{mhref} = $href; - PublicInbox::View::multipart_text_as_html($eml, $ctx); - delete $ctx->{obuf}; - $s .= '</pre></div></content></entry>'; + $ctx->{changed_href} = "${href}#related"; + $eml->each_part(\&PublicInbox::View::add_text_body, $ctx, 1); + '</pre></div></content></entry>'; } sub feed_updated { diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm new file mode 100644 index 00000000..a5e2dc4a --- /dev/null +++ b/lib/PublicInbox/WwwCoderepo.pm @@ -0,0 +1,377 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Standalone code repository viewer for users w/o cgit. +# This isn't intended to replicate all of cgit, but merely to be a +# "good enough" viewer with search support and some UI hints to encourage +# cloning + command-line usage. +package PublicInbox::WwwCoderepo; +use v5.12; +use parent qw(PublicInbox::WwwStream); +use File::Temp 0.19 (); # newdir +use POSIX qw(O_RDWR F_GETFL); +use PublicInbox::ViewVCS; +use PublicInbox::WwwStatic qw(r); +use PublicInbox::GitHTTPBackend; +use PublicInbox::WwwStream; +use PublicInbox::Hval qw(prurl ascii_html utf8_maybe); +use PublicInbox::ViewDiff qw(uri_escape_path); +use PublicInbox::RepoSnapshot; +use PublicInbox::RepoAtom; +use PublicInbox::RepoTree; +use PublicInbox::RepoList; +use PublicInbox::OnDestroy; +use URI::Escape qw(uri_escape_utf8); +use File::Spec; +use autodie qw(fcntl open); + +my @EACH_REF = (qw(git for-each-ref --sort=-creatordate), + "--format=%(HEAD)%00".join('%00', map { "%($_)" } + qw(objectname refname:short subject creatordate:short))); +my $HEADS_CMD = <<''; +# heads (aka `branches'): +$ git for-each-ref --sort=-creatordate refs/heads \ + --format='%(HEAD) %(refname:short) %(subject) (%(creatordate:short))' + +my $TAGS_CMD = <<''; +# tags: +$ git for-each-ref --sort=-creatordate refs/tags \ + --format='%(refname:short) %(subject) (%(creatordate:short))' + +my $NO_HEADS = "# no heads (branches), yet...\n"; +my $NO_TAGS = "# no tags, yet...\n"; + +# shared with PublicInbox::Cgit +sub prepare_coderepos { + my ($self) = @_; + my $pi_cfg = $self->{pi_cfg}; + + # TODO: support gitweb and other repository viewers? + $pi_cfg->parse_cgitrc(undef, 0); + + my $coderepos = $pi_cfg->{-coderepos}; + for my $k (grep(/\Acoderepo\.(?:.+)\.dir\z/, keys %$pi_cfg)) { + $k = substr($k, length('coderepo.'), -length('.dir')); + $coderepos->{$k} //= $pi_cfg->fill_coderepo($k); + } + + # associate inboxes and extindices with coderepos for search: + for my $k (grep(/\Apublicinbox\.(?:.+)\.coderepo\z/, keys %$pi_cfg)) { + $k = substr($k, length('publicinbox.'), -length('.coderepo')); + my $ibx = $pi_cfg->lookup_name($k) // next; + $pi_cfg->repo_objs($ibx); + } + for my $k (grep(/\Aextindex\.(?:.+)\.coderepo\z/, keys %$pi_cfg)) { + $k = substr($k, length('extindex.'), -length('.coderepo')); + my $eidx = $pi_cfg->lookup_ei($k) // next; + $pi_cfg->repo_objs($eidx); + } + $pi_cfg->each_cindex('load_coderepos', $pi_cfg); +} + +sub new { + my ($cls, $pi_cfg) = @_; + my $self = bless { pi_cfg => $pi_cfg }, $cls; + prepare_coderepos($self); + $self->{snapshots} = do { + my $s = $pi_cfg->{'coderepo.snapshots'} // ''; + $s eq 'all' ? \%PublicInbox::RepoSnapshot::FMT_TYPES : + +{ map { $_ => 1 } split(/\s+/, $s) }; + }; + $self->{$_} = 10 for qw(summary_branches summary_tags); + $self->{$_} = 10 for qw(summary_log); + + # try reuse STDIN if it's already /dev/null + open $self->{log_fh}, '+>', '/dev/null'; + my @l = stat($self->{log_fh}) or die "stat: $!"; + my @s = stat(STDIN) or die "stat(STDIN): $!"; + if ("@l[0, 1]" eq "@s[0, 1]") { + my $f = fcntl(STDIN, F_GETFL, 0); + $self->{log_fh} = *STDIN{IO} if $f & O_RDWR; + } + $self; +} + +sub _snapshot_link_prep { + my ($ctx) = @_; + my @s = sort keys %{$ctx->{wcr}->{snapshots}} or return (); + my $n = $ctx->{git}->local_nick // die "BUG: $ctx->{git_dir} nick"; + $n =~ s!\.git/*\z!!; + ($n) = ($n =~ m!([^/]+)/*\z!); + (ascii_html($n).'-', @s); +} + +sub _refs_heads_link { + my ($line, $upfx) = @_; + my ($pfx, $oid, $ref, $s, $cd) = split(/\0/, $line); + my $align = length($ref) < 12 ? ' ' x (12 - length($ref)) : ''; + ("$pfx <a\nhref=$upfx$oid/s/>", ascii_html($ref), + "</a>$align ", ascii_html($s), " ($cd)\n") +} + +sub _refs_tags_link { + my ($line, $upfx, $snap_pfx, @snap_fmt) = @_; + my (undef, $oid, $ref, $s, $cd) = split(/\0/, $line); + my $align = length($ref) < 12 ? ' ' x (12 - length($ref)) : ''; + if (@snap_fmt) { + my $v = $ref; + $v =~ s/\A[vV]//; + @snap_fmt = map { + qq{ <a href="${upfx}snapshot/$snap_pfx$v.$_">$_</a>} + } @snap_fmt; + } + ("<a\nhref=$upfx$oid/s/>", ascii_html($ref), + "</a>$align ", ascii_html($s), " ($cd)", @snap_fmt, "\n"); +} + +sub emit_joined_inboxes ($) { + my ($ctx) = @_; + my $names = $ctx->{git}->{ibx_names}; # coderepo directives in config + my $score = $ctx->{git}->{ibx_score}; # generated w/ cindex --join + ($names || $score) or return; + my $pi_cfg = $ctx->{wcr}->{pi_cfg}; + my ($u, $h); + my $zfh = $ctx->zfh; + print $zfh "\n# associated public inboxes:", + "\n# (number on the left is used for dev purposes)"; + my @ns = map { [ 0, $_ ] } @$names; + my $env = $ctx->{env}; + for (@ns, @$score) { + my ($nr, $name) = @$_; + my $ibx = $pi_cfg->lookup_name($name) // do { + warn "W: inbox `$name' gone for $ctx->{git}->{git_dir}"; + say $zfh '# ', ascii_html($name), ' (missing inbox?)'; + next; + }; + if (scalar(@{$ibx->{url} // []})) { + $u = $h = ascii_html(prurl($env, $ibx->{url})); + } else { + $h = ascii_html(prurl($env, uri_escape_utf8($name))); + $h .= '/'; + $u = ascii_html($name); + } + if ($nr) { + printf $zfh "\n% 11u", $nr; + } else { + print $zfh "\n", ' 'x11; + } + print $zfh qq{ <a\nhref="$h">$u</a>}; + } +} + +sub summary_END { # called via OnDestroy + my ($ctx) = @_; + my $wcb = delete($ctx->{-wcb}) or return; # already done + PublicInbox::WwwStream::html_init($ctx); + my $zfh = $ctx->zfh; + + my @r = split(/\n/s, delete($ctx->{qx_res}->{'log'}) // ''); + my $last = scalar(@r) > $ctx->{wcr}->{summary_log} ? pop(@r) : undef; + my $tip_html = ''; + my $tip = $ctx->{qp}->{h}; + $tip_html .= ' '.ascii_html($tip).' --' if defined $tip; + print $zfh <<EOM; +<pre><a id=log>\$</a> git log --pretty=format:'%h %s (%cs)%d'$tip_html +EOM + for (@r) { + my $d; # decorations + s/^ \(([^\)]+)\)// and $d = $1; + substr($_, 0, 1, ''); + my ($H, $h, $cs, $s) = split(/ /, $_, 4); + print $zfh "<a\nhref=./$H/s/>$h</a> ", ascii_html($s), + " (", $cs, ")\n"; + print $zfh "\t(", ascii_html($d), ")\n" if $d; + } + print $zfh '# no commits in `', ($tip//'HEAD'),"', yet\n\n" if !@r; + print $zfh "...\n" if $last; + + # README + my ($bref, $oid, $ref_path) = @{delete $ctx->{qx_res}->{readme}}; + if ($bref) { + my $l = PublicInbox::Linkify->new; + $$bref =~ s/\s*\z//sm; + my (undef, $path) = split(/:/, $ref_path, 2); # HEAD:README + print $zfh "\n<a id=readme>\$</a> " . + qq(git cat-file blob <a href="./$oid/s/?b=) . + ascii_html(uri_escape_path($path)) . q(">). + ascii_html($ref_path), "</a>\n", + $l->to_html($$bref), '</pre><hr><pre>'; + } + + # refs/heads + print $zfh '<a id=heads>', $HEADS_CMD , '</a>'; + @r = split(/^/sm, delete($ctx->{qx_res}->{heads}) // ''); + $last = scalar(@r) > $ctx->{wcr}->{summary_branches} ? pop(@r) : undef; + chomp(@r); + for (@r) { print $zfh _refs_heads_link($_, './') } + print $zfh $NO_HEADS if !@r; + print $zfh qq(<a href="refs/heads/">...</a>\n) if $last; + print $zfh "\n<a id=tags>", $TAGS_CMD, '</a>'; + @r = split(/^/sm, delete($ctx->{qx_res}->{tags}) // ''); + $last = scalar(@r) > $ctx->{wcr}->{summary_tags} ? pop(@r) : undef; + my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx); + chomp @r; + for (@r) { print $zfh _refs_tags_link($_, './', $snap_pfx, @snap_fmt) } + print $zfh $NO_TAGS if !@r; + print $zfh qq(<a href="refs/tags/">...</a>\n) if $last; + emit_joined_inboxes $ctx; + $wcb->($ctx->html_done('</pre>')); +} + +sub capture { # psgi_qx callback to capture git-for-each-ref + my ($bref, $ctx, $key) = @_; # $_[3] = OnDestroy(summary_END) + $ctx->{qx_res}->{$key} = $$bref; + # summary_END may be called via OnDestroy $arg->[2] +} + +sub set_readme { # git->cat_async callback + my ($bref, $oid, $type, $size, $ctx) = @_; + my $ref_path = shift @{$ctx->{-readme_tries}}; # e.g. HEAD:README + if ($type eq 'blob' && !$ctx->{qx_res}->{readme}) { + $ctx->{qx_res}->{readme} = [ $bref, $oid, $ref_path ]; + } elsif (scalar @{$ctx->{-readme_tries}} == 0) { + $ctx->{qx_res}->{readme} //= []; # nothing left to try + } # or try another README... + # summary_END may be called via OnDestroy ($ctx->{-END}) +} + +sub summary ($$) { + my ($ctx, $wcb) = @_; + $ctx->{-wcb} = $wcb; # PublicInbox::HTTP::{Identity,Chunked} + my $tip = $ctx->{qp}->{h}; # same as cgit + if (defined $tip && $tip eq '') { + delete $ctx->{qp}->{h}; + undef($tip); + } + my ($nb, $nt, $nl) = map { $_ + 1 } @{$ctx->{wcr}}{qw( + summary_branches summary_tags summary_log)}; + $ctx->{qx_res} = {}; + my $qsp_err = \($ctx->{-qsp_err} = ''); + my %opt = (quiet => 1, 2 => $ctx->{wcr}->{log_fh}); + my %env = (GIT_DIR => $ctx->{git}->{git_dir}); + my @log = (qw(git log), "-$nl", '--pretty=format:%d %H %h %cs %s'); + push(@log, $tip) if defined $tip; + + # limit scope for MockHTTP test (t/solver_git.t) + my $END = on_destroy \&summary_END, $ctx; + for (['log', \@log], + [ 'heads', [@EACH_REF, "--count=$nb", 'refs/heads'] ], + [ 'tags', [@EACH_REF, "--count=$nt", 'refs/tags'] ]) { + my ($k, $cmd) = @$_; + my $qsp = PublicInbox::Qspawn->new($cmd, \%env, \%opt); + $qsp->{qsp_err} = $qsp_err; + $qsp->psgi_qx($ctx->{env}, undef, \&capture, $ctx, $k, $END); + } + $tip //= 'HEAD'; + my @try = ("$tip:README", "$tip:README.md"); # TODO: configurable + my %ctx = (%$ctx, -END => $END, -readme_tries => [ @try ]); + PublicInbox::ViewVCS::do_cat_async(\%ctx, \&set_readme, @try); +} + +# called by GzipFilter->close after translate +sub zflush { $_[0]->SUPER::zflush('</pre>', $_[0]->_html_end) } + +# called by GzipFilter->write or GetlineResponse->getline +sub translate { + my $ctx = shift; + $_[0] // return zflush($ctx); # getline caller + my @out; + my $fbuf = delete($ctx->{fbuf}) // shift; + $fbuf .= shift while @_; + if ($ctx->{-heads}) { + while ($fbuf =~ s/\A([^\n]+)\n//s) { + utf8_maybe(my $x = $1); + push @out, _refs_heads_link($x, '../../'); + } + } else { + my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx); + while ($fbuf =~ s/\A([^\n]+)\n//s) { + utf8_maybe(my $x = $1); + push @out, _refs_tags_link($x, '../../', + $snap_pfx, @snap_fmt); + } + } + $ctx->{fbuf} = $fbuf; # may be incomplete + @out ? $ctx->SUPER::translate(@out) : ''; # not EOF, yet +} + +sub _refs_parse_hdr { # {parse_hdr} for Qspawn + my ($r, $bref, $ctx) = @_; + my ($code, $top); + if ($r == 0) { + $code = 404; + $top = $ctx->{-heads} ? $NO_HEADS : $NO_TAGS; + } else { + $code = 200; + $top = $ctx->{-heads} ? $HEADS_CMD : $TAGS_CMD; + } + PublicInbox::WwwStream::html_init($ctx); + bless $ctx, __PACKAGE__; # re-bless for ->translate + print { $ctx->{zfh} } '<pre>', $top; + [ $code, delete($ctx->{-res_hdr}), $ctx ]; # [2] is qspawn.filter +} + +sub refs_foo { # /$REPO/refs/{heads,tags} endpoints + my ($self, $ctx, $pfx) = @_; + $ctx->{wcr} = $self; + $ctx->{-upfx} = '../../'; + $ctx->{-heads} = 1 if $pfx eq 'refs/heads'; + my $qsp = PublicInbox::Qspawn->new([@EACH_REF, $pfx ], + { GIT_DIR => $ctx->{git}->{git_dir} }); + $qsp->psgi_yield($ctx->{env}, undef, \&_refs_parse_hdr, $ctx); +} + +sub srv { # endpoint called by PublicInbox::WWW + my ($self, $ctx) = @_; + my $path_info = $ctx->{env}->{PATH_INFO}; + my $git; + # handle clone requests + my $pi_cfg = $self->{pi_cfg}; + if ($path_info =~ m!\A/(.+?)/($PublicInbox::GitHTTPBackend::ANY)\z!x and + ($git = $pi_cfg->get_coderepo($1))) { + PublicInbox::GitHTTPBackend::serve($ctx->{env},$git,$2); + } elsif ($path_info =~ m!\A/(.+?)/\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{wcr} = $self; + sub { summary($ctx, $_[0]) }; # $_[0] = wcb + } elsif ($path_info =~ m!\A/(.+?)/([a-f0-9]+)/s/([^/]+)?\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{lh} = $self->{log_fh}; + PublicInbox::ViewVCS::show($ctx, $2, $3); + } elsif ($path_info =~ m!\A/(.+?)/tree/(.*)\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{lh} = $self->{log_fh}; + PublicInbox::RepoTree::srv_tree($ctx, $2) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/snapshot/([^/]+)\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{wcr} = $self; + PublicInbox::RepoSnapshot::srv($ctx, $2) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/atom/(.*)\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{lh} = $self->{log_fh}; + PublicInbox::RepoAtom::srv_atom($ctx, $2) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/tags\.atom\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + PublicInbox::RepoAtom::srv_tags_atom($ctx); + } elsif ($path_info =~ m!\A/(.+?)/(refs/(?:heads|tags))/\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + refs_foo($self, $ctx, $2); + } elsif ($path_info =~ m!\A/(.*?\*.*?)/*\z!) { + my $re = PublicInbox::Config::glob2re($1); + PublicInbox::RepoList::html($self, $ctx, qr!$re\z!) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/\z!) { + my $re = qr!\A\Q$1\E/!; + PublicInbox::RepoList::html($self, $ctx, $re) // r(404); + } elsif ($path_info =~ m!\A/(.+?)\z! and + ($git = $pi_cfg->get_coderepo($1))) { + my $qs = $ctx->{env}->{QUERY_STRING}; + my $url = $git->base_url($ctx->{env}); + $url .= "?$qs" if $qs ne ''; + [ 301, [ Location => $url, 'Content-Type' => 'text/plain' ], + [ "Redirecting to $url\n" ] ]; + } else { + r(404); + } +} + +1; diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index 79c0a8ec..2d6c74da 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -41,10 +41,7 @@ sub list_match_i { # ConfigIter callback if (defined($section)) { return if $section !~ m!\Apublicinbox\.([^/]+)\z!; my $ibx = $cfg->lookup_name($1) or return; - if (!$ibx->{-hide}->{$ctx->hide_key} && - grep(/$re/, @{$ibx->{url} // []})) { - $ctx->ibx_entry($ibx); - } + $ctx->ibx_entry($ibx) unless $ctx->hide_inbox($ibx, $re); } else { # undef == "EOF" $ctx->{-wcb}->($ctx->psgi_triple); } @@ -54,13 +51,17 @@ sub url_filter { my ($ctx, $key, $default) = @_; $key //= 'publicInbox.wwwListing'; $default //= '404'; - my $v = $ctx->{www}->{pi_cfg}->{lc $key} // $default; + my $cfg = $ctx->{www}->{pi_cfg}; + my $v = $cfg->{lc $key} // $default; again: if ($v eq 'match=domain') { my $h = $ctx->{env}->{HTTP_HOST} // $ctx->{env}->{SERVER_NAME}; $h =~ s/:[0-9]+\z//; (qr!\A(?:https?:)?//\Q$h\E(?::[0-9]+)?/!i, "url:$h"); } elsif ($v eq 'all') { + my $niu = $cfg->{lc 'publicinbox.nameIsUrl'}; + defined($niu) && $cfg->git_bool($niu) and + $ctx->{-name_is_url} = [ '.' ]; (qr/./, undef); } elsif ($v eq '404') { (undef, undef); @@ -76,6 +77,12 @@ EOF sub hide_key { 'www' } +sub hide_inbox { + my ($ctx, $ibx, $re) = @_; + $ibx->{'-hide_'.$ctx->hide_key} || + !grep(/$re/, @{$ibx->{url} // $ctx->{-name_is_url} // []}) +} + sub add_misc_ibx { # MiscSearch->retry_reopen callback my ($misc, $ctx, $re, $qs) = @_; require PublicInbox::SearchQuery; @@ -104,15 +111,13 @@ sub add_misc_ibx { # MiscSearch->retry_reopen callback $ctx->ibx_entry($pi_cfg->ALL // die('BUG: ->ALL expected'), {}); } my $mset = $misc->mset($qs, $opt); # sorts by $MODIFIED (mtime) - my $hide_key = $ctx->hide_key; for my $mi ($mset->items) { my $doc = $mi->get_document; my ($eidx_key) = PublicInbox::Search::xap_terms('Q', $doc); $eidx_key // next; my $ibx = $pi_cfg->lookup_eidx_key($eidx_key) // next; - next if $ibx->{-hide}->{$hide_key}; - grep(/$re/, @{$ibx->{url} // []}) or next; + next if $ctx->hide_inbox($ibx, $re); $ctx->ibx_entry($ibx, $misc->doc2ibx_cache_ent($doc)); if ($r) { # for descriptions in search_nav_bot my $pct = PublicInbox::Search::get_pct($mi); @@ -162,24 +167,22 @@ sub mset_footer ($$) { # no footer if too few matches return '' if $mset->get_matches_estimated == $mset->size; require PublicInbox::SearchView; - PublicInbox::SearchView::search_nav_bot($mset, $ctx->{-sq}); + PublicInbox::SearchView::search_nav_bot($ctx, $mset, $ctx->{-sq}); } sub mset_nav_top { my ($ctx, $mset) = @_; my $q = $ctx->{-sq}; my $qh = $q->{'q'} // ''; - utf8::decode($qh); - $qh = ascii_html($qh); - $qh = qq[\nvalue="$qh"] if $qh ne ''; - my $rv = <<EOM; -<form -action="./"><pre><input name=q type=text$qh -/><input type=submit value="locate inbox" -/><input type=submit name=a value="search all inboxes" -/></pre></form><pre> + if ($qh ne '') { + utf8::decode($qh); + $qh = qq[\nvalue="].ascii_html($qh).'"'; + } + chop(my $rv = <<EOM); +<form action="./"><pre><input name=q type=text$qh/><input +type=submit value="locate inbox"/><input type=submit name=a +value="search all inboxes"/></pre></form><pre> EOM - chomp $rv; if (defined($q->{'q'})) { my $initial_q = $ctx->{-uxs_retried}; if (defined $initial_q) { @@ -210,28 +213,28 @@ sub psgi_triple { my $h = [ 'Content-Type', 'text/html; charset=UTF-8', 'Content-Length', undef ]; my $gzf = gzf_maybe($h, $ctx->{env}); - $gzf->zmore('<html><head><title>public-inbox listing</title>' . - $ctx->{www}->style('+/') . - '</head><body>'); + my $zfh = $gzf->zfh; + print $zfh '<html><head><title>public-inbox listing</title>', + $ctx->{www}->style('+/'), + '</head><body>'; my $code = 404; if (my $list = delete $ctx->{-list}) { my $mset = delete $ctx->{-mset}; $code = 200; if ($mset) { # already sorted, so search bar: - $gzf->zmore(mset_nav_top($ctx, $mset)); + print $zfh mset_nav_top($ctx, $mset); } else { # sort config dump by ->modified @$list = map { $_->[1] } sort { $b->[0] <=> $a->[0] } @$list; } - $gzf->zmore('<pre>'); - $gzf->zmore(join("\n", @$list)); - $gzf->zmore(mset_footer($ctx, $mset)) if $mset; + print $zfh '<pre>', join("\n", @$list); # big + print $zfh mset_footer($ctx, $mset) if $mset; } elsif (my $mset = delete $ctx->{-mset}) { - $gzf->zmore(mset_nav_top($ctx, $mset)); - $gzf->zmore('<pre>no matching inboxes'); - $gzf->zmore(mset_footer($ctx, $mset)); + print $zfh mset_nav_top($ctx, $mset), + '<pre>no matching inboxes', + mset_footer($ctx, $mset); } else { - $gzf->zmore('<pre>no inboxes, yet'); + print $zfh '<pre>no inboxes, yet'; } my $out = $gzf->zflush('</pre><hr><pre>'. qq(This is a listing of public inboxes, see the `mirror' link of each inbox diff --git a/lib/PublicInbox/WwwStatic.pm b/lib/PublicInbox/WwwStatic.pm index eeb5e565..d8902193 100644 --- a/lib/PublicInbox/WwwStatic.pm +++ b/lib/PublicInbox/WwwStatic.pm @@ -12,13 +12,12 @@ use strict; use v5.10.1; use parent qw(Exporter); use Fcntl qw(SEEK_SET O_RDONLY O_NONBLOCK); -use POSIX qw(strftime); use HTTP::Date qw(time2str); use HTTP::Status qw(status_message); use Errno qw(EACCES ENOTDIR ENOENT); use URI::Escape qw(uri_escape_utf8); use PublicInbox::GzipFilter qw(gzf_maybe); -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html fmt_ts); use Plack::MIME; our @EXPORT_OK = qw(@NO_CACHE r path_info_raw); @@ -275,12 +274,11 @@ sub dir_response ($$$) { my $path_info = $env->{PATH_INFO}; push @entries, '..' if $path_info ne '/'; for my $base (@entries) { + my @st = stat($fs_path . $base) or next; # unlikely my $href = ascii_html(uri_escape_utf8($base)); my $name = ascii_html($base); - my @st = stat($fs_path . $base) or next; # unlikely - my ($gzipped, $uncompressed, $hsize); - my $entry = ''; my $mtime = $st[9]; + my ($entry, $hsize); if (-d _) { $href .= '/'; $name .= '/'; @@ -296,12 +294,12 @@ sub dir_response ($$$) { next; } # 54 = 80 - (SP length(strftime(%Y-%m-%d %k:%M)) SP human_size) - $hsize = sprintf('% 8s', $hsize); my $pad = 54 - length($name); $pad = 1 if $pad <= 0; - $entry .= qq(<a\nhref="$href">$name</a>) . (' ' x $pad); - $mtime = strftime('%Y-%m-%d %k:%M', gmtime($mtime)); - $entry .= $mtime . $hsize; + $entry = qq(\n<a\nhref="$href">$name</a>) . + (' ' x $pad) . + fmt_ts($mtime) . + sprintf('% 8s', $hsize); } # filter out '.gz' files as long as the mtime matches the @@ -309,17 +307,16 @@ sub dir_response ($$$) { delete(@other{keys %want_gz}); @entries = ((map { ${$dirs{$_}} } sort keys %dirs), (map { ${$other{$_}} } sort keys %other)); - my $path_info_html = ascii_html($path_info); - my $h = [qw(Content-Type text/html Content-Length), undef]; - my $gzf = gzf_maybe($h, $env); - $gzf->zmore("<html><head><title>Index of $path_info_html</title>" . - ${$self->{style}} . - "</head><body><pre>Index of $path_info_html</pre><hr><pre>\n"); - $gzf->zmore(join("\n", @entries)); - my $out = $gzf->zflush("</pre><hr></body></html>\n"); - $h->[3] = length($out); - [ 200, $h, [ $out ] ] + my @h = qw(Content-Type text/html); + my $gzf = gzf_maybe(\@h, $env); + print { $gzf->zfh } '<html><head><title>Index of ', $path_info_html, + '</title>', ${$self->{style}}, '</head><body><pre>Index of ', + $path_info_html, '</pre><hr><pre>', @entries, + '</pre><hr></body></html>'; + my $out = $gzf->zflush; + push @h, 'Content-Length', length($out); + [ 200, \@h, [ $out ] ] } sub call { # PSGI app endpoint diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index aee78170..8d32074f 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # HTML body stream for which yields getline+close methods for @@ -17,8 +17,9 @@ http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inb https://public-inbox.org/public-inbox.git) ]; sub base_url ($) { - my $ctx = shift; - my $base_url = $ctx->{ibx}->base_url($ctx->{env}); + my ($ctx) = @_; + my $thing = $ctx->{ibx} // $ctx->{git} // return; + my $base_url = $thing->base_url($ctx->{env}); chop $base_url; # no trailing slash for clone $base_url; } @@ -27,6 +28,9 @@ sub init { my ($ctx, $cb) = @_; $ctx->{cb} = $cb; $ctx->{base_url} = base_url($ctx); + $ctx->{-res_hdr} = [ 'Content-Type' => 'text/html; charset=UTF-8' ]; + $ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($ctx->{-res_hdr}, + $ctx->{env}); bless $ctx, __PACKAGE__; } @@ -35,9 +39,60 @@ sub async_eml { # for async_blob_cb $ctx->write($ctx->{cb}->($ctx, $eml)); } +sub html_repo_top ($) { + my ($ctx) = @_; + my $git = $ctx->{git} // return $ctx->html_top_fallback; + my $desc = ascii_html($git->description); + my $title = delete($ctx->{-title_html}) // $desc; + my $upfx = $ctx->{-upfx} // ''; + my $atom = $ctx->{-atom} // (substr($upfx, -1) eq '/' ? + "${upfx}atom/" : "$upfx/atom/"); + my $top = ascii_html($git->{nick}); + $top = qq(<a\nhref="$upfx">$top</a>) if length($upfx); + $top .= <<EOM; + <a href='$upfx#readme'>about</a> / <a +href='$upfx#heads'>heads</a> / <a +href='$upfx#tags'>tags</a> +<b>$desc</b> +EOM + my @url = PublicInbox::ViewVCS::ibx_url_for($ctx); + if (@url) { + $ctx->{-has_srch} = 1; + my $base_url = base_url($ctx); + my ($pfx, $sfx) = ($base_url =~ m!\A(https?://[^/]+/)(.*)\z!i); + my $iupfx = '../' x (($sfx =~ tr!/!/!) + 1); + $pfx = ascii_html($pfx); + $pfx = qr/\A\Q$pfx\E/i; + my $tmp = $top; + $top = ''; + my ($s, $u); + my $q_val = delete($ctx->{-q_value_html}) // ''; + $q_val = qq(\nvalue="$q_val") if $q_val ne ''; + for (@url) { + $u = $s = ascii_html($_); + substr($u, 0, 0, $iupfx) if $u !~ m!://!; + $s =~ s!$pfx!!; + $s =~ s!/\z!!; + $top .= qq{<form\naction="$u"><pre>$tmp} . + qq{<input\nname=q type=text$q_val />} . + qq{<input type=submit\n} . + qq{value="search mail in `$s'"/>} . + q{</pre></form>}; + $tmp = ''; + } + } else { + $top = "<pre>$top</pre>"; + } + "<html><head><title>$title</title>" . + qq(<link\nrel=alternate\ntitle="Atom feed"\n). + qq(href="$atom"\ntype="application/atom+xml"/>) . + $ctx->{www}->style($upfx) . + '</head><body>'.$top; +} + sub html_top ($) { my ($ctx) = @_; - my $ibx = $ctx->{ibx}; + my $ibx = $ctx->{ibx} // return html_repo_top($ctx); my $desc = ascii_html($ibx->description); my $title = delete($ctx->{-title_html}) // $desc; my $upfx = $ctx->{-upfx} || ''; @@ -59,6 +114,7 @@ sub html_top ($) { qq(<a\nid=mirror) . qq(\nhref="${upfx}_/text/mirror/">mirror</a>$code / ). qq(<a\nhref="$atom">Atom feed</a>); + $links .= delete($ctx->{-html_more_links}) if $ctx->{-html_more_links}; if ($ibx->isrch) { my $q_val = delete($ctx->{-q_value_html}) // ''; $q_val = qq(\nvalue="$q_val") if $q_val ne ''; @@ -81,38 +137,36 @@ sub html_top ($) { '</head><body>'. $top . (delete($ctx->{-html_tip}) // ''); } +sub inboxes { () } # TODO + sub coderepos ($) { my ($ctx) = @_; + $ctx->{ibx} // return inboxes($ctx); my $cr = $ctx->{ibx}->{coderepo} // return (); - my $cfg = $ctx->{www}->{pi_cfg}; my $upfx = ($ctx->{-upfx} // ''). '../'; - my @ret; - for my $cr_name (@$cr) { - $ret[0] //= <<EOF; -<a id=code>Code repositories for project(s) associated with this inbox: -EOF - my $urls = $cfg->get_all("coderepo.$cr_name.cgiturl"); - if ($urls) { - for (@$urls) { - # relative or absolute URL?, prefix relative - # "foo.git" with appropriate number of "../" - my $u = m!\A(?:[a-z\+]+:)?//! ? $_ : $upfx.$_; - $u = ascii_html(prurl($ctx->{env}, $u)); - $ret[0] .= qq(\n\t<a\nhref="$u">$u</a>); - } - } else { - $ret[0] .= qq[\n\t$cr_name.git (no URL configured)]; + my $pfx = $ctx->{base_url} //= $ctx->base_url; + my $up = $upfx =~ tr!/!/!; + $pfx =~ s!/[^/]+\z!/! for (1..$up); + $pfx .= '/' if substr($pfx, -1, 1) ne '/'; + my $buf = '<a id=code>' . + 'Code repositories for project(s) associated with this '. + $ctx->{ibx}->thing_type . "\n"; + for my $git (@{$ctx->{www}->{pi_cfg}->repo_objs($ctx->{ibx})}) { + for ($git->pub_urls($ctx->{env})) { + my $u = m!\A(?:[a-z\+]+:)?//!i ? $_ : $pfx.$_; + $u = ascii_html(prurl($ctx->{env}, $u)); + $buf .= qq(\n\t<a\nhref="$u">$u</a>); } } - @ret; # may be empty, this sub is called as an arg for join() + ($buf); } sub _html_end { my ($ctx) = @_; my $upfx = $ctx->{-upfx} || ''; my $m = "${upfx}_/text/mirror/"; - my $x; - if ($ctx->{ibx}->can('cloneurl')) { + my $x = ''; + if ($ctx->{ibx} && $ctx->{ibx}->can('cloneurl')) { $x = <<EOF; This is a public inbox, see <a href="$m">mirroring instructions</a> @@ -136,12 +190,15 @@ as well as URLs for IMAP folder(s). EOM } } - } else { + } elsif ($ctx->{ibx}) { # extindex $x = <<EOF; This is an external index of several public inboxes, see <a href="$m">mirroring instructions</a> on how to clone and mirror all data and code used by this external index. EOF + } elsif ($ctx->{git}) { # coderepo + $x = join('', map { "git clone $_\n" } + @{$ctx->{git}->cloneurl($ctx->{env})}); } chomp $x; '<hr><pre>'.join("\n\n", coderepos($ctx), $x).'</pre></body></html>' @@ -164,18 +221,26 @@ sub getline { $ctx->zflush(_html_end($ctx)); } -sub html_oneshot ($$;$) { - my ($ctx, $code, $sref) = @_; +sub html_done ($;@) { + my $ctx = $_[0]; + my $bdy = $ctx->zflush(@_[1..$#_], _html_end($ctx)); + my $res_hdr = delete $ctx->{-res_hdr}; + push @$res_hdr, 'Content-Length', length($bdy); + [ 200, $res_hdr, [ $bdy ] ] +} + +sub html_oneshot ($$;@) { + my ($ctx, $code) = @_[0, 1]; my $res_hdr = [ 'Content-Type' => 'text/html; charset=UTF-8', 'Content-Length' => undef ]; bless $ctx, __PACKAGE__; $ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($res_hdr, $ctx->{env}); + my @top; $ctx->{base_url} // do { - $ctx->zmore(html_top($ctx)); + @top = html_top($ctx); $ctx->{base_url} = base_url($ctx); }; - $ctx->zmore($$sref) if $sref; - my $bdy = $ctx->zflush(_html_end($ctx)); + my $bdy = $ctx->zflush(@top, @_[2..$#_], _html_end($ctx)); $res_hdr->[3] = length($bdy); [ $code, $res_hdr, [ $bdy ] ] } @@ -195,10 +260,23 @@ sub async_next ($) { } sub aresponse { - my ($ctx, $code, $cb) = @_; - my $res_hdr = [ 'Content-Type' => 'text/html; charset=UTF-8' ]; + my ($ctx, $cb) = @_; init($ctx, $cb); - $ctx->psgi_response($code, $res_hdr); + $ctx->psgi_response(200, delete $ctx->{-res_hdr}); +} + +sub html_init { + my $ctx = $_[-1]; + $ctx->{base_url} = base_url($ctx); + my $h = $ctx->{-res_hdr} = ['Content-Type', 'text/html; charset=UTF-8']; + $ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($h, $ctx->{env}); + bless $ctx, @_ > 1 ? $_[0] : __PACKAGE__; + print { $ctx->zfh } html_top($ctx); +} + +sub DESTROY { + my ($ctx) = @_; + $ctx->{git}->cleanup if $ctx->{git} && $ctx->{git}->{-tmp}; } 1; diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm index 71b04561..5e23005e 100644 --- a/lib/PublicInbox/WwwText.pm +++ b/lib/PublicInbox/WwwText.pm @@ -7,7 +7,7 @@ use strict; use v5.10.1; use PublicInbox::Linkify; use PublicInbox::WwwStream; -use PublicInbox::Hval qw(ascii_html prurl); +use PublicInbox::Hval qw(ascii_html prurl fmt_ts); use HTTP::Date qw(time2str); use URI::Escape qw(uri_escape_utf8); use PublicInbox::GzipFilter qw(gzf_maybe); @@ -31,16 +31,17 @@ sub get_text { my $have_tslash = ($key =~ s!/\z!!) if !$raw; my $txt = ''; - my $hdr = [ 'Content-Type', 'text/plain', 'Content-Length', undef ]; - if (!_default_text($ctx, $key, $hdr, \$txt)) { + if (!_default_text($ctx, $key, \$txt)) { $code = 404; $txt = "404 Not Found ($key)\n"; } my $env = $ctx->{env}; if ($raw) { - $txt = gzf_maybe($hdr, $env)->zflush($txt) if $code == 200; - $hdr->[3] = length($txt); - return [ $code, $hdr, [ $txt ] ] + my $h = delete $ctx->{-res_hdr}; + $txt = gzf_maybe($h, $env)->zflush($txt) if $code == 200; + push @$h, 'Content-Type', 'text/plain', + 'Content-Length', length($txt); + return [ $code, $h, [ $txt ] ] } # enforce trailing slash for "wget -r" compatibility @@ -67,10 +68,11 @@ sub get_text { $txt = ascii_html($txt); } $txt = '<pre>' . $l->linkify_2($txt) . '</pre>'; + $txt =~ s!^search$!<a\nid=search>search</a>!sm; $txt =~ s!\bPOP3\b!<a\nid=pop3>POP3</a>!; - $txt =~ s!\bNewsgroups\b!<a\nid=nntp>Newsgroups</a>!; + $txt =~ s!\b(Newsgroups?)\b!<a\nid=nntp>$1</a>!; $txt =~ s!\bIMAP\b!<a\nid=imap>IMAP</a>!; - PublicInbox::WwwStream::html_oneshot($ctx, $code, \$txt); + PublicInbox::WwwStream::html_oneshot($ctx, $code, $txt); } sub _srch_prefix ($$) { @@ -166,12 +168,13 @@ EOF } # n.b. this is a perfect candidate for memoization -sub inbox_config ($$$) { - my ($ctx, $hdr, $txt) = @_; +sub inbox_config ($$) { + my ($ctx, $txt) = @_; my $ibx = $ctx->{ibx}; - push @$hdr, 'Content-Disposition', 'inline; filename=inbox.config'; + push @{$ctx->{-res_hdr}}, + 'Content-Disposition', 'inline; filename=inbox.config'; my $t = eval { $ibx->mm->created_at }; - push(@$hdr, 'Last-Modified', time2str($t)) if $t; + push(@{$ctx->{-res_hdr}}, 'Last-Modified', time2str($t)) if $t; my $name = dq_escape($ibx->{name}); my $inboxdir = '/path/to/top-level-inbox'; my $base_url = $ibx->base_url($ctx->{env}); @@ -218,10 +221,11 @@ EOF } # n.b. this is a perfect candidate for memoization -sub extindex_config ($$$) { - my ($ctx, $hdr, $txt) = @_; +sub extindex_config ($$) { + my ($ctx, $txt) = @_; my $ibx = $ctx->{ibx}; - push @$hdr, 'Content-Disposition', 'inline; filename=extindex.config'; + push @{$ctx->{-res_hdr}}, + 'Content-Disposition', 'inline; filename=extindex.config'; my $name = dq_escape($ibx->{name}); my $base_url = $ibx->base_url($ctx->{env}); $$txt .= <<EOS; @@ -244,31 +248,24 @@ EOS sub coderepos_raw ($$) { my ($ctx, $top_url) = @_; - my $cr = $ctx->{ibx}->{coderepo} // return (); my $cfg = $ctx->{www}->{pi_cfg}; - my @ret; - for my $cr_name (@$cr) { - $ret[0] //= do { - my $thing = $ctx->{ibx}->can('cloneurl') ? - 'public inbox' : 'external index'; - <<EOF; -Code repositories for project(s) associated with this $thing -EOF - }; - my $urls = $cfg->get_all("coderepo.$cr_name.cgiturl"); - if ($urls) { - for (@$urls) { - # relative or absolute URL?, prefix relative - # "foo.git" with appropriate number of "../" - my $u = m!\A(?:[a-z\+]+:)?//!i ? $_ : - $top_url.$_; - $ret[0] .= "\n\t" . prurl($ctx->{env}, $u); - } - } else { - $ret[0] .= qq[\n\t$cr_name.git (no URL configured)]; + my $cr = $cfg->repo_objs($ctx->{ibx}) or return (); + my $buf = 'Code repositories for project(s) associated with this '. + $ctx->{ibx}->thing_type . ":\n"; + my @recs = PublicInbox::CodeSearch::repos_sorted($cfg, @$cr); + my $cr_score = $ctx->{ibx}->{-cr_score}; + my $env = $ctx->{env}; + for (@recs) { + my ($t, $git) = @$_; + for ($git->pub_urls($env)) { + my $u = m!\A(?:[a-z\+]+:)?//!i ? $_ : $top_url.$_; + my $nr = $cr_score->{$git->{nick}}; + $buf .= "\n"; + $buf .= $nr ? sprintf('% 9u', $nr) : (' 'x9); + $buf .= ' '.fmt_ts($t).' '.prurl($env, $u); } } - @ret; # may be empty, this sub is called as an arg for join() + ($buf); } sub _add_non_http_urls ($$) { @@ -276,10 +273,12 @@ sub _add_non_http_urls ($$) { $ctx->{ibx}->can('nntp_url') or return; # TODO extindex can have IMAP my $urls = $ctx->{ibx}->imap_url($ctx); if (@$urls) { - $$txt .= "\nIMAP subfolder(s) are available under:"; - $$txt .= "\n " . join("\n ", @$urls); + $urls = join("\n ", @$urls); + $urls =~ s!://([^/@]+)/!://;AUTH=ANONYMOUS\@$1/!sg; $$txt .= <<EOM +IMAP subfolder(s) are available under: + $urls # each subfolder (starting with `0') holds 50K messages at most EOM } @@ -301,6 +300,9 @@ The POP3 password is: anonymous The POP3 username is: \$(uuidgen)\@$ctx->{ibx}->{newsgroup} where \$(uuidgen) in the output of the `uuidgen' command on your system. The UUID in the username functions as a private cookie (don't share it). +By default, only 1000 messages are retrieved. You may download more +by appending `?limit=NUM' (without quotes) to the username, where +`NUM' is an integer between 1 and 50000. Idle accounts will expire periodically. EOM } @@ -402,16 +404,16 @@ EOF 1; } -sub _default_text ($$$$) { - my ($ctx, $key, $hdr, $txt) = @_; +sub _default_text ($$$) { + my ($ctx, $key, $txt) = @_; if ($key eq 'mirror') { return _mirror_help($ctx, $txt); } elsif ($key eq 'color') { return _colors_help($ctx, $txt); } elsif ($key eq 'config') { return $ctx->{ibx}->can('cloneurl') ? - inbox_config($ctx, $hdr, $txt) : - extindex_config($ctx, $hdr, $txt); + inbox_config($ctx, $txt) : + extindex_config($ctx, $txt); } return if $key ne 'help'; # TODO more keys? diff --git a/lib/PublicInbox/WwwTopics.pm b/lib/PublicInbox/WwwTopics.pm new file mode 100644 index 00000000..9d270732 --- /dev/null +++ b/lib/PublicInbox/WwwTopics.pm @@ -0,0 +1,85 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::WwwTopics; +use v5.12; +use PublicInbox::Hval qw(ascii_html mid_href fmt_ts); + +sub add_topic_html ($$) { + my (undef, $smsg) = @_; + my $s = ascii_html($smsg->{subject}); + $s = '(no subject)' if $s eq ''; + $_[0] .= "\n".fmt_ts($smsg->{ds}) . + qq{ <a\nhref="}.mid_href($smsg->{mid}).qq{/#r">$s</a>}; + my $nr = $smsg->{'COUNT(num)'}; + $_[0] .= " $nr+ messages" if $nr > 1; +} + +# n.b. the `SELECT DISTINCT(tid)' subquery is critical for performance +# with giant inboxes and extindices +sub topics_new ($) { + $_[0]->do_get(<<EOS); +SELECT ds,ddd,COUNT(num) FROM over WHERE tid IN +(SELECT DISTINCT(tid) FROM over WHERE tid > 0 ORDER BY ts DESC LIMIT 200) +AND +num > 0 +GROUP BY tid +ORDER BY ds ASC +EOS +} + +sub topics_active ($) { + $_[0]->do_get(<<EOS); +SELECT ddd,MAX(ds) as ds,COUNT(num) FROM over WHERE tid IN +(SELECT DISTINCT(tid) FROM over WHERE tid > 0 ORDER BY ts DESC LIMIT 200) +AND +num > 0 +GROUP BY tid +ORDER BY ds ASC +EOS +} + +sub topics_i { pop @{$_[0]->{msgs}} } + +sub topics_atom { # GET /$INBOX_NAME/topics_(new|active).atom + my ($ctx) = @_; + require PublicInbox::WwwAtomStream; + my ($hdr, $smsg, $val); + PublicInbox::WwwAtomStream->response($ctx, \&topics_i); +} + +sub topics_html { # GET /$INBOX_NAME/topics_(new|active).html + my ($ctx) = @_; + require PublicInbox::WwwStream; + my $buf = '<pre>'; + $ctx->{-html_more_links} = qq{\n- recent:[<a +href="./">subjects (threaded)</a>|}; + + if ($ctx->{topic_category} eq 'new') { + $ctx->{-html_more_links} .= qq{<b>topics (new)</b>|<a +href="./topics_active.html">topics (active)</a>]}; + } else { # topic_category eq "active" - topics with recent replies + $ctx->{-html_more_links} .= qq{<a +href="./topics_new.html">topics (new)</a>|<b>topics (active)</b>]}; + } + # can't use SQL to filter references since our schema wasn't designed + # for it, but our SQL sorts by ascending time to favor top-level + # messages while our final result (post-references filter) favors + # recent messages + my $msgs = delete $ctx->{msgs}; + add_topic_html($buf, pop @$msgs) while scalar(@$msgs); + $buf .= '</pre>'; + PublicInbox::WwwStream::html_oneshot($ctx, 200, $buf); +} + +sub response { + my ($ctx, $ibx_name, $category, $type) = @_; + my ($ret, $over); + $ret = PublicInbox::WWW::invalid_inbox($ctx, $ibx_name) and return $ret; + $over = $ctx->{ibx}->over or + return PublicInbox::WWW::need($ctx, 'Overview', './'); + $ctx->{msgs} = $category eq 'new' ? topics_new($over) : + topics_active($over); + $ctx->{topic_category} = $category; + $type eq 'atom' ? topics_atom($ctx) : topics_html($ctx); +} + +1; diff --git a/lib/PublicInbox/XapClient.pm b/lib/PublicInbox/XapClient.pm new file mode 100644 index 00000000..24b3f45e --- /dev/null +++ b/lib/PublicInbox/XapClient.pm @@ -0,0 +1,50 @@ +#!perl -w +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# This talks to (XapHelperCxx.pm + xap_helper.h) or XapHelper.pm +# and will eventually allow users with neither XS nor SWIG Perl +# bindings to use Xapian as long as they have Xapian development +# headers/libs and a C++ compiler +package PublicInbox::XapClient; +use v5.12; +use PublicInbox::Spawn qw(spawn); +use Socket qw(AF_UNIX SOCK_SEQPACKET); +use PublicInbox::IPC; +use autodie qw(pipe socketpair); +our $tries = 50; + +sub mkreq { + my ($self, $ios, @arg) = @_; + my ($r, $n); + pipe($r, $ios->[0]) if !defined($ios->[0]); + my @fds = map fileno($_), @$ios; + my $buf = join("\0", @arg, ''); + $n = $PublicInbox::IPC::send_cmd->($self->{io}, \@fds, $buf, 0, $tries) + // die "send_cmd: $!"; + $n == length($buf) or die "send_cmd: $n != ".length($buf); + $r; +} + +sub start_helper (@) { + $PublicInbox::IPC::send_cmd or return; # can't work w/o SCM_RIGHTS + my @argv = @_; + socketpair(my $sock, my $in, AF_UNIX, SOCK_SEQPACKET, 0); + my $cls = 'PublicInbox::XapHelperCxx'; + my $env; + my $cmd = eval "require $cls; ${cls}::cmd()"; + if ($@) { # fall back to Perl + XS|SWIG + $cls = 'PublicInbox::XapHelper'; + # ensure the child process has the same @INC we do: + $env = { PERL5LIB => join(':', @INC) }; + $cmd = [$^X, ($^W ? ('-w') : ()), "-M$cls", '-e', + $cls.'::start(@ARGV)', '--' ]; + } + push @$cmd, @argv; + my $pid = spawn($cmd, $env, { 0 => $in }); + my $self = bless { io => $sock, impl => $cls }, __PACKAGE__; + PublicInbox::IO::attach_pid($sock, $pid); + $self; +} + +1; diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm new file mode 100644 index 00000000..2e20660e --- /dev/null +++ b/lib/PublicInbox/XapHelper.pm @@ -0,0 +1,324 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Perl + SWIG||XS implementation if XapHelperCxx / xap_helper.h isn't usable. +package PublicInbox::XapHelper; +use v5.12; +use Getopt::Long (); # good API even if we only use short options +our $GLP = Getopt::Long::Parser->new; +$GLP->configure(qw(require_order bundling no_ignore_case no_auto_abbrev)); +use PublicInbox::Search qw(xap_terms); +use PublicInbox::CodeSearch; +use PublicInbox::IPC; +use PublicInbox::IO qw(read_all); +use Socket qw(SOL_SOCKET SO_TYPE SOCK_SEQPACKET AF_UNIX); +use PublicInbox::DS qw(awaitpid); +use autodie qw(open getsockopt); +use POSIX qw(:signal_h); +use Fcntl qw(LOCK_UN LOCK_EX); +use Carp qw(croak); +my $X = \%PublicInbox::Search::X; +our (%SRCH, %WORKERS, $nworker, $workerset, $in); +our $stderr = \*STDERR; + +sub cmd_test_inspect { + my ($req) = @_; + print { $req->{0} } "pid=$$ has_threadid=", + ($req->{srch}->has_threadid ? 1 : 0) +} + +sub cmd_test_sleep { select(undef, undef, undef, 0.01) while 1 } + +sub iter_retry_check ($) { + if (ref($@) =~ /\bDatabaseModifiedError\b/) { + $_[0]->{srch}->reopen; + undef; # retries + } elsif (ref($@) =~ /\bDocNotFoundError\b/) { + warn "doc not found: $@"; + 0; # continue to next doc + } else { + die; + } +} + +sub term_length_extract ($) { + my ($req) = @_; + @{$req->{A_len}} = map { + my $len = s/([0-9]+)\z// ? ($1 + 0) : undef; + [ $_, $len ]; + } @{$req->{A}}; +} + +sub dump_ibx_iter ($$$) { + my ($req, $ibx_id, $it) = @_; + my $out = $req->{0}; + eval { + my $doc = $it->get_document; + for my $pair (@{$req->{A_len}}) { + my ($pfx, $len) = @$pair; + my @t = xap_terms($pfx, $doc); + @t = grep { length == $len } @t if defined($len); + for (@t) { + print $out "$_ $ibx_id\n" or die "print: $!"; + ++$req->{nr_out}; + } + } + }; + $@ ? iter_retry_check($req) : 0; +} + +sub emit_mset_stats ($$) { + my ($req, $mset) = @_; + my $err = $req->{1} or croak "BUG: caller only passed 1 FD"; + say $err 'mset.size='.$mset->size.' nr_out='.$req->{nr_out} +} + +sub cmd_dump_ibx { + my ($req, $ibx_id, $qry_str) = @_; + $qry_str // die 'usage: dump_ibx [OPTIONS] IBX_ID QRY_STR'; + $req->{A} or die 'dump_ibx requires -A PREFIX'; + term_length_extract $req; + my $max = $req->{'m'} // $req->{srch}->{xdb}->get_doccount; + my $opt = { relevance => -1, limit => $max, offset => $req->{o} // 0 }; + $opt->{eidx_key} = $req->{O} if defined $req->{O}; + my $mset = $req->{srch}->mset($qry_str, $opt); + $req->{0}->autoflush(1); + for my $it ($mset->items) { + for (my $t = 10; $t > 0; --$t) { + $t = dump_ibx_iter($req, $ibx_id, $it) // $t; + } + } + emit_mset_stats($req, $mset); +} + +sub dump_roots_iter ($$$) { + my ($req, $root2off, $it) = @_; + eval { + my $doc = $it->get_document; + my $G = join(' ', map { $root2off->{$_} } xap_terms('G', $doc)); + for my $pair (@{$req->{A_len}}) { + my ($pfx, $len) = @$pair; + my @t = xap_terms($pfx, $doc); + @t = grep { length == $len } @t if defined($len); + for (@t) { + $req->{wbuf} .= "$_ $G\n"; + ++$req->{nr_out}; + } + } + }; + $@ ? iter_retry_check($req) : 0; +} + +sub dump_roots_flush ($$) { + my ($req, $fh) = @_; + if ($req->{wbuf} ne '') { + until (flock($fh, LOCK_EX)) { die "LOCK_EX: $!" if !$!{EINTR} } + print { $req->{0} } $req->{wbuf} or die "print: $!"; + until (flock($fh, LOCK_UN)) { die "LOCK_UN: $!" if !$!{EINTR} } + $req->{wbuf} = ''; + } +} + +sub cmd_dump_roots { + my ($req, $root2off_file, $qry_str) = @_; + $qry_str // die 'usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR'; + $req->{A} or die 'dump_roots requires -A PREFIX'; + term_length_extract $req; + open my $fh, '<', $root2off_file; + my $root2off; # record format: $OIDHEX "\0" uint32_t + my @x = split(/\0/, read_all $fh); + while (defined(my $oidhex = shift @x)) { + $root2off->{$oidhex} = shift @x; + } + my $opt = { relevance => -1, limit => $req->{'m'}, + offset => $req->{o} // 0 }; + my $mset = $req->{srch}->mset($qry_str, $opt); + $req->{0}->autoflush(1); + $req->{wbuf} = ''; + for my $it ($mset->items) { + for (my $t = 10; $t > 0; --$t) { + $t = dump_roots_iter($req, $root2off, $it) // $t; + } + if (!($req->{nr_out} & 0x3fff)) { + dump_roots_flush($req, $fh); + } + } + dump_roots_flush($req, $fh); + emit_mset_stats($req, $mset); +} + +sub mset_iter ($$) { + my ($req, $it) = @_; + say { $req->{0} } $it->get_docid, "\0", + $it->get_percent, "\0", $it->get_rank; +} + +sub cmd_mset { # to be used by WWW + IMAP + my ($req, $qry_str) = @_; + $qry_str // die 'usage: mset [OPTIONS] QRY_STR'; + my $opt = { limit => $req->{'m'}, offset => $req->{o} // 0 }; + $opt->{relevance} = 1 if $req->{r}; + $opt->{threads} = 1 if defined $req->{t}; + $opt->{git_dir} = $req->{g} if defined $req->{g}; + $opt->{eidx_key} = $req->{O} if defined $req->{O}; + $opt->{threadid} = $req->{T} if defined $req->{T}; + my $mset = $req->{srch}->mset($qry_str, $opt); + say { $req->{0} } 'mset.size=', $mset->size, + ' .get_matches_estimated=', $mset->get_matches_estimated; + for my $it ($mset->items) { + for (my $t = 10; $t > 0; --$t) { + $t = mset_iter($req, $it) // $t; + } + } +} + +sub dispatch { + my ($req, $cmd, @argv) = @_; + my $fn = $req->can("cmd_$cmd") or return; + $GLP->getoptionsfromarray(\@argv, $req, @PublicInbox::Search::XH_SPEC) + or return; + my $dirs = delete $req->{d} or die 'no -d args'; + my $key = join("\0", @$dirs); + $req->{srch} = $SRCH{$key} //= do { + my $new = { qp_flags => $PublicInbox::Search::QP_FLAGS }; + my $first = shift @$dirs; + my $slow_phrase = -f "$first/iamchert"; + $new->{xdb} = $X->{Database}->new($first); + for (@$dirs) { + $slow_phrase ||= -f "$_/iamchert"; + $new->{xdb}->add_database($X->{Database}->new($_)); + } + $slow_phrase or + $new->{qp_flags} |= PublicInbox::Search::FLAG_PHRASE(); + bless $new, $req->{c} ? 'PublicInbox::CodeSearch' : + 'PublicInbox::Search'; + $new->{qp} = $new->qparse_new; + $new; + }; + my $timeo = $req->{K}; + alarm($timeo) if $timeo; + $fn->($req, @argv); + alarm(0) if $timeo; +} + +sub recv_loop { + local $SIG{__WARN__} = sub { print $stderr @_ }; + my $rbuf; + local $SIG{TERM} = sub { undef $in }; + local $SIG{USR1} = \&reopen_logs; + while (defined($in)) { + PublicInbox::DS::sig_setmask($workerset); + my @fds = eval { # we undef $in in SIG{TERM} + $PublicInbox::IPC::recv_cmd->($in, $rbuf, 4096*33) + }; + if ($@) { + exit if !$in; # hit by SIGTERM + die; + } + scalar(@fds) or exit(66); # EX_NOINPUT + die "recvmsg: $!" if !defined($fds[0]); + PublicInbox::DS::block_signals(POSIX::SIGALRM); + my $req = bless {}, __PACKAGE__; + my $i = 0; + open($req->{$i++}, '+<&=', $_) for @fds; + local $stderr = $req->{1} // \*STDERR; + die "not NUL-terminated" if chop($rbuf) ne "\0"; + my @argv = split(/\0/, $rbuf); + $req->{nr_out} = 0; + $req->dispatch(@argv) if @argv; + } +} + +sub reap_worker { # awaitpid CB + my ($pid, $nr) = @_; + delete $WORKERS{$nr}; + if (($? >> 8) == 66) { # EX_NOINPUT + undef $in; + } elsif ($?) { + warn "worker[$nr] died \$?=$?\n"; + } + PublicInbox::DS::requeue(\&start_workers) if $in; +} + +sub start_worker ($) { + my ($nr) = @_; + my $pid = eval { PublicInbox::DS::fork_persist } // return(warn($@)); + if ($pid == 0) { + undef %WORKERS; + $SIG{TTIN} = $SIG{TTOU} = 'IGNORE'; + $SIG{CHLD} = 'DEFAULT'; # Xapian may use this + recv_loop(); + exit(0); + } else { + $WORKERS{$nr} = $pid; + awaitpid($pid, \&reap_worker, $nr); + } +} + +sub start_workers { + for my $nr (grep { !defined($WORKERS{$_}) } (0..($nworker - 1))) { + start_worker($nr) if $in; + } +} + +sub do_sigttou { + if ($in && $nworker > 1) { + --$nworker; + my @nr = grep { $_ >= $nworker } keys %WORKERS; + kill('TERM', @WORKERS{@nr}); + } +} + +sub reopen_logs { + my $p = $ENV{STDOUT_PATH}; + defined($p) && open(STDOUT, '>>', $p) and STDOUT->autoflush(1); + $p = $ENV{STDERR_PATH}; + defined($p) && open(STDERR, '>>', $p) and STDERR->autoflush(1); +} + +sub parent_reopen_logs { + reopen_logs(); + kill('USR1', values %WORKERS); +} + +sub xh_alive { $in || scalar(keys %WORKERS) } + +sub start (@) { + my (@argv) = @_; + my $c = getsockopt(local $in = \*STDIN, SOL_SOCKET, SO_TYPE); + unpack('i', $c) == SOCK_SEQPACKET or die 'stdin is not SOCK_SEQPACKET'; + + local (%SRCH, %WORKERS); + PublicInbox::Search::load_xapian(); + $GLP->getoptionsfromarray(\@argv, my $opt = { j => 1 }, 'j=i') or + die 'bad args'; + local $workerset = POSIX::SigSet->new; + $workerset->fillset or die "fillset: $!"; + for (@PublicInbox::DS::UNBLOCKABLE, POSIX::SIGUSR1) { + $workerset->delset($_) or die "delset($_): $!"; + } + + local $nworker = $opt->{j}; + return recv_loop() if $nworker == 0; + die '-j must be >= 0' if $nworker < 0; + for (POSIX::SIGTERM, POSIX::SIGCHLD) { + $workerset->delset($_) or die "delset($_): $!"; + } + my $sig = { + TTIN => sub { + if ($in) { + ++$nworker; + PublicInbox::DS::requeue(\&start_workers) + } + }, + TTOU => \&do_sigttou, + CHLD => \&PublicInbox::DS::enqueue_reap, + USR1 => \&parent_reopen_logs, + }; + PublicInbox::DS::block_signals(); + start_workers(); + @PublicInbox::DS::post_loop_do = \&xh_alive; + PublicInbox::DS::event_loop($sig); +} + +1; diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm new file mode 100644 index 00000000..74852ad1 --- /dev/null +++ b/lib/PublicInbox/XapHelperCxx.pm @@ -0,0 +1,141 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Just-ahead-of-time builder for the lib/PublicInbox/xap_helper.h shim. +# I never want users to be without source code for repairs, so this +# aims to replicate the feel of a scripting language using C++. +# The resulting executable is not linked to Perl in any way. +package PublicInbox::XapHelperCxx; +use v5.12; +use PublicInbox::Spawn qw(run_die run_qx run_wait which); +use PublicInbox::IO qw(try_cat write_file); +use PublicInbox::Search; +use Fcntl qw(SEEK_SET); +use Config; +use autodie; +my $cxx = which($ENV{CXX} // 'c++') // which('clang') // die 'no C++ compiler'; +my $dir = substr("$cxx-$Config{archname}", 1); # drop leading '/' +$dir =~ tr!/!-!; +my $idir; +if ((defined($ENV{XDG_CACHE_HOME}) && -d $ENV{XDG_CACHE_HOME}) || + (defined($ENV{HOME}) && -d $ENV{HOME})) { + $idir = ($ENV{XDG_CACHE_HOME} // + (($ENV{HOME} // die('HOME unset')).'/.cache') + ).'/public-inbox/jaot'; +} +$idir //= $ENV{PERL_INLINE_DIRECTORY} // + die 'HOME and PERL_INLINE_DIRECTORY unset'; +substr($dir, 0, 0) = "$idir/"; +my $bin = "$dir/xap_helper"; +my ($srcpfx) = (__FILE__ =~ m!\A(.+/)[^/]+\z!); +my @srcs = map { $srcpfx.$_ } qw(xh_mset.h xh_cidx.h xap_helper.h); +my @pm_dep = map { $srcpfx.$_ } qw(Search.pm CodeSearch.pm); +my $ldflags = '-Wl,-O1'; +$ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd'; +my $xflags = ($ENV{CXXFLAGS} // '-Wall -ggdb3 -pipe') . ' ' . + ' -DTHREADID=' . PublicInbox::Search::THREADID . + ' -DXH_SPEC="'.join('', + map { s/=.*/:/; $_ } @PublicInbox::Search::XH_SPEC) . '" ' . + ($ENV{LDFLAGS} // $ldflags); +substr($xflags, 0, 0, '-O2 ') if !defined($ENV{CXXFLAGS}) && !-w __FILE__; +my $xap_modversion; + +sub xap_cfg (@) { + my $cmd = [ $ENV{PKG_CONFIG} // 'pkg-config', @_, 'xapian-core' ]; + chomp(my $ret = run_qx($cmd, undef, { 2 => \(my $err) })); + return $ret if !$?; + die <<EOM; +@$cmd failed: Xapian development files missing? (\$?=$?) +$err +EOM +} + +sub needs_rebuild () { + my $prev = try_cat("$dir/XFLAGS") or return 1; + chomp $prev; + return 1 if $prev ne $xflags; + + $prev = try_cat("$dir/xap_modversion") or return 1; + chomp $prev; + + $xap_modversion = xap_cfg('--modversion'); + $xap_modversion ne $prev; +} + +sub build () { + if (!-d $dir) { + require File::Path; + eval { File::Path::make_path($dir) }; + if (!-d $dir && defined($ENV{PERL_INLINE_DIRECTORY})) { + $dir = $ENV{PERL_INLINE_DIRECTORY}; + File::Path::make_path($dir); + } + } + require PublicInbox::CodeSearch; + require PublicInbox::Lock; + my ($prog) = ($bin =~ m!/([^/]+)\z!); + my $lk = PublicInbox::Lock->new("$dir/$prog.lock")->lock_for_scope; + write_file '>', "$dir/$prog.cpp", qq{#include "xap_helper.h"\n}, + PublicInbox::Search::generate_cxx(), + PublicInbox::CodeSearch::generate_cxx(); + + # xap_modversion may be set by needs_rebuild + $xap_modversion //= xap_cfg('--modversion'); + my $fl = xap_cfg(qw(--libs --cflags)); + + # Using rpath seems acceptable/encouraged in the NetBSD packaging world + # since /usr/pkg/lib isn't searched by the dynamic loader by default. + # Not sure if other OSes need this, but rpath seems fine for JAOT + # binaries (like this one) even if other distros discourage it for + # distributed packages. + $^O eq 'netbsd' and $fl =~ s/(\A|[ \t])\-L([^ \t]+)([ \t]|\z)/ + "$1-L$2 -Wl,-rpath=$2$3"/egsx; + my @xflags = split(' ', "$fl $xflags"); # ' ' awk-mode eats leading WS + my @cflags = ('-I', $srcpfx, grep(!/\A-(?:Wl|l|L)/, @xflags)); + run_die([$cxx, '-o', "$dir/$prog.o", '-c', "$dir/$prog.cpp", @cflags]); + + # xapian on Alpine Linux (tested 3.19.0) is linked against libstdc++, + # and clang needs to be told to use it (rather than libc++): + my @try = rindex($cxx, 'clang') >= 0 ? qw(-lstdc++) : (); + my @cmd = ($cxx, '-o', "$dir/$prog.tmp", "$dir/$prog.o", @xflags); + while (run_wait(\@cmd) and @try) { + warn("# attempting to link again with $try[0]...\n"); + push(@cmd, shift(@try)); + } + die "# @cmd failed: \$?=$?" if $?; + unlink "$dir/$prog.cpp", "$dir/$prog.o"; + write_file '>', "$dir/XFLAGS.tmp", $xflags, "\n"; + write_file '>', "$dir/xap_modversion.tmp", $xap_modversion, "\n"; + undef $xap_modversion; # do we ever build() twice? + # not quite atomic, but close enough :P + rename("$dir/$_.tmp", "$dir/$_") for ($prog, qw(XFLAGS xap_modversion)); +} + +sub check_build () { + use Time::HiRes qw(stat); + my $ctime = 0; + my @bin = stat($bin) or return build(); + for (@srcs, @pm_dep) { + my @st = stat($_) or die "stat $_: $!"; + if ($st[10] > $ctime) { + $ctime = $st[10]; + return build() if $ctime > $bin[10]; + } + } + needs_rebuild() ? build() : 0; +} + +# returns spawn arg +sub cmd { + die 'PI_NO_CXX set' if $ENV{PI_NO_CXX}; + check_build(); + my @cmd; + if (my $v = $ENV{VALGRIND}) { + $v = 'valgrind -v' if $v eq '1'; + @cmd = split(/\s+/, $v); + } + push @cmd, $bin; + \@cmd; +} + +1; diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 10685636..9a148ae4 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -1,15 +1,17 @@ # Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::Xapcmd; -use strict; +use v5.12; use PublicInbox::Spawn qw(which popen_rd); use PublicInbox::Syscall; use PublicInbox::Admin qw(setup_signals); use PublicInbox::Over; +use PublicInbox::Search qw(xap_terms); use PublicInbox::SearchIdx; use File::Temp 0.19 (); # ->newdir use File::Path qw(remove_tree); use POSIX qw(WNOHANG _exit); +use PublicInbox::DS; # support testing with dev versions of Xapian which installs # commands with a version number suffix (e.g. "xapian-compact-1.5") @@ -75,15 +77,15 @@ sub commit_changes ($$$$) { $tmp = undef; if (!$opt->{-coarse_lock}) { $opt->{-skip_lock} = 1; - $im //= $ibx if $ibx->can('eidx_sync'); - if ($im->can('count_shards')) { # v2w or eidx + $im //= $ibx if $ibx->can('eidx_sync') || $ibx->can('cidx_run'); + if ($im->can('count_shards')) { # v2w, eidx, cidx my $pr = $opt->{-progress}; my $n = $im->count_shards; if (defined $reshard && $n != $reshard) { die "BUG: counted $n shards after resharding to $reshard"; } - my $prev = $im->{shards}; + my $prev = $im->{shards} // $ibx->{nshard}; if ($pr && $prev != $n) { $pr->("shard count changed: $prev => $n\n"); $im->{shards} = $n; @@ -93,7 +95,7 @@ sub commit_changes ($$$$) { local %ENV = (%ENV, %$env) if $env; if ($ibx->can('eidx_sync')) { $ibx->eidx_sync($opt); - } else { + } elsif (!$ibx->can('cidx_run')) { PublicInbox::Admin::index_inbox($ibx, $im, $opt); } } @@ -101,10 +103,8 @@ sub commit_changes ($$$$) { sub cb_spawn { my ($cb, $args, $opt) = @_; # $cb = cpdb() or compact() - my $seed = rand(0xffffffff); - my $pid = fork // die "fork: $!"; + my $pid = PublicInbox::DS::fork_persist; return $pid if $pid > 0; - srand($seed); $SIG{__DIE__} = sub { warn @_; _exit(1) }; # don't jump up stack $cb->($args, $opt); _exit(0); @@ -117,7 +117,8 @@ sub runnable_or_die ($) { sub prepare_reindex ($$) { my ($ibx, $opt) = @_; - if ($ibx->can('eidx_sync')) { # no prep needed for ExtSearchIdx + if ($ibx->can('eidx_sync') || $ibx->can('cidx_run')) { + # no prep needed for ExtSearchIdx nor CodeSearchIdx } elsif ($ibx->version == 1) { my $dir = $ibx->search->xdir(1); my $xdb = $PublicInbox::Search::X{Database}->new($dir); @@ -147,8 +148,9 @@ sub kill_pids { } sub process_queue { - my ($queue, $cb, $opt) = @_; + my ($queue, $task, $opt) = @_; my $max = $opt->{jobs} // scalar(@$queue); + my $cb = \&$task; if ($max <= 1) { while (defined(my $args = shift @$queue)) { $cb->($args, $opt); @@ -186,7 +188,9 @@ sub prepare_run { my $tmp = {}; # old shard dir => File::Temp->newdir object or undef my @queue; # ([old//src,newdir]) - list of args for cpdb() or compact() my ($old, $misc_ok); - if ($ibx->can('eidx_sync')) { + if ($ibx->can('cidx_run')) { + $old = $ibx->xdir(1); + } elsif ($ibx->can('eidx_sync')) { $misc_ok = 1; $old = $ibx->xdir(1); } elsif (my $srch = $ibx->search) { @@ -219,7 +223,7 @@ sub prepare_run { my @old_shards; while (defined(my $dn = readdir($dh))) { if ($dn =~ /\A[0-9]+\z/) { - push @old_shards, $dn; + push(@old_shards, $dn + 0); } elsif ($dn eq '.' || $dn eq '..') { } elsif ($dn =~ /\Aover\.sqlite3/) { } elsif ($dn eq 'misc' && $misc_ok) { @@ -228,7 +232,7 @@ sub prepare_run { } } die "No Xapian shards found in $old\n" unless @old_shards; - + @old_shards = sort { $a <=> $b } @old_shards; my ($src, $max_shard); if (!defined($reshard) || $reshard == scalar(@old_shards)) { # 1:1 copy @@ -256,38 +260,21 @@ sub prepare_run { sub check_compact () { runnable_or_die($XAPIAN_COMPACT) } -sub _run { # with_umask callback - my ($ibx, $cb, $opt) = @_; - my $im = $ibx->can('importer') ? $ibx->importer(0) : undef; - ($im // $ibx)->lock_acquire; - my ($tmp, $queue) = prepare_run($ibx, $opt); - - # fine-grained locking if we prepare for reindex - if (!$opt->{-coarse_lock}) { - prepare_reindex($ibx, $opt); - ($im // $ibx)->lock_release; - } - - $ibx->cleanup if $ibx->can('cleanup'); - process_queue($queue, $cb, $opt); - ($im // $ibx)->lock_acquire if !$opt->{-coarse_lock}; - commit_changes($ibx, $im, $tmp, $opt); -} - sub run { my ($ibx, $task, $opt) = @_; # task = 'cpdb' or 'compact' - my $cb = \&$task; PublicInbox::Admin::progress_prepare($opt ||= {}); my $dir; - for my $fld (qw(inboxdir topdir)) { + for my $fld (qw(inboxdir topdir cidx_dir)) { my $d = $ibx->{$fld} // next; -d $d or die "$fld=$d does not exist\n"; $dir = $d; last; } - check_compact() if $opt->{compact} && $ibx->search; + check_compact() if $opt->{compact} && + ($ibx->can('cidx_run') || $ibx->search); - if (!$ibx->can('eidx_sync') && !$opt->{-coarse_lock}) { + if (!$ibx->can('eidx_sync') && $ibx->can('version') && + !$opt->{-coarse_lock}) { # per-epoch ranges for v2 # v1:{ from => $OID }, v2:{ from => [ $OID, $OID, $OID ] } } $opt->{reindex} = { from => $ibx->version == 1 ? '' : [] }; @@ -296,7 +283,26 @@ sub run { local @SIG{keys %SIG} = values %SIG; setup_signals(); - $ibx->with_umask(\&_run, $ibx, $cb, $opt); + my $restore = $ibx->with_umask; + + my $im = $ibx->can('importer') ? $ibx->importer(0) : undef; + ($im // $ibx)->lock_acquire; + my ($tmp, $queue) = prepare_run($ibx, $opt); + + # fine-grained locking if we prepare for reindex + if (!$opt->{-coarse_lock}) { + prepare_reindex($ibx, $opt); + ($im // $ibx)->lock_release; + } + + $ibx->cleanup if $ibx->can('cleanup'); + if ($task eq 'cpdb' && $opt->{reshard} && $ibx->can('cidx_run')) { + cidx_reshard($ibx, $queue, $opt); + } else { + process_queue($queue, $task, $opt); + } + ($im // $ibx)->lock_acquire if !$opt->{-coarse_lock}; + commit_changes($ibx, $im, $tmp, $opt); } sub cpdb_retryable ($$) { @@ -315,15 +321,16 @@ sub cpdb_retryable ($$) { sub progress_pfx ($) { my ($wip) = @_; # tempdir v2: ([0-9])+-XXXX - my @p = split('/', $wip); + my @p = split(m'/', $wip); - # return "xap15/0" for v2, or "xapian15" for v1: - ($p[-1] =~ /\A([0-9]+)/) ? "$p[-2]/$1" : $p[-1]; + # "basename(inboxdir)/xap15/0" for v2, + # "basename(inboxdir)/xapian15" for v1: + ($p[-1] =~ /\A([0-9]+)/) ? "$p[-3]/$p[-2]/$1" : "$p[-2]/$p[-1]"; } sub kill_compact { # setup_signals callback - my ($sig, $pidref) = @_; - kill($sig, $$pidref) if defined($$pidref); + my ($sig, $ioref) = @_; + kill($sig, $$ioref->attached_pid // return) if defined($$ioref); } # xapian-compact wrapper @@ -351,18 +358,16 @@ sub compact ($$) { # cb_spawn callback } $pr->("$pfx `".join(' ', @$cmd)."'\n") if $pr; push @$cmd, $src, $dst; - my ($rd, $pid); local @SIG{keys %SIG} = values %SIG; - setup_signals(\&kill_compact, \$pid); - ($rd, $pid) = popen_rd($cmd, undef, $rdr); + setup_signals(\&kill_compact, \my $rd); + $rd = popen_rd($cmd, undef, $rdr); while (<$rd>) { if ($pr) { s/\r/\r$pfx /g; $pr->("$pfx $_"); } } - waitpid($pid, 0); - die "@$cmd failed: \$?=$?\n" if $?; + $rd->close or die "@$cmd failed: \$?=$?\n"; } sub cpdb_loop ($$$;$$) { @@ -406,17 +411,95 @@ sub cpdb_loop ($$$;$$) { } while (cpdb_retryable($src, $pfx)); } +sub xapian_write_prep ($) { + my ($opt) = @_; + PublicInbox::SearchIdx::load_xapian_writable(); + my $flag = eval($PublicInbox::Search::Xap.'::DB_CREATE()'); + die if $@; + $flag |= $PublicInbox::SearchIdx::DB_NO_SYNC if !$opt->{fsync}; + (\%PublicInbox::Search::X, $flag); +} + +sub compact_tmp_shard ($) { + my ($wip) = @_; + my $new = $wip->dirname; + my ($dir) = ($new =~ m!(.*?/)[^/]+/*\z!); + same_fs_or_die($dir, $new); + my $ft = File::Temp->newdir("$new.compact-XXXX", DIR => $dir); + PublicInbox::Syscall::nodatacow_dir($ft->dirname); + $ft; +} + +sub cidx_reshard { # not docid based + my ($cidx, $queue, $opt) = @_; + my ($X, $flag) = xapian_write_prep($opt); + my $src = $cidx->xdb; + delete($cidx->{xdb}) == $src or die "BUG: xdb != $src"; + my $pfx = $opt->{-progress_pfx} = progress_pfx($cidx->xdir.'/0'); + my $pr = $opt->{-progress}; + my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr; + local @SIG{keys %SIG} = values %SIG; + + # like copydatabase(1), be sure we don't overwrite anything in case + # of other bugs: + setup_signals() if $opt->{compact}; + my @tmp; + my @dst = map { + my $wip = $_->[1]; + my $tmp = $opt->{compact} ? compact_tmp_shard($wip) : $wip; + push @tmp, $tmp; + $X->{WritableDatabase}->new($tmp->dirname, $flag); + } @$queue; + my $l = $src->get_metadata('indexlevel'); + $dst[0]->set_metadata('indexlevel', $l) if $l eq 'medium'; + my $fmt; + if ($pr_data) { + my $tot = $src->get_doccount; + $fmt = "$pfx % ".length($tot)."u/$tot\n"; + $pr->("$pfx copying $tot documents\n"); + } + my $cur = $src->postlist_begin(''); + my $end = $src->postlist_end(''); + my $git_dir_hash = $cidx->can('git_dir_hash'); + my ($n, $nr); + for (; $cur != $end; $cur++) { + my $doc = $src->get_document($cur->get_docid); + if (my @cmt = xap_terms('Q', $doc)) { + $n = hex(substr($cmt[0], 0, 8)) % scalar(@dst); + warn "W: multi-commit: @cmt" if scalar(@cmt) != 1; + } elsif (my @P = xap_terms('P', $doc)) { + $n = $git_dir_hash->($P[0]) % scalar(@dst); + warn "W: multi-path @P " if scalar(@P) != 1; + } else { + warn "W: skipped, no terms in ".$cur->get_docid; + next; + } + $dst[$n]->add_document($doc); + $pr->(sprintf($fmt, $nr)) if $pr_data && !(++$nr & 1023); + } + return if !$opt->{compact}; + $src = undef; + @dst = (); # flushes and closes + my @q; + for my $tmp (@tmp) { + my $arg = shift @$queue // die 'BUG: $queue empty'; + my $wip = $arg->[1] // die 'BUG: no $wip'; + push @q, [ "$tmp", $wip ]; + } + delete $opt->{-progress_pfx}; + process_queue(\@q, 'compact', $opt); +} + # Like copydatabase(1), this is horribly slow; and it doesn't seem due # to the overhead of Perl. sub cpdb ($$) { # cb_spawn callback my ($args, $opt) = @_; - my ($old, $newdir) = @$args; - my $new = $newdir->dirname; + my ($old, $wip) = @$args; my ($src, $cur_shard); my $reshard; - PublicInbox::SearchIdx::load_xapian_writable(); - my $XapianDatabase = $PublicInbox::Search::X{Database}; + my ($X, $flag) = xapian_write_prep($opt); if (ref($old) eq 'ARRAY') { + my $new = $wip->dirname; ($cur_shard) = ($new =~ m!(?:xap|ei)[0-9]+/([0-9]+)\b!); defined $cur_shard or die "BUG: could not extract shard # from $new"; @@ -426,36 +509,27 @@ sub cpdb ($$) { # cb_spawn callback # resharding, M:N copy means have full read access foreach (@$old) { if ($src) { - my $sub = $XapianDatabase->new($_); + my $sub = $X->{Database}->new($_); $src->add_database($sub); } else { - $src = $XapianDatabase->new($_); + $src = $X->{Database}->new($_); } } } else { - $src = $XapianDatabase->new($old); + $src = $X->{Database}->new($old); } - my ($tmp, $ft); + my $tmp = $wip; local @SIG{keys %SIG} = values %SIG; if ($opt->{compact}) { - my ($dir) = ($new =~ m!(.*?/)[^/]+/*\z!); - same_fs_or_die($dir, $new); - $ft = File::Temp->newdir("$new.compact-XXXX", DIR => $dir); + $tmp = compact_tmp_shard($wip); setup_signals(); - $tmp = $ft->dirname; - PublicInbox::Syscall::nodatacow_dir($tmp); - } else { - $tmp = $new; } # like copydatabase(1), be sure we don't overwrite anything in case # of other bugs: - my $flag = eval($PublicInbox::Search::Xap.'::DB_CREATE()'); - die if $@; - my $XapianWritableDatabase = $PublicInbox::Search::X{WritableDatabase}; - $flag |= $PublicInbox::SearchIdx::DB_NO_SYNC if !$opt->{fsync}; - my $dst = $XapianWritableDatabase->new($tmp, $flag); + my $new = $wip->dirname; + my $dst = $X->{WritableDatabase}->new($tmp->dirname, $flag); my $pr = $opt->{-progress}; my $pfx = $opt->{-progress_pfx} = progress_pfx($new); my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr; @@ -467,11 +541,10 @@ sub cpdb ($$) { # cb_spawn callback $dst->set_metadata('last_commit', $lc) if $lc; # only the first xapian shard (0) gets 'indexlevel' - if ($new =~ m!(?:xapian[0-9]+|xap[0-9]+/0)\b!) { + if ($new =~ m!/(?:xapian[0-9]+|(?:ei|xap)[0-9]+/0)\b!) { my $l = $src->get_metadata('indexlevel'); - if ($l eq 'medium') { + $l eq 'medium' and $dst->set_metadata('indexlevel', $l); - } } if ($pr_data) { my $tot = $src->get_doccount; @@ -498,7 +571,7 @@ sub cpdb ($$) { # cb_spawn callback # individually. $src = undef; foreach (@$old) { - my $old = $XapianDatabase->new($_); + my $old = $X->{Database}->new($_); cpdb_loop($old, $dst, $pr_data, $cur_shard, $reshard); } } else { @@ -513,7 +586,6 @@ sub cpdb ($$) { # cb_spawn callback # this is probably the best place to do xapian-compact # since $dst isn't readable by HTTP or NNTP clients, yet: compact([ $tmp, $new ], $opt); - remove_tree($tmp) or die "failed to remove $tmp: $!\n"; } 1; diff --git a/lib/PublicInbox/XhcMset.pm b/lib/PublicInbox/XhcMset.pm new file mode 100644 index 00000000..ac25eece --- /dev/null +++ b/lib/PublicInbox/XhcMset.pm @@ -0,0 +1,51 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# mocks Xapian::Mset and allows slow queries from blocking the event loop +package PublicInbox::XhcMset; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::XhcMsetIterator; +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); + +sub event_step { + my ($self) = @_; + my ($cb, @args) = @{delete $self->{cb_args} // return}; + my $rd = $self->{sock}; + eval { + my $hdr = <$rd> // die "E: reading mset header: $!"; + for (split /\s+/, $hdr) { # read mset.size + estimated_matches + my ($k, $v) = split /=/, $_, 2; + $k =~ s/\A[^\.]*\.//; # s/(mset)?\./ + $self->{$k} = $v; + } + my $size = $self->{size} // die "E: bad xhc header: `$hdr'"; + my @it = map { PublicInbox::XhcMsetIterator::make($_) } <$rd>; + $self->{items} = \@it; + scalar(@it) == $size or die + 'E: got ',scalar(@it),", expected mset.size=$size"; + }; + my $err = $@; + $self->close; + eval { $cb->(@args, $self, $err) }; + warn "E: $@\n" if $@; +} + +sub maybe_new { + my (undef, $rd, $srch, @cb_args) = @_; + my $self = bless { cb_args => \@cb_args, srch => $srch }, __PACKAGE__; + if ($PublicInbox::DS::in_loop) { # async + $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); + } else { # synchronous + $self->{sock} = $rd; + event_step($self); + undef; + } +} + +eval(join('', map { "sub $_ { \$_[0]->{$_} }\n" } qw(size + get_matches_estimated))); + +sub items { @{$_[0]->{items}} } + +1; diff --git a/lib/PublicInbox/XhcMsetIterator.pm b/lib/PublicInbox/XhcMsetIterator.pm new file mode 100644 index 00000000..dcfc61e4 --- /dev/null +++ b/lib/PublicInbox/XhcMsetIterator.pm @@ -0,0 +1,20 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# mocks Xapian::MsetIterator, there's many of these allocated at once +package PublicInbox::XhcMsetIterator; +use v5.12; + +sub make ($) { + chomp($_[0]); + my @self = map { $_ + 0 } split /\0/, $_[0]; # docid, pct, rank + # we don't store $xdb in self[4] since we avoid $it->get_document + # in favor of $xdb->get_document($it->get_docid) + bless \@self, __PACKAGE__; +} + +sub get_docid { $_[0]->[0] } +sub get_percent { $_[0]->[1] } +sub get_rank { $_[0]->[2] } + +1; diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h new file mode 100644 index 00000000..3df3ce91 --- /dev/null +++ b/lib/PublicInbox/xap_helper.h @@ -0,0 +1,1098 @@ +/* + * Copyright (C) all contributors <meta@public-inbox.org> + * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt> + * Note: GPL-2+ since it'll incorporate approxidate from git someday + * + * Standalone helper process using C and minimal C++ for Xapian, + * this is not linked to Perl in any way. + * C (not C++) is used as much as possible to lower the contribution + * barrier for hackers who mainly know C (this includes the maintainer). + * Yes, that means we use C stdlib stuff like hsearch and open_memstream + * instead their equivalents in the C++ stdlib :P + * Everything here is an unstable internal API of public-inbox and + * NOT intended for ordinary users; only public-inbox hackers + */ +#ifndef _ALL_SOURCE +# define _ALL_SOURCE +#endif +#if defined(__NetBSD__) && !defined(_OPENBSD_SOURCE) // for reallocarray(3) +# define _OPENBSD_SOURCE +#endif +#include <sys/file.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/wait.h> +#include <poll.h> + +#include <assert.h> +#include <err.h> // BSD, glibc, and musl all have this +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <search.h> +#include <signal.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> +#include <xapian.h> // our only reason for using C++ + +#define MY_VER(maj,min,rev) ((maj) << 16 | (min) << 8 | (rev)) +#define XAP_VER \ + MY_VER(XAPIAN_MAJOR_VERSION,XAPIAN_MINOR_VERSION,XAPIAN_REVISION) + +#if XAP_VER >= MY_VER(1,3,6) +# define NRP Xapian::NumberRangeProcessor +# define ADD_RP add_rangeprocessor +# define SET_MAX_EXPANSION set_max_expansion // technically 1.3.3 +#else +# define NRP Xapian::NumberValueRangeProcessor +# define ADD_RP add_valuerangeprocessor +# define SET_MAX_EXPANSION set_max_wildcard_expansion +#endif + +#if defined(__GLIBC__) +# define MY_DO_OPTRESET() do { optind = 0; } while (0) +#else /* FreeBSD, musl, dfly, NetBSD, OpenBSD */ +# define MY_DO_OPTRESET() do { optind = optreset = 1; } while (0) +#endif + +#if defined(__DragonFly__) || defined(__FreeBSD__) || defined(__GLIBC__) +# define STDERR_ASSIGNABLE (1) +#else +# define STDERR_ASSIGNABLE (0) +#endif + +// assert functions are used correctly (e.g. ensure hackers don't +// cause EINVAL/EFAULT). Not for stuff that can fail due to HW +// failures. +# define CHECK(type, expect, expr) do { \ + type ckvar______ = (expr); \ + assert(ckvar______ == (expect) && "BUG" && __FILE__ && __LINE__); \ +} while (0) + +// coredump on most usage errors since our only users are internal +#define ABORT(...) do { warnx(__VA_ARGS__); abort(); } while (0) +#define EABORT(...) do { warn(__VA_ARGS__); abort(); } while (0) + +// sock_fd is modified in signal handler, yes, it's SOCK_SEQPACKET +static volatile int sock_fd = STDIN_FILENO; +static sigset_t fullset, workerset; +static bool alive = true; +#if STDERR_ASSIGNABLE +static FILE *orig_err = stderr; +#endif +static int orig_err_fd = -1; +static void *srch_tree; // tsearch + tdelete + twalk +static pid_t *worker_pids; // nr => pid +#define WORKER_MAX USHRT_MAX +static unsigned long nworker, nworker_hwm; +static int pipefds[2]; +static const char *stdout_path, *stderr_path; // for SIGUSR1 +static sig_atomic_t worker_needs_reopen; + +// PublicInbox::Search and PublicInbox::CodeSearch generate these: +static void mail_nrp_init(void); +static void code_nrp_init(void); +static void qp_init_mail_search(Xapian::QueryParser *); +static void qp_init_code_search(Xapian::QueryParser *); + +enum exc_iter { + ITER_OK = 0, + ITER_RETRY, + ITER_ABORT +}; + +struct srch { + int paths_len; // int for comparisons + unsigned qp_flags; + Xapian::Database *db; + Xapian::QueryParser *qp; + char paths[]; // $shard_path0\0$shard_path1\0... +}; + +#define MY_ARG_MAX 256 +typedef bool (*cmd)(struct req *); + +// only one request per-process since we have RLIMIT_CPU timeout +struct req { // argv and pfxv point into global rbuf + char *argv[MY_ARG_MAX]; + char *pfxv[MY_ARG_MAX]; // -A <prefix> + size_t *lenv; // -A <prefix>LENGTH + struct srch *srch; + char *Pgit_dir; + char *Oeidx_key; + cmd fn; + unsigned long long max; + unsigned long long off; + unsigned long long threadid; + unsigned long timeout_sec; + size_t nr_out; + long sort_col; // value column, negative means BoolWeight + int argc; + int pfxc; + FILE *fp[2]; // [0] response pipe or sock, [1] status/errors (optional) + bool has_input; // fp[0] is bidirectional + bool collapse_threads; + bool code_search; + bool relevance; // sort by relevance before column + bool asc; // ascending sort +}; + +struct worker { + pid_t pid; + unsigned nr; +}; + +struct fbuf { + FILE *fp; + char *ptr; + size_t len; +}; + +#define SPLIT2ARGV(dst,buf,len) split2argv(dst,buf,len,MY_ARRAY_SIZE(dst)) +static size_t split2argv(char **dst, char *buf, size_t len, size_t limit) +{ + if (buf[0] == 0 || len == 0 || buf[len - 1] != 0) + ABORT("bogus argument given"); + size_t nr = 0; + char *c = buf; + for (size_t i = 1; i < len; i++) { + if (!buf[i]) { + dst[nr++] = c; + c = buf + i + 1; + } + if (nr == limit) + ABORT("too many args: %zu == %zu", nr, limit); + } + if (nr == 0) ABORT("no argument given"); + if ((long)nr < 0) ABORT("too many args: %zu", nr); + return (long)nr; +} + +static bool has_threadid(const struct srch *srch) +{ + return srch->db->get_metadata("has_threadid") == "1"; +} + +static Xapian::Enquire prep_enquire(const struct req *req) +{ + Xapian::Enquire enq(*req->srch->db); + if (req->sort_col < 0) { + enq.set_weighting_scheme(Xapian::BoolWeight()); + enq.set_docid_order(req->asc ? Xapian::Enquire::ASCENDING + : Xapian::Enquire::DESCENDING); + } else if (req->relevance) { + enq.set_sort_by_relevance_then_value(req->sort_col, !req->asc); + } else { + enq.set_sort_by_value_then_relevance(req->sort_col, !req->asc); + } + return enq; +} + +static Xapian::MSet enquire_mset(struct req *req, Xapian::Enquire *enq) +{ + if (!req->max) { + switch (sizeof(Xapian::doccount)) { + case 4: req->max = UINT_MAX; break; + default: req->max = ULLONG_MAX; + } + } + for (int i = 0; i < 9; i++) { + try { + Xapian::MSet mset = enq->get_mset(req->off, req->max); + return mset; + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + } + } + return enq->get_mset(req->off, req->max); +} + +// for v1, v2, and extindex +static Xapian::MSet mail_mset(struct req *req, const char *qry_str) +{ + struct srch *srch = req->srch; + Xapian::Query qry = srch->qp->parse_query(qry_str, srch->qp_flags); + if (req->Oeidx_key) { + req->Oeidx_key[0] = 'O'; // modifies static rbuf + qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, + Xapian::Query(req->Oeidx_key)); + } + // TODO: uid_range + if (req->threadid != ULLONG_MAX) { + std::string tid = Xapian::sortable_serialise(req->threadid); + qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, + Xapian::Query(Xapian::Query::OP_VALUE_RANGE, THREADID, + tid, tid)); + } + Xapian::Enquire enq = prep_enquire(req); + enq.set_query(qry); + // THREADID is a CPP macro defined on CLI (see) XapHelperCxx.pm + if (req->collapse_threads && has_threadid(srch)) + enq.set_collapse_key(THREADID); + + return enquire_mset(req, &enq); +} + +static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len) +{ + return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len); +} + +static void apply_roots_filter(struct req *req, Xapian::Query *qry) +{ + if (!req->Pgit_dir) return; + req->Pgit_dir[0] = 'P'; // modifies static rbuf + Xapian::Database *xdb = req->srch->db; + for (int i = 0; i < 9; i++) { + try { + std::string P = req->Pgit_dir; + Xapian::PostingIterator p = xdb->postlist_begin(P); + if (p == xdb->postlist_end(P)) { + warnx("W: %s not indexed?", req->Pgit_dir + 1); + return; + } + Xapian::TermIterator cur = xdb->termlist_begin(*p); + Xapian::TermIterator end = xdb->termlist_end(*p); + cur.skip_to("G"); + if (cur == end) { + warnx("W: %s has no root commits?", + req->Pgit_dir + 1); + return; + } + Xapian::Query f = Xapian::Query(*cur); + for (++cur; cur != end; ++cur) { + std::string tn = *cur; + if (!starts_with(&tn, "G", 1)) + continue; + f = Xapian::Query(Xapian::Query::OP_OR, f, tn); + } + *qry = Xapian::Query(Xapian::Query::OP_FILTER, *qry, f); + return; + } catch (const Xapian::DatabaseModifiedError & e) { + xdb->reopen(); + } + } +} + +// for cindex +static Xapian::MSet commit_mset(struct req *req, const char *qry_str) +{ + struct srch *srch = req->srch; + Xapian::Query qry = srch->qp->parse_query(qry_str, srch->qp_flags); + apply_roots_filter(req, &qry); + + // we only want commits: + qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, + Xapian::Query("T" "c")); + Xapian::Enquire enq = prep_enquire(req); + enq.set_query(qry); + return enquire_mset(req, &enq); +} + +static void emit_mset_stats(struct req *req, const Xapian::MSet *mset) +{ + if (req->fp[1]) + fprintf(req->fp[1], "mset.size=%llu nr_out=%zu\n", + (unsigned long long)mset->size(), req->nr_out); + else + ABORT("BUG: %s caller only passed 1 FD", req->argv[0]); +} + +static int my_setlinebuf(FILE *fp) // glibc setlinebuf(3) can't report errors +{ + return setvbuf(fp, NULL, _IOLBF, 0); +} + +// n.b. __cleanup__ works fine with C++ exceptions, but not longjmp +// Only clang and g++ are supported, as AFAIK there's no other +// relevant Free(-as-in-speech) C++ compilers. +#define CLEANUP_FBUF __attribute__((__cleanup__(fbuf_ensure))) +static void fbuf_ensure(void *ptr) +{ + struct fbuf *fbuf = (struct fbuf *)ptr; + if (fbuf->fp && fclose(fbuf->fp)) + err(EXIT_FAILURE, "fclose(fbuf->fp)"); // ENOMEM? + fbuf->fp = NULL; + free(fbuf->ptr); +} + +static void fbuf_init(struct fbuf *fbuf) +{ + assert(!fbuf->ptr); + fbuf->fp = open_memstream(&fbuf->ptr, &fbuf->len); + if (!fbuf->fp) err(EXIT_FAILURE, "open_memstream(fbuf)"); +} + +static bool write_all(int fd, const struct fbuf *wbuf, size_t len) +{ + const char *p = wbuf->ptr; + assert(wbuf->len >= len); + do { // write to client FD + ssize_t n = write(fd, p, len); + if (n > 0) { + len -= n; + p += n; + } else { + perror(n ? "write" : "write (zero bytes)"); + return false; + } + } while (len); + return true; +} + +#define ERR_FLUSH(f) do { \ + if (ferror(f) | fflush(f)) err(EXIT_FAILURE, "ferror|fflush "#f); \ +} while (0) + +#define ERR_CLOSE(f, e) do { \ + if (ferror(f) | fclose(f)) \ + e ? err(e, "ferror|fclose "#f) : perror("ferror|fclose "#f); \ +} while (0) + +static void xclose(int fd) +{ + if (close(fd) < 0 && errno != EINTR) + EABORT("BUG: close"); +} + +static size_t off2size(off_t n) +{ + if (n < 0 || (uintmax_t)n > SIZE_MAX) + ABORT("off_t out of size_t range: %lld\n", (long long)n); + return (size_t)n; +} + +static char *hsearch_enter_key(char *s) +{ +#if defined(__OpenBSD__) || defined(__DragonFly__) + // hdestroy frees each key on some platforms, + // so give it something to free: + char *ret = strdup(s); + if (!ret) err(EXIT_FAILURE, "strdup"); + return ret; +// AFAIK there's no way to detect musl, assume non-glibc Linux is musl: +#elif defined(__GLIBC__) || defined(__linux__) || \ + defined(__FreeBSD__) || defined(__NetBSD__) + // do nothing on these platforms +#else +#warning untested platform detected, unsure if hdestroy(3) frees keys +#warning contact us at meta@public-inbox.org if you get segfaults +#endif + return s; +} + +// for test usage only, we need to ensure the compiler supports +// __cleanup__ when exceptions are thrown +struct inspect { struct req *req; }; + +static void inspect_ensure(struct inspect *x) +{ + fprintf(x->req->fp[0], "pid=%d has_threadid=%d", + (int)getpid(), has_threadid(x->req->srch) ? 1 : 0); +} + +static bool cmd_test_inspect(struct req *req) +{ + __attribute__((__cleanup__(inspect_ensure))) struct inspect x; + x.req = req; + try { + throw Xapian::InvalidArgumentError("test"); + } catch (Xapian::InvalidArgumentError) { + return true; + } + fputs("this should not be printed", req->fp[0]); + return false; +} + +static bool cmd_test_sleep(struct req *req) +{ + for (;;) poll(NULL, 0, 10); + return false; +} +#include "xh_mset.h" // read-only (WWW, IMAP, lei) stuff +#include "xh_cidx.h" // CodeSearchIdx.pm stuff + +#define CMD(n) { .fn_len = sizeof(#n) - 1, .fn_name = #n, .fn = cmd_##n } +static const struct cmd_entry { + size_t fn_len; + const char *fn_name; + cmd fn; +} cmds[] = { // should be small enough to not need bsearch || gperf + // most common commands first + CMD(mset), // WWW and IMAP requests + CMD(dump_ibx), // many inboxes + CMD(dump_roots), // per-cidx shard + CMD(test_inspect), // least common commands last + CMD(test_sleep), // least common commands last +}; + +#define MY_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) +#define RECV_FD_CAPA 2 +#define RECV_FD_SPACE (RECV_FD_CAPA * sizeof(int)) +union my_cmsg { + struct cmsghdr hdr; + char pad[sizeof(struct cmsghdr) + 16 + RECV_FD_SPACE]; +}; + +static bool recv_req(struct req *req, char *rbuf, size_t *len) +{ + union my_cmsg cmsg = {}; + struct msghdr msg = {}; + struct iovec iov; + ssize_t r; + iov.iov_base = rbuf; + iov.iov_len = *len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = &cmsg.hdr; + msg.msg_controllen = CMSG_SPACE(RECV_FD_SPACE); + + // allow SIGTERM to hit + CHECK(int, 0, sigprocmask(SIG_SETMASK, &workerset, NULL)); + +again: + r = recvmsg(sock_fd, &msg, 0); + if (r == 0) { + exit(EX_NOINPUT); /* grandparent went away */ + } else if (r < 0) { + switch (errno) { + case EINTR: goto again; + case EBADF: if (sock_fd < 0) exit(0); + // fall-through + default: err(EXIT_FAILURE, "recvmsg"); + } + } + + // success! no signals for the rest of the request/response cycle + CHECK(int, 0, sigprocmask(SIG_SETMASK, &fullset, NULL)); + if (r > 0 && msg.msg_flags) + ABORT("unexpected msg_flags"); + + *len = r; + if (cmsg.hdr.cmsg_level == SOL_SOCKET && + cmsg.hdr.cmsg_type == SCM_RIGHTS) { + size_t clen = cmsg.hdr.cmsg_len; + int *fdp = (int *)CMSG_DATA(&cmsg.hdr); + size_t i; + for (i = 0; CMSG_LEN((i + 1) * sizeof(int)) <= clen; i++) { + int fd = *fdp++; + const char *mode = NULL; + int fl = fcntl(fd, F_GETFL); + if (fl == -1) { + errx(EXIT_FAILURE, "invalid fd=%d", fd); + } else if (fl & O_WRONLY) { + mode = "w"; + } else if (fl & O_RDWR) { + mode = "r+"; + if (i == 0) req->has_input = true; + } else { + errx(EXIT_FAILURE, + "invalid mode from F_GETFL: 0x%x", fl); + } + req->fp[i] = fdopen(fd, mode); + if (!req->fp[i]) + err(EXIT_FAILURE, "fdopen(fd=%d)", fd); + } + return true; + } + errx(EXIT_FAILURE, "no FD received in %zd-byte request", r); + return false; +} + +static int srch_cmp(const void *pa, const void *pb) // for tfind|tsearch +{ + const struct srch *a = (const struct srch *)pa; + const struct srch *b = (const struct srch *)pb; + int diff = a->paths_len - b->paths_len; + + return diff ? diff : memcmp(a->paths, b->paths, (size_t)a->paths_len); +} + +static bool is_chert(const char *dir) +{ + char iamchert[PATH_MAX]; + struct stat sb; + int rc = snprintf(iamchert, sizeof(iamchert), "%s/iamchert", dir); + + if (rc <= 0 || rc >= (int)sizeof(iamchert)) + err(EXIT_FAILURE, "BUG: snprintf(%s/iamchert)", dir); + if (stat(iamchert, &sb) == 0 && S_ISREG(sb.st_mode)) + return true; + return false; +} + +static bool srch_init(struct req *req) +{ + char *dirv[MY_ARG_MAX]; + int i; + struct srch *srch = req->srch; + int dirc = (int)SPLIT2ARGV(dirv, srch->paths, (size_t)srch->paths_len); + const unsigned FLAG_PHRASE = Xapian::QueryParser::FLAG_PHRASE; + srch->qp_flags = FLAG_PHRASE | + Xapian::QueryParser::FLAG_BOOLEAN | + Xapian::QueryParser::FLAG_LOVEHATE | + Xapian::QueryParser::FLAG_WILDCARD; + if (is_chert(dirv[0])) + srch->qp_flags &= ~FLAG_PHRASE; + try { + srch->db = new Xapian::Database(dirv[0]); + } catch (...) { + warn("E: Xapian::Database(%s)", dirv[0]); + return false; + } + try { + for (i = 1; i < dirc; i++) { + if (srch->qp_flags & FLAG_PHRASE && is_chert(dirv[i])) + srch->qp_flags &= ~FLAG_PHRASE; + srch->db->add_database(Xapian::Database(dirv[i])); + } + } catch (...) { + warn("E: add_database(%s)", dirv[i]); + return false; + } + try { + srch->qp = new Xapian::QueryParser; + } catch (...) { + perror("E: Xapian::QueryParser"); + return false; + } + srch->qp->set_default_op(Xapian::Query::OP_AND); + srch->qp->set_database(*srch->db); + try { + srch->qp->set_stemmer(Xapian::Stem("english")); + } catch (...) { + perror("E: Xapian::Stem"); + return false; + } + srch->qp->set_stemming_strategy(Xapian::QueryParser::STEM_SOME); + srch->qp->SET_MAX_EXPANSION(100); + + if (req->code_search) + qp_init_code_search(srch->qp); // CodeSearch.pm + else + qp_init_mail_search(srch->qp); // Search.pm + return true; +} + +static void free_srch(void *p) // tdestroy +{ + struct srch *srch = (struct srch *)p; + delete srch->qp; + delete srch->db; + free(srch); +} + +static void dispatch(struct req *req) +{ + int c; + size_t size = strlen(req->argv[0]); + union { + struct srch *srch; + char *ptr; + } kbuf; + char *end; + FILE *kfp; + struct srch **s; + req->threadid = ULLONG_MAX; + for (c = 0; c < (int)MY_ARRAY_SIZE(cmds); c++) { + if (cmds[c].fn_len == size && + !memcmp(cmds[c].fn_name, req->argv[0], size)) { + req->fn = cmds[c].fn; + break; + } + } + if (!req->fn) ABORT("not handled: `%s'", req->argv[0]); + + kfp = open_memstream(&kbuf.ptr, &size); + if (!kfp) err(EXIT_FAILURE, "open_memstream(kbuf)"); + // write padding, first (contents don't matter) + fwrite(&req->argv[0], offsetof(struct srch, paths), 1, kfp); + + // global getopt variables: + optopt = 0; + optarg = NULL; + MY_DO_OPTRESET(); + + // XH_SPEC is generated from @PublicInbox::Search::XH_SPEC + while ((c = getopt(req->argc, req->argv, XH_SPEC)) != -1) { + switch (c) { + case 'a': req->asc = true; break; + case 'c': req->code_search = true; break; + case 'd': fwrite(optarg, strlen(optarg) + 1, 1, kfp); break; + case 'g': req->Pgit_dir = optarg - 1; break; // pad "P" prefix + case 'k': + req->sort_col = strtol(optarg, &end, 10); + if (*end) ABORT("-k %s", optarg); + switch (req->sort_col) { + case LONG_MAX: case LONG_MIN: ABORT("-k %s", optarg); + } + break; + case 'm': + req->max = strtoull(optarg, &end, 10); + if (*end || req->max == ULLONG_MAX) + ABORT("-m %s", optarg); + break; + case 'o': + req->off = strtoull(optarg, &end, 10); + if (*end || req->off == ULLONG_MAX) + ABORT("-o %s", optarg); + break; + case 'r': req->relevance = true; break; + case 't': req->collapse_threads = true; break; + case 'A': + req->pfxv[req->pfxc++] = optarg; + if (MY_ARG_MAX == req->pfxc) + ABORT("too many -A"); + break; + case 'K': + req->timeout_sec = strtoul(optarg, &end, 10); + if (*end || req->timeout_sec == ULONG_MAX) + ABORT("-K %s", optarg); + break; + case 'O': req->Oeidx_key = optarg - 1; break; // pad "O" prefix + case 'T': + req->threadid = strtoull(optarg, &end, 10); + if (*end || req->threadid == ULLONG_MAX) + ABORT("-T %s", optarg); + break; + default: ABORT("bad switch `-%c'", c); + } + } + ERR_CLOSE(kfp, EXIT_FAILURE); // may ENOMEM, sets kbuf.srch + kbuf.srch->db = NULL; + kbuf.srch->qp = NULL; + kbuf.srch->paths_len = size - offsetof(struct srch, paths); + if (kbuf.srch->paths_len <= 0) + ABORT("no -d args"); + s = (struct srch **)tsearch(kbuf.srch, &srch_tree, srch_cmp); + if (!s) err(EXIT_FAILURE, "tsearch"); // likely ENOMEM + req->srch = *s; + if (req->srch != kbuf.srch) { // reuse existing + free_srch(kbuf.srch); + } else if (!srch_init(req)) { + assert(kbuf.srch == *((struct srch **)tfind( + kbuf.srch, &srch_tree, srch_cmp))); + void *del = tdelete(kbuf.srch, &srch_tree, srch_cmp); + assert(del); + free_srch(kbuf.srch); + goto cmd_err; // srch_init already warned + } + if (req->timeout_sec) + alarm(req->timeout_sec > UINT_MAX ? + UINT_MAX : (unsigned)req->timeout_sec); + try { + if (!req->fn(req)) + warnx("`%s' failed", req->argv[0]); + } catch (const Xapian::Error & e) { + warnx("Xapian::Error: %s", e.get_description().c_str()); + } catch (...) { + warn("unhandled exception"); + } + if (req->timeout_sec) + alarm(0); +cmd_err: + return; // just be silent on errors, for now +} + +static void cleanup_pids(void) +{ + free(worker_pids); + worker_pids = NULL; +} + +static void stderr_set(FILE *tmp_err) +{ +#if STDERR_ASSIGNABLE + if (my_setlinebuf(tmp_err)) + perror("W: setlinebuf(tmp_err)"); + stderr = tmp_err; + return; +#endif + int fd = fileno(tmp_err); + if (fd < 0) err(EXIT_FAILURE, "BUG: fileno(tmp_err)"); + while (dup2(fd, STDERR_FILENO) < 0) { + if (errno != EINTR) + err(EXIT_FAILURE, "dup2(%d => 2)", fd); + } +} + +static void stderr_restore(FILE *tmp_err) +{ +#if STDERR_ASSIGNABLE + stderr = orig_err; + return; +#endif + ERR_FLUSH(stderr); + while (dup2(orig_err_fd, STDERR_FILENO) < 0) { + if (errno != EINTR) + err(EXIT_FAILURE, "dup2(%d => 2)", orig_err_fd); + } + clearerr(stderr); +} + +static void sigw(int sig) // SIGTERM+SIGUSR1 handler for worker +{ + switch (sig) { + case SIGUSR1: worker_needs_reopen = 1; break; + default: sock_fd = -1; // break out of recv_loop + } +} + +#define CLEANUP_REQ __attribute__((__cleanup__(req_cleanup))) +static void req_cleanup(void *ptr) +{ + struct req *req = (struct req *)ptr; + free(req->lenv); +} + +static void reopen_logs(void) +{ + if (stdout_path && *stdout_path && !freopen(stdout_path, "a", stdout)) + err(EXIT_FAILURE, "freopen %s", stdout_path); + if (stderr_path && *stderr_path) { + if (!freopen(stderr_path, "a", stderr)) + err(EXIT_FAILURE, "freopen %s", stderr_path); + if (my_setlinebuf(stderr)) + err(EXIT_FAILURE, "setlinebuf(stderr)"); + } +} + +static void recv_loop(void) // worker process loop +{ + static char rbuf[4096 * 33]; // per-process + struct sigaction sa = {}; + sa.sa_handler = sigw; + + CHECK(int, 0, sigaction(SIGTERM, &sa, NULL)); + CHECK(int, 0, sigaction(SIGUSR1, &sa, NULL)); + + while (sock_fd == 0) { + size_t len = sizeof(rbuf); + CLEANUP_REQ struct req req = {}; + + if (!recv_req(&req, rbuf, &len)) + continue; + if (req.fp[1]) + stderr_set(req.fp[1]); + req.argc = (int)SPLIT2ARGV(req.argv, rbuf, len); + dispatch(&req); + ERR_CLOSE(req.fp[0], 0); + if (req.fp[1]) { + stderr_restore(req.fp[1]); + ERR_CLOSE(req.fp[1], 0); + } + if (worker_needs_reopen) { + worker_needs_reopen = 0; + reopen_logs(); + } + } +} + +static void insert_pid(pid_t pid, unsigned nr) +{ + assert(!worker_pids[nr]); + worker_pids[nr] = pid; +} + +static void start_worker(unsigned nr) +{ + pid_t pid = fork(); + if (pid < 0) { + warn("E: fork(worker=%u)", nr); + } else if (pid > 0) { + insert_pid(pid, nr); + } else { + cleanup_pids(); + xclose(pipefds[0]); + xclose(pipefds[1]); + if (signal(SIGCHLD, SIG_DFL) == SIG_ERR) + err(EXIT_FAILURE, "signal CHLD"); + if (signal(SIGTTIN, SIG_IGN) == SIG_ERR) + err(EXIT_FAILURE, "signal TTIN"); + if (signal(SIGTTOU, SIG_IGN) == SIG_ERR) + err(EXIT_FAILURE, "signal TTIN"); + recv_loop(); + exit(0); + } +} + +static void start_workers(void) +{ + sigset_t old; + + CHECK(int, 0, sigprocmask(SIG_SETMASK, &fullset, &old)); + for (unsigned long nr = 0; nr < nworker; nr++) { + if (!worker_pids[nr]) + start_worker(nr); + } + CHECK(int, 0, sigprocmask(SIG_SETMASK, &old, NULL)); +} + +static void cleanup_all(void) +{ + cleanup_pids(); +#ifdef __GLIBC__ + tdestroy(srch_tree, free_srch); + srch_tree = NULL; +#endif +} + +static void parent_reopen_logs(void) +{ + reopen_logs(); + for (unsigned long nr = nworker; nr < nworker_hwm; nr++) { + pid_t pid = worker_pids[nr]; + if (pid != 0 && kill(pid, SIGUSR1)) + warn("BUG?: kill(%d, SIGUSR1)", (int)pid); + } +} + +static void sigp(int sig) // parent signal handler +{ + static const char eagain[] = "signals coming in too fast"; + static const char bad_sig[] = "BUG: bad sig\n"; + static const char write_errno[] = "BUG: sigp write (errno)"; + static const char write_zero[] = "BUG: sigp write wrote zero bytes"; + char c = 0; + + switch (sig) { + case SIGCHLD: c = '.'; break; + case SIGTTOU: c = '-'; break; + case SIGTTIN: c = '+'; break; + case SIGUSR1: c = '#'; break; + default: + write(STDERR_FILENO, bad_sig, sizeof(bad_sig) - 1); + _exit(EXIT_FAILURE); + } + ssize_t w = write(pipefds[1], &c, 1); + if (w > 0) return; + if (w < 0 && errno == EAGAIN) { + write(STDERR_FILENO, eagain, sizeof(eagain) - 1); + return; + } else if (w == 0) { + write(STDERR_FILENO, write_zero, sizeof(write_zero) - 1); + } else { + // strerror isn't technically async-signal-safe, and + // strerrordesc_np+strerrorname_np isn't portable + write(STDERR_FILENO, write_errno, sizeof(write_errno) - 1); + } + _exit(EXIT_FAILURE); +} + +static void reaped_worker(pid_t pid, int st) +{ + unsigned long nr = 0; + for (; nr < nworker_hwm; nr++) { + if (worker_pids[nr] == pid) { + worker_pids[nr] = 0; + break; + } + } + if (nr >= nworker_hwm) { + warnx("W: unknown pid=%d reaped $?=%d", (int)pid, st); + return; + } + if (WIFEXITED(st) && WEXITSTATUS(st) == EX_NOINPUT) + alive = false; + else if (st) + warnx("worker[%lu] died $?=%d alive=%d", nr, st, (int)alive); + if (alive) + start_workers(); +} + +static void do_sigchld(void) +{ + while (1) { + int st; + pid_t pid = waitpid(-1, &st, WNOHANG); + if (pid > 0) { + reaped_worker(pid, st); + } else if (pid == 0) { + return; + } else { + switch (errno) { + case ECHILD: return; + case EINTR: break; // can it happen w/ WNOHANG? + default: err(EXIT_FAILURE, "BUG: waitpid"); + } + } + } +} + +static void do_sigttin(void) +{ + if (!alive) return; + if (nworker >= WORKER_MAX) { + warnx("workers cannot exceed %zu", (size_t)WORKER_MAX); + return; + } + void *p = realloc(worker_pids, (nworker + 1) * sizeof(pid_t)); + if (!p) { + warn("realloc worker_pids"); + } else { + worker_pids = (pid_t *)p; + worker_pids[nworker++] = 0; + if (nworker_hwm < nworker) + nworker_hwm = nworker; + start_workers(); + } +} + +static void do_sigttou(void) +{ + if (!alive || nworker <= 1) return; + + // worker_pids array does not shrink + --nworker; + for (unsigned long nr = nworker; nr < nworker_hwm; nr++) { + pid_t pid = worker_pids[nr]; + if (pid != 0 && kill(pid, SIGTERM)) + warn("BUG?: kill(%d, SIGTERM)", (int)pid); + } +} + +static size_t living_workers(void) +{ + size_t ret = 0; + + for (unsigned long nr = 0; nr < nworker_hwm; nr++) { + if (worker_pids[nr]) + ret++; + } + return ret; +} + +int main(int argc, char *argv[]) +{ + int c; + socklen_t slen = (socklen_t)sizeof(c); + stdout_path = getenv("STDOUT_PATH"); + stderr_path = getenv("STDERR_PATH"); + + if (getsockopt(sock_fd, SOL_SOCKET, SO_TYPE, &c, &slen)) + err(EXIT_FAILURE, "getsockopt"); + if (c != SOCK_SEQPACKET) + errx(EXIT_FAILURE, "stdin is not SOCK_SEQPACKET"); + + mail_nrp_init(); + code_nrp_init(); + atexit(cleanup_all); + + if (!STDERR_ASSIGNABLE) { + orig_err_fd = dup(STDERR_FILENO); + if (orig_err_fd < 0) + err(EXIT_FAILURE, "dup(2)"); + } + + nworker = 1; + // make warn/warnx/err multi-process friendly: + if (my_setlinebuf(stderr)) + err(EXIT_FAILURE, "setlinebuf(stderr)"); + // not using -W<workers> like Daemon.pm, since -W is reserved (glibc) + while ((c = getopt(argc, argv, "j:")) != -1) { + char *end; + + switch (c) { + case 'j': + nworker = strtoul(optarg, &end, 10); + if (*end != 0 || nworker > WORKER_MAX) + errx(EXIT_FAILURE, "-j %s invalid", optarg); + break; + case ':': + errx(EXIT_FAILURE, "missing argument: `-%c'", optopt); + case '?': + errx(EXIT_FAILURE, "unrecognized: `-%c'", optopt); + default: + errx(EXIT_FAILURE, "BUG: `-%c'", c); + } + } + sigset_t pset; // parent-only + CHECK(int, 0, sigfillset(&pset)); + + // global sigsets: + CHECK(int, 0, sigfillset(&fullset)); + CHECK(int, 0, sigfillset(&workerset)); + +#define DELSET(sig) do { \ + CHECK(int, 0, sigdelset(&fullset, sig)); \ + CHECK(int, 0, sigdelset(&workerset, sig)); \ + CHECK(int, 0, sigdelset(&pset, sig)); \ +} while (0) + DELSET(SIGABRT); + DELSET(SIGBUS); + DELSET(SIGFPE); + DELSET(SIGILL); + DELSET(SIGSEGV); + DELSET(SIGXCPU); + DELSET(SIGXFSZ); +#undef DELSET + CHECK(int, 0, sigdelset(&workerset, SIGUSR1)); + CHECK(int, 0, sigdelset(&fullset, SIGALRM)); + + if (nworker == 0) { // no SIGTERM handling w/o workers + recv_loop(); + return 0; + } + CHECK(int, 0, sigdelset(&workerset, SIGTERM)); + CHECK(int, 0, sigdelset(&workerset, SIGCHLD)); + nworker_hwm = nworker; + worker_pids = (pid_t *)calloc(nworker, sizeof(pid_t)); + if (!worker_pids) err(EXIT_FAILURE, "calloc"); + + if (pipe(pipefds)) err(EXIT_FAILURE, "pipe"); + int fl = fcntl(pipefds[1], F_GETFL); + if (fl == -1) err(EXIT_FAILURE, "F_GETFL"); + if (fcntl(pipefds[1], F_SETFL, fl | O_NONBLOCK)) + err(EXIT_FAILURE, "F_SETFL"); + + CHECK(int, 0, sigdelset(&pset, SIGCHLD)); + CHECK(int, 0, sigdelset(&pset, SIGTTIN)); + CHECK(int, 0, sigdelset(&pset, SIGTTOU)); + CHECK(int, 0, sigdelset(&pset, SIGUSR1)); + + struct sigaction sa = {}; + sa.sa_handler = sigp; + + CHECK(int, 0, sigaction(SIGUSR1, &sa, NULL)); + CHECK(int, 0, sigaction(SIGTTIN, &sa, NULL)); + CHECK(int, 0, sigaction(SIGTTOU, &sa, NULL)); + sa.sa_flags = SA_NOCLDSTOP; + CHECK(int, 0, sigaction(SIGCHLD, &sa, NULL)); + + CHECK(int, 0, sigprocmask(SIG_SETMASK, &pset, NULL)); + + start_workers(); + + char sbuf[64]; + while (alive || living_workers()) { + ssize_t n = read(pipefds[0], &sbuf, sizeof(sbuf)); + if (n < 0) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "read"); + } else if (n == 0) { + errx(EXIT_FAILURE, "read EOF"); + } + do_sigchld(); + for (ssize_t i = 0; i < n; i++) { + switch (sbuf[i]) { + case '.': break; // do_sigchld already called + case '-': do_sigttou(); break; + case '+': do_sigttin(); break; + case '#': parent_reopen_logs(); break; + default: errx(EXIT_FAILURE, "BUG: c=%c", sbuf[i]); + } + } + } + + return 0; +} diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h new file mode 100644 index 00000000..311ca05f --- /dev/null +++ b/lib/PublicInbox/xh_cidx.h @@ -0,0 +1,277 @@ +// Copyright (C) all contributors <meta@public-inbox.org> +// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt> +// This file is only intended to be included by xap_helper.h +// it implements pieces used by CodeSearchIdx.pm + +static void term_length_extract(struct req *req) +{ + req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t)); + if (!req->lenv) + EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t)); + for (int i = 0; i < req->pfxc; i++) { + char *pfx = req->pfxv[i]; + // extract trailing digits as length: + // $len = s/([0-9]+)\z// ? ($1+0) : 0 + for (size_t j = 0; pfx[j]; j++) { + if (pfx[j] < '0' || pfx[j] > '9') + continue; + if (j == 0) { + warnx("W: `%s' not a valid prefix", pfx); + continue; + } + char *end; + unsigned long long tmp = strtoull(pfx + j, &end, 10); + if (*end || tmp >= (unsigned long long)SIZE_MAX) { + warnx("W: `%s' not recognized", pfx); + } else { + req->lenv[i] = (size_t)tmp; + pfx[j] = 0; + break; + } + } + } +} + +static void dump_ibx_term(struct req *req, int p, + Xapian::Document *doc, const char *ibx_id) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + const char *pfx = req->pfxv[p]; + size_t pfx_len = strlen(pfx); + size_t term_len = req->lenv[p]; + + for (cur.skip_to(pfx); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, pfx, pfx_len)) break; + if (term_len > 0 && (tn.length() - pfx_len) != term_len) + continue; + fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id); + ++req->nr_out; + } +} + +static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id, + Xapian::MSetIterator *i) +{ + try { + Xapian::Document doc = i->get_document(); + for (int p = 0; p < req->pfxc; p++) + dump_ibx_term(req, p, &doc, ibx_id); + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + return ITER_RETRY; + } catch (const Xapian::DocNotFoundError & e) { // oh well... + warnx("doc not found: %s", e.get_description().c_str()); + } + return ITER_OK; +} + +static bool cmd_dump_ibx(struct req *req) +{ + if ((optind + 1) >= req->argc) + ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR"); + if (!req->pfxc) + ABORT("dump_ibx requires -A PREFIX"); + + const char *ibx_id = req->argv[optind]; + if (my_setlinebuf(req->fp[0])) // for sort(1) pipe + EABORT("setlinebuf(fp[0])"); // WTF? + req->asc = true; + req->sort_col = -1; + term_length_extract(req); + Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]); + + // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine + // in case we need to retry on DB reopens + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + for (int t = 10; t > 0; --t) + switch (dump_ibx_iter(req, ibx_id, &i)) { + case ITER_OK: t = 0; break; // leave inner loop + case ITER_RETRY: break; // continue for-loop + case ITER_ABORT: return false; // error + } + } + emit_mset_stats(req, &mset); + return true; +} + +struct dump_roots_tmp { + struct stat sb; + void *mm_ptr; + char **entries; + struct fbuf wbuf; + int root2off_fd; +}; + +#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure))) +static void dump_roots_ensure(void *ptr) +{ + struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr; + if (drt->root2off_fd >= 0) + xclose(drt->root2off_fd); + hdestroy(); // idempotent + size_t size = off2size(drt->sb.st_size); + if (drt->mm_ptr && munmap(drt->mm_ptr, size)) + EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size); + free(drt->entries); + fbuf_ensure(&drt->wbuf); +} + +static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + ENTRY e, *ep; + fbuf_init(root_offs); + for (cur.skip_to("G"); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, "G", 1)) break; + union { const char *in; char *out; } u; + u.in = tn.c_str() + 1; + e.key = u.out; + ep = hsearch(e, FIND); + if (!ep) ABORT("hsearch miss `%s'", e.key); + // ep->data is a NUL-terminated string matching /[0-9]+/ + fputc(' ', root_offs->fp); + fputs((const char *)ep->data, root_offs->fp); + } + fputc('\n', root_offs->fp); + ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM + root_offs->fp = NULL; + return true; +} + +// writes term values matching @pfx for a given @doc, ending the line +// with the contents of @root_offs +static void dump_roots_term(struct req *req, int p, + struct dump_roots_tmp *drt, + struct fbuf *root_offs, + Xapian::Document *doc) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + const char *pfx = req->pfxv[p]; + size_t pfx_len = strlen(pfx); + size_t term_len = req->lenv[p]; + + for (cur.skip_to(pfx); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, pfx, pfx_len)) break; + if (term_len > 0 && (tn.length() - pfx_len) != term_len) + continue; + fputs(tn.c_str() + pfx_len, drt->wbuf.fp); + fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp); + ++req->nr_out; + } +} + +// we may have lines which exceed PIPE_BUF, so we do our own +// buffering and rely on flock(2), here +static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt) +{ + bool ok = true; + off_t off = ftello(drt->wbuf.fp); + if (off < 0) EABORT("ftello"); + if (!off) return ok; + + ERR_FLUSH(drt->wbuf.fp); // ENOMEM + int fd = fileno(req->fp[0]); + + while (flock(drt->root2off_fd, LOCK_EX)) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK? + } + ok = write_all(fd, &drt->wbuf, (size_t)off); + while (flock(drt->root2off_fd, LOCK_UN)) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK? + } + if (fseeko(drt->wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); + return ok; +} + +static enum exc_iter dump_roots_iter(struct req *req, + struct dump_roots_tmp *drt, + Xapian::MSetIterator *i) +{ + CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n" + try { + Xapian::Document doc = i->get_document(); + if (!root2offs_str(&root_offs, &doc)) + return ITER_ABORT; // bad request, abort + for (int p = 0; p < req->pfxc; p++) + dump_roots_term(req, p, drt, &root_offs, &doc); + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + return ITER_RETRY; + } catch (const Xapian::DocNotFoundError & e) { // oh well... + warnx("doc not found: %s", e.get_description().c_str()); + } + return ITER_OK; +} + +static bool cmd_dump_roots(struct req *req) +{ + CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {}; + drt.root2off_fd = -1; + if ((optind + 1) >= req->argc) + ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR"); + if (!req->pfxc) + ABORT("dump_roots requires -A PREFIX"); + const char *root2off_file = req->argv[optind]; + drt.root2off_fd = open(root2off_file, O_RDONLY); + if (drt.root2off_fd < 0) + EABORT("open(%s)", root2off_file); + if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM? + err(EXIT_FAILURE, "fstat(%s)", root2off_file); + // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0), + // so /32 overestimates the number of expected entries by + // ~%25 (as recommended by Linux hcreate(3) manpage) + size_t size = off2size(drt.sb.st_size); + size_t est = (size / 32) + 1; //+1 for "\0" termination + drt.mm_ptr = mmap(NULL, size, PROT_READ, + MAP_PRIVATE, drt.root2off_fd, 0); + if (drt.mm_ptr == MAP_FAILED) + err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file); + size_t asize = est * 2; + if (asize < est) ABORT("too many entries: %zu", est); + drt.entries = (char **)calloc(asize, sizeof(char *)); + if (!drt.entries) + err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *)); + size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize); + if (tot <= 0) return false; // split2argv already warned on error + if (!hcreate(est)) + err(EXIT_FAILURE, "hcreate(%zu)", est); + for (size_t i = 0; i < tot; ) { + ENTRY e; + e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM + e.data = drt.entries[i++]; + if (!hsearch(e, ENTER)) + err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key, + (const char *)e.data); + } + req->asc = true; + req->sort_col = -1; + Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]); + term_length_extract(req); + + fbuf_init(&drt.wbuf); + + // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine + // in case we need to retry on DB reopens + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + for (int t = 10; t > 0; --t) + switch (dump_roots_iter(req, &drt, &i)) { + case ITER_OK: t = 0; break; // leave inner loop + case ITER_RETRY: break; // continue for-loop + case ITER_ABORT: return false; // error + } + if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt)) + return false; + } + if (!dump_roots_flush(req, &drt)) + return false; + emit_mset_stats(req, &mset); + return true; +} diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h new file mode 100644 index 00000000..db2692c9 --- /dev/null +++ b/lib/PublicInbox/xh_mset.h @@ -0,0 +1,53 @@ +// Copyright (C) all contributors <meta@public-inbox.org> +// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt> +// This file is only intended to be included by xap_helper.h +// it implements pieces used by WWW, IMAP and lei + +#ifndef WBUF_FLUSH_THRESHOLD +# define WBUF_FLUSH_THRESHOLD (BUFSIZ - 1000) +#endif +#if WBUF_FLUSH_THRESHOLD < 0 +# undef WBUF_FLUSH_THRESHOLD +# define WBUF_FLUSH_THRESHOLD BUFSIZ +#endif + +static bool cmd_mset(struct req *req) +{ + if (optind >= req->argc) ABORT("usage: mset [OPTIONS] WANT QRY_STR"); + if (req->fp[1]) ABORT("mset only accepts 1 FD"); + const char *qry_str = req->argv[optind]; + CLEANUP_FBUF struct fbuf wbuf = {}; + Xapian::MSet mset = req->code_search ? commit_mset(req, qry_str) : + mail_mset(req, qry_str); + fbuf_init(&wbuf); + fprintf(wbuf.fp, "mset.size=%llu .get_matches_estimated=%llu\n", + (unsigned long long)mset.size(), + (unsigned long long)mset.get_matches_estimated()); + int fd = fileno(req->fp[0]); + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + off_t off = ftello(wbuf.fp); + if (off < 0) EABORT("ftello"); + /* + * TODO verify our fflush + fseeko use isn't affected by a + * glibc <2.25 bug: + * https://sourceware.org/bugzilla/show_bug.cgi?id=20181 + * CentOS 7.x only has glibc 2.17. In any case, bug #20181 + * shouldn't affect us since our use of fseeko is used to + * effectively discard data. + */ + if (off > WBUF_FLUSH_THRESHOLD) { + ERR_FLUSH(wbuf.fp); + if (!write_all(fd, &wbuf, (size_t)off)) return false; + if (fseeko(wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); + off = 0; + } + fprintf(wbuf.fp, "%llu" "%c" "%d" "%c" "%llu\n", + (unsigned long long)(*i), // get_docid + 0, i.get_percent(), + 0, (unsigned long long)i.get_rank()); + } + off_t off = ftello(wbuf.fp); + if (off < 0) EABORT("ftello"); + ERR_FLUSH(wbuf.fp); + return off > 0 ? write_all(fd, &wbuf, (size_t)off) : true; +} |