diff options
Diffstat (limited to 'lib/PublicInbox')
236 files changed, 34032 insertions, 8053 deletions
diff --git a/lib/PublicInbox/Address.pm b/lib/PublicInbox/Address.pm index f413c2f6..3a59945c 100644 --- a/lib/PublicInbox/Address.pm +++ b/lib/PublicInbox/Address.pm @@ -1,8 +1,9 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::Address; -use strict; -use warnings; +use v5.12; +use parent qw(Exporter); +our @EXPORT_OK = qw(pairs); sub xs_emails { grep { defined } map { $_->address() } parse_email_addresses($_[0]) @@ -17,17 +18,30 @@ sub xs_names { } parse_email_addresses($_[0]); } +sub xs_pairs { # for JMAP, RFC 8621 section 4.1.2.3 + [ map { # LHS (name) may be undef if there's an address + my @p = ($_->phrase // $_->comment, $_->address); + # show original if totally bogus: + $p[0] = $_->original unless defined $p[1]; + \@p; + } parse_email_addresses($_[0]) ]; +} + eval { require Email::Address::XS; Email::Address::XS->import(qw(parse_email_addresses)); *emails = \&xs_emails; *names = \&xs_names; + *pairs = \&xs_pairs; + *objects = sub { Email::Address::XS->parse(@_) }; }; if ($@) { require PublicInbox::AddressPP; *emails = \&PublicInbox::AddressPP::emails; *names = \&PublicInbox::AddressPP::names; + *pairs = \&PublicInbox::AddressPP::pairs; + *objects = \&PublicInbox::AddressPP::objects; } 1; diff --git a/lib/PublicInbox/AddressPP.pm b/lib/PublicInbox/AddressPP.pm index 74a82843..65ba36a9 100644 --- a/lib/PublicInbox/AddressPP.pm +++ b/lib/PublicInbox/AddressPP.pm @@ -1,7 +1,8 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::AddressPP; use strict; +use v5.10.1; # TODO check regexps for unicode_strings compat # very loose regexes, here. We don't need RFC-compliance, # just enough to make thing sanely displayable and pass to git @@ -13,6 +14,7 @@ sub emails { } sub names { + # split by address and post-address comment my @p = split(/<?([^@<>]+)\@[\w\.\-]+>?\s*(\(.*?\))?(?:,\s*|\z)/, $_[0]); my @ret; @@ -35,4 +37,33 @@ sub names { @ret; } +sub pairs { # for JMAP, RFC 8621 section 4.1.2.3 + my ($s) = @_; + [ map { + my $addr = $_; + if ($s =~ s/\A\s*(.*?)\s*<\Q$addr\E>\s*(.*?)\s*(?:,|\z)// || + $s =~ s/\A\s*(.*?)\s*\Q$addr\E\s*(.*?)\s*(?:,|\z)//) { + my ($phrase, $comment) = ($1, $2); + $phrase =~ tr/\r\n\t / /s; + $phrase =~ s/\A['"\s]*//; + $phrase =~ s/['"\s]*\z//; + $phrase =~ s/\s*<*\s*\z//; + $phrase = undef if $phrase !~ /\S/; + $comment = ($comment =~ /\((.*?)\)/) ? $1 : undef; + [ $phrase // $comment, $addr ] + } else { + (); + } + } emails($s) ]; +} + +# Mail::Address->name is inconsistent with Email::Address::XS, so we're +# doing our own thing, here: +sub objects { map { bless $_, __PACKAGE__ } @{pairs($_[0])} } + +# OO API for objects() results +sub user { (split(/@/, $_[0]->[1]))[0] } +sub host { (split(/@/, $_[0]->[1]))[1] } +sub name { $_[0]->[0] // user($_[0]) } + 1; diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index fb88e621..a1b1fc07 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -1,20 +1,21 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # common stuff for administrative command-line tools # Unstable internal API package PublicInbox::Admin; -use strict; +use v5.12; use parent qw(Exporter); -use Cwd qw(abs_path); -use POSIX (); -our @EXPORT_OK = qw(resolve_repo_dir setup_signals); +our @EXPORT_OK = qw(setup_signals fmt_localtime); use PublicInbox::Config; use PublicInbox::Inbox; -use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::Spawn qw(run_qx); +use PublicInbox::Eml; +*rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed; sub setup_signals { my ($cb, $arg) = @_; # optional + require POSIX; # we call exit() here instead of _exit() so DESTROY methods # get called (e.g. File::Temp::Dir and PublicInbox::Msgmap) @@ -27,48 +28,68 @@ sub setup_signals { }; } -sub resolve_repo_dir { - my ($cd, $ver) = @_; - my $prefix = defined $cd ? $cd : './'; - if (-d $prefix && -f "$prefix/inbox.lock") { # v2 - $$ver = 2 if $ver; - return abs_path($prefix); +sub resolve_any_idxdir ($$) { + my ($cd, $lock_bn) = @_; + my $try = $cd // '.'; + my $root_dev_ino; + while (1) { + if (-f "$try/$lock_bn") { # inbox.lock, ei.lock, cidx.lock + return rel2abs_collapsed($try); + } elsif (-d $try) { + my @try = stat _; + $root_dev_ino //= do { + my @root = stat('/') or die "stat /: $!\n"; + "$root[0]\0$root[1]"; + }; + return undef if "$try[0]\0$try[1]" eq $root_dev_ino; + $try .= '/..'; # continue, cd up + } else { + die "`$try' is not a directory\n"; + } } - my $cmd = [ qw(git rev-parse --git-dir) ]; - my $fh = popen_rd($cmd, undef, {-C => $cd}); - my $dir = do { local $/; <$fh> }; - close $fh or die "error in ".join(' ', @$cmd)." (cwd:$cd): $!\n"; - chomp $dir; - $$ver = 1 if $ver; - return abs_path($cd) if ($dir eq '.' && defined $cd); - abs_path($dir); } -# for unconfigured inboxes -sub detect_indexlevel ($) { - my ($ibx) = @_; +sub resolve_eidxdir ($) { resolve_any_idxdir($_[0], 'ei.lock') } +sub resolve_cidxdir ($) { resolve_any_idxdir($_[0], 'cidx.lock') } - my $over = $ibx->over; - my $srch = $ibx->search; - delete @$ibx{qw(over search)}; # don't leave open FDs lying around +sub resolve_inboxdir { + my ($cd, $ver) = @_; + my $dir; + if (defined($dir = resolve_any_idxdir($cd, 'inbox.lock'))) { # try v2 + $$ver = 2 if $ver; + } elsif (defined($dir = resolve_git_dir($cd))) { # try v1 + $$ver = 1 if $ver; + } # else: not an inbox at all + $dir; +} - # brand new or never before indexed inboxes default to full - return 'full' unless $over; - my $l = 'basic'; - return $l unless $srch; - if (my $xdb = $srch->xdb) { - $l = 'full'; - my $m = $xdb->get_metadata('indexlevel'); - if ($m eq 'medium') { - $l = $m; - } elsif ($m ne '') { - warn <<""; -$ibx->{inboxdir} has unexpected indexlevel in Xapian: $m +sub valid_pwd { + my $pwd = $ENV{PWD} // return; + my @st_pwd = stat $pwd or return; + my @st_cwd = stat '.' or die "stat(.): $!"; + "@st_pwd[1,0]" eq "@st_cwd[1,0]" ? $pwd : undef; +} - } - $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata'); +sub resolve_git_dir { + my ($cd) = @_; # cd may be `undef' for cwd + # try v1 bare git dirs + my $pwd = valid_pwd(); + my $env; + defined($pwd) && substr($cd // '/', 0, 1) ne '/' and + $env->{PWD} = "$pwd/$cd"; + my $cmd = [ qw(git rev-parse --git-dir) ]; + my $dir = run_qx($cmd, $env, { -C => $cd }); + die "error in @$cmd (cwd:${\($cd // '.')}): $?\n" if $?; + chomp $dir; + # --absolute-git-dir requires git v2.13.0+, and we want to + # respect symlinks when $ENV{PWD} if $ENV{PWD} ne abs_path('.') + # since we store absolute GIT_DIR paths in cindex. + if (substr($dir, 0, 1) ne '/') { + substr($cd // '/', 0, 1) eq '/' or + $cd = File::Spec->rel2abs($cd, $pwd); + $dir = rel2abs_collapsed($dir, $cd); } - $l; + $dir; } sub unconfigured_ibx ($$) { @@ -78,8 +99,8 @@ sub unconfigured_ibx ($$) { name => $name, address => [ "$name\@example.com" ], inboxdir => $dir, - # TODO: consumers may want to warn on this: - #-unconfigured => 1, + # consumers (-convert) warn on this: + -unconfigured => 1, }); } @@ -93,63 +114,100 @@ sub resolve_inboxes ($;$$) { $cfg or die "--all specified, but $cfgfile not readable\n"; @$argv and die "--all specified, but directories specified\n"; } - + my (@old, @ibxs, @eidx, @cidx); + if ($opt->{-cidx_ok}) { + require PublicInbox::CodeSearchIdx; + @$argv = grep { + if (defined(my $d = resolve_cidxdir($_))) { + push @cidx, PublicInbox::CodeSearchIdx->new( + $d, $opt); + undef; + } else { + 1; + } + } @$argv; + } + if ($opt->{-eidx_ok}) { + require PublicInbox::ExtSearchIdx; + @$argv = grep { + if (defined(my $ei = resolve_eidxdir($_))) { + $ei = PublicInbox::ExtSearchIdx->new($ei, $opt); + push @eidx, $ei; + undef; + } else { + 1; + } + } @$argv; + } my $min_ver = $opt->{-min_inbox_version} || 0; - my (@old, @ibxs); - my %dir2ibx; - if ($cfg) { + # lookup inboxes by st_dev + st_ino instead of {inboxdir} pathnames, + # pathnames are not unique due to symlinks and bind mounts + if ($opt->{all}) { $cfg->each_inbox(sub { my ($ibx) = @_; - my $path = abs_path($ibx->{inboxdir}); - if (defined($path)) { - $dir2ibx{$path} = $ibx; + if (-e $ibx->{inboxdir}) { + push(@ibxs, $ibx) if $ibx->version >= $min_ver; } else { - warn <<EOF; -W: $ibx->{name} $ibx->{inboxdir}: $! -EOF + warn "W: $ibx->{name} $ibx->{inboxdir}: $!\n"; } }); - } - if ($opt->{all}) { - my @all = values %dir2ibx; - @all = grep { $_->version >= $min_ver } @all; - push @ibxs, @all; + # TODO: no way to configure cindex in config file, yet } else { # directories specified on the command-line - my $i = 0; my @dirs = @$argv; - push @dirs, '.' unless @dirs; - foreach (@dirs) { - my $v; - my $dir = resolve_repo_dir($_, \$v); - if ($v < $min_ver) { + push @dirs, '.' if !@dirs && $opt->{-use_cwd}; + my %s2i; # "st_dev\0st_ino" => array index + for (my $i = 0; $i <= $#dirs; $i++) { + my $dir = $dirs[$i]; + my @st = stat($dir) or die "stat($dir): $!\n"; + $dir = $dirs[$i] = resolve_inboxdir($dir, \(my $ver)); + if ($ver >= $min_ver) { + $s2i{"$st[0]\0$st[1]"} //= $i; + } else { push @old, $dir; - next; } - my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i); - $i++; - push @ibxs, $ibx; } + my $done = \'done'; + eval { + $cfg->each_inbox(sub { + my ($ibx) = @_; + return if $ibx->version < $min_ver; + my $dir = $ibx->{inboxdir}; + if (my @s = stat $dir) { + my $i = delete($s2i{"$s[0]\0$s[1]"}) + // return; + $ibxs[$i] = $ibx; + die $done if !keys(%s2i); + } else { + warn "W: $ibx->{name} $dir: $!\n"; + } + }); + }; + die $@ if $@ && $@ ne $done; + for my $i (sort { $a <=> $b } values %s2i) { + $ibxs[$i] = unconfigured_ibx($dirs[$i], $i); + } + @ibxs = grep { defined } @ibxs; # duplicates are undef } if (@old) { die "-V$min_ver inboxes not supported by $0\n\t", join("\n\t", @old), "\n"; } - @ibxs; + ($opt->{-eidx_ok} || $opt->{-cidx_ok}) ? (\@ibxs, \@eidx, \@cidx) + : @ibxs; } -# TODO: make Devel::Peek optional, only used for daemon -my @base_mod = qw(Devel::Peek); +my @base_mod = (); my @over_mod = qw(DBD::SQLite DBI); my %mod_groups = ( -index => [ @base_mod, @over_mod ], -base => \@base_mod, - -search => [ @base_mod, @over_mod, 'Search::Xapian' ], + -search => [ @base_mod, @over_mod, 'Xapian' ], ); sub scan_ibx_modules ($$) { my ($mods, $ibx) = @_; if (!$ibx->{indexlevel} || $ibx->{indexlevel} ne 'basic') { - $mods->{'Search::Xapian'} = 1; + $mods->{'Xapian'} = 1; } else { $mods->{$_} = 1 foreach @over_mod; } @@ -161,10 +219,10 @@ sub check_require { while (my $mod = shift @mods) { if (my $groups = $mod_groups{$mod}) { push @mods, @$groups; - } elsif ($mod eq 'Search::Xapian') { + } elsif ($mod eq 'Xapian') { require PublicInbox::Search; PublicInbox::Search::load_xapian() or - $err->{'Search::Xapian || Xapian'} = $@; + $err->{'Xapian || Search::Xapian'} = $@; } else { eval "require $mod"; $err->{$mod} = $@ if $@; @@ -208,13 +266,19 @@ sub index_terminate { sub index_inbox { my ($ibx, $im, $opt) = @_; + require PublicInbox::InboxWritable; my $jobs = delete $opt->{jobs} if $opt; if (my $pr = $opt->{-progress}) { $pr->("indexing $ibx->{inboxdir} ...\n"); } - local %SIG = %SIG; + local @SIG{keys %SIG} = values %SIG; setup_signals(\&index_terminate, $ibx); - if (ref($ibx) && $ibx->version == 2) { + my $idx = { current_info => $ibx->{inboxdir} }; + local $SIG{__WARN__} = sub { + return if PublicInbox::Eml::warn_ignore(@_); + warn($idx->{current_info}, ': ', @_); + }; + if ($ibx->version == 2) { eval { require PublicInbox::V2Writable }; die "v2 requirements not met: $@\n" if $@; $ibx->{-creat_opt}->{nproc} = $jobs; @@ -225,25 +289,23 @@ sub index_inbox { } else { my $n = $v2w->{shards}; if ($jobs < ($n + 1) && !$opt->{reshard}) { - warn -"Unable to respect --jobs=$jobs on index, inbox was created with $n shards\n"; + warn <<EOM; +Unable to respect --jobs=$jobs on index, inbox was created with $n shards +EOM } } } - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; - local $SIG{__WARN__} = sub { - $warn_cb->($v2w->{current_info}, ': ', @_); - }; - $v2w->index_sync($opt); + $idx = $v2w; } else { require PublicInbox::SearchIdx; - my $s = PublicInbox::SearchIdx->new($ibx, 1); - $s->index_sync($opt); + $idx = PublicInbox::SearchIdx->new($ibx, 1); } + $idx->index_sync($opt); + $idx->{nidx} // 0; # returns number processed } -sub progress_prepare ($) { - my ($opt) = @_; +sub progress_prepare ($;$) { + my ($opt, $dst) = @_; # public-inbox-index defaults to quiet, -xcpdb and -compact do not if (defined($opt->{quiet}) && $opt->{quiet} < 0) { @@ -255,7 +317,8 @@ sub progress_prepare ($) { $opt->{1} = $null; # suitable for spawn() redirect } else { $opt->{verbose} ||= 1; - $opt->{-progress} = sub { print STDERR @_ }; + $dst //= *STDERR{GLOB}; + $opt->{-progress} = sub { print $dst '# ', @_ }; } } @@ -295,15 +358,35 @@ sub index_prepare ($$) { $opt->{batch_size} and $env = { XAPIAN_FLUSH_THRESHOLD => '4294967295' }; - for my $k (qw(sequential_shard)) { + for my $k (qw(sequential-shard)) { my $git_key = "publicInbox.index".ucfirst($k); - $git_key =~ s/_([a-z])/\U$1/g; + $git_key =~ s/-([a-z])/\U$1/g; defined(my $s = $opt->{$k} // $cfg->{lc($git_key)}) or next; defined(my $v = $cfg->git_bool($s)) or die "`$git_key=$s' not boolean\n"; $opt->{$k} = $v; } + for my $k (qw(since until)) { + my $v = $opt->{$k} // next; + $opt->{reindex} or die "--$k=$v requires --reindex\n"; + } $env; } +sub do_chdir ($) { + my $chdir = $_[0] // return; + for my $d (@$chdir) { + next if $d eq ''; # same as git(1) + chdir $d or die "cd $d: $!"; + } +} + +sub fmt_localtime ($) { + require POSIX; + my @lt = localtime $_[0]; + my (undef, $M, $H, $d, $m, $Y) = @lt; + sprintf('%u-%02u-%02u % 2u:%02u ', $Y + 1900, $m + 1, $d, $H, $M) + .POSIX::strftime('%z', @lt); +} + 1; diff --git a/lib/PublicInbox/AdminEdit.pm b/lib/PublicInbox/AdminEdit.pm index 4448dcc2..654141a7 100644 --- a/lib/PublicInbox/AdminEdit.pm +++ b/lib/PublicInbox/AdminEdit.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # common stuff between -edit, -purge (and maybe -learn in the future) @@ -19,16 +19,17 @@ sub check_editable ($) { } # Undefined indexlevel, so `full'... - # Search::Xapian exists and the DB can be read, at least, fine + # Xapian exists and the DB can be read, at least, fine $ibx->search and next; # it's possible for a Xapian directory to exist, - # but Search::Xapian to go missing/broken. + # but Xapian to go missing/broken. # Make sure it's purged in that case: $ibx->over or die "no over.sqlite3 in $ibx->{inboxdir}\n"; - # $ibx->{search} is populated by $ibx->over call - my $xdir_ro = $ibx->{search}->xdir(1); + require PublicInbox::Search; + my $xdir_ro = PublicInbox::Search->new($ibx)->xdir(1); + my $nshard = 0; foreach my $shard (<$xdir_ro/*>) { if (-d $shard && $shard =~ m!/[0-9]+\z!) { diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm index 6d16242a..80757ceb 100644 --- a/lib/PublicInbox/AltId.pm +++ b/lib/PublicInbox/AltId.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Used for giving serial numbers to messages. This can be tied to diff --git a/lib/PublicInbox/Aspawn.pm b/lib/PublicInbox/Aspawn.pm new file mode 100644 index 00000000..49f8651a --- /dev/null +++ b/lib/PublicInbox/Aspawn.pm @@ -0,0 +1,34 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# async system()/qx() which takes callback +package PublicInbox::Aspawn; +use v5.12; +use parent qw(Exporter); +use PublicInbox::DS qw(awaitpid); +use PublicInbox::Spawn qw(spawn); +our @EXPORT_OK = qw(run_await); + +sub _await_cb { # awaitpid cb + my ($pid, $cmd, $env, $opt, $cb, @args) = @_; + PublicInbox::Spawn::read_out_err($opt); + if ($? && !$opt->{quiet}) { + my ($status, $sig) = ($? >> 8, $? & 127); + my $msg = ''; + $msg .= " (-C=$opt->{-C})" if defined $opt->{-C}; + $msg .= " status=$status" if $status; + $msg .= " signal=$sig" if $sig; + warn "E: @$cmd", $msg, "\n"; + } + $cb->($pid, $cmd, $env, $opt, @args) if $cb; +} + +sub run_await { + my ($cmd, $env, $opt, $cb, @args) = @_; + $opt->{1} //= \(my $out); + my $pid = spawn($cmd, $env, $opt); + awaitpid($pid, \&_await_cb, $cmd, $env, $opt, $cb, @args); + awaitpid($pid); # synchronous for non-$in_loop +} + +1; diff --git a/lib/PublicInbox/AutoReap.pm b/lib/PublicInbox/AutoReap.pm new file mode 100644 index 00000000..ae4984b8 --- /dev/null +++ b/lib/PublicInbox/AutoReap.pm @@ -0,0 +1,33 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# automatically kill + reap children when this goes out-of-scope +package PublicInbox::AutoReap; +use v5.12; + +sub new { + my (undef, $pid, $cb) = @_; + bless { pid => $pid, cb => $cb, owner => $$ }, __PACKAGE__ +} + +sub kill { + my ($self, $sig) = @_; + CORE::kill($sig // 'TERM', $self->{pid}); +} + +sub join { + my ($self, $sig) = @_; + my $pid = delete $self->{pid} or return; + $self->{cb}->() if defined $self->{cb}; + CORE::kill($sig, $pid) if defined $sig; + my $r = waitpid($pid, 0); + $r == $pid or die "BUG? waitpid($pid) => $r (\$?=$? \$!=$!)"; +} + +sub DESTROY { + my ($self) = @_; + return if $self->{owner} != $$; + $self->join('TERM'); +} + +1; diff --git a/lib/PublicInbox/Cgit.pm b/lib/PublicInbox/Cgit.pm index fb0d0e60..78fc9ca0 100644 --- a/lib/PublicInbox/Cgit.pm +++ b/lib/PublicInbox/Cgit.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # wrapper for cgit(1) and git-http-backend(1) for browsing and @@ -6,7 +6,8 @@ # directive to be set in the public-inbox config file. package PublicInbox::Cgit; -use strict; +use v5.12; +use parent qw(PublicInbox::WwwCoderepo); use PublicInbox::GitHTTPBackend; use PublicInbox::Git; # not bothering with Exporter for a one-off @@ -16,9 +17,9 @@ use PublicInbox::Qspawn; use PublicInbox::WwwStatic qw(r); sub locate_cgit ($) { - my ($pi_config) = @_; - my $cgit_bin = $pi_config->{'publicinbox.cgitbin'}; - my $cgit_data = $pi_config->{'publicinbox.cgitdata'}; + my ($pi_cfg) = @_; + my $cgit_bin = $pi_cfg->{'publicinbox.cgitbin'}; + my $cgit_data = $pi_cfg->{'publicinbox.cgitdata'}; # /var/www/htdocs/cgit is the default install path from cgit.git # /usr/{lib,share}/cgit is where Debian puts cgit @@ -40,10 +41,9 @@ sub locate_cgit ($) { if (defined($cgit_bin) && $cgit_bin =~ m!\A(.+?)/[^/]+\z!) { unshift @dirs, $1 if -d $1; } - foreach my $d (@dirs) { - my $f = "$d/cgit.css"; - next unless -f $f; - $cgit_data = $d; + for (@dirs) { + next unless -f "$_/cgit.css"; + $cgit_data = $_; last; } } @@ -51,30 +51,20 @@ sub locate_cgit ($) { } sub new { - my ($class, $pi_config) = @_; - my ($cgit_bin, $cgit_data) = locate_cgit($pi_config); - + my ($class, $pi_cfg) = @_; + my ($cgit_bin, $cgit_data) = locate_cgit($pi_cfg); + $cgit_bin // return; # fall back in WWW->cgit my $self = bless { cmd => [ $cgit_bin ], cgit_data => $cgit_data, - pi_config => $pi_config, + pi_cfg => $pi_cfg, + cgitrc => $pi_cfg->{'publicinbox.cgitrc'} // $ENV{CGIT_CONFIG}, }, $class; - $pi_config->fill_all; # fill in -code_repos mapped to inboxes - # some cgit repos may not be mapped to inboxes, so ensure those exist: - my $code_repos = $pi_config->{-code_repos}; - foreach my $k (keys %$pi_config) { - $k =~ /\Acoderepo\.(.+)\.dir\z/ or next; - my $dir = $pi_config->{$k}; - $code_repos->{$1} ||= PublicInbox::Git->new($dir); - } - while (my ($nick, $repo) = each %$code_repos) { - $self->{"\0$nick"} = $repo; - } - my $cgit_static = $pi_config->{-cgit_static}; - my $static = join('|', map { quotemeta $_ } keys %$cgit_static); - $self->{static} = qr/\A($static)\z/; + PublicInbox::WwwCoderepo::prepare_coderepos($self); + my $s = join('|', map { quotemeta } keys %{$pi_cfg->{-cgit_static}}); + $self->{static} = qr/\A($s)\z/; $self; } @@ -95,14 +85,14 @@ my @PASS_ENV = qw( my $parse_cgi_headers = \&PublicInbox::GitHTTPBackend::parse_cgi_headers; sub call { - my ($self, $env) = @_; + my ($self, $env, $ctx) = @_; # $ctx is optional, used by WWW my $path_info = $env->{PATH_INFO}; my $cgit_data; # handle requests without spawning cgit iff possible: if ($path_info =~ m!\A/(.+?)/($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my ($nick, $path) = ($1, $2); - if (my PublicInbox::Git $git = $self->{"\0$nick"}) { + if (my $git = $self->{pi_cfg}->get_coderepo($nick)) { return serve($env, $git, $path); } } elsif ($path_info =~ m!$self->{static}! && @@ -111,17 +101,14 @@ sub call { return PublicInbox::WwwStatic::response($env, [], $f); } - my $cgi_env = { PATH_INFO => $path_info }; - foreach (@PASS_ENV) { - defined(my $v = $env->{$_}) or next; - $cgi_env->{$_} = $v; - } - $cgi_env->{'HTTPS'} = 'on' if $env->{'psgi.url_scheme'} eq 'https'; + my %cgi_env = (CGIT_CONFIG => $self->{cgitrc}, PATH_INFO => $path_info); + @cgi_env{@PASS_ENV} = @$env{@PASS_ENV}; # spawn ignores undef vals + $cgi_env{HTTPS} = 'on' if $env->{'psgi.url_scheme'} eq 'https'; my $rdr = input_prepare($env) or return r(500); - my $qsp = PublicInbox::Qspawn->new($self->{cmd}, $cgi_env, $rdr); - my $limiter = $self->{pi_config}->limiter('-cgit'); - $qsp->psgi_return($env, $limiter, $parse_cgi_headers); + my $qsp = PublicInbox::Qspawn->new($self->{cmd}, \%cgi_env, $rdr); + my $limiter = $self->{pi_cfg}->limiter('-cgit'); + $qsp->psgi_yield($env, $limiter, $parse_cgi_headers, $ctx); } 1; diff --git a/lib/PublicInbox/CidxComm.pm b/lib/PublicInbox/CidxComm.pm new file mode 100644 index 00000000..80a235e9 --- /dev/null +++ b/lib/PublicInbox/CidxComm.pm @@ -0,0 +1,28 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Waits for initial comm(1) output for PublicInbox::CodeSearchIdx. +# The initial output from `comm' can take a while to generate because +# it needs to wait on: +# `git cat-file --batch-all-objects --batch-check --unordered | sort' +# We still rely on blocking reads, here, since comm should be fast once +# it's seeing input. (`--unordered | sort' is intentional for HDDs) +package PublicInbox::CidxComm; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); + +sub new { + my ($cls, $rd, $cidx, $drs) = @_; + my $self = bless { cidx => $cidx, drs => $drs }, $cls; + $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); +} + +sub event_step { + my ($self) = @_; + my $rd = $self->{sock} // return warn('BUG?: no {sock}'); + $self->close; # EPOLL_CTL_DEL + delete($self->{cidx})->cidx_read_comm($rd, delete $self->{drs}); +} + +1; diff --git a/lib/PublicInbox/CidxLogP.pm b/lib/PublicInbox/CidxLogP.pm new file mode 100644 index 00000000..5ea675bf --- /dev/null +++ b/lib/PublicInbox/CidxLogP.pm @@ -0,0 +1,28 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Waits for initial `git log -p' output for PublicInbox::CodeSearchIdx. +# The initial output from `git log -p' can take a while to generate, +# CodeSearchIdx can process prune work while it's happening. Once +# `git log -p' starts generating output, it should be able to keep +# up with Xapian indexing, so we still rely on blocking reads to simplify +# cidx_read_log_p +package PublicInbox::CidxLogP; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); + +sub new { + my ($cls, $rd, $cidx, $git, $roots) = @_; + my $self = bless { cidx => $cidx, git => $git, roots => $roots }, $cls; + $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); +} + +sub event_step { + my ($self) = @_; + my $rd = $self->{sock} // return warn('BUG?: no {sock}'); + $self->close; # EPOLL_CTL_DEL + delete($self->{cidx})->cidx_read_log_p($self, $rd); +} + +1; diff --git a/lib/PublicInbox/CidxXapHelperAux.pm b/lib/PublicInbox/CidxXapHelperAux.pm new file mode 100644 index 00000000..91c9b021 --- /dev/null +++ b/lib/PublicInbox/CidxXapHelperAux.pm @@ -0,0 +1,48 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Intended for PublicInbox::DS::event_loop for -cindex --associate, +# this reports auxilliary status while dumping +package PublicInbox::CidxXapHelperAux; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN); + +# rpipe connects to req->fp[1] in xap_helper.h +sub new { + my ($cls, $rpipe, $cidx, $pfx) = @_; + my $self = bless { cidx => $cidx, pfx => $pfx }, $cls; + $rpipe->blocking(0); + $self->SUPER::new($rpipe, EPOLLIN); +} + +sub event_step { + my ($self) = @_; # xap_helper.h is line-buffered + my $buf = delete($self->{buf}) // ''; + my $n = sysread($self->{sock}, $buf, 65536, length($buf)); + if (!defined($n)) { + return if $!{EAGAIN}; + die "sysread: $!"; + } + my $pfx = $self->{pfx}; + if ($n == 0) { + warn "BUG? $pfx buf=$buf" if $buf ne ''; + if (delete $self->{cidx}->{PENDING}->{$pfx}) { + warn "BUG? $pfx did not get mset.size"; + $self->{cidx}->index_next; + } + return $self->close; + } + my @lines = split(/^/m, $buf); + $self->{buf} = pop @lines if substr($lines[-1], -1) ne "\n"; + for my $l (@lines) { + if ($l =~ /\Amset\.size=[0-9]+ nr_out=[0-9]+\n\z/) { + delete $self->{cidx}->{PENDING}->{$pfx}; + $self->{cidx}->index_next; + } + chomp $l; + $self->{cidx}->progress("$pfx $l"); + } +} + +1; diff --git a/lib/PublicInbox/CmdIPC4.pm b/lib/PublicInbox/CmdIPC4.pm new file mode 100644 index 00000000..fc77bd03 --- /dev/null +++ b/lib/PublicInbox/CmdIPC4.pm @@ -0,0 +1,56 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# callers should use PublicInbox::CmdIPC4->can('send_cmd4') (or recv_cmd4) +# first choice for script/lei front-end and 2nd choice for lei backend +# libsocket-msghdr-perl is in Debian but not many other distros as of 2021. +package PublicInbox::CmdIPC4; +use v5.12; +use Socket qw(SOL_SOCKET SCM_RIGHTS); + +sub sendmsg_retry ($) { + return 1 if $!{EINTR}; + return unless ($!{ENOMEM} || $!{ENOBUFS} || $!{ETOOMANYREFS}); + return if --$_[0] < 0; + warn "# sleeping on sendmsg: $! ($_[0] tries left)\n"; + select(undef, undef, undef, 0.1); + 1; +} + +BEGIN { eval { +require Socket::MsgHdr; # XS +no warnings 'once'; + +# any number of FDs per-sendmsg(2) + buffer +*send_cmd4 = sub ($$$$;$) { # (sock, fds, buf, flags) = @_; + my ($sock, $fds, undef, $flags, $tries) = @_; + $tries //= 50; + my $mh = Socket::MsgHdr->new(buf => $_[2]); + $mh->cmsghdr(SOL_SOCKET, SCM_RIGHTS, pack('i' x scalar(@$fds), @$fds)); + my $s; + do { + $s = Socket::MsgHdr::sendmsg($sock, $mh, $flags); + } while (!defined($s) && sendmsg_retry($tries)); + $s; +}; + +*recv_cmd4 = sub ($$$) { + my ($s, undef, $len) = @_; # $_[1] = destination buffer + my $mh = Socket::MsgHdr->new(buflen => $len, controllen => 256); + my $r; + do { + $r = Socket::MsgHdr::recvmsg($s, $mh, 0); + } while (!defined($r) && $!{EINTR}); + if (!defined($r)) { + $_[1] = ''; + return (undef); + } + $_[1] = $mh->buf; + return () if $r == 0; + my (undef, undef, $data) = $mh->cmsghdr; + defined($data) ? unpack('i' x (length($data) / 4), $data) : (); +}; + +} } # /eval /BEGIN + +1; diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm new file mode 100644 index 00000000..e5fa4480 --- /dev/null +++ b/lib/PublicInbox/CodeSearch.pm @@ -0,0 +1,372 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# read-only external index for coderepos +# currently, it only indexes commits and repository metadata +# (pathname, root commits); not blob contents +package PublicInbox::CodeSearch; +use v5.12; +use parent qw(PublicInbox::Search); +use PublicInbox::Config; +use PublicInbox::Search qw(retry_reopen int_val xap_terms); +use PublicInbox::Compat qw(uniqstr); +use Compress::Zlib qw(uncompress); +use constant { + AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail) + CT => 1, # commit time (Unix time stamp, like TS/rt: in mail) + CIDX_SCHEMA_VER => 1, # brand new schema for code search + # for repos (`Tr'), CT(col=1) is used for the latest tip commit time + # in refs/{heads,tags}. AT(col=0) may be used to store disk usage + # in the future, but disk usage calculation is espensive w/ alternates +}; +our @CODE_NRP; +our @CODE_VMAP = ( + [ AT, 'd:' ], # mairix compat + [ AT, 'dt:' ], # public-inbox mail compat + [ CT, 'ct:' ], +); + +# note: the non-X term prefix allocations are shared with Xapian omega, +# see xapian-applications/omega/docs/termprefixes.rst +# bool_pfx_internal: +# type => 'T', # 'c' - commit, 'r' - repo GIT_DIR +# tags are not indexed, only normal branches (refs/heads/*), not hidden +# 'P' # (pathname) GIT_DIR # uniq +# 'G' # (group) root commit (may have multiple roots) +my %bool_pfx_external = ( + oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256) + # type:repo - rel2abs_collapsed(GIT_DIR) + parent => 'XP', + %PublicInbox::Search::PATCH_BOOL_COMMON, +); + +my %prob_prefix = ( # copied from PublicInbox::Search + # do we care about committer? or partial commit OID via Xapian? + # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial + %PublicInbox::Search::PATCH_PROB_COMMON, + + # default: + '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY +); + +sub new { + my ($cls, $dir, $cfg) = @_; + # can't have a PublicInbox::Config here due to circular refs + bless { topdir => $dir, xpfx => "$dir/cidx".CIDX_SCHEMA_VER, + -cfg_f => $cfg->{-f} }, $cls; +} + +sub join_data_key ($) { "join:$_[0]->{-cfg_f}" } + +sub join_data { + my ($self) = @_; + my $key = join_data_key($self); + my $cur = $self->xdb->get_metadata($key) or return; + $cur = eval { PublicInbox::Config::json()->decode(uncompress($cur)) }; + warn "E: $@ (corrupt metadata in `$key' key?)" if $@; + my @m = grep { ref($cur->{$_}) ne 'ARRAY' } qw(ekeys roots ibx2root); + if (@m) { + warn <<EOM; +W: $self->{topdir} join data for $self->{-cfg_f} missing: @m +EOM + undef; + } elsif (@{$cur->{ekeys}} < @{$cur->{ibx2root}}) { + warn <<EOM; +W: $self->{topdir} join data for $self->{-cfg_f} mismatched ekeys and ibx2root +EOM + undef; + } else { + $cur; + } +} + +sub qparse_new ($) { + my ($self) = @_; + my $qp = $self->qp_init_common; + my $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + if (!@CODE_NRP) { + @CODE_NRP = map { + $PublicInbox::Search::NVRP->new(@$_) + } @CODE_VMAP; + } + $cb->($qp, $_) for @CODE_NRP; + while (my ($name, $pfx) = each %bool_pfx_external) { + $qp->add_boolean_prefix($name, $_) for split(/ /, $pfx); + } + while (my ($name, $pfx) = each %prob_prefix) { + $qp->add_prefix($name, $_) for split(/ /, $pfx); + } + $qp; +} + +sub generate_cxx () { # generates snippet for xap_helper.h + my $ret = <<EOM; +# line ${\__LINE__} "${\__FILE__}" +static NRP *code_nrp[${\scalar(@CODE_VMAP)}]; +static void code_nrp_init(void) +{ +EOM + for (0..$#CODE_VMAP) { + my $x = $CODE_VMAP[$_]; + $ret .= qq{\tcode_nrp[$_] = new NRP($x->[0], "$x->[1]");\n} + } +$ret .= <<EOM; +} + +# line ${\__LINE__} "${\__FILE__}" +static void qp_init_code_search(Xapian::QueryParser *qp) +{ + for (size_t i = 0; i < MY_ARRAY_SIZE(code_nrp); i++) + qp->ADD_RP(code_nrp[i]); +EOM + for my $name (sort keys %bool_pfx_external) { + for (split(/ /, $bool_pfx_external{$name})) { + $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} + } + } + for my $name (sort keys %prob_prefix) { + for (split(/ /, $prob_prefix{$name})) { + $ret .= qq{\tqp->add_prefix("$name", "$_");\n} + } + } + $ret .= "}\n"; +} + +# returns a Xapian::Query to filter by roots +sub roots_filter { # retry_reopen callback + my ($self, $git_dir) = @_; + my $xdb = $self->xdb; + my $P = 'P'.$git_dir; + my ($cur, $end) = ($xdb->postlist_begin($P), $xdb->postlist_end($P)); + if ($cur == $end) { + warn "W: $git_dir not indexed?\n"; + return; + } + my @roots = xap_terms('G', $xdb, $cur->get_docid); + if (!@roots) { + warn "W: $git_dir has no root commits?\n"; + return; + } + my $q = $PublicInbox::Search::X{Query}->new('G'.shift(@roots)); + for my $r (@roots) { + $q = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_OR(), + $q, 'G'.$r); + } + $q; +} + +sub mset { + my ($self, $qry_str, $opt) = @_; + my $qp = $self->{qp} //= qparse_new($self); + my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); + + # limit to commits with shared roots + if (defined(my $git_dir = $opt->{git_dir})) { + my $rf = retry_reopen($self, \&roots_filter, $git_dir) + or return; + + $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qry, $rf); + } + + # we only want commits: + $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qry, 'T'.'c'); + $self->do_enquire($qry, $opt, CT); +} + +sub roots2paths { # for diagnostics + my ($self) = @_; + my $cur = $self->xdb->allterms_begin('G'); + my $end = $self->{xdb}->allterms_end('G'); + my $qrepo = $PublicInbox::Search::X{Query}->new('T'.'r'); + my $enq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); + $enq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); + $enq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); + my %ret; + for (; $cur != $end; $cur++) { + my $G_oidhex = $cur->get_termname; + my $qry = $PublicInbox::Search::X{Query}->new( + PublicInbox::Search::OP_FILTER(), + $qrepo, $G_oidhex); + $enq->set_query($qry); + my ($size, $off, $lim) = (0, 0, 100000); + my $dirs = $ret{substr($G_oidhex, 1)} = []; + do { + my $mset = $enq->get_mset($off += $size, $lim); + for my $x ($mset->items) { + push @$dirs, xap_terms('P', $x->get_document); + } + $size = $mset->size; + } while ($size); + @$dirs = sort(uniqstr(@$dirs)); + } + \%ret; +} + +sub docids_of_git_dir ($$) { + my ($self, $git_dir) = @_; + my @ids = $self->docids_by_postlist('P'.$git_dir); + warn <<"" if @ids > 1; +BUG: (non-fatal) $git_dir indexed multiple times in $self->{topdir} + + @ids; +} + +sub root_oids ($$) { + my ($self, $git_dir) = @_; + my @ids = docids_of_git_dir $self, $git_dir or warn <<""; +BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir} + + my @ret = map { xap_terms('G', $self->xdb, $_) } @ids; + @ret = uniqstr(@ret) if @ids > 1; + @ret; +} + +sub paths2roots { + my ($self, $paths) = @_; + my %ret; + if ($paths) { + for my $p (keys %$paths) { @{$ret{$p}} = root_oids($self, $p) } + } else { + my $tmp = roots2paths($self); + for my $root_oidhex (keys %$tmp) { + my $paths = delete $tmp->{$root_oidhex}; + push @{$ret{$_}}, $root_oidhex for @$paths; + } + @$_ = sort(@$_) for values %ret; + } + \%ret; +} + +sub load_ct { # retry_reopen cb + my ($self, $git_dir) = @_; + my @ids = docids_of_git_dir $self, $git_dir or return; + for (@ids) { + my $doc = $self->get_doc($_) // next; + return int_val($doc, CT); + } +} + +sub load_commit_times { # each_cindex callback + my ($self, $todo) = @_; # todo = [ [ time, git ], [ time, git ] ...] + my (@pending, $rec, $ct); + while ($rec = shift @$todo) { + $ct = $self->retry_reopen(\&load_ct, $rec->[1]->{git_dir}); + if (defined $ct) { + $rec->[0] = $ct; + } else { # may be in another cindex: + push @pending, $rec; + } + } + @$todo = @pending; +} + +sub load_coderepos { # each_cindex callback + my ($self, $pi_cfg) = @_; + my $name = $self->{name}; + my $cfg_f = $pi_cfg->{-f}; + my $lpfx = $self->{localprefix} or return warn <<EOM; +W: cindex.$name.localprefix unset in $cfg_f, ignoring cindex.$name +EOM + my $lre = join('|', map { $_ .= '/'; tr!/!/!s; quotemeta } @$lpfx); + $lre = qr!\A(?:$lre)!; + my $coderepos = $pi_cfg->{-coderepos}; + my $nick_pfx = $name eq '' ? '' : "$name/"; + my %dir2cr; + for my $p ($self->all_terms('P')) { + my $nick = $p; + $nick =~ s!$lre!$nick_pfx!s or next; + $dir2cr{$p} = $coderepos->{$nick} //= do { + my $git = PublicInbox::Git->new($p); + my %dedupe = ($nick => undef); + ($git->{nick}) = keys %dedupe; # for git->pub_urls + $git; + }; + } + my $jd = $self->retry_reopen(\&join_data, $self) or return warn <<EOM; +W: cindex.$name.topdir=$self->{topdir} has no usable join data for $cfg_f +EOM + my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)}; + my $roots2paths = roots2paths($self); + my %dedupe; # 50x alloc reduction w/ lore + gko mirror (Mar 2024) + for my $root_offs (@$ibx2root) { + my $ekey = shift(@$ekeys) // die 'BUG: {ekeys} empty'; + scalar(@$root_offs) or next; + my $ibx = $pi_cfg->lookup_eidx_key($ekey) // do { + warn "W: `$ekey' gone from $cfg_f\n"; + next; + }; + my $gits = $ibx->{-repo_objs} //= []; + my $cr_score = $ibx->{-cr_score} //= {}; + my %ibx_p2g = map { $_->{git_dir} => $_ } @$gits; + my $ibx2self; # cindex has an association w/ inbox? + for (@$root_offs) { # sorted by $nr descending + my ($nr, $root_off) = @$_; + my $root_oid = $roots->[$root_off] // do { + warn <<EOM; +BUG: root #$root_off invalid in join data for `$ekey' with $cfg_f +EOM + next; + }; + my $git_dirs = $roots2paths->{$root_oid}; + my @gits = map { $dir2cr{$_} // () } @$git_dirs; + $cr_score->{$_->{nick}} //= $nr for @gits; + @$git_dirs = grep { !$ibx_p2g{$_} } @$git_dirs; + # @$git_dirs or warn "W: no matches for $root_oid\n"; + for (@$git_dirs) { + if (my $git = $dir2cr{$_}) { + $ibx_p2g{$_} = $git; + $ibx2self = 1; + if (!$ibx->{-hide_www}) { + # don't stringify $nr directly + # to avoid long-lived PV + my $k = ($nr + 0)."\0". + ($ibx + 0); + my $s = $dedupe{$k} //= + [ $nr, $ibx->{name} ]; + push @{$git->{ibx_score}}, $s; + } + push @$gits, $git; + } else { + warn <<EOM; +W: no coderepo available for $_ (localprefix=@$lpfx) +EOM + } + } + } + if (@$gits) { + push @{$ibx->{-csrch}}, $self if $ibx2self; + } else { + delete $ibx->{-repo_objs}; + delete $ibx->{-cr_score}; + } + } + for my $git (values %dir2cr) { + my $s = $git->{ibx_score}; + @$s = sort { $b->[0] <=> $a->[0] } @$s if $s; + } + my $ALL = $pi_cfg->ALL or return; + my @alls_gits = sort { + scalar @{$b->{ibx_score} // []} <=> + scalar @{$a->{ibx_score} // []} + } values %$coderepos; + my $gits = $ALL->{-repo_objs} //= []; + push @$gits, @alls_gits; + my $cr_score = $ALL->{-cr_score} //= {}; + $cr_score->{$_->{nick}} //= scalar(@{$_->{ibx_score}//[]}) for @$gits; +} + +sub repos_sorted { + my $pi_cfg = shift; + my @recs = map { [ 0, $_ ] } @_; # PublicInbox::Git objects + my @todo = @recs; + $pi_cfg->each_cindex(\&load_commit_times, \@todo); + @recs = sort { $b->[0] <=> $a->[0] } @recs; # sort by commit time +} + +1; diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm new file mode 100644 index 00000000..6d777bf6 --- /dev/null +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -0,0 +1,1391 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# indexer for git coderepos, just commits and repo paths for now +# this stores normalized absolute paths of indexed GIT_DIR inside +# the DB itself and is designed to handle forks by designating roots +# At minimum, it needs to have the pathnames of all git repos in +# memory at runtime. --join also requires all inbox pathnames to +# be in memory (as it happens when loaded from ~/.public-inbox/config). +# +# Unlike mail search, docid isn't tied to NNTP artnum or IMAP UID, +# there's no serial number dependency at all. The first 32-bits of +# the commit SHA-(1|256) is used to select a shard. +# +# We shard repos using the first 32-bits of sha256($ABS_GIT_DIR) +# +# --join associates root commits of coderepos to inboxes based on prefixes. +# +# Internally, each inbox is assigned a non-negative integer index ($IBX_OFF), +# and each root commit object ID (SHA-1/SHA-256 hex) is also assigned +# a non-negative integer index ($ROOT_COMMIT_OID_ID). +# +# join dumps to 2 intermediate files in $TMPDIR: +# +# * to_root_off - each line is of the format: +# +# $PFX @ROOT_COMMIT_OID_OFFS +# +# * to_ibx_off - each line is of the format: +# +# $PFX @IBX_OFFS +# +# $IBX_OFFS is a comma-delimited list of integers ($IBX_ID) +# The $IBX_OFF here is ephemeral (per-join_data) and NOT related to +# the `ibx_off' column of `over.sqlite3' for extindex. +# @ROOT_COMMIT_OID_OFFS is space-delimited +# In both cases, $PFX is typically the value of the 7-(hex)char dfpost +# XDFPOST but it can be configured to use any combination of patchid, +# dfpre, dfpost or dfblob. +# +# WARNING: this is vulnerable to arbitrary memory usage attacks if we +# attempt to index or join against malicious coderepos with +# thousands/millions of root commits. Most coderepos have only one +# root commit, some have several: git.git currently has 7, +# torvalds/linux.git has 4. +# --max-size= is required to keep memory usage reasonable for gigantic +# commits. +# +# See PublicInbox::CodeSearch (read-only API) for more +package PublicInbox::CodeSearchIdx; +use v5.12; +# parent order matters, we want ->DESTROY from IPC, not SearchIdx +use parent qw(PublicInbox::CodeSearch PublicInbox::IPC PublicInbox::SearchIdx); +use PublicInbox::DS qw(awaitpid); +use PublicInbox::PktOp; +use PublicInbox::IPC qw(nproc_shards); +use POSIX qw(WNOHANG SEEK_SET strftime); +use File::Path (); +use File::Spec (); +use List::Util qw(max); +use PublicInbox::SHA qw(sha256_hex sha_all); +use PublicInbox::Search qw(xap_terms); +use PublicInbox::SearchIdx qw(add_val); +use PublicInbox::Config qw(glob2re rel2abs_collapsed); +use PublicInbox::Spawn qw(which spawn popen_rd); +use PublicInbox::OnDestroy; +use PublicInbox::CidxLogP; +use PublicInbox::CidxComm; +use PublicInbox::Git qw(%OFMT2HEXLEN); +use PublicInbox::Compat qw(uniqstr); +use PublicInbox::Aspawn qw(run_await); +use Compress::Zlib qw(compress); +use Carp qw(croak); +use Time::Local qw(timegm); +use autodie qw(close pipe open sysread seek sysseek send); +our $DO_QUIT = 15; # signal number +our ( + $LIVE_JOBS, # integer + $GITS_NR, # number of coderepos + $MY_SIG, # like %SIG + $SIGSET, + $TXN_BYTES, # number of bytes in current shard transaction + $BATCH_BYTES, + @RDONLY_XDB, # Xapian::Database + @IDX_SHARDS, # clones of self + $MAX_SIZE, + $REINDEX, # PublicInbox::SharedKV + @GIT_DIR_GONE, # [ git_dir1, git_dir2 ] + $PRUNE_DONE, # marks off prune completions + $NCHANGE, # current number of changes + $NPROC, + $XHC, # XapClient + $REPO_CTX, # current repo being indexed in shards + $IDXQ, # PublicInbox::Git object arrayref + $SCANQ, # PublicInbox::Git object arrayref + %ALT_FH, # hexlen => tmp IO for TMPDIR git alternates + $TMPDIR, # File::Temp->newdir object for prune + @PRUNEQ, # GIT_DIRs to prepare for pruning + %TODO, @IBXQ, @IBX, + @JOIN, # join(1) command for --join + $CMD_ENV, # env for awk(1), comm(1), sort(1) commands during prune + @AWK, @COMM, @SORT, # awk(1), comm(1), sort(1) commands + %JOIN, # CLI --join= suboptions + @JOIN_PFX, # any combination of XDFID, XDFPRE, XDFPOST + @JOIN_DT, # YYYYmmddHHMMSS for dt: + $QRY_STR, # common query string for both code and inbox associations + $DUMP_IBX_WPIPE, # goes to sort(1) + $ANY_SHARD, # shard round-robin for scan fingerprinting + @OFF2ROOT, + $GIT_VER, + @NO_ABBREV, +); + +# stop walking history if we see >$SEEN_MAX existing commits, this assumes +# branches don't diverge by more than this number of commits... +# git walks commits quickly if it doesn't have to read trees +our $SEEN_MAX = 100000; + +# window for commits/emails to determine a inbox <-> coderepo association +my $JOIN_WINDOW = 50000; + +our @PRUNE_BATCH = qw(cat-file --batch-all-objects --batch-check); + +# TODO: do we care about committer name + email? or tree OID? +my @FMT = qw(H P ct an ae at s b); # (b)ody must be last + +# git log --stdin buffers all commits before emitting, thus --reverse +# doesn't incur extra overhead. We use --reverse to keep Xapian docids +# increasing so we may be able to avoid sorting results in some cases +my @LOG_STDIN = (qw(log --no-decorate --no-color --no-notes -p --stat -M + --reverse --stdin --no-walk=unsorted), '--pretty=format:%n%x00'. + join('%n', map { "%$_" } @FMT)); + +sub new { + my (undef, $dir, $opt) = @_; + my $l = $opt->{indexlevel} // 'full'; + $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and + die "invalid indexlevel=$l\n"; + $l eq 'basic' and die "E: indexlevel=basic not supported\n"; + my $self = bless { + xpfx => "$dir/cidx". PublicInbox::CodeSearch::CIDX_SCHEMA_VER, + cidx_dir => $dir, + creat => 1, # TODO: get rid of this, should be implicit + transact_bytes => 0, # for checkpoint + total_bytes => 0, # for lock_release + current_info => '', + parallel => 1, + -opt => $opt, + lock_path => "$dir/cidx.lock", + }, __PACKAGE__; + $self->{nshard} = count_shards($self) || + nproc_shards({nproc => $opt->{jobs}}); + $self->{-no_fsync} = 1 if !$opt->{fsync}; + $self->{-dangerous} = 1 if $opt->{dangerous}; + $self; +} + +# This is similar to uniq(1) on the first column, but combines the +# contents of subsequent columns using $OFS. +our @UNIQ_FOLD = ($^X, $^W ? ('-w') : (), qw(-MList::Util=uniq -ane), <<'EOM'); +BEGIN { $ofs = $ENV{OFS} // ','; $apfx = '' } +if ($F[0] eq $apfx) { + shift @F; + push @ids, @F; +} else { + print $apfx.' '.join($ofs, uniq(@ids))."\n" if @ids; + ($apfx, @ids) = @F; +} +END { print $apfx.' '.join($ofs, uniq(@ids))."\n" if @ids } +EOM + +# TODO: may be used for reshard/compact +sub count_shards { scalar($_[0]->xdb_shards_flat) } + +sub update_commit ($$$) { + my ($self, $cmt, $roots) = @_; # fields from @FMT + my $x = 'Q'.$cmt->{H}; + my ($docid, @extra) = sort { $a <=> $b } $self->docids_by_postlist($x); + @extra and warn "W: $cmt->{H} indexed multiple times, pruning ", + join(', ', map { "#$_" } @extra), "\n"; + $self->{xdb}->delete_document($_) for @extra; + my $doc = $PublicInbox::Search::X{Document}->new; + $doc->add_boolean_term($x); + $doc->add_boolean_term('G'.$_) for @$roots; + $doc->add_boolean_term('XP'.$_) for split(/ /, $cmt->{P}); + $doc->add_boolean_term('T'.'c'); + + # Author-Time is compatible with dt: for mail search schema_version=15 + add_val($doc, PublicInbox::CodeSearch::AT, + POSIX::strftime('%Y%m%d%H%M%S', gmtime($cmt->{at}))); + + # Commit-Time is the fallback used by rt: (TS) for mail search: + add_val($doc, PublicInbox::CodeSearch::CT, $cmt->{ct}); + + $self->term_generator->set_document($doc); + + # email address is always indexed with positional data for usability + $self->index_phrase("$cmt->{an} <$cmt->{ae}>", 1, 'A'); + + $x = $cmt->{'s'}; + $self->index_text($x, 1, 'S') if $x =~ /\S/s; + $doc->set_data($x); # subject is the first (and currently only) line + + $x = delete $cmt->{b}; + $self->index_body_text($doc, \$x) if $x =~ /\S/s; + defined($docid) ? $self->{xdb}->replace_document($docid, $doc) : + $self->{xdb}->add_document($doc); +} + +sub progress { + my ($self, @msg) = @_; + my $pr = $self->{-opt}->{-progress} or return; + $pr->($self->{git} ? ("$self->{git}->{git_dir}: ") : (), @msg, "\n"); +} + +sub check_objfmt_status ($$$) { + my ($git, $chld_err, $fmt) = @_; + my ($status, $sig) = ($chld_err >> 8, $chld_err & 127); + if (!$sig && $status == 1) { # unset, default is '' (SHA-1) + $fmt = 'sha1'; + } elsif (!$sig && $status == 0) { + chomp($fmt ||= 'sha1'); + } + $fmt // warn("git --git-dir=$git->{git_dir} config \$?=$chld_err"); + $fmt; +} + +sub store_repo { # wq_io_do, sends docid back + my ($self, $repo) = @_; + my $op_p = delete($self->{0}) // die 'BUG: no {0} op_p'; + my $git = bless $repo, 'PublicInbox::Git'; + my $rd = $git->popen(qw(config extensions.objectFormat)); + $self->begin_txn_lazy; + $self->{xdb}->delete_document($_) for @{$repo->{to_delete}}; + my $doc = $PublicInbox::Search::X{Document}->new; + add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct}); + $doc->add_boolean_term("P$repo->{git_dir}"); + $doc->add_boolean_term('T'.'r'); + $doc->add_boolean_term('G'.$_) for @{$repo->{roots}}; + $doc->set_data($repo->{fp}); # \n delimited + my $fmt = readline($rd); + $rd->close; + $fmt = check_objfmt_status $git, $?, $fmt; + $OFMT2HEXLEN{$fmt} // warn <<EOM; # store unknown formats anyways +E: unknown extensions.objectFormat=$fmt in $repo->{git_dir} +EOM + $doc->add_boolean_term('H'.$fmt); + my $did = $repo->{docid}; + $did ? $self->{xdb}->replace_document($did, $doc) + : ($did = $self->{xdb}->add_document($doc)); + send($op_p, "repo_stored $did", 0); +} + +sub cidx_ckpoint ($;$) { + my ($self, $msg) = @_; + progress($self, $msg) if defined($msg); + $TXN_BYTES = $BATCH_BYTES; # reset + return if $PublicInbox::Search::X{CLOEXEC_UNSET}; + $self->commit_txn_lazy; + $self->begin_txn_lazy; +} + +sub truncate_cmt ($$) { + my ($cmt) = @_; # _[1] is $buf (giant) + my ($orig_len, $len); + $len = $orig_len = length($_[1]); + @$cmt{@FMT} = split(/\n/, $_[1], scalar(@FMT)); + undef $_[1]; + $len -= length($cmt->{b}); + + # try to keep the commit message body. + # n.b. this diffstat split may be unreliable but it's not worth + # perfection for giant commits: + my ($bdy) = split(/^---\n/sm, delete($cmt->{b}), 2); + if (($len + length($bdy)) <= $MAX_SIZE) { + $len += length($bdy); + $cmt->{b} = $bdy; + warn <<EOM; +W: $cmt->{H}: truncated body ($orig_len => $len bytes) +W: to be under --max-size=$MAX_SIZE +EOM + } else { + $cmt->{b} = ''; + warn <<EOM; +W: $cmt->{H}: deleted body ($orig_len => $len bytes) +W: to be under --max-size=$MAX_SIZE +EOM + } + $len; +} + +sub cidx_reap_log { # awaitpid cb + my ($pid, $cmd, $self, $op_p) = @_; + if (!$? || ($DO_QUIT && (($? & 127) == $DO_QUIT || + ($? & 127) == POSIX::SIGPIPE))) { + send($op_p, "shard_done $self->{shard}", 0); + } else { + warn "W: @$cmd (\$?=$?)\n"; + $self->{xdb}->cancel_transaction; + } +} + +sub shard_index { # via wq_io_do in IDX_SHARDS + my ($self, $git, $roots) = @_; + + my $in = delete($self->{0}) // die 'BUG: no {0} input'; + my $op_p = delete($self->{1}) // die 'BUG: no {1} op_p'; + sysseek($in, 0, SEEK_SET); + my $cmd = $git->cmd(@NO_ABBREV, @LOG_STDIN); + my $rd = popen_rd($cmd, undef, { 0 => $in }, + \&cidx_reap_log, $cmd, $self, $op_p); + PublicInbox::CidxLogP->new($rd, $self, $git, $roots); + # CidxLogP->event_step will call cidx_read_log_p once there's input +} + +# sharded reader for `git log --pretty=format: --stdin' +sub cidx_read_log_p { + my ($self, $log_p, $rd) = @_; + my $git = delete $log_p->{git} // die 'BUG: no {git}'; + local $self->{current_info} = "$git->{git_dir} [$self->{shard}]"; + my $roots = delete $log_p->{roots} // die 'BUG: no {roots}'; + # local-ized in parent before fork + $TXN_BYTES = $BATCH_BYTES; + local $self->{git} = $git; # for patchid + return if $DO_QUIT; + my $nr = 0; + + # a patch may have \0, see c4201214cbf10636e2c1ab9131573f735b42c8d4 + # in linux.git, so we use $/ = "\n\0" to check end-of-patch + my $FS = "\n\0"; + my $len; + my $cmt = {}; + local $/ = $FS; + my $buf = <$rd> // return; # leading $FS + $buf eq $FS or die "BUG: not LF-NUL: $buf\n"; + $self->begin_txn_lazy; + while (!$DO_QUIT && defined($buf = <$rd>)) { + chomp($buf); + $/ = "\n"; + $len = length($buf); + if (defined($MAX_SIZE) && $len > $MAX_SIZE) { + $len = truncate_cmt($cmt, $buf); + } else { + @$cmt{@FMT} = split(/\n/, $buf, scalar(@FMT)); + } + if (($TXN_BYTES -= $len) <= 0) { + cidx_ckpoint($self, "[$self->{shard}] $nr"); + $TXN_BYTES -= $len; # len may be huge, >TXN_BYTES; + } + update_commit($self, $cmt, $roots); + ++$nr; + cidx_ckpoint($self, "[$self->{shard}] $nr") if $TXN_BYTES <= 0; + $/ = $FS; + } + # return and wait for cidx_reap_log +} + +sub shard_done { # called via PktOp on shard_index completion + my ($self, $repo_ctx, $on_destroy, $n) = @_; + $repo_ctx->{shard_ok}->{$n} = 1; +} + +sub repo_stored { + my ($self, $repo_ctx, $drs, $did) = @_; + # check @IDX_SHARDS instead of DO_QUIT to avoid wasting prior work + # because shard_commit is fast + return unless @IDX_SHARDS; + $did > 0 or die "BUG: $repo_ctx->{repo}->{git_dir}: docid=$did"; + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{shard_done} = [ $self, $repo_ctx, + on_destroy(\&next_repos, $repo_ctx, $drs)]; + # shard_done fires when all shards are committed + my @active = keys %{$repo_ctx->{active}}; + $IDX_SHARDS[$_]->wq_io_do('shard_commit', [ $p->{op_p} ]) for @active; +} + +sub prune_done { # called via prune_do completion + my ($self, $drs, $n) = @_; + return if $DO_QUIT || !$PRUNE_DONE; + die "BUG: \$PRUNE_DONE->[$n] already defined" if $PRUNE_DONE->[$n]; + $PRUNE_DONE->[$n] = 1; + if (grep(defined, @$PRUNE_DONE) == @IDX_SHARDS) { + progress($self, 'prune done'); + index_next($self); # may kick dump_roots_start + } +} + +sub seen ($$) { + my ($xdb, $q) = @_; # $q = "Q$COMMIT_HASH" + for (1..100) { + my $ret = eval { + $xdb->postlist_begin($q) != $xdb->postlist_end($q); + }; + return $ret unless $@; + if (ref($@) =~ /\bDatabaseModifiedError\b/) { + $xdb->reopen; + } else { + Carp::croak($@); + } + } + Carp::croak('too many Xapian DB modifications in progress'); +} + +# used to select the shard for a GIT_DIR +sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) } + +sub _cb { # run_await cb + my ($pid, $cmd, undef, $opt, $cb, $self, $git, @arg) = @_; + return if $DO_QUIT; + return $cb->($opt, $self, $git, @arg) if $opt->{quiet}; + $? ? ($git->{-cidx_err} = warn("W: @$cmd (\$?=$?)\n")) : + $cb->($opt, $self, $git, @arg); +} + +sub run_git { + my ($cmd, $opt, $cb, $self, $git, @arg) = @_; + run_await($git->cmd(@$cmd), undef, $opt, \&_cb, $cb, $self, $git, @arg) +} + +# this is different from the grokmirror-compatible fingerprint since we +# only care about --heads (branches) and --tags, and not even their names +sub fp_start ($$) { + my ($self, $git) = @_; + return if $DO_QUIT; + open my $refs, '+>', undef; + $git->{-repo}->{refs} = $refs; + my ($c, $p) = PublicInbox::PktOp->pair; + my $next_on_err = on_destroy \&index_next, $self; + $c->{ops}->{fp_done} = [ $self, $git, $next_on_err ]; + $IDX_SHARDS[++$ANY_SHARD % scalar(@IDX_SHARDS)]->wq_io_do('fp_async', + [ $p->{op_p}, $refs ], $git->{git_dir}) +} + +sub fp_async { # via wq_io_do in worker + my ($self, $git_dir) = @_; + my $op_p = delete $self->{0} // die 'BUG: no {0} op_p'; + my $refs = delete $self->{1} // die 'BUG: no {1} refs'; + my $git = PublicInbox::Git->new($git_dir); + run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs }, + \&fp_async_done, $self, $git, $op_p); +} + +sub fp_async_done { # run_git cb from worker + my ($opt, $self, $git, $op_p) = @_; + my $refs = delete $opt->{1} // 'BUG: no {-repo}->{refs}'; + sysseek($refs, 0, SEEK_SET); + send($op_p, 'fp_done '.sha_all(256, $refs)->hexdigest, 0); +} + +sub fp_done { # called parent via PktOp by fp_async_done + my ($self, $git, $next_on_err, $hex) = @_; + $next_on_err->cancel; + return if $DO_QUIT; + $git->{-repo}->{fp} = $hex; + my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB); + my $shard = bless { %$self, shard => $n }, ref($self); + $git->{-repo}->{shard_n} = $n; + delete @$shard{qw(lockfh lock_path)}; + local $shard->{xdb} = $RDONLY_XDB[$n] // die "BUG: shard[$n] undef"; + $shard->retry_reopen(\&check_existing, $self, $git); +} + +sub check_existing { # retry_reopen callback + my ($shard, $self, $git) = @_; + my @docids = $shard->docids_of_git_dir($git->{git_dir}); + my $docid = shift(@docids) // return prep_repo($self, $git); # new repo + my $doc = $shard->get_doc($docid) // + die "BUG: no #$docid ($git->{git_dir})"; + my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data; + if ($old_fp eq $git->{-repo}->{fp}) { # no change + delete $git->{-repo}; + return index_next($self); + } + $git->{-repo}->{docid} = $docid; + if (@docids) { + warn "BUG: $git->{git_dir} indexed multiple times, culling\n"; + $git->{-repo}->{to_delete} = \@docids; # XXX needed? + } + prep_repo($self, $git); +} + +sub partition_refs ($$$) { + my ($self, $git, $refs) = @_; # show-ref --heads --tags --hash output + sysseek($refs, 0, SEEK_SET); + my $rfh = $git->popen(qw(rev-list --stdin), undef, { 0 => $refs }); + my $seen = 0; + my @shard_in = map { + $_->reopen; + open my $fh, '+>', undef; + $fh; + } @RDONLY_XDB; + + my $n0 = $NCHANGE; + while (defined(my $cmt = <$rfh>)) { + chomp $cmt; + my $n = hex(substr($cmt, 0, 8)) % scalar(@RDONLY_XDB); + if ($REINDEX && $REINDEX->set_maybe(pack('H*', $cmt), '')) { + say { $shard_in[$n] } $cmt; + ++$NCHANGE; + } elsif (seen($RDONLY_XDB[$n], 'Q'.$cmt)) { + last if ++$seen > $SEEN_MAX; + } else { + say { $shard_in[$n] } $cmt; + ++$NCHANGE; + $seen = 0; + } + if ($DO_QUIT) { + $rfh->close; + return (); + } + } + $rfh->close; + return () if $DO_QUIT; + if (!$? || (($? & 127) == POSIX::SIGPIPE && $seen > $SEEN_MAX)) { + my $n = $NCHANGE - $n0; + progress($self, "$git->{git_dir}: $n commits") if $n; + return @shard_in; + } + die "git --git-dir=$git->{git_dir} rev-list: \$?=$?\n"; +} + +sub shard_commit { # via wq_io_do + my ($self) = @_; + my $op_p = delete($self->{0}) // die 'BUG: no {0} op_p'; + $self->commit_txn_lazy; + send($op_p, "shard_done $self->{shard}", 0); +} + +sub dump_roots_start { + my ($self, $do_join) = @_; + return if $DO_QUIT; + $XHC //= PublicInbox::XapClient::start_helper("-j$NPROC"); + $do_join // die 'BUG: no $do_join'; + progress($self, 'dumping IDs from coderepos'); + local $self->{xdb}; + @OFF2ROOT = $self->all_terms('G'); + my $root2id = "$TMPDIR/root2id"; + open my $fh, '>', $root2id; + my $nr = -1; + for (@OFF2ROOT) { print $fh $_, "\0", ++$nr, "\0" } # mmap-friendly + close $fh; + # dump_roots | sort -k1,1 | OFS=' ' uniq_fold >to_root_off + my ($sort_opt, $fold_opt); + pipe(local $sort_opt->{0}, my $sort_w); + pipe(local $fold_opt->{0}, local $sort_opt->{1}); + my @sort = (@SORT, '-k1,1'); + my $dst = "$TMPDIR/to_root_off"; + open $fold_opt->{1}, '>', $dst; + my $fold_env = { %$CMD_ENV, OFS => ' ' }; + run_await(\@sort, $CMD_ENV, $sort_opt, \&cmd_done, $do_join); + run_await(\@UNIQ_FOLD, $fold_env, $fold_opt, \&cmd_done, $do_join); + my $window = $JOIN{window} // $JOIN_WINDOW; + my @m = $window <= 0 ? () : ('-m', $window); + my @arg = ((map { ('-A', $_) } @JOIN_PFX), '-c', + @m, $root2id, $QRY_STR); + for my $d ($self->shard_dirs) { + pipe(my $err_r, my $err_w); + $XHC->mkreq([$sort_w, $err_w], qw(dump_roots -d), $d, @arg); + my $desc = "dump_roots $d"; + $self->{PENDING}->{$desc} = $do_join; + PublicInbox::CidxXapHelperAux->new($err_r, $self, $desc); + } + progress($self, 'waiting on dump_roots sort'); +} + +sub dump_ibx { # sends to xap_helper.h + my ($self, $ibx_off) = @_; + my $ibx = $IBX[$ibx_off] // die "BUG: no IBX[$ibx_off]"; + my $ekey = $ibx->eidx_key; + my $srch = $ibx->isrch or return warn <<EOM; +W: $ekey not indexed for search +EOM + # note: we don't send `-m MAX' to dump_ibx since we have to + # post-filter non-patch messages for now... + my @cmd = ('dump_ibx', $srch->xh_args, + (map { ('-A', $_) } @JOIN_PFX), $ibx_off, $QRY_STR); + pipe(my $r, my $w); + $XHC->mkreq([$DUMP_IBX_WPIPE, $w], @cmd); + $self->{PENDING}->{$ekey} = $TODO{do_join}; + PublicInbox::CidxXapHelperAux->new($r, $self, $ekey); +} + +sub dump_ibx_start { + my ($self, $do_join) = @_; + return if $DO_QUIT; + $XHC //= PublicInbox::XapClient::start_helper("-j$NPROC"); + my ($sort_opt, $fold_opt); + pipe(local $sort_opt->{0}, $DUMP_IBX_WPIPE); + pipe(local $fold_opt->{0}, local $sort_opt->{1}); + my @sort = (@SORT, '-k1,1'); # sort only on JOIN_PFX + # pipeline: dump_ibx | sort -k1,1 | uniq_fold >to_ibx_off + open $fold_opt->{1}, '>', "$TMPDIR/to_ibx_off"; + run_await(\@sort, $CMD_ENV, $sort_opt, \&cmd_done, $do_join); + run_await(\@UNIQ_FOLD, $CMD_ENV, $fold_opt, \&cmd_done, $do_join); +} + +sub index_next ($) { + my ($self) = @_; + return if $DO_QUIT; + if ($IDXQ && @$IDXQ) { + index_repo($self, shift @$IDXQ); + } elsif ($SCANQ && @$SCANQ) { + fp_start $self, shift @$SCANQ; + } elsif ($TMPDIR) { + delete $TODO{dump_roots_start}; + delete $TODO{dump_ibx_start}; # runs OnDestroy once + return dump_ibx($self, shift @IBXQ) if @IBXQ; + undef $DUMP_IBX_WPIPE; # done dumping inboxes + delete $TODO{do_join}; + } + # else: wait for shards_active (post_loop_do) callback +} + +sub next_repos { # OnDestroy cb + my ($repo_ctx, $drs) = @_; + my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)}; + progress($self, "$repo->{git_dir}: done"); + return if $DO_QUIT || !$REPO_CTX; + my $n = grep { ! $repo_ctx->{shard_ok}->{$_} } keys %$active; + die "E: $repo->{git_dir} $n shards failed" if $n; + $REPO_CTX == $repo_ctx or die "BUG: $REPO_CTX != $repo_ctx"; + $REPO_CTX = undef; + index_next($self); +} + +sub index_done { # OnDestroy cb called when done indexing each code repo + my ($repo_ctx, $drs) = @_; + return if $DO_QUIT; + my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)}; + # $active may be undef here, but it's fine to vivify + my $n = grep { ! $repo_ctx->{shard_ok}->{$_} } keys %$active; + die "E: $repo->{git_dir} $n shards failed" if $n; + $repo_ctx->{shard_ok} = {}; # reset for future shard_done + $n = $repo->{shard_n}; + $repo_ctx->{active}->{$n} = undef; # may vivify $repo_ctx->{active} + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{repo_stored} = [ $self, $repo_ctx, $drs ]; + $IDX_SHARDS[$n]->wq_io_do('store_repo', [ $p->{op_p} ], $repo); + # repo_stored will fire once store_repo is done +} + +sub index_repo { + my ($self, $git) = @_; + return if $DO_QUIT; + my $repo = $git->{-repo} // die 'BUG: no {-repo}'; + return index_next($self) if $git->{-cidx_err}; + if (!defined($repo->{ct})) { + warn "W: $git->{git_dir} has no commits, skipping\n"; + return index_next($self); + } + return push(@$IDXQ, $git) if $REPO_CTX; # busy + delete $git->{-repo}; + my $roots_fh = delete $repo->{roots_fh} // die 'BUG: no {roots_fh}'; + seek($roots_fh, 0, SEEK_SET); + chomp(my @roots = PublicInbox::IO::read_all $roots_fh); + if (!@roots) { + warn("E: $git->{git_dir} has no root commits\n"); + return index_next($self); + } + $repo->{roots} = \@roots; + local $self->{current_info} = $git->{git_dir}; + my @shard_in = partition_refs($self, $git, delete($repo->{refs})); + $repo->{git_dir} = $git->{git_dir}; + my $repo_ctx = $REPO_CTX = { self => $self, repo => $repo }; + delete $git->{-cidx_gits_fini}; # may fire gits_fini + my $drs = delete $git->{-cidx_dump_roots_start}; + my $index_done = on_destroy \&index_done, $repo_ctx, $drs; + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{shard_done} = [ $self, $repo_ctx, $index_done ]; + for my $n (0..$#shard_in) { + $shard_in[$n]->flush or die "flush shard[$n]: $!"; + -s $shard_in[$n] or next; + last if $DO_QUIT; + $IDX_SHARDS[$n]->wq_io_do('shard_index', + [ $shard_in[$n], $p->{op_p} ], + $git, \@roots); + $repo_ctx->{active}->{$n} = undef; + } + # shard_done fires when shard_index is done +} + +sub ct_fini { # run_git cb + my ($opt, $self, $git, $index_repo) = @_; + my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF + $git->{-repo}->{ct} = $ct + 0; +} + +# TODO: also index gitweb.owner and the full fingerprint for grokmirror? +sub prep_repo ($$) { + my ($self, $git) = @_; + return if $DO_QUIT; + my $index_repo = on_destroy \&index_repo, $self, $git; + my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}'; + sysseek($refs, 0, SEEK_SET); + open my $roots_fh, '+>', undef; + $git->{-repo}->{roots_fh} = $roots_fh; + run_git([ qw(rev-list --stdin --max-parents=0) ], + { 0 => $refs, 1 => $roots_fh }, \&PublicInbox::Config::noop, + $self, $git, $index_repo); + run_git([ qw[for-each-ref --sort=-committerdate + --format=%(committerdate:raw) --count=1 + refs/heads/ refs/tags/] ], undef, # capture like qx + \&ct_fini, $self, $git, $index_repo); +} + +# for PublicInbox::SearchIdx `git patch-id' call and with_umask +sub git { $_[0]->{git} } + +sub load_existing ($) { # for -u/--update + my ($self) = @_; + my $dirs = $self->{git_dirs} //= []; + if ($self->{-opt}->{update} || $self->{-opt}->{prune}) { + local $self->{xdb}; + $self->xdb or + die "E: $self->{cidx_dir} non-existent for --update\n"; + my @cur = grep { + if (-e $_) { + 1; + } else { + push @GIT_DIR_GONE, $_; + undef; + } + } $self->all_terms('P'); + if (@GIT_DIR_GONE && !$self->{-opt}->{prune}) { + warn "W: the following repos no longer exist:\n", + (map { "W:\t$_\n" } @GIT_DIR_GONE), + "W: use --prune to remove them from ", + $self->{cidx_dir}, "\n"; + } + push @$dirs, @cur; + } + @$dirs = uniqstr @$dirs; +} + +# SIG handlers: +sub shard_quit { $DO_QUIT = POSIX->can("SIG$_[0]")->() } +sub shard_usr1 { $TXN_BYTES = -1 } + +sub cidx_init ($) { + my ($self) = @_; + my $dir = $self->{cidx_dir}; + unless (-d $dir) { + warn "# creating $dir\n" if !$self->{-opt}->{quiet}; + File::Path::mkpath($dir); + } + $self->lock_acquire; + my @shards; + my $l = $self->{indexlevel} //= $self->{-opt}->{indexlevel}; + + for my $n (0..($self->{nshard} - 1)) { + my $shard = bless { %$self, shard => $n }, ref($self); + delete @$shard{qw(lockfh lock_path)}; + my $xdb = $shard->idx_acquire; + if (!$n) { + if (($l // '') eq 'medium') { + $xdb->set_metadata('indexlevel', $l); + } elsif (($l // '') eq 'full') { + $xdb->set_metadata('indexlevel', ''); # unset + } + $l ||= $xdb->get_metadata('indexlevel') || 'full'; + } + $shard->{indexlevel} = $l; + $shard->idx_release; + $shard->wq_workers_start("cidx shard[$n]", 1, $SIGSET, { + siblings => \@shards, # for ipc_atfork_child + }, \&shard_done_wait, $self); + push @shards, $shard; + } + $self->{indexlevel} //= $l; + # this warning needs to happen after idx_acquire + state $once; + warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++; +W: Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks, +W: memory usage may be high for large indexing runs +EOM + @shards; +} + +# called when all git coderepos are done +sub gits_fini { + undef $GITS_NR; + PublicInbox::DS::enqueue_reap(); # kick @post_loop_do +} + +sub scan_git_dirs ($) { + my ($self) = @_; + @$SCANQ = () unless $self->{-opt}->{scan}; + $GITS_NR = @$SCANQ or return; + my $gits_fini = on_destroy \&gits_fini; + $_->{-cidx_gits_fini} = $gits_fini for @$SCANQ; + if (my $drs = $TODO{dump_roots_start}) { + $_->{-cidx_dump_roots_start} = $drs for @$SCANQ; + } + progress($self, "scanning $GITS_NR code repositories..."); +} + +sub prune_init { # via wq_io_do in IDX_SHARDS + my ($self) = @_; + $self->{nr_prune} = 0; + $TXN_BYTES = $BATCH_BYTES; + $self->begin_txn_lazy; +} + +sub prune_one { # via wq_io_do in IDX_SHARDS + my ($self, $term) = @_; + my @docids = $self->docids_by_postlist($term); + for (@docids) { + $TXN_BYTES -= $self->{xdb}->get_doclength($_) * 42; + $self->{xdb}->delete_document($_); + } + ++$self->{nr_prune}; + $TXN_BYTES < 0 and + cidx_ckpoint($self, "prune [$self->{shard}] $self->{nr_prune}"); +} + +sub prune_commit { # via wq_io_do in IDX_SHARDS + my ($self) = @_; + my $prune_op_p = delete $self->{0} // die 'BUG: no {0} op_p'; + my $nr = delete $self->{nr_prune} // die 'BUG: nr_prune undef'; + cidx_ckpoint($self, "prune [$self->{shard}] $nr done") if $nr; + send($prune_op_p, "prune_done $self->{shard}", 0); +} + +sub shards_active { # post_loop_do + return if $DO_QUIT; + return if grep(defined, $PRUNE_DONE, $SCANQ, $IDXQ) != 3; + return 1 if grep(defined, @$PRUNE_DONE) != @IDX_SHARDS; + return 1 if $GITS_NR || scalar(@$IDXQ) || $REPO_CTX; + return 1 if @IBXQ || keys(%TODO); + for my $s (grep { $_->{-wq_s1} } @IDX_SHARDS) { + $s->{-cidx_quit} = 1 if defined($s->{-wq_s1}); + $s->wq_close; # may recurse via awaitpid outside of event_loop + } + scalar(grep { $_->{-cidx_quit} } @IDX_SHARDS); +} + +# signal handlers +sub kill_shards { $_->wq_kill(@_) for (@IDX_SHARDS) } + +sub parent_quit { + $DO_QUIT = POSIX->can("SIG$_[0]")->(); + $XHC = 0; # stops the process + kill_shards(@_); + warn "# SIG$_[0] received, quitting...\n"; +} + +sub prep_umask ($) { + my ($self) = @_; + if ($self->{-cidx_internal}) { # respect core.sharedRepository + @{$self->{git_dirs}} == 1 or die 'BUG: only for GIT_DIR'; + local $self->{git} = + PublicInbox::Git->new($self->{git_dirs}->[0]); + $self->with_umask; + } elsif (-d $self->{cidx_dir}) { # respect existing perms + my @st = stat(_); + my $um = (~$st[2] & 0777); + $self->{umask} = $um; # for SearchIdx->with_umask + umask == $um or progress($self, 'using umask from ', + $self->{cidx_dir}, ': ', + sprintf('0%03o', $um)); + on_destroy \&CORE::umask, umask($um); + } else { + $self->{umask} = umask; # for SearchIdx->with_umask + undef; + } +} + +sub prep_alternate_end ($$) { + my ($objdir, $fmt) = @_; + my $hexlen = $OFMT2HEXLEN{$fmt} // return warn <<EOM; +E: ignoring objdir=$objdir, unknown extensions.objectFormat=$fmt +EOM + unless ($ALT_FH{$hexlen}) { + require PublicInbox::Import; + my $git_dir = "$TMPDIR/hexlen$hexlen.git"; + PublicInbox::Import::init_bare($git_dir, 'cidx-all', $fmt); + open $ALT_FH{$hexlen}, '>', "$git_dir/objects/info/alternates"; + } + say { $ALT_FH{$hexlen} } $objdir; +} + +sub store_objfmt { # via wq_do - make early cidx users happy + my ($self, $docid, $git_dir, $fmt) = @_; + $self->begin_txn_lazy; + my $doc = $self->get_doc($docid) // return + warn "BUG? #$docid for $git_dir missing"; + my @p = xap_terms('P', $doc) or return + warn "BUG? #$docid for $git_dir has no P(ath)"; + @p == 1 or return warn "BUG? #$docid $git_dir multi: @p"; + $p[0] eq $git_dir or return warn "BUG? #$docid $git_dir != @p"; + $doc->add_boolean_term('H'.$fmt); + $self->{xdb}->replace_document($docid, $doc); + # wait for prune_commit to commit... +} + +# TODO: remove prep_alternate_read and store_objfmt 1-2 years after 2.0 is out +# they are for compatibility with pre-release indices +sub prep_alternate_read { # run_git cb for config extensions.objectFormat + my ($opt, $self, $git, $objdir, $docid, $shard_n, $run_prune) = @_; + return if $DO_QUIT; + my $chld_err = $?; + prep_alternate_start($self, shift(@PRUNEQ), $run_prune) if @PRUNEQ; + my $fmt = check_objfmt_status $git, $chld_err, ${$opt->{1}}; + $IDX_SHARDS[$shard_n]->wq_do('store_objfmt', # async + $docid, $git->{git_dir}, $fmt); + prep_alternate_end $objdir, $fmt; +} + +sub prep_alternate_start { + my ($self, $git, $run_prune) = @_; + local $self->{xdb}; + my ($o, $n, @ids, @fmt); +start: + $o = $git->git_path('objects'); + while (!-d $o) { + $git = shift(@PRUNEQ) // return; + $o = $git->git_path('objects'); + } + $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB); + $self->{xdb} = $RDONLY_XDB[$n] // croak("BUG: no shard[$n]"); + @ids = $self->docids_by_postlist('P'.$git->{git_dir}); + @fmt = @ids ? xap_terms('H', $self->{xdb}, $ids[0]) : (); + @fmt > 1 and warn "BUG? multi `H' for shard[$n] #$ids[0]: @fmt"; + + if (@fmt) { # cache hit + prep_alternate_end $o, $fmt[0]; + $git = shift(@PRUNEQ) and goto start; + } else { # compatibility w/ early cidx format + run_git([qw(config extensions.objectFormat)], { quiet => 1 }, + \&prep_alternate_read, $self, $git, $o, $ids[0], $n, + $run_prune); + } +} + +sub cmd_done { # run_await cb for sort, xapian-delve, sed failures + my ($pid, $cmd, undef, undef, $run_on_destroy) = @_; + $? and die "fatal: @$cmd (\$?=$?)\n"; + # $run_on_destroy calls do_join() or run_prune() +} + +sub current_join_data ($) { + my ($self) = @_; + local $self->{xdb} = $RDONLY_XDB[0] // die 'BUG: shard[0] undef'; + # we support multiple PI_CONFIG files for a cindex: + $self->join_data; +} + +# combined previously stored stats with new +sub score_old_join_data ($$$) { + my ($self, $score, $ekeys_new) = @_; + my $old = ($JOIN{reset} ? undef : current_join_data($self)) or return; + progress($self, 'merging old join data...'); + my ($ekeys_old, $roots_old, $ibx2root_old) = + @$old{qw(ekeys roots ibx2root)}; + # score: "ibx_off root_off" => nr + my $i = -1; + my %root2id_new = map { $_ => ++$i } @OFF2ROOT; + $i = -1; + my %ekey2id_new = map { $_ => ++$i } @$ekeys_new; + for my $ibx_off_old (0..$#$ibx2root_old) { + my $root_offs_old = $ibx2root_old->[$ibx_off_old]; + my $ekey = $ekeys_old->[$ibx_off_old] // do { + warn "W: no ibx #$ibx_off_old in old join data\n"; + next; + }; + my $ibx_off_new = $ekey2id_new{$ekey} // do { + warn "W: `$ekey' no longer exists\n"; + next; + }; + for (@$root_offs_old) { + my ($nr, $rid_old) = @$_; + my $root_old = $roots_old->[$rid_old] // do { + warn "W: no root #$rid_old in old data\n"; + next; + }; + my $rid_new = $root2id_new{$root_old} // do { + warn "W: root `$root_old' no longer exists\n"; + next; + }; + $score->{"$ibx_off_new $rid_new"} += $nr; + } + } +} + +sub metadata_set { # via wq_do + my ($self, $key, $val, $commit) = @_; + $self->begin_txn_lazy; + $self->{xdb}->set_metadata($key, $val); + $self->commit_txn_lazy if $commit || defined(wantarray); +} + +# runs once all inboxes and shards are dumped via OnDestroy +sub do_join { + my ($self) = @_; + return if $DO_QUIT; + $XHC = 0; # should not be recreated again + @IDX_SHARDS or return warn("# aborting on no shards\n"); + unlink("$TMPDIR/root2id"); + my @pending = keys %{$self->{PENDING}}; + die "BUG: pending=@pending jobs not done\n" if @pending; + progress($self, 'joining...'); + my @join = (@JOIN, 'to_ibx_off', 'to_root_off'); + if (my $time = which('time')) { unshift @join, $time }; + my $rd = popen_rd(\@join, $CMD_ENV, { -C => "$TMPDIR" }); + my %score; + while (<$rd>) { # PFX ibx_offs root_off + chop eq "\n" or die "no newline from @join: <$_>"; + my (undef, $ibx_offs, @root_offs) = split / /, $_; + for my $ibx_off (split(/,/, $ibx_offs)) { + ++$score{"$ibx_off $_"} for @root_offs; + } + } + $rd->close or die "fatal: @join failed: \$?=$?"; + my $nr = scalar(keys %score) or do { + delete $TODO{joining}; + return progress($self, 'no potential new pairings'); + }; + progress($self, "$nr potential new pairings..."); + my @ekeys = map { $_->eidx_key } @IBX; + score_old_join_data($self, \%score, \@ekeys); + my $new; + while (my ($k, $nr) = each %score) { + my ($ibx_off, $root_off) = split(/ /, $k); + my ($ekey, $root) = ($ekeys[$ibx_off], $OFF2ROOT[$root_off]); + progress($self, "$ekey => $root has $nr matches"); + push @{$new->{ibx2root}->[$ibx_off]}, [ $nr, $root_off ]; + } + for my $ary (values %$new) { # sort by nr (largest first) + for (@$ary) { @$_ = sort { $b->[0] <=> $a->[0] } @$_ } + } + $new->{ekeys} = \@ekeys; + $new->{roots} = \@OFF2ROOT; + $new->{dt} = \@JOIN_DT; + $new = compress(PublicInbox::Config::json()->encode($new)); + my $key = $self->join_data_key; + my $wait = $IDX_SHARDS[0]->wq_do('metadata_set', $key, $new); + delete $TODO{joining}; +} + +sub require_progs { + my $op = shift; + while (my ($x, $argv) = splice(@_, 0, 2)) { + my $e = $x; + $e =~ tr/a-z-/A-Z_/; + my $c = $ENV{$e} // $x; + $argv->[0] //= which($c) // die "E: `$x' required for --$op\n"; + } +} + +sub init_join_postfork ($) { + my ($self) = @_; + return unless $self->{-opt}->{join}; + require_progs('join', join => \@JOIN); + my $d2 = '([0-9]{2})'; + my $dt_re = qr!([0-9]{4})$d2$d2$d2$d2$d2!; + if (my $cur = $JOIN{reset} ? undef : current_join_data($self)) { + if (($cur->{dt}->[1] // '') =~ m!\A$dt_re\z!o) { + my ($Y, $m, $d, $H, $M, $S) = ($1, $2, $3, $4, $5, $6); + my $t = timegm($S, $M, $H, $d, $m - 1, $Y); + $t = strftime('%Y%m%d%H%M%S', gmtime($t + 1)); + $JOIN{dt} //= "$t.."; + } else { + warn <<EOM; +BUG?: previous --join invocation did not store usable `dt' key +EOM + } + } + if ($JOIN{aggressive}) { + $JOIN{window} //= -1; + $JOIN{dt} //= '..1.month.ago'; + } + $QRY_STR = $JOIN{dt} // '1.year.ago..'; + index($QRY_STR, '..') >= 0 or die "E: dt:$QRY_STR is not a range\n"; + # Account for send->apply delay (torvalds/linux.git mean is ~20 days + # from Author to CommitDate in cases where CommitDate > AuthorDate + $QRY_STR .= '1.month.ago' if $QRY_STR =~ /\.\.\z/; + @{$self->{git_dirs} // []} or die "E: no coderepos to join\n"; + @IBX or die "E: no inboxes to join\n"; + my $approx_git = PublicInbox::Git->new($self->{git_dirs}->[0]); # ugh + substr($QRY_STR, 0, 0) = 'dt:'; + $self->query_approxidate($approx_git, $QRY_STR); # in-place + ($JOIN_DT[1]) = ($QRY_STR =~ /\.\.([0-9]{14})\z/); # YYYYmmddHHMMSS + ($JOIN_DT[0]) = ($QRY_STR =~ /\Adt:([0-9]{14})/); # YYYYmmddHHMMSS + $JOIN_DT[0] //= '19700101'.'000000'; # git uses unsigned times + $TODO{do_join} = on_destroy \&do_join, $self; + $TODO{joining} = 1; # keep shards_active() happy + $TODO{dump_ibx_start} = on_destroy \&dump_ibx_start, + $self, $TODO{do_join}; + $TODO{dump_roots_start} = on_destroy \&dump_roots_start, + $self, $TODO{do_join}; + progress($self, "will join in $QRY_STR date range..."); + my $id = -1; + @IBXQ = map { ++$id } @IBX; +} + +sub init_prune ($) { + my ($self) = @_; + return (@$PRUNE_DONE = map { 1 } @IDX_SHARDS) if !$self->{-opt}->{prune}; + + # Dealing with millions of commits here at once, so use faster tools. + # xapian-delve is nearly an order-of-magnitude faster than Xapian Perl + # bindings. sed/awk are faster than Perl for simple stream ops, and + # sort+comm are more memory-efficient with gigantic lists. + # pipeline: delve | sed | sort >indexed_commits + my @delve = (undef, qw(-A Q -1)); + my @sed = (undef, '-ne', 's/^Q//p'); + @COMM = (undef, qw(-2 -3 indexed_commits -)); + @AWK = (undef, '$2 == "commit" { print $1 }'); # --batch-check output + require_progs('prune', 'xapian-delve' => \@delve, sed => \@sed, + comm => \@COMM, awk => \@AWK); + for (0..$#IDX_SHARDS) { push @delve, "$self->{xpfx}/$_" } + my $run_prune = on_destroy \&run_prune, $self, $TODO{dump_roots_start}; + my ($sort_opt, $sed_opt, $delve_opt); + pipe(local $sed_opt->{0}, local $delve_opt->{1}); + pipe(local $sort_opt->{0}, local $sed_opt->{1}); + open($sort_opt->{1}, '+>', "$TMPDIR/indexed_commits"); + run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done, $run_prune); + run_await(\@sed, $CMD_ENV, $sed_opt, \&cmd_done, $run_prune); + run_await(\@delve, undef, $delve_opt, \&cmd_done, $run_prune); + @PRUNEQ = @$SCANQ; + for (1..$LIVE_JOBS) { + prep_alternate_start($self, shift(@PRUNEQ) // last, $run_prune); + } +} + +sub dump_git_commits { # run_await cb + my ($pid, $cmd, undef, $batch_opt, $self) = @_; + (defined($pid) && $?) and die "E: @$cmd \$?=$?"; + return if $DO_QUIT; + my ($hexlen) = keys(%ALT_FH) or return; # done, DESTROY batch_opt->{1} + close(delete $ALT_FH{$hexlen}); # flushes `say' buffer + progress($self, "preparing $hexlen-byte hex OID commits for prune..."); + my $g = PublicInbox::Git->new("$TMPDIR/hexlen$hexlen.git"); + run_await($g->cmd(@PRUNE_BATCH), undef, $batch_opt, + \&dump_git_commits, $self); +} + +sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done + my ($self, $drs) = @_; + return if $DO_QUIT; + # setup the following pipeline: ( + # git --git-dir=hexlen40.git cat-file \ + # --batch-all-objects --batch-check && + # git --git-dir=hexlen64.git cat-file \ + # --batch-all-objects --batch-check + # ) | awk | sort | comm | cidx_read_comm() + my ($awk_opt, $sort_opt, $batch_opt); + my $comm_opt = { -C => "$TMPDIR" }; + pipe(local $awk_opt->{0}, $batch_opt->{1}); + pipe(local $sort_opt->{0}, local $awk_opt->{1}); + pipe(local $comm_opt->{0}, local $sort_opt->{1}); + run_await(\@AWK, $CMD_ENV, $awk_opt, \&cmd_done); + run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done); + my $comm_rd = popen_rd(\@COMM, $CMD_ENV, $comm_opt, \&cmd_done, \@COMM); + PublicInbox::CidxComm->new($comm_rd, $self, $drs); # ->cidx_read_comm + push @PRUNE_BATCH, '--buffer' if $GIT_VER ge v2.6; + + # Yes, we pipe --unordered git output to sort(1) because sorting + # inside git leads to orders-of-magnitude slowdowns on rotational + # storage. GNU sort(1) also works well on larger-than-memory + # datasets, and it's not worth eliding sort(1) for old git. + push @PRUNE_BATCH, '--unordered' if $GIT_VER ge v2.19; + warn(sprintf(<<EOM, $GIT_VER)) if $GIT_VER lt v2.19; +W: git v2.19+ recommended for high-latency storage (have git v%vd) +EOM + dump_git_commits(undef, undef, undef, $batch_opt, $self); +} + +sub cidx_read_comm { # via PublicInbox::CidxComm::event_step + my ($self, $comm_rd, $drs) = @_; + return if $DO_QUIT; + progress($self, 'starting prune...'); + $_->wq_do('prune_init') for @IDX_SHARDS; + while (defined(my $cmt = <$comm_rd>)) { + chop($cmt) eq "\n" or die "BUG: no LF in comm output ($cmt)"; + my $n = hex(substr($cmt, 0, 8)) % scalar(@IDX_SHARDS); + $IDX_SHARDS[$n]->wq_do('prune_one', 'Q'.$cmt); + last if $DO_QUIT; + } + for my $git_dir (@GIT_DIR_GONE) { + my $n = git_dir_hash($git_dir) % scalar(@IDX_SHARDS); + $IDX_SHARDS[$n]->wq_do('prune_one', 'P'.$git_dir); + last if $DO_QUIT; + } + my ($c, $p) = PublicInbox::PktOp->pair; + $c->{ops}->{prune_done} = [ $self, $drs ]; + $_->wq_io_do('prune_commit', [ $p->{op_p} ]) for @IDX_SHARDS; +} + +sub init_join_prefork ($) { + my ($self) = @_; + my $subopt = $self->{-opt}->{join} // return; + %JOIN = map { + my ($k, $v) = split /:/, $_, 2; + $k => $v // 1; + } split(/,/, join(',', @$subopt)); + require PublicInbox::CidxXapHelperAux; + require PublicInbox::XapClient; + my @unknown; + my $pfx = $JOIN{prefixes} // 'dfpost7'; + for my $p (split /\+/, $pfx) { + my $n = ''; + $p =~ s/([0-9]+)\z// and $n = $1; + my $v = $PublicInbox::Search::PATCH_BOOL_COMMON{$p} // + push(@unknown, $p); + push(@JOIN_PFX, map { $_.$n } split(/ /, $v)); + } + @unknown and die <<EOM; +E: --join=prefixes= contains unsupported prefixes: @unknown +EOM + @JOIN_PFX = uniqstr @JOIN_PFX; + my %incl = map { + if (-f "$_/inbox.lock" || -d "$_/public-inbox") { + rel2abs_collapsed($_) => undef; + } else { + warn "W: `$_' is not a public inbox, skipping\n"; + (); + } + } (@{$self->{-opt}->{include} // []}); + my $all = $self->{-opt}->{all}; + if (my $only = $self->{-opt}->{only}) { + die <<'' if $all; +E: --all is incompatible with --only + + $incl{rel2abs_collapsed($_)} = undef for @$only; + } + unless (keys(%incl)) { + $all = 1; + warn <<EOM unless $self->{opt}->{quiet}; +# --all implied since no inboxes were specified with --only or --include +EOM + } + $self->{-opt}->{-pi_cfg}->each_inbox(\&_prep_ibx, $self, \%incl, $all); + my $nr = scalar(@IBX) or die "E: no inboxes to join with\n"; + progress($self, "will join with $nr inboxes in ", + $self->{-opt}->{-pi_cfg}->{-f}, " using: $pfx"); +} + +sub _prep_ibx { # each_inbox callback + my ($ibx, $self, $incl, $all) = @_; + ($all || exists($incl->{$ibx->{inboxdir}})) and push @IBX, $ibx; +} + +sub show_json { # for diagnostics (unstable output) + my ($self) = @_; + my $s = $self->{-opt}->{show} or return; # for diagnostics + local $self->{xdb}; + my %ret; + my @todo = @$s; + while (defined(my $f = shift @todo)) { + if ($f =~ /\A(?:roots2paths|paths2roots|join_data)\z/) { + $ret{$f} = $self->$f; + } elsif ($f eq '') { # default --show (no args) + push @todo, qw(roots2paths join_data); + } else { + warn "E: cannot show `$f'\n"; + } + } + my $json = ref(PublicInbox::Config::json())->new; + $json->utf8->canonical->pretty; # n.b. FS pathnames may not be UTF-8... + say $json->encode(\%ret); +} + +sub do_inits { # called via PublicInbox::DS::add_timer + my ($self) = @_; + grep !!$_, @{$self->{-opt}}{qw(scan prune)} and + @$SCANQ = map PublicInbox::Git->new($_), @{$self->{git_dirs}}; + init_join_postfork $self; + init_prune $self; + scan_git_dirs $self; + my $max = $TODO{do_join} ? max($LIVE_JOBS, $NPROC) : $LIVE_JOBS; + index_next($self) for (1..$max); +} + +sub cidx_run { # main entry point + my ($self) = @_; + my $restore_umask = prep_umask($self); + local $SIGSET = PublicInbox::DS::block_signals( + POSIX::SIGTSTP, POSIX::SIGCONT); + my $restore = on_destroy \&PublicInbox::DS::sig_setmask, $SIGSET; + local $PRUNE_DONE = []; + local $IDXQ = []; + local $SCANQ = []; + local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, @PRUNEQ, + $REPO_CTX, %ALT_FH, $TMPDIR, @AWK, @COMM, $CMD_ENV, + %TODO, @IBXQ, @IBX, @JOIN, %JOIN, @JOIN_PFX, @NO_ABBREV, + @JOIN_DT, $DUMP_IBX_WPIPE, @OFF2ROOT, $XHC, @SORT, $GITS_NR); + local $BATCH_BYTES = $self->{-opt}->{batch_size} // + $PublicInbox::SearchIdx::BATCH_BYTES; + local $MAX_SIZE = $self->{-opt}->{max_size}; + local $self->{PENDING} = {}; # used by PublicInbox::CidxXapHelperAux + my $cfg = $self->{-opt}->{-pi_cfg} // die 'BUG: -pi_cfg unset'; + $self->{-cfg_f} = $cfg->{-f} = rel2abs_collapsed($cfg->{-f}); + local $GIT_VER = PublicInbox::Git::git_version(); + @NO_ABBREV = ('-c', 'core.abbrev='.($GIT_VER lt v2.31.0 ? 40 : 'no')); + if (grep { $_ } @{$self->{-opt}}{qw(prune join)}) { + require File::Temp; + $TMPDIR = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1); + $CMD_ENV = { TMPDIR => "$TMPDIR", LC_ALL => 'C', LANG => 'C' }; + require_progs('(prune|join)', sort => \@SORT); + for (qw(parallel compress-program buffer-size)) { # GNU sort + my $v = $self->{-opt}->{"sort-$_"}; + push @SORT, "--$_=$v" if defined $v; + } + ($self->{-opt}->{prune} && $GIT_VER le v2.6) and + die "W: --prune requires git v2.6+\n"; + init_join_prefork($self) + } + local @IDX_SHARDS = cidx_init($self); # forks workers + local $ANY_SHARD = -1; + local $self->{current_info} = ''; + local $MY_SIG = { + CHLD => \&PublicInbox::DS::enqueue_reap, + USR1 => \&kill_shards, + }; + local @PRUNE_BATCH = @PRUNE_BATCH; + $MY_SIG->{$_} = \&parent_quit for qw(TERM QUIT INT); + my $cb = $SIG{__WARN__} || \&CORE::warn; + local $SIG{__WARN__} = sub { + my $m = shift @_; + $self->{current_info} eq '' or + $m =~ s/\A(#?\s*)/$1$self->{current_info}: /; + $cb->($m, @_); + }; + load_existing($self) unless $self->{-cidx_internal}; + if ($self->{-opt}->{reindex}) { + require PublicInbox::SharedKV; + $REINDEX = PublicInbox::SharedKV->new; + delete $REINDEX->{lock_path}; + $REINDEX->dbh; + } + my @nc = grep { File::Spec->canonpath($_) ne $_ } @{$self->{git_dirs}}; + if (@nc) { + warn "E: BUG? paths in $self->{cidx_dir} not canonicalized:\n"; + for my $d (@{$self->{git_dirs}}) { + my $c = File::Spec->canonpath($_); + warn "E: $d => $c\n"; + $d = $c; + } + warn "E: canonicalized and attempting to continue\n"; + } + if (defined(my $excl = $self->{-opt}->{exclude})) { + my $re = '(?:'.join('\\z|', map { + glob2re($_) // qr/\A\Q$_\E/ + } @$excl).'\\z)'; + my @excl; + @{$self->{git_dirs}} = grep { + $_ =~ /$re/ ? (push(@excl, $_), 0) : 1; + } @{$self->{git_dirs}}; + warn("# excluding $_\n") for @excl; + @GIT_DIR_GONE = uniqstr @GIT_DIR_GONE, @excl; + } + local $NCHANGE = 0; + local $NPROC = PublicInbox::IPC::detect_nproc(); + local $LIVE_JOBS = $self->{-opt}->{jobs} || $NPROC || 2; + local @RDONLY_XDB = $self->xdb_shards_flat; + PublicInbox::DS::add_timer(0, \&do_inits, $self); + + # FreeBSD ignores/discards SIGCHLD while signals are blocked and + # EVFILT_SIGNAL is inactive, so we pretend we have a SIGCHLD pending + PublicInbox::DS::enqueue_reap(); + + local @PublicInbox::DS::post_loop_do = (\&shards_active); + PublicInbox::DS::event_loop($MY_SIG, $SIGSET); + $self->lock_release(!!$NCHANGE); + show_json($self); +} + +sub ipc_atfork_child { # @IDX_SHARDS + my ($self) = @_; + $self->SUPER::ipc_atfork_child; + $SIG{USR1} = \&shard_usr1; + $SIG{$_} = \&shard_quit for qw(INT TERM QUIT); + my $x = delete $self->{siblings} // die 'BUG: no {siblings}'; + $_->wq_close for @$x; + undef; +} + +sub shard_done_wait { # awaitpid cb via ipc_worker_reap + my ($pid, $shard, $self) = @_; + my $quit_req = delete($shard->{-cidx_quit}); + return if $DO_QUIT; + if ($? == 0) { # success + $quit_req // warn 'BUG: {-cidx_quit} unset'; + } else { + warn "PID:$pid $shard->{shard} exited with \$?=$?\n"; + ++$self->{shard_err} if defined($self->{shard_err}); + } +} + +1; diff --git a/lib/PublicInbox/Compat.pm b/lib/PublicInbox/Compat.pm new file mode 100644 index 00000000..78cba90e --- /dev/null +++ b/lib/PublicInbox/Compat.pm @@ -0,0 +1,24 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# compatibility code for old Perl and standard modules, mainly +# List::Util but maybe other stuff +package PublicInbox::Compat; +use v5.12; +use parent qw(Exporter); +require List::Util; + +our @EXPORT_OK = qw(uniqstr); + +# uniqstr is in List::Util 1.45+, which means Perl 5.26+; +# so maybe 2030 for us since we need to support enterprise distros. +# We can use uniqstr everywhere in our codebase and don't need +# to account for special cases of `uniqnum' nor `uniq' in List::Util +# even if they make more sense in some contexts +no warnings 'once'; +*uniqstr = List::Util->can('uniqstr') // sub (@) { + my %seen; + grep { !$seen{$_}++ } @_; +}; + +1; diff --git a/lib/PublicInbox/CompressNoop.pm b/lib/PublicInbox/CompressNoop.pm index fe73c2d1..5135299f 100644 --- a/lib/PublicInbox/CompressNoop.pm +++ b/lib/PublicInbox/CompressNoop.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Provide the same methods as Compress::Raw::Zlib::Deflate but @@ -10,7 +10,7 @@ use Compress::Raw::Zlib qw(Z_OK); sub new { bless \(my $self), __PACKAGE__ } sub deflate { # ($self, $input, $output) - $_[2] .= $_[1]; + $_[2] .= ref($_[1]) ? ${$_[1]} : $_[1]; Z_OK; } diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index d57c361a..49659a2e 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used throughout the project for reading configuration @@ -10,33 +10,38 @@ package PublicInbox::Config; use strict; use v5.10.1; +use parent qw(Exporter); +our @EXPORT_OK = qw(glob2re rel2abs_collapsed); use PublicInbox::Inbox; -use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::Git qw(git_exe); +use PublicInbox::Spawn qw(popen_rd run_qx); +our $LD_PRELOAD = $ENV{LD_PRELOAD}; # only valid at startup +our $DEDUPE; # set to {} to dedupe or clear cache sub _array ($) { ref($_[0]) eq 'ARRAY' ? $_[0] : [ $_[0] ] } # returns key-value pairs of config directives in a hash # if keys may be multi-value, the value is an array ref containing all values sub new { - my ($class, $file) = @_; + my ($class, $file, $lei) = @_; $file //= default_file(); - my $self; - if (ref($file) eq 'SCALAR') { # used by some tests - open my $fh, '<', $file or die; # PerlIO::scalar - $self = config_fh_parse($fh, "\n", '='); - } else { - $self = git_config_dump($file); + my ($self, $set_dedupe); + if (-f $file && $DEDUPE) { + $file = rel2abs_collapsed($file); + $self = $DEDUPE->{$file} and return $self; + $set_dedupe = 1; } - bless $self, $class; + $self = git_config_dump($class, $file, $lei); + $self->{-f} = $file; # caches $self->{-by_addr} = {}; $self->{-by_list_id} = {}; $self->{-by_name} = {}; $self->{-by_newsgroup} = {}; + $self->{-by_eidx_key} = {}; $self->{-no_obfuscate} = {}; $self->{-limiters} = {}; - $self->{-code_repos} = {}; # nick => PublicInbox::Git object - $self->{-cgitrc_unparsed} = $self->{'publicinbox.cgitrc'}; + $self->{-coderepos} = {}; # nick => PublicInbox::Git object if (my $no = delete $self->{'publicinbox.noobfuscate'}) { $no = _array($no); @@ -59,7 +64,7 @@ sub new { if (my $css = delete $self->{'publicinbox.css'}) { $self->{css} = _array($css); } - + $DEDUPE->{$file} = $self if $set_dedupe; $self; } @@ -86,9 +91,22 @@ sub lookup_list_id { sub lookup_name ($$) { my ($self, $name) = @_; - $self->{-by_name}->{$name} // _fill($self, "publicinbox.$name"); + $self->{-by_name}->{$name} // _fill_ibx($self, $name); +} + +sub lookup_ei { + my ($self, $name) = @_; + $self->{-ei_by_name}->{$name} //= _fill_ei($self, $name); +} + +sub lookup_eidx_key { + my ($self, $eidx_key) = @_; + _lookup_fill($self, '-by_eidx_key', $eidx_key); } +# special case for [extindex "all"] +sub ALL { lookup_ei($_[0], 'all') } + sub each_inbox { my ($self, $cb, @arg) = @_; # may auto-vivify if config file is non-existent: @@ -107,9 +125,9 @@ sub lookup_newsgroup { sub limiter { my ($self, $name) = @_; $self->{-limiters}->{$name} //= do { - require PublicInbox::Qspawn; + require PublicInbox::Limiter; my $max = $self->{"publicinboxlimiter.$name.max"} || 1; - my $limiter = PublicInbox::Qspawn::Limiter->new($max); + my $limiter = PublicInbox::Limiter->new($max); $limiter->setup_rlimit($name, $self); $limiter; }; @@ -123,20 +141,19 @@ sub default_file { sub config_fh_parse ($$$) { my ($fh, $rs, $fs) = @_; - my %rv; - my (%section_seen, @section_order); + my (%rv, %seen, @section_order, $line, $k, $v, $section, $cur, $i); local $/ = $rs; - while (defined(my $line = <$fh>)) { - chomp $line; - my ($k, $v) = split($fs, $line, 2); - my ($section) = ($k =~ /\A(\S+)\.[^\.]+\z/); - unless (defined $section_seen{$section}) { - $section_seen{$section} = 1; - push @section_order, $section; - } - - my $cur = $rv{$k}; - if (defined $cur) { + while (defined($line = <$fh>)) { # perf critical with giant configs + $i = index($line, $fs); + # $i may be -1 if $fs not found and it's a key-only entry + # (meaning boolean true). Either way the -1 will drop the + # $rs either from $k or $v. + $k = substr($line, 0, $i); + $v = $i >= 0 ? substr($line, $i + 1, -1) : 1; + $section = substr($k, 0, rindex($k, '.')); + $seen{$section} //= push(@section_order, $section); + + if (defined($cur = $rv{$k})) { if (ref($cur) eq "ARRAY") { push @$cur, $v; } else { @@ -151,19 +168,39 @@ sub config_fh_parse ($$$) { \%rv; } +sub tmp_cmd_opt ($$) { + my ($env, $opt) = @_; + # quiet global and system gitconfig if supported by installed git, + # but normally harmless if too noisy (NOGLOBAL no longer exists) + $env->{GIT_CONFIG_NOSYSTEM} = 1; + $env->{GIT_CONFIG_GLOBAL} = '/dev/null'; # git v2.32+ + $opt->{-C} = '/'; # avoid $worktree/.git/config on MOST systems :P +} + sub git_config_dump { - my ($file) = @_; - return {} unless -e $file; - my @cmd = (qw/git config -z -l --includes/, "--file=$file"); - my $cmd = join(' ', @cmd); - my $fh = popen_rd(\@cmd); + my ($class, $file, $lei) = @_; + my @opt_c = map { ('-c', $_) } @{$lei->{opt}->{c} // []}; + $file = undef if !-e $file; + # XXX should we set {-f} if !-e $file? + return bless {}, $class if (!@opt_c && !defined($file)); + my %env; + my $opt = { 2 => $lei->{2} // 2 }; + if (@opt_c) { + unshift(@opt_c, '-c', "include.path=$file") if defined($file); + tmp_cmd_opt(\%env, $opt); + } + my @cmd = (git_exe, @opt_c, qw(config -z -l --includes)); + push(@cmd, '-f', $file) if !@opt_c && defined($file); + my $fh = popen_rd(\@cmd, \%env, $opt); my $rv = config_fh_parse($fh, "\0", "\n"); - close $fh or die "failed to close ($cmd) pipe: $?"; - $rv; + $fh->close or die "@cmd failed: \$?=$?\n"; + $rv->{-opt_c} = \@opt_c if @opt_c; # for ->urlmatch + $rv->{-f} = $file; + bless $rv, $class; } -sub valid_inbox_name ($) { - my ($name) = @_; +sub valid_foo_name ($;$) { + my ($name, $pfx) = @_; # Similar rules found in git.git/remote.c::valid_remote_nick # and git.git/refs.c::check_refname_component @@ -171,6 +208,7 @@ sub valid_inbox_name ($) { if ($name eq '' || $name =~ /\@\{/ || $name =~ /\.\./ || $name =~ m![/:\?\[\]\^~\s\f[:cntrl:]\*]! || $name =~ /\A\./ || $name =~ /\.\z/) { + warn "invalid $pfx name: `$name'\n" if $pfx; return 0; } @@ -208,7 +246,6 @@ sub cgit_repo_merge ($$$) { $rel =~ s!/?\.git\z!!; } $self->{"coderepo.$rel.dir"} //= $path; - $self->{"coderepo.$rel.cgiturl"} //= _array($rel); } sub is_git_dir ($) { @@ -244,10 +281,11 @@ sub scan_tree_coderepo ($$) { scan_path_coderepo($self, $path, $path); } -sub scan_projects_coderepo ($$$) { - my ($self, $list, $path) = @_; - open my $fh, '<', $list or do { - warn "failed to open cgit projectlist=$list: $!\n"; +sub scan_projects_coderepo ($$) { + my ($self, $path) = @_; + my $l = $self->{-cgit_project_list} // die 'BUG: no cgit_project_list'; + open my $fh, '<', $l or do { + warn "failed to open cgit project-list=$l: $!\n"; return; }; while (<$fh>) { @@ -256,8 +294,20 @@ sub scan_projects_coderepo ($$$) { } } +sub apply_cgit_scan_path { + my ($self, @paths) = @_; + @paths or @paths = @{$self->{-cgit_scan_path}}; + if (defined($self->{-cgit_project_list})) { + for my $p (@paths) { scan_projects_coderepo($self, $p) } + } else { + for my $p (@paths) { scan_tree_coderepo($self, $p) } + } +} + sub parse_cgitrc { my ($self, $cgitrc, $nesting) = @_; + $cgitrc //= $self->{'publicinbox.cgitrc'} // + $ENV{CGIT_CONFIG} // return; if ($nesting == 0) { # defaults: my %s = map { $_ => 1 } qw(/cgit.css /cgit.png @@ -297,58 +347,50 @@ sub parse_cgitrc { my ($k, $v) = ($1, $2); $k =~ tr/-/_/; $self->{"-cgit_$k"} = $v; + delete $self->{-cgit_scan_path} if $k eq 'project_list'; } elsif (m!\Ascan-path=(.+)\z!) { - if (defined(my $list = $self->{-cgit_project_list})) { - scan_projects_coderepo($self, $list, $1); - } else { - scan_tree_coderepo($self, $1); - } + # this depends on being after project-list in the + # config file, just like cgit.c + push @{$self->{-cgit_scan_path}}, $1; + apply_cgit_scan_path($self, $1); } elsif (m!\A(?:css|favicon|logo|repo\.logo)=(/.+)\z!) { # absolute paths for static files via PublicInbox::Cgit $self->{-cgit_static}->{$1} = 1; + } elsif (s!\Asnapshots=\s*!!) { + $self->{'coderepo.snapshots'} = $_; } } cgit_repo_merge($self, $repo->{dir}, $repo) if $repo; } -# parse a code repo -# Only git is supported at the moment, but SVN and Hg are possibilities -sub _fill_code_repo { - my ($self, $nick) = @_; - my $pfx = "coderepo.$nick"; - - # TODO: support gitweb and other repository viewers? - if (defined(my $cgitrc = delete $self->{-cgitrc_unparsed})) { - parse_cgitrc($self, $cgitrc, 0); - } - my $dir = $self->{"$pfx.dir"}; # aka "GIT_DIR" - unless (defined $dir) { - warn "$pfx.dir unset\n"; - return; - } - - my $git = PublicInbox::Git->new($dir); - foreach my $t (qw(blob commit tree tag)) { - $git->{$t.'_url_format'} = - _array($self->{lc("$pfx.${t}UrlFormat")}); +sub valid_dir ($$) { + my $dir = get_1($_[0], $_[1]) // return; + index($dir, "\n") < 0 ? $dir : do { + warn "E: `$_[1]=$dir' must not contain `\\n'\n"; + undef; } +} +# parse a code repo, only git is supported at the moment +sub fill_coderepo { + my ($self, $nick) = @_; + my $pfx = "coderepo.$nick"; + my $git = PublicInbox::Git->new(valid_dir($self, "$pfx.dir") // return); if (defined(my $cgits = $self->{"$pfx.cgiturl"})) { $git->{cgit_url} = $cgits = _array($cgits); $self->{"$pfx.cgiturl"} = $cgits; - - # cgit supports "/blob/?id=%s", but it's only a plain-text - # display and requires an unabbreviated id= - foreach my $t (qw(blob commit tag)) { - $git->{$t.'_url_format'} //= map { - "$_/$t/?id=%s" - } @$cgits; - } } - + my %dedupe = ($nick => undef); + ($git->{nick}) = keys %dedupe; $git; } +sub get_all { + my ($self, $key) = @_; + my $v = $self->{$key} // return; + _array($v); +} + sub git_bool { my ($val) = $_[-1]; # $_[0] may be $self, or $val if ($val =~ /\A(?:false|no|off|[\-\+]?(?:0x)?0+)\z/i) { @@ -360,37 +402,75 @@ sub git_bool { } } -sub _fill { - my ($self, $pfx) = @_; - my $ibx = {}; +# abs_path resolves symlinks, so we want to avoid it if rel2abs +# is sufficient and doesn't leave "/.." or "/../" +sub rel2abs_collapsed { + require File::Spec; + my $p = File::Spec->rel2abs(@_); + return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0; + require Cwd; + Cwd::abs_path($p); +} - for my $k (qw(watch nntpserver)) { - my $v = $self->{"$pfx.$k"}; - $ibx->{$k} = $v if defined $v; - } - for my $k (qw(filter inboxdir newsgroup replyto httpbackendmax feedmax - indexlevel indexsequentialshard)) { - if (defined(my $v = $self->{"$pfx.$k"})) { - if (ref($v) eq 'ARRAY') { - warn <<EOF; -W: $pfx.$k has multiple values, only using `$v->[-1]' -EOF - $ibx->{$k} = $v->[-1]; - } else { - $ibx->{$k} = $v; +sub get_1 { + my ($self, $key) = @_; + my $v = $self->{$key}; + return $v if !ref($v); + warn "W: $key has multiple values, only using `$v->[-1]'\n"; + $v->[-1]; +} + +sub repo_objs { + my ($self, $ibxish) = @_; + $ibxish->{-repo_objs} // do { + my $ibx_coderepos = $ibxish->{coderepo} // return; + parse_cgitrc($self, undef, 0); + my $coderepos = $self->{-coderepos}; + my @repo_objs; + for my $nick (@$ibx_coderepos) { + my @parts = split(m!/!, $nick); + for (@parts) { + @parts = () unless valid_foo_name($_); + } + unless (@parts) { + warn "invalid coderepo name: `$nick'\n"; + next; } + my $repo = $coderepos->{$nick} //= + fill_coderepo($self, $nick); + $repo ? push(@repo_objs, $repo) : + warn("coderepo.$nick.dir unset\n"); + } + if (scalar @repo_objs) { + for (@repo_objs) { + push @{$_->{ibx_names}}, $ibxish->{name}; + } + $ibxish->{-repo_objs} = \@repo_objs; + } else { + delete $ibxish->{coderepo}; } } +} - # backwards compatibility: - $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"}; - if (($ibx->{inboxdir} // '') =~ /\n/s) { - warn "E: `$ibx->{inboxdir}' must not contain `\\n'\n"; - return; - } - foreach my $k (qw(obfuscate)) { +sub _fill_ibx { + my ($self, $name) = @_; + my $pfx = "publicinbox.$name"; + my $ibx = {}; + for my $k (qw(watch)) { my $v = $self->{"$pfx.$k"}; - defined $v or next; + $ibx->{$k} = $v if defined $v; + } + for my $k (qw(filter newsgroup replyto httpbackendmax feedmax + indexlevel indexsequentialshard boost)) { + my $v = get_1($self, "$pfx.$k") // next; + $ibx->{$k} = $v; + } + + # "mainrepo" is backwards compatibility: + my $dir = $ibx->{inboxdir} = valid_dir($self, "$pfx.inboxdir") // + valid_dir($self, "$pfx.mainrepo") // return; + for my $k (qw(obfuscate)) { + my $v = $self->{"$pfx.$k"} // next; if (defined(my $bval = git_bool($v))) { $ibx->{$k} = $bval; } else { @@ -399,24 +479,18 @@ EOF } # TODO: more arrays, we should support multi-value for # more things to encourage decentralization - foreach my $k (qw(address altid nntpmirror coderepo hide listid url - infourl watchheader)) { - if (defined(my $v = $self->{"$pfx.$k"})) { - $ibx->{$k} = _array($v); - } + for my $k (qw(address altid nntpmirror imapmirror + coderepo hide listid url + infourl watchheader + nntpserver imapserver pop3server)) { + my $v = $self->{"$pfx.$k"} // next; + $ibx->{$k} = _array($v); } - return unless defined($ibx->{inboxdir}); - my $name = $pfx; - $name =~ s/\Apublicinbox\.//; - - if (!valid_inbox_name($name)) { - warn "invalid inbox name: '$name'\n"; - return; - } - - $ibx->{name} = $name; - $ibx->{-pi_config} = $self; + return unless valid_foo_name($name, 'publicinbox'); + my %dedupe = ($name => undef); + ($ibx->{name}) = keys %dedupe; # used as a key everywhere + $ibx->{-pi_cfg} = $self; $ibx = PublicInbox::Inbox->new($ibx); foreach (@{$ibx->{address}}) { my $lc_addr = lc($_); @@ -429,8 +503,38 @@ EOF $self->{-by_list_id}->{lc($list_id)} = $ibx; } } - if (my $ng = $ibx->{newsgroup}) { - $self->{-by_newsgroup}->{$ng} = $ibx; + if (defined(my $ngname = $ibx->{newsgroup})) { + if (ref($ngname)) { + delete $ibx->{newsgroup}; + warn 'multiple newsgroups not supported: '. + join(', ', @$ngname). "\n"; + # Newsgroup name needs to be compatible with RFC 3977 + # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. + # Leave out a few chars likely to cause problems or conflicts: + # '|', '<', '>', ';', '#', '$', '&', + } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]! || + $ngname eq '') { + delete $ibx->{newsgroup}; + warn "newsgroup name invalid: `$ngname'\n"; + } else { + my $lc = $ibx->{newsgroup} = lc $ngname; + warn <<EOM if $lc ne $ngname; +W: newsgroup=`$ngname' lowercased to `$lc' +EOM + # PublicInbox::NNTPD does stricter ->nntp_usable + # checks, keep this lean for startup speed + my $cur = $self->{-by_newsgroup}->{$lc} //= $ibx; + warn <<EOM if $cur != $ibx; +W: newsgroup=`$lc' is used by both `$cur->{name}' and `$ibx->{name}' +EOM + } + } + unless (defined $ibx->{newsgroup}) { # for ->eidx_key + my $abs = rel2abs_collapsed($dir); + if ($abs ne $dir) { + warn "W: `$dir' canonicalized to `$abs'\n"; + $ibx->{inboxdir} = $abs; + } } $self->{-by_name}->{$name} = $ibx; if ($ibx->{obfuscate}) { @@ -438,42 +542,171 @@ EOF $ibx->{-no_obfuscate_re} = $self->{-no_obfuscate_re}; fill_all($self); # noop to populate -no_obfuscate } + if (my $es = ALL($self)) { + require PublicInbox::Isearch; + $ibx->{isrch} = PublicInbox::Isearch->new($ibx, $es); + } + my $cur = $self->{-by_eidx_key}->{my $ekey = $ibx->eidx_key} //= $ibx; + $cur == $ibx or warn + "W: `$ekey' used by both `$cur->{name}' and `$ibx->{name}'\n"; + $ibx; +} - if (my $ibx_code_repos = $ibx->{coderepo}) { - my $code_repos = $self->{-code_repos}; - my $repo_objs = $ibx->{-repo_objs} = []; - foreach my $nick (@$ibx_code_repos) { - my @parts = split(m!/!, $nick); - my $valid = 0; - $valid += valid_inbox_name($_) foreach (@parts); - $valid == scalar(@parts) or next; +sub _fill_ei ($$) { + my ($self, $name) = @_; + eval { require PublicInbox::ExtSearch } or return; + my $pfx = "extindex.$name"; + my $d = valid_dir($self, "$pfx.topdir") // return; + -d $d or return; + my $es = PublicInbox::ExtSearch->new($d); + for my $k (qw(indexlevel indexsequentialshard)) { + my $v = get_1($self, "$pfx.$k") // next; + $es->{$k} = $v; + } + for my $k (qw(coderepo hide url infourl)) { + my $v = $self->{"$pfx.$k"} // next; + $es->{$k} = _array($v); + } + return unless valid_foo_name($name, 'extindex'); + $es->{name} = $name; + $es; +} - my $repo = $code_repos->{$nick} //= - _fill_code_repo($self, $nick); - push @$repo_objs, $repo if $repo; - } +sub _fill_csrch ($$) { + my ($self, $name) = @_; # "" is a valid name for cindex + return if $name ne '' && !valid_foo_name($name, 'cindex'); + eval { require PublicInbox::CodeSearch } or return; + my $pfx = "cindex.$name"; + my $d = valid_dir($self, "$pfx.topdir") // return; + -d $d or return; + my $csrch = PublicInbox::CodeSearch->new($d, $self); + for my $k (qw(localprefix)) { + my $v = $self->{"$pfx.$k"} // next; + $csrch->{$k} = _array($v); } + $csrch->{name} = $name; + $csrch; +} - $ibx +sub lookup_cindex ($$) { + my ($self, $name) = @_; + $self->{-csrch_by_name}->{$name} //= _fill_csrch($self, $name); +} + +sub each_cindex { + my ($self, $cb, @arg) = @_; + my @csrch = map { + lookup_cindex($self, substr($_, length('cindex.'))) // () + } grep(m!\Acindex\.[^\./]*\z!, @{$self->{-section_order}}); + if (ref($cb) eq 'CODE') { + $cb->($_, @arg) for @csrch; + } else { # string function + $_->$cb(@arg) for @csrch; + } +} + +sub config_cmd { + my ($self, $env, $opt) = @_; + my $f = $self->{-f} // default_file(); + my @opt_c = @{$self->{-opt_c} // []}; + my @cmd = (git_exe, @opt_c, 'config'); + @opt_c ? tmp_cmd_opt($env, $opt) : push(@cmd, '-f', $f); + \@cmd; } sub urlmatch { - my ($self, $key, $url) = @_; + my $self = shift; + my @bool = $_[0] eq '--bool' ? (shift) : (); + my ($key, $url, $try_git) = @_; state $urlmatch_broken; # requires git 1.8.5 return if $urlmatch_broken; - my $file = default_file(); - my $cmd = [qw/git config -z --includes --get-urlmatch/, - "--file=$file", $key, $url ]; - my $fh = popen_rd($cmd); - local $/ = "\0"; - my $val = <$fh>; - if (close($fh)) { - chomp($val); - $val; - } else { - $urlmatch_broken = 1 if (($? >> 8) != 1); - undef; + my (%env, %opt); + my $cmd = $self->config_cmd(\%env, \%opt); + push @$cmd, @bool, qw(--includes -z --get-urlmatch), $key, $url; + my $val = run_qx($cmd, \%env, \%opt); + if ($?) { + undef $val; + if (@bool && ($? >> 8) == 128) { # not boolean + } elsif (($? >> 8) != 1) { + $urlmatch_broken = 1; + } elsif ($try_git) { # n.b. this takes cwd into account + $val = run_qx([qw(git config), @bool, + qw(-z --get-urlmatch), $key, $url]); + undef $val if $?; + } + } + $? = 0; # don't influence lei exit status + if (defined($val)) { + local $/ = "\0"; + chomp $val; + $val = git_bool($val) if @bool; } + $val; +} + +sub json { + state $json; + $json //= do { + for my $mod (qw(Cpanel::JSON::XS JSON::MaybeXS JSON JSON::PP)) { + eval "require $mod" or next; + # ->ascii encodes non-ASCII to "\uXXXX" + $json = $mod->new->ascii(1) and last; + } + $json; + }; +} + +sub squote_maybe ($) { + my ($val) = @_; + if ($val =~ m{([^\w@\./,\%\+\-])}) { + $val =~ s/(['!])/'\\$1'/g; # '!' for csh + return "'$val'"; + } + $val; +} + +my %re_map = ( '*' => '[^/]*?', '?' => '[^/]', + '/**' => '/.*', '**/' => '.*/', '/**/' => '(?:/.*?/|/)', + '[' => '[', ']' => ']', ',' => ',' ); + +sub glob2re ($) { + my ($re) = @_; + my $p = ''; + my $in_bracket = 0; + my $qm = 0; + my $schema_host_port = ''; + + # don't glob URL-looking things that look like IPv6 + if ($re =~ s!\A([a-z0-9\+]+://\[[a-f0-9\:]+\](?::[0-9]+)?/)!!i) { + $schema_host_port = quotemeta $1; # "http://[::1]:1234" + } + my $changes = ($re =~ s!(/\*\*/|\A\*\*/|/\*\*\z|.)! + $re_map{$p eq '\\' ? '' : do { + if ($1 eq '[') { ++$in_bracket } + elsif ($1 eq ']') { --$in_bracket } + elsif ($1 eq ',') { ++$qm } # no change + $p = $1; + }} // do { + $p = $1; + ($p eq '-' && $in_bracket) ? $p : (++$qm, "\Q$p") + }!sge); + # bashism (also supported by curl): {a,b,c} => (a|b|c) + $changes += ($re =~ s/([^\\]*)\\\{([^,]*,[^\\]*)\\\}/ + (my $in_braces = $2) =~ tr!,!|!; + $1."($in_braces)"; + /sge); + ($changes - $qm) ? $schema_host_port.$re : undef; +} + +sub get_coderepo { + my ($self, $nick) = @_; + $self->{-coderepos}->{$nick} // do { + defined($self->{-cgit_scan_path}) ? do { + apply_cgit_scan_path($self); + my $cr = fill_coderepo($self, $nick); + $cr ? ($self->{-coderepos}->{$nick} = $cr) : undef; + } : undef; + }; } 1; diff --git a/lib/PublicInbox/ConfigIter.pm b/lib/PublicInbox/ConfigIter.pm index e6fa8172..f9e3451a 100644 --- a/lib/PublicInbox/ConfigIter.pm +++ b/lib/PublicInbox/ConfigIter.pm @@ -1,12 +1,11 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# Intended for PublicInbox::DS->EventLoop in read-only daemons +# Intended for PublicInbox::DS::event_loop in read-only daemons # to avoid each_inbox() monopolizing the event loop when hundreds/thousands # of inboxes are in play. package PublicInbox::ConfigIter; -use strict; -use v5.10.1; +use v5.12; sub new { my ($class, $pi_cfg, $cb, @args) = @_; @@ -25,7 +24,7 @@ sub event_step { PublicInbox::DS::requeue($self) if defined($section); } -# for generic PSGI servers +# for generic PSGI servers, but also ManifestJsGz w/ ALL extindex sub each_section { my $self = shift; my ($pi_cfg, $i, $cb, @arg) = @$self; diff --git a/lib/PublicInbox/ContentDigestDbg.pm b/lib/PublicInbox/ContentDigestDbg.pm new file mode 100644 index 00000000..853624f1 --- /dev/null +++ b/lib/PublicInbox/ContentDigestDbg.pm @@ -0,0 +1,22 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::ContentDigestDbg; # cf. PublicInbox::ContentDigest +use v5.12; +use Data::Dumper; +use PublicInbox::SHA; +$Data::Dumper::Useqq = $Data::Dumper::Terse = 1; + +sub new { bless [ PublicInbox::SHA->new(256), $_[1] ], __PACKAGE__ } + +sub add { + $_[0]->[0]->add($_[1]); + my @dbg = split(/^/sm, $_[1]); + if (@dbg && $dbg[0] =~ /\A(To|Cc)\0/) { # fold excessively long lines + @dbg = map { split(/,/s, $_) } @dbg; + } + print { $_[0]->[1] } Dumper(\@dbg) or die "print $!"; +} + +sub hexdigest { $_[0]->[0]->hexdigest } + +1; diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index 1fe22955..95ca2929 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Unstable internal API. @@ -8,18 +8,20 @@ # See L<public-inbox-v2-format(5)> manpage for more details. package PublicInbox::ContentHash; use strict; -use warnings; -use base qw/Exporter/; -our @EXPORT_OK = qw/content_hash content_digest/; +use v5.10.1; +use parent qw(Exporter); +our @EXPORT_OK = qw(content_hash content_digest git_sha); use PublicInbox::MID qw(mids references); use PublicInbox::MsgIter; # not sure if less-widely supported hash families are worth bothering with -use Digest::SHA; +use PublicInbox::SHA; # faster, but no ->clone +use Digest::SHA; # we still need this for ->clone sub digest_addr ($$$) { my ($dig, $h, $v) = @_; $v =~ tr/"//d; + $v =~ tr/\r\n\t / /s; $v =~ s/@([a-z0-9\_\.\-\(\)]*([A-Z])\S*)/'@'.lc($1)/ge; utf8::encode($v); $dig->add("$h\0$v\0"); @@ -43,7 +45,7 @@ sub content_dig_i { my $ct = $part->content_type || 'text/plain'; my ($s, undef) = msg_part_text($part, $ct); if (defined $s) { - $s =~ s/\r\n/\n/gs; + $s =~ s/\r\n/\n/gs; # TODO: consider \r+\n to match View $s =~ s/\s*\z//s; utf8::encode($s); } else { @@ -52,30 +54,36 @@ sub content_dig_i { $dig->add($s); } -sub content_digest ($) { - my ($eml) = @_; - my $dig = Digest::SHA->new(256); +sub content_digest ($;$$) { + my ($eml, $dig, $hash_mids) = @_; + $dig //= Digest::SHA->new(256); # References: and In-Reply-To: get used interchangeably # in some "duplicates" in LKML. We treat them the same # in SearchIdx, so treat them the same for this: # do NOT consider the Message-ID as part of the content_hash - # if we got here, we've already got Message-ID reuse - my %seen = map { $_ => 1 } @{mids($eml)}; - foreach my $mid (@{references($eml)}) { - $dig->add("ref\0$mid\0") unless $seen{$mid}++; + # if we got here, we've already got Message-ID reuse for v2. + # + # However, `lei q --dedupe=content' does use $hash_mids since + # it doesn't have any other dedupe + my $mids = mids($eml); + if ($hash_mids) { + $dig->add("mid\0$_\0") for @$mids; + } + my %seen = map { $_ => 1 } @$mids; + for (grep { !$seen{$_}++ } @{references($eml)}) { + utf8::encode($_); + $dig->add("ref\0$_\0"); } # Only use Sender: if From is not present foreach my $h (qw(From Sender)) { - my @v = $eml->header($h); - if (@v) { - digest_addr($dig, $h, $_) foreach @v; - } + my @v = $eml->header($h) or next; + digest_addr($dig, $h, $_) foreach @v; + last; } foreach my $h (qw(Subject Date)) { - my @v = $eml->header($h); - foreach my $v (@v) { + for my $v ($eml->header($h)) { utf8::encode($v); $dig->add("$h\0$v\0"); } @@ -84,15 +92,23 @@ sub content_digest ($) { # not in the original message. For the purposes of deduplication, # do not take it into account: foreach my $h (qw(To Cc)) { - my @v = $eml->header($h); - digest_addr($dig, $h, $_) foreach @v; + digest_addr($dig, $h, $_) for ($eml->header($h)); } msg_iter($eml, \&content_dig_i, $dig); $dig; } sub content_hash ($) { - content_digest($_[0])->digest; + content_digest($_[0], PublicInbox::SHA->new(256))->digest; +} + +# don't clone the result of this +sub git_sha ($$) { + my ($n, $eml) = @_; + my $dig = PublicInbox::SHA->new($n); + my $bref = ref($eml) eq 'SCALAR' ? $eml : \($eml->as_string); + $dig->add('blob '.length($$bref)."\0", $$bref); + $dig; } 1; diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index 9c278307..a6fec954 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -21,42 +21,43 @@ # (tmpio = [ GLOB, offset, [ length ] ]) package PublicInbox::DS; use strict; -use bytes; -use POSIX qw(WNOHANG); -use IO::Handle qw(); +use v5.10.1; +use parent qw(Exporter); +use bytes qw(length substr); # FIXME(?): needed for PublicInbox::NNTP +use POSIX qw(WNOHANG sigprocmask SIG_SETMASK SIG_UNBLOCK); use Fcntl qw(SEEK_SET :DEFAULT O_APPEND); use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); -use parent qw(Exporter); -our @EXPORT_OK = qw(now msg_more); -use 5.010_001; use Scalar::Util qw(blessed); -use PublicInbox::Syscall qw(:epoll); +use PublicInbox::Syscall qw(%SIGNUM + EPOLLIN EPOLLOUT EPOLLONESHOT EPOLLEXCLUSIVE); use PublicInbox::Tmpfile; -use Errno qw(EAGAIN EINVAL); -use Carp qw(confess carp); +use PublicInbox::Select; +use PublicInbox::OnDestroy; +use Errno qw(EAGAIN EINVAL ECHILD); +use Carp qw(carp croak); +our @EXPORT_OK = qw(now msg_more awaitpid add_timer add_uniq_timer); my $nextq; # queue for next_tick -my $wait_pids; # list of [ pid, callback, callback_arg ] -my $later_queue; # list of callbacks to run at some later interval -my $EXPMAP; # fd -> idle_time -our $EXPTIME = 180; # 3 minutes -my ($later_timer, $reap_armed, $exp_timer); -my $ToClose; # sockets to close when event loop is done -our ( - %DescriptorMap, # fd (num) -> PublicInbox::DS object - $Epoll, # Global epoll fd (or DSKQXS ref) - $_io, # IO::Handle for Epoll - - $PostLoopCallback, # subref to call at the end of each loop, if defined (global) - - $LoopTimeout, # timeout of event loop in milliseconds - $DoneInit, # if we've done the one-time module init yet +my $reap_armed; +my @active; # FDs (or objs) returned by epoll/kqueue +our (%AWAIT_PIDS, # pid => [ $callback, @args ] + $cur_runq, # only set inside next_tick + @FD_MAP, # fd (num) -> PublicInbox::DS object + $Poller, # global Select, Epoll, DSPoll, or DSKQXS ref + + @post_loop_do, # subref + args to call at the end of each loop + + $loop_timeout, # timeout of event loop in milliseconds @Timers, # timers + %UniqTimer, $in_loop, ); Reset(); +# clobber everything explicitly to avoid DESTROY ordering problems w/ DBI +END { Reset() } + ##################################################################### ### C L A S S M E T H O D S ##################################################################### @@ -67,180 +68,156 @@ Reset all state =cut sub Reset { - %DescriptorMap = (); - $in_loop = $wait_pids = $later_queue = $reap_armed = undef; - $EXPMAP = {}; - $nextq = $ToClose = $later_timer = $exp_timer = undef; - $LoopTimeout = -1; # no timeout by default - @Timers = (); - - $PostLoopCallback = undef; - $DoneInit = 0; - - $_io = undef; # closes real $Epoll FD - $Epoll = undef; # may call DSKQXS::DESTROY + $Poller = bless [], 'PublicInbox::DummyPoller'; + do { + $in_loop = undef; # first in case DESTROY callbacks use this + # clobbering $Poller may call DSKQXS::DESTROY, + # we must always have this set to something to avoid + # needing branches before ep_del/ep_mod calls (via ->close). + @FD_MAP = (); + @Timers = (); + %UniqTimer = (); + @post_loop_do = (); + + # we may be called from an *atfork_child inside next_tick: + @$cur_runq = () if $cur_runq; + @active = (); + $nextq = undef; # may call ep_del + %AWAIT_PIDS = (); + } while (@Timers || $nextq || keys(%AWAIT_PIDS) || + @active || @FD_MAP || + @post_loop_do || keys(%UniqTimer) || + scalar(@{$cur_runq // []})); # do not vivify cur_runq - *EventLoop = *FirstTimeEventLoop; -} - -=head2 C<< CLASS->SetLoopTimeout( $timeout ) >> - -Set the loop timeout for the event loop to some value in milliseconds. - -A timeout of 0 (zero) means poll forever. A timeout of -1 means poll and return -immediately. - -=cut -sub SetLoopTimeout { - return $LoopTimeout = $_[1] + 0; + $reap_armed = undef; + $loop_timeout = -1; # no timeout by default + $Poller = PublicInbox::Select->new; } -=head2 C<< PublicInbox::DS::add_timer( $seconds, $coderef, $arg) >> - -Add a timer to occur $seconds from now. $seconds may be fractional, but timers -are not guaranteed to fire at the exact time you ask for. - -=cut -sub add_timer ($$;$) { - my ($secs, $coderef, $arg) = @_; - - my $fire_time = now() + $secs; - - my $timer = [$fire_time, $coderef, $arg]; - - if (!@Timers || $fire_time >= $Timers[-1][0]) { - push @Timers, $timer; - return $timer; - } - - # Now, where do we insert? (NOTE: this appears slow, algorithm-wise, - # but it was compared against calendar queues, heaps, naive push/sort, - # and a bunch of other versions, and found to be fastest with a large - # variety of datasets.) - for (my $i = 0; $i < @Timers; $i++) { - if ($Timers[$i][0] > $fire_time) { - splice(@Timers, $i, 0, $timer); - return $timer; - } - } - - die "Shouldn't get here."; -} +sub _add_named_timer { + my ($name, $secs, $coderef, @args) = @_; + my $fire_time = now() + $secs; + my $timer = [$fire_time, $name, $coderef, @args]; -# keeping this around in case we support other FD types for now, -# epoll_create1(EPOLL_CLOEXEC) requires Linux 2.6.27+... -sub set_cloexec ($) { - my ($fd) = @_; + if (!@Timers || $fire_time >= $Timers[-1][0]) { + push @Timers, $timer; + return $timer; + } - $_io = IO::Handle->new_from_fd($fd, 'r+') or return; - defined(my $fl = fcntl($_io, F_GETFD, 0)) or return; - fcntl($_io, F_SETFD, $fl | FD_CLOEXEC); + # Now, where do we insert? (NOTE: this appears slow, algorithm-wise, + # but it was compared against calendar queues, heaps, naive push/sort, + # and a bunch of other versions, and found to be fastest with a large + # variety of datasets.) + for (my $i = 0; $i < @Timers; $i++) { + if ($Timers[$i][0] > $fire_time) { + splice(@Timers, $i, 0, $timer); + return $timer; + } + } + die "Shouldn't get here."; } -sub _InitPoller -{ - return if $DoneInit; - $DoneInit = 1; +sub add_timer { _add_named_timer(undef, @_) } - if (PublicInbox::Syscall::epoll_defined()) { - $Epoll = epoll_create(); - set_cloexec($Epoll) if (defined($Epoll) && $Epoll >= 0); - } else { - my $cls; - for (qw(DSKQXS DSPoll)) { - $cls = "PublicInbox::$_"; - last if eval "require $cls"; - } - $cls->import(qw(epoll_ctl epoll_wait)); - $Epoll = $cls->new; - } - *EventLoop = *EpollEventLoop; +sub add_uniq_timer { # ($name, $secs, $coderef, @args) = @_; + $UniqTimer{$_[0]} //= _add_named_timer(@_); } -=head2 C<< CLASS->EventLoop() >> - -Start processing IO events. In most daemon programs this never exits. See -C<PostLoopCallback> below for how to exit the loop. - -=cut -sub FirstTimeEventLoop { - my $class = shift; - - _InitPoller(); - - EventLoop($class); +# caller sets return value to $Poller +sub _InitPoller () { + my @try = ($^O eq 'linux' ? 'Epoll' : 'DSKQXS'); + my $cls; + for (@try, 'DSPoll') { + $cls = "PublicInbox::$_"; + last if eval "require $cls"; + } + $cls->new; } sub now () { clock_gettime(CLOCK_MONOTONIC) } sub next_tick () { - my $q = $nextq or return; - $nextq = undef; - for (@$q) { - # we avoid "ref" on blessed refs to workaround a Perl 5.16.3 leak: - # https://rt.perl.org/Public/Bug/Display.html?id=114340 - if (blessed($_)) { - $_->event_step; - } else { - $_->(); - } - } + $cur_runq = $nextq or return; + $nextq = undef; + while (my $obj = shift @$cur_runq) { + # avoid "ref" on blessed refs to workaround a Perl 5.16.3 leak: + # https://rt.perl.org/Public/Bug/Display.html?id=114340 + blessed($obj) ? $obj->event_step : $obj->(); + } + 1; } # runs timers and returns milliseconds for next one, or next event loop sub RunTimers { - next_tick(); + my $ran = next_tick(); - return (($nextq || $ToClose) ? 0 : $LoopTimeout) unless @Timers; + return ($nextq || $ran ? 0 : $loop_timeout) unless @Timers; - my $now = now(); + my $now = now(); - # Run expired timers - while (@Timers && $Timers[0][0] <= $now) { - my $to_run = shift(@Timers); - $to_run->[1]->($to_run->[2]); - } + # Run expired timers + while (@Timers && $Timers[0][0] <= $now) { + my $to_run = shift(@Timers); + delete $UniqTimer{$to_run->[1] // ''}; + $to_run->[2]->(@$to_run[3..$#$to_run]); + $ran = 1; + } - # timers may enqueue into nextq: - return 0 if ($nextq || $ToClose); + # timers may enqueue into nextq: + return 0 if $nextq || $ran; - return $LoopTimeout unless @Timers; + return $loop_timeout unless @Timers; - # convert time to an even number of milliseconds, adding 1 - # extra, otherwise floating point fun can occur and we'll - # call RunTimers like 20-30 times, each returning a timeout - # of 0.0000212 seconds - my $timeout = int(($Timers[0][0] - $now) * 1000) + 1; + # convert time to an even number of milliseconds, adding 1 + # extra, otherwise floating point fun can occur and we'll + # call RunTimers like 20-30 times, each returning a timeout + # of 0.0000212 seconds + my $t = int(($Timers[0][0] - $now) * 1000) + 1; - # -1 is an infinite timeout, so prefer a real timeout - return $timeout if $LoopTimeout == -1; + # -1 is an infinite timeout, so prefer a real timeout + ($loop_timeout < 0 || $loop_timeout >= $t) ? $t : $loop_timeout +} - # otherwise pick the lower of our regular timeout and time until - # the next timer - return $LoopTimeout if $LoopTimeout < $timeout; - return $timeout; +sub sig_setmask { sigprocmask(SIG_SETMASK, @_) or die "sigprocmask: $!" } + +# ensure we detect bugs, HW problems and user rlimits +our @UNBLOCKABLE = (POSIX::SIGABRT, POSIX::SIGBUS, POSIX::SIGFPE, + POSIX::SIGILL, POSIX::SIGSEGV, POSIX::SIGXCPU, POSIX::SIGXFSZ); + +sub block_signals { # anything in @_ stays unblocked + my $newset = POSIX::SigSet->new; + $newset->fillset or die "fillset: $!"; + for (@_, @UNBLOCKABLE) { $newset->delset($_) or die "delset($_): $!" } + my $oldset = POSIX::SigSet->new; + sig_setmask($newset, $oldset); + $oldset; } -# We can't use waitpid(-1) safely here since it can hit ``, system(), -# and other things. So we scan the $wait_pids list, which is hopefully -# not too big. We keep $wait_pids small by not calling dwaitpid() -# until we've hit EOF when reading the stdout of the child. +sub await_cb ($;@) { + my ($pid, @cb_args) = @_; + my $cb = shift @cb_args or return; + eval { $cb->($pid, @cb_args) }; + warn "E: awaitpid($pid): $@" if $@; +} +# This relies on our Perl process being single-threaded, or at least +# no threads spawning and waiting on processes (``, system(), etc...) +# Threads are officially discouraged by the Perl5 team, and I expect +# that to remain the case. sub reap_pids { $reap_armed = undef; - my $tmp = $wait_pids or return; - $wait_pids = undef; - foreach my $ary (@$tmp) { - my ($pid, $cb, $arg) = @$ary; - my $ret = waitpid($pid, WNOHANG); - if ($ret == 0) { - push @$wait_pids, $ary; # autovivifies @$wait_pids - } elsif ($cb) { - eval { $cb->($arg, $pid) }; + while (1) { + my $pid = waitpid(-1, WNOHANG) or return; + if (defined(my $cb_args = delete $AWAIT_PIDS{$pid})) { + await_cb($pid, @$cb_args) if $cb_args; + } elsif ($pid == -1 && $! == ECHILD) { + return requeue(\&dflush); # force @post_loop_do to run + } elsif ($pid > 0) { + warn "W: reaped unknown PID=$pid: \$?=$?\n"; + } else { # does this happen? + return warn("W: waitpid(-1, WNOHANG) => $pid ($!)"); } } - # we may not be done, yet, and could've missed/masked a SIGCHLD: - $reap_armed //= requeue(\&reap_pids) if $wait_pids; } # reentrant SIGCHLD handler (since reap_pids is not reentrant) @@ -248,65 +225,80 @@ sub enqueue_reap () { $reap_armed //= requeue(\&reap_pids) } sub in_loop () { $in_loop } +# use inside @post_loop_do, returns number of busy clients +sub close_non_busy () { + my $n = 0; + for my $s (grep defined, @FD_MAP) { + # close as much as possible, early as possible + ($s->busy ? ++$n : $s->close) if $s->can('busy'); + } + $n; +} + # Internal function: run the post-event callback, send read events # for pushed-back data, and close pending connections. returns 1 # if event loop should continue, or 0 to shut it all down. sub PostEventLoop () { - # now we can close sockets that wanted to close during our event - # processing. (we didn't want to close them during the loop, as we - # didn't want fd numbers being reused and confused during the event - # loop) - if (my $close_now = $ToClose) { - $ToClose = undef; # will be autovivified on push - @$close_now = map { fileno($_) } @$close_now; - - # order matters, destroy expiry times, first: - delete @$EXPMAP{@$close_now}; - - # ->DESTROY methods may populate ToClose - delete @DescriptorMap{@$close_now}; - } - # by default we keep running, unless a postloop callback cancels it - $PostLoopCallback ? $PostLoopCallback->(\%DescriptorMap) : 1; -} - -sub EpollEventLoop { - local $in_loop = 1; - do { - my @events; - my $i; - my $timeout = RunTimers(); - - # get up to 1000 events - my $evcount = epoll_wait($Epoll, 1000, $timeout, \@events); - for ($i=0; $i<$evcount; $i++) { - # it's possible epoll_wait returned many events, including some at the end - # that ones in the front triggered unregister-interest actions. if we - # can't find the %sock entry, it's because we're no longer interested - # in that event. - $DescriptorMap{$events[$i]->[0]}->event_step; - } - } while (PostEventLoop()); - _run_later(); + @post_loop_do ? $post_loop_do[0]->(@post_loop_do[1..$#post_loop_do]) : 1 } -=head2 C<< CLASS->SetPostLoopCallback( CODEREF ) >> - -Sets post loop callback function. Pass a subref and it will be -called every time the event loop finishes. - -Return 1 (or any true value) from the sub to make the loop continue, 0 or false -and it will exit. +sub sigset_prep ($$$) { + my ($sig, $init, $each) = @_; # $sig: { signame => whatever } + my $ret = POSIX::SigSet->new; + $ret->$init or die "$init: $!"; + for my $s (keys %$sig) { + my $num = $SIGNUM{$s} // POSIX->can("SIG$s")->(); + $ret->$each($num) or die "$each ($s => $num): $!"; + } + for (@UNBLOCKABLE) { $ret->$each($_) or die "$each ($_): $!" } + $ret; +} + +sub allowset ($) { sigset_prep $_[0], 'fillset', 'delset' } +sub unblockset ($) { sigset_prep $_[0], 'emptyset', 'addset' } + +# Start processing IO events. In most daemon programs this never exits. See +# C<post_loop_do> for how to exit the loop. +sub event_loop (;$$) { + my ($sig, $oldset) = @_; + $Poller //= _InitPoller(); + require PublicInbox::Sigfd if $sig; + my $sigfd = $sig ? PublicInbox::Sigfd->new($sig) : undef; + if ($sigfd && $sigfd->{is_kq}) { + my $tmp = allowset($sig); + local @SIG{keys %$sig} = values(%$sig); + sig_setmask($tmp, my $old = POSIX::SigSet->new); + # Unlike Linux signalfd, EVFILT_SIGNAL can't handle + # signals received before the filter is created, + # so we peek at signals here. + sig_setmask($old); + } + local @SIG{keys %$sig} = values(%$sig) if $sig && !$sigfd; + local $SIG{PIPE} = 'IGNORE'; + if (!$sigfd && $sig) { + # wake up every second to accept signals if we don't + # have signalfd or IO::KQueue: + sig_setmask($oldset) if $oldset; + sigprocmask(SIG_UNBLOCK, unblockset($sig)) or + die "SIG_UNBLOCK: $!"; + $loop_timeout = 1000; + } + $_[0] = $sigfd = $sig = undef; # $_[0] == sig + local $in_loop = 1; + do { + my $timeout = RunTimers(); -The callback function will be passed two parameters: \%DescriptorMap + # grab whatever FDs are ready + $Poller->ep_wait($timeout, \@active); -=cut -sub SetPostLoopCallback { - my ($class, $ref) = @_; + # map all FDs to their associated Perl object + @active = @FD_MAP[@active]; - # global callback - $PostLoopCallback = (defined $ref && ref $ref eq 'CODE') ? $ref : undef; + while (my $obj = shift @active) { + $obj->event_step; + } + } while (PostEventLoop()); } ##################################################################### @@ -318,7 +310,7 @@ sub SetPostLoopCallback { =head2 C<< CLASS->new( $socket ) >> Create a new PublicInbox::DS subclass object for the given I<socket> which will -react to events on it during the C<EventLoop>. +react to events on it during the C<event_loop>. This is normally (always?) called from your subclass via: @@ -330,62 +322,54 @@ sub new { $self->{sock} = $sock; my $fd = fileno($sock); - _InitPoller(); - - if (epoll_ctl($Epoll, EPOLL_CTL_ADD, $fd, $ev)) { + $Poller //= _InitPoller(); +retry: + if ($Poller->ep_add($sock, $ev)) { if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { $ev &= ~EPOLLEXCLUSIVE; goto retry; } - die "couldn't add epoll watch for $fd: $!\n"; + die "EPOLL_CTL_ADD $self/$sock/$fd: $!"; } - confess("DescriptorMap{$fd} defined ($DescriptorMap{$fd})") - if defined($DescriptorMap{$fd}); + defined($FD_MAP[$fd]) and + croak("BUG: FD:$fd in use by $FD_MAP[$fd] (for $self/$sock)"); - $DescriptorMap{$fd} = $self; + $FD_MAP[$fd] = $self; } - -##################################################################### -### I N S T A N C E M E T H O D S -##################################################################### +# for IMAP, NNTP, and POP3 which greet clients upon connect +sub greet { + my ($self, $sock) = @_; + my $ev = EPOLLIN; + my $wbuf; + if ($sock->can('accept_SSL') && !$sock->accept_SSL) { + return if $! != EAGAIN || !($ev = PublicInbox::TLS::epollbit()); + $wbuf = [ \&accept_tls_step, $self->can('do_greet')]; + } + new($self, $sock, $ev | EPOLLONESHOT); + if ($wbuf) { + $self->{wbuf} = $wbuf; + } else { + $self->do_greet; + } + $self; +} sub requeue ($) { push @$nextq, $_[0] } # autovivifies -=head2 C<< $obj->close >> - -Close the socket. - -=cut +# drop the IO::Handle ref, true if successful, false if not (or already dropped) +# (this is closer to CORE::close than Danga::Socket::close) sub close { - my ($self) = @_; - my $sock = delete $self->{sock} or return; - - # we need to flush our write buffer, as there may - # be self-referential closures (sub { $client->close }) - # preventing the object from being destroyed - delete $self->{wbuf}; - - # if we're using epoll, we have to remove this from our epoll fd so we stop getting - # notifications about it - my $fd = fileno($sock); - epoll_ctl($Epoll, EPOLL_CTL_DEL, $fd, 0) and - confess("EPOLL_CTL_DEL: $!"); - - # we explicitly don't delete from DescriptorMap here until we - # actually close the socket, as we might be in the middle of - # processing an epoll_wait/etc that returned hundreds of fds, one - # of which is not yet processed and is what we're closing. if we - # keep it in DescriptorMap, then the event harnesses can just - # looked at $pob->{sock} == undef and ignore it. but if it's an - # un-accounted for fd, then it (understandably) freak out a bit - # and emit warnings, thinking their state got off. + my ($self) = @_; + my $sock = delete $self->{sock} or return; - # defer closing the actual socket until the event loop is done - # processing this round of events. (otherwise we might reuse fds) - push @$ToClose, $sock; # autovivifies $ToClose + # we need to clear our write buffer, as there may + # be self-referential closures (sub { $client->close }) + # preventing the object from being destroyed + delete $self->{wbuf}; + $FD_MAP[fileno($sock)] = undef; - return 0; + !$Poller->ep_del($sock); # stop getting notifications } # portable, non-thread-safe sendfile emulation (no pread, yet) @@ -431,8 +415,8 @@ next_buf: shift @$wbuf; goto next_buf; } - } elsif ($! == EAGAIN) { - epwait($sock, epbit($sock, EPOLLOUT) | EPOLLONESHOT); + } elsif ($! == EAGAIN && (my $ev = epbit($sock, EPOLLOUT))) { + epwait($sock, $ev | EPOLLONESHOT); return 0; } else { return $self->close; @@ -461,39 +445,40 @@ sub rbuf_idle ($$) { } } +# returns true if bytes are read, false otherwise sub do_read ($$$;$) { - my ($self, $rbuf, $len, $off) = @_; - my $r = sysread(my $sock = $self->{sock}, $$rbuf, $len, $off // 0); - return ($r == 0 ? $self->close : $r) if defined $r; - # common for clients to break connections without warning, - # would be too noisy to log here: - if ($! == EAGAIN) { - epwait($sock, epbit($sock, EPOLLIN) | EPOLLONESHOT); - rbuf_idle($self, $rbuf); - 0; - } else { - $self->close; - } + my ($self, $rbuf, $len, $off) = @_; + my ($ev, $r, $s); + $r = sysread($s = $self->{sock}, $$rbuf, $len, $off // 0) and return $r; + + if (!defined($r) && $! == EAGAIN && ($ev = epbit($s, EPOLLIN))) { + epwait($s, $ev | EPOLLONESHOT); + rbuf_idle($self, $rbuf); + } else { + $self->close; + } + undef; } # drop the socket if we hit unrecoverable errors on our system which # require BOFH attention: ENOSPC, EFBIG, EIO, EMFILE, ENFILE... sub drop { - my $self = shift; - carp(@_); - $self->close; + my $self = shift; + carp(@_); + $self->close; + undef; } -# n.b.: use ->write/->read for this buffer to allow compatibility with -# PerlIO::mmap or PerlIO::scalar if needed sub tmpio ($$$) { - my ($self, $bref, $off) = @_; - my $fh = tmpfile('wbuf', $self->{sock}, O_APPEND) or - return drop($self, "tmpfile $!"); - $fh->autoflush(1); - my $len = bytes::length($$bref) - $off; - $fh->write($$bref, $len, $off) or return drop($self, "write ($len): $!"); - [ $fh, 0 ] # [1] = offset, [2] = length, not set by us + my ($self, $bref, $off) = @_; + my $fh = tmpfile('wbuf', $self->{sock}, O_APPEND) or + return drop($self, "tmpfile $!"); + $fh->autoflush(1); + my $len = length($$bref) - $off; + my $n = syswrite($fh, $$bref, $len, $off) // + return drop($self, "write ($len): $!"); + $n == $len or return drop($self, "wrote $n < $len bytes"); + [ $fh, 0 ] # [1] = offset, [2] = length, not set by us } =head2 C<< $obj->write( $data ) >> @@ -523,7 +508,8 @@ sub write { push @$wbuf, $bref; } else { my $tmpio = $wbuf->[-1]; - if ($tmpio && !defined($tmpio->[2])) { # append to tmp file buffer + if (ref($tmpio) eq 'ARRAY' && !defined($tmpio->[2])) { + # append to tmp file buffer $tmpio->[0]->print($$bref) or return drop($self, "print: $!"); } else { my $tmpio = tmpio($self, $bref, 0) or return 0; @@ -535,14 +521,15 @@ sub write { $bref->($self); return 1; } else { - my $to_write = bytes::length($$bref); + my $to_write = length($$bref); my $written = syswrite($sock, $$bref, $to_write); if (defined $written) { return 1 if $written == $to_write; requeue($self); # runs: event_step -> flush_write } elsif ($! == EAGAIN) { - epwait($sock, epbit($sock, EPOLLOUT) | EPOLLONESHOT); + my $ev = epbit($sock, EPOLLOUT) or return $self->close; + epwait($sock, $ev | EPOLLONESHOT); $written = 0; } else { return $self->close; @@ -569,7 +556,7 @@ sub msg_more ($$) { !$sock->can('stop_SSL')) { my $n = send($sock, $_[1], MSG_MORE); if (defined $n) { - my $nlen = bytes::length($_[1]) - $n; + my $nlen = length($_[1]) - $n; return 1 if $nlen == 0; # all done! # queue up the unwritten substring: my $tmpio = tmpio($self, \($_[1]), $n) or return 0; @@ -584,9 +571,8 @@ sub msg_more ($$) { } sub epwait ($$) { - my ($sock, $ev) = @_; - epoll_ctl($Epoll, EPOLL_CTL_MOD, fileno($sock), $ev) and - confess("EPOLL_CTL_MOD $!"); + my ($io, $ev) = @_; + $Poller->ep_mod($io, $ev) and croak("EPOLL_CTL_MOD($io): $!"); } # return true if complete, false if incomplete (or failure) @@ -595,84 +581,123 @@ sub accept_tls_step ($) { my $sock = $self->{sock} or return; return 1 if $sock->accept_SSL; return $self->close if $! != EAGAIN; - epwait($sock, PublicInbox::TLS::epollbit() | EPOLLONESHOT); + my $ev = PublicInbox::TLS::epollbit() or return $self->close; + epwait($sock, $ev | EPOLLONESHOT); unshift(@{$self->{wbuf}}, \&accept_tls_step); # autovivifies 0; } -# return true if complete, false if incomplete (or failure) +# return value irrelevant sub shutdn_tls_step ($) { my ($self) = @_; my $sock = $self->{sock} or return; return $self->close if $sock->stop_SSL(SSL_fast_shutdown => 1); return $self->close if $! != EAGAIN; - epwait($sock, PublicInbox::TLS::epollbit() | EPOLLONESHOT); + my $ev = PublicInbox::TLS::epollbit() or return $self->close; + epwait($sock, $ev | EPOLLONESHOT); unshift(@{$self->{wbuf}}, \&shutdn_tls_step); # autovivifies - 0; } # don't bother with shutdown($sock, 2), we don't fork+exec w/o CLOEXEC # or fork w/o exec, so no inadvertent socket sharing sub shutdn ($) { - my ($self) = @_; - my $sock = $self->{sock} or return; - if ($sock->can('stop_SSL')) { - shutdn_tls_step($self); - } else { - $self->close; - } + my ($self) = @_; + my $sock = $self->{sock} or return; + $sock->can('stop_SSL') ? shutdn_tls_step($self) : $self->close; } -# must be called with eval, PublicInbox::DS may not be loaded (see t/qspawn.t) -sub dwaitpid ($$$) { - die "Not in EventLoop\n" unless $in_loop; - push @$wait_pids, [ @_ ]; # [ $pid, $cb, $arg ] +sub dflush {} # overridden by DSdeflate +sub compressed {} # overridden by DSdeflate +sub long_response_done {} # overridden by Net::NNTP - # We could've just missed our SIGCHLD, cover it, here: - enqueue_reap(); +sub long_step { + my ($self) = @_; + # wbuf is unset or empty, here; {long} may add to it + my ($fd, $cb, $t0, @args) = @{$self->{long_cb}}; + my $more = eval { $cb->($self, @args) }; + if ($@ || !$self->{sock}) { # something bad happened... + delete $self->{long_cb}; + my $elapsed = now() - $t0; + $@ and warn("$@ during long response[$fd] - ", + sprintf('%0.6f', $elapsed),"\n"); + $self->out(" deferred[$fd] aborted - %0.6f", $elapsed); + $self->close; + } elsif ($more) { # $self->{wbuf}: + # control passed to ibx_async_cat if $more == \undef + requeue_once($self) if !ref($more); + } else { # all done! + delete $self->{long_cb}; + $self->long_response_done; + my $elapsed = now() - $t0; + $self->out(" deferred[$fd] done - %0.6f", $elapsed); + my $wbuf = $self->{wbuf}; # do NOT autovivify + requeue($self) unless $wbuf && @$wbuf; + } } -sub _run_later () { - my $run = $later_queue or return; - $later_timer = $later_queue = undef; - $_->() for @$run; -} +sub requeue_once { + my ($self) = @_; + # COMPRESS users all share the same DEFLATE context. + # Flush it here to ensure clients don't see each other's data + $self->dflush; -sub later ($) { - push @$later_queue, $_[0]; # autovivifies @$later_queue - $later_timer //= add_timer(60, \&_run_later); + # no recursion, schedule another call ASAP, + # but only after all pending writes are done. + # autovivify wbuf. wbuf may be populated by $cb, + # no need to rearm if so: (push returns new size of array) + $self->requeue if push(@{$self->{wbuf}}, \&long_step) == 1; } -sub expire_old () { - my $now = now(); - my $exp = $EXPTIME; - my $old = $now - $exp; - my %new; - while (my ($fd, $idle_at) = each %$EXPMAP) { - if ($idle_at < $old) { - my $ds_obj = $DescriptorMap{$fd}; - $new{$fd} = $idle_at if !$ds_obj->shutdn; +sub long_response ($$;@) { + my ($self, $cb, @args) = @_; # cb returns true if more, false if done + my $sock = $self->{sock} or return; + # make sure we disable reading during a long response, + # clients should not be sending us stuff and making us do more + # work while we are stream a response to them + $self->{long_cb} = [ fileno($sock), $cb, now(), @args ]; + long_step($self); # kick off! + undef; +} + +sub awaitpid { + my ($pid, @cb_args) = @_; # @cb_args = ($cb, @args), $cb may be undef + $AWAIT_PIDS{$pid} = \@cb_args if @cb_args; + # provide synchronous API + if (defined(wantarray) || (!$in_loop && !@cb_args)) { + my $ret = waitpid($pid, 0); + if ($ret == $pid) { + my $cb_args = delete $AWAIT_PIDS{$pid}; + @cb_args = @$cb_args if !@cb_args && $cb_args; + await_cb($pid, @cb_args); } else { - $new{$fd} = $idle_at; + carp "waitpid($pid) => $ret ($!)"; + delete $AWAIT_PIDS{$pid}; } + return $ret; + } elsif ($in_loop) { # We could've just missed our SIGCHLD, cover it, here: + enqueue_reap(); } - $EXPMAP = \%new; - $exp_timer = scalar(keys %new) ? later(\&expire_old) : undef; } -sub update_idle_time { - my ($self) = @_; - my $sock = $self->{sock} or return; - $EXPMAP->{fileno($sock)} = now(); - $exp_timer //= later(\&expire_old); +# for persistent child process +sub fork_persist () { + my $seed = rand(0xffffffff); + my $pid = PublicInbox::OnDestroy::fork_tmp; + if ($pid == 0) { + srand($seed); + eval { Net::SSLeay::randomize() }; # may not be loaded + Reset(); + } + $pid; } -sub not_idle_long { - my ($self, $now) = @_; - my $sock = $self->{sock} or return; - my $idle_at = $EXPMAP->{fileno($sock)} or return; - ($idle_at + $EXPTIME) > $now; -} +package PublicInbox::DummyPoller; # only used during Reset +use v5.12; + +sub ep_del {} +no warnings 'once'; +*ep_add = \&ep_del; +*ep_mod = \&ep_del; 1; diff --git a/lib/PublicInbox/DSKQXS.pm b/lib/PublicInbox/DSKQXS.pm index d1d3fe60..dc6621e4 100644 --- a/lib/PublicInbox/DSKQXS.pm +++ b/lib/PublicInbox/DSKQXS.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # Licensed the same as Danga::Socket (and Perl5) # License: GPL-1.0+ or Artistic-1.0-Perl # <https://www.gnu.org/licenses/gpl-1.0.txt> @@ -11,15 +11,12 @@ # # It also implements signalfd(2) emulation via "tie". package PublicInbox::DSKQXS; -use strict; -use warnings; -use parent qw(Exporter); +use v5.12; use Symbol qw(gensym); use IO::KQueue; use Errno qw(EAGAIN); -use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT EPOLLET - EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL $SFD_NONBLOCK); -our @EXPORT_OK = qw(epoll_ctl epoll_wait); +use PublicInbox::OnDestroy; +use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT EPOLLET); sub EV_DISPATCH () { 0x0080 } @@ -41,23 +38,23 @@ sub kq_flag ($$) { sub new { my ($class) = @_; - bless { kq => IO::KQueue->new, owner_pid => $$ }, $class; + my $fgen = $PublicInbox::OnDestroy::fork_gen; + bless { kq => IO::KQueue->new, fgen => $fgen }, $class; } # returns a new instance which behaves like signalfd on Linux. # It's wasteful in that it uses another FD, but it simplifies # our epoll-oriented code. sub signalfd { - my ($class, $signo, $flags) = @_; + my ($class, $signo) = @_; my $sym = gensym; - tie *$sym, $class, $signo, $flags; # calls TIEHANDLE + tie *$sym, $class, $signo; # calls TIEHANDLE $sym } sub TIEHANDLE { # similar to signalfd() - my ($class, $signo, $flags) = @_; + my ($class, $signo) = @_; my $self = $class->new; - $self->{timeout} = ($flags & $SFD_NONBLOCK) ? 0 : -1; my $kq = $self->{kq}; $kq->EV_SET($_, EVFILT_SIGNAL, EV_ADD) for @$signo; $self; @@ -66,12 +63,11 @@ sub TIEHANDLE { # similar to signalfd() sub READ { # called by sysread() for signalfd compatibility my ($self, undef, $len, $off) = @_; # $_[1] = buf die "bad args for signalfd read" if ($len % 128) // defined($off); - my $timeout = $self->{timeout}; my $sigbuf = $self->{sigbuf} //= []; my $nr = $len / 128; my $r = 0; $_[1] = ''; - do { + while (1) { while ($nr--) { my $signo = shift(@$sigbuf) or last; # caller only cares about signalfd_siginfo.ssi_signo: @@ -79,13 +75,13 @@ sub READ { # called by sysread() for signalfd compatibility $r += 128; } return $r if $r; - my @events = eval { $self->{kq}->kevent($timeout) }; + my @events = eval { $self->{kq}->kevent(0) }; # workaround https://rt.cpan.org/Ticket/Display.html?id=116615 if ($@) { next if $@ =~ /Interrupted system call/; die; } - if (!scalar(@events) && $timeout == 0) { + if (!scalar(@events)) { $! = EAGAIN; return; } @@ -94,36 +90,37 @@ sub READ { # called by sysread() for signalfd compatibility # field shows coalesced signals, and maybe we'll use it # in the future... @$sigbuf = map { $_->[0] } @events; - } while (1); + } } # for fileno() calls in PublicInbox::DS sub FILENO { ${$_[0]->{kq}} } -sub epoll_ctl { - my ($self, $op, $fd, $ev) = @_; - my $kq = $self->{kq}; - if ($op == EPOLL_CTL_MOD) { - $kq->EV_SET($fd, EVFILT_READ, kq_flag(EPOLLIN, $ev)); - eval { $kq->EV_SET($fd, EVFILT_WRITE, kq_flag(EPOLLOUT, $ev)) }; - } elsif ($op == EPOLL_CTL_DEL) { - $kq->EV_SET($fd, EVFILT_READ, EV_DISABLE); - eval { $kq->EV_SET($fd, EVFILT_WRITE, EV_DISABLE) }; - } else { # EPOLL_CTL_ADD - $kq->EV_SET($fd, EVFILT_READ, EV_ADD|kq_flag(EPOLLIN, $ev)); - - # we call this blindly for read-only FDs such as tied - # DSKQXS (signalfd emulation) and Listeners - eval { - $kq->EV_SET($fd, EVFILT_WRITE, EV_ADD | - kq_flag(EPOLLOUT, $ev)); - }; - } +sub _ep_mod_add ($$$$) { + my ($kq, $fd, $ev, $add) = @_; + $kq->EV_SET($fd, EVFILT_READ, $add|kq_flag(EPOLLIN, $ev)); + + # we call this blindly for read-only FDs such as tied + # DSKQXS (signalfd emulation) and Listeners + eval { $kq->EV_SET($fd, EVFILT_WRITE, $add|kq_flag(EPOLLOUT, $ev)) }; 0; } -sub epoll_wait { - my ($self, $maxevents, $timeout_msec, $events) = @_; +sub ep_add { _ep_mod_add($_[0]->{kq}, fileno($_[1]), $_[2], EV_ADD) }; +sub ep_mod { _ep_mod_add($_[0]->{kq}, fileno($_[1]), $_[2], 0) }; + +sub ep_del { + my ($self, $io, $ev) = @_; + my $kq = $_[0]->{kq} // return; # called in cleanup + my $fd = fileno($io); + $kq->EV_SET($fd, EVFILT_READ, EV_DISABLE); + eval { $kq->EV_SET($fd, EVFILT_WRITE, EV_DISABLE) }; + 0; +} + +sub ep_wait { + my ($self, $timeout_msec, $events) = @_; + # n.b.: IO::KQueue is hard-coded to return up to 1000 events @$events = eval { $self->{kq}->kevent($timeout_msec) }; if (my $err = $@) { # workaround https://rt.cpan.org/Ticket/Display.html?id=116615 @@ -134,7 +131,7 @@ sub epoll_wait { } } # caller only cares for $events[$i]->[0] - scalar(@$events); + $_ = $_->[0] for @$events; } # kqueue is close-on-fork (not exec), so we must not close it @@ -142,9 +139,8 @@ sub epoll_wait { sub DESTROY { my ($self) = @_; my $kq = delete $self->{kq} or return; - if (delete($self->{owner_pid}) == $$) { + delete($self->{fgen}) == $PublicInbox::OnDestroy::fork_gen and POSIX::close($$kq); - } } 1; diff --git a/lib/PublicInbox/DSPoll.pm b/lib/PublicInbox/DSPoll.pm index 1d9b51d9..a7055ec9 100644 --- a/lib/PublicInbox/DSPoll.pm +++ b/lib/PublicInbox/DSPoll.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # Licensed the same as Danga::Socket (and Perl5) # License: GPL-1.0+ or Artistic-1.0-Perl # <https://www.gnu.org/licenses/gpl-1.0.txt> @@ -9,50 +9,47 @@ # an all encompassing emulation of epoll via IO::Poll, but just to # support cases public-inbox-nntpd/httpd care about. package PublicInbox::DSPoll; -use strict; -use warnings; -use parent qw(Exporter); +use v5.12; use IO::Poll; -use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT EPOLL_CTL_DEL); -our @EXPORT_OK = qw(epoll_ctl epoll_wait); +use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT); +use Carp qw(carp); +use Errno (); -sub new { bless {}, $_[0] } # fd => events +sub new { bless {}, __PACKAGE__ } # fd => events -sub epoll_ctl { - my ($self, $op, $fd, $ev) = @_; - - # not wasting time on error checking - if ($op != EPOLL_CTL_DEL) { - $self->{$fd} = $ev; - } else { - delete $self->{$fd}; - } - 0; -} - -sub epoll_wait { - my ($self, $maxevents, $timeout_msec, $events) = @_; - my @pset; +sub ep_wait { + my ($self, $timeout_msec, $events) = @_; + my (@pset, $n, $fd, $revents, $nval); while (my ($fd, $events) = each %$self) { my $pevents = $events & EPOLLIN ? POLLIN : 0; $pevents |= $events & EPOLLOUT ? POLLOUT : 0; push(@pset, $fd, $pevents); } @$events = (); - my $n = IO::Poll::_poll($timeout_msec, @pset); - if ($n >= 0) { - for (my $i = 0; $i < @pset; ) { - my $fd = $pset[$i++]; - my $revents = $pset[$i++] or next; - delete($self->{$fd}) if $self->{$fd} & EPOLLONESHOT; - push @$events, [ $fd ]; - } - my $nevents = scalar @$events; - if ($n != $nevents) { - warn "BUG? poll() returned $n, but got $nevents"; + $n = IO::Poll::_poll($timeout_msec, @pset) or return; # timeout expired + return if $n < 0 && $! == Errno::EINTR; # caller recalculates timeout + die "poll: $!" if $n < 0; + while (defined($fd = shift @pset)) { + $revents = shift @pset or next; # no event + if ($revents & POLLNVAL) { + carp "E: FD=$fd invalid in poll"; + delete $self->{$fd}; + $nval = 1; + } else { + delete $self->{$fd} if $self->{$fd} & EPOLLONESHOT; + push @$events, $fd; } } - $n; + if ($nval && !@$events) { + $! = Errno::EBADF; + die "poll: $!"; + } } +sub ep_del { delete($_[0]->{fileno($_[1])}); 0 } +sub ep_add { $_[0]->{fileno($_[1])} = $_[2]; 0 } + +no warnings 'once'; +*ep_mod = \&ep_add; + 1; diff --git a/lib/PublicInbox/NNTPdeflate.pm b/lib/PublicInbox/DSdeflate.pm index 02af935f..539adf0f 100644 --- a/lib/PublicInbox/NNTPdeflate.pm +++ b/lib/PublicInbox/DSdeflate.pm @@ -1,7 +1,8 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # RFC 8054 NNTP COMPRESS DEFLATE implementation +# RFC 4978 IMAP COMPRESS=DEFLATE extension # # RSS usage for 10K idle-but-did-something NNTP clients on 64-bit: # TLS + DEFLATE[a] : 1.8 GB (MemLevel=9, 1.2 GB with MemLevel=8) @@ -14,23 +15,22 @@ # [b] - memory-optimized implementation using a global deflate context. # It's less efficient in terms of compression, but way more # efficient in terms of server memory usage. -package PublicInbox::NNTPdeflate; +package PublicInbox::DSdeflate; use strict; -use 5.010_001; -use parent qw(PublicInbox::NNTP); +use v5.10.1; use Compress::Raw::Zlib; my %IN_OPT = ( - -Bufsize => PublicInbox::NNTP::LINE_MAX, + -Bufsize => 1024, -WindowBits => -15, # RFC 1951 -AppendOutput => 1, ); # global deflate context and buffer -my $zbuf = \(my $buf = ''); -my $zout; +my ($zout, $zbuf); { my $err; + $zbuf = \(my $initial = ''); # replaced by $next in dflush/write ($zout, $err) = Compress::Raw::Zlib::Deflate->new( # nnrpd (INN) and Compress::Raw::Zlib favor MemLevel=9, # the zlib C library and git use MemLevel=8 as the default @@ -42,21 +42,18 @@ my $zout; $err == Z_OK or die "Failed to initialize zlib deflate stream: $err"; } - sub enable { my ($class, $self) = @_; my ($in, $err) = Compress::Raw::Zlib::Inflate->new(%IN_OPT); if ($err != Z_OK) { - $self->err("Inflate->new failed: $err"); - $self->res('403 Unable to activate compression'); + warn("Inflate->new failed: $err\n"); return; } - $self->res('206 Compression active'); bless $self, $class; $self->{zin} = $in; } -# overrides PublicInbox::NNTP::compressed +# overrides PublicInbox::DS::compressed sub compressed { 1 } sub do_read ($$$$) { @@ -103,7 +100,7 @@ sub msg_more ($$) { 1; } -sub zflush ($) { +sub dflush ($) { my ($self) = @_; my $deflated = $zbuf; diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index 1520f8f2..28458b19 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -1,93 +1,178 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# contains common daemon code for the httpd, imapd, and nntpd servers. -# This may be used for read-only IMAP server if we decide to implement it. +# +# Contains common daemon code for the httpd, imapd, and nntpd servers +# and designed for handling thousands of untrusted clients over slow +# and/or lossy connections. package PublicInbox::Daemon; -use strict; -use warnings; -use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; +use v5.12; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); use IO::Handle; # ->autoflush use IO::Socket; -use POSIX qw(WNOHANG :signal_h); +use File::Spec; +use POSIX qw(WNOHANG :signal_h F_SETFD); use Socket qw(IPPROTO_TCP SOL_SOCKET); -sub SO_ACCEPTFILTER () { 0x1000 } -use Cwd qw/abs_path/; STDOUT->autoflush(1); STDERR->autoflush(1); -use PublicInbox::DS qw(now); -use PublicInbox::Syscall qw($SFD_NONBLOCK); -require PublicInbox::Listener; +use PublicInbox::DS qw(now awaitpid); +use PublicInbox::Listener; use PublicInbox::EOFpipe; -use PublicInbox::Sigfd; +use PublicInbox::Git; use PublicInbox::GitAsyncCat; +use PublicInbox::Eml; +use PublicInbox::Config; +use PublicInbox::OnDestroy; +use PublicInbox::Search; +use PublicInbox::XapClient; +our $SO_ACCEPTFILTER = 0x1000; my @CMD; -my ($set_user, $oldset); +my ($set_user, $oldset, $xh_workers); my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize); -my $worker_processes = 1; -my @listeners; -my %pids; -my %tls_opt; # scheme://sockname => args for IO::Socket::SSL->start_SSL +my ($nworker, @listeners, %WORKERS, %logs); +my %tls_opt; # scheme://sockname => args for IO::Socket::SSL::SSL_Context->new my $reexec_pid; my ($uid, $gid); my ($default_cert, $default_key); -my %KNOWN_TLS = ( 443 => 'https', 563 => 'nntps', 993 => 'imaps' ); -my %KNOWN_STARTTLS = ( 119 => 'nntp', 143 => 'imap' ); - -sub accept_tls_opt ($) { - my ($opt_str) = @_; - # opt_str: opt1=val1,opt2=val2 (opt may repeat for multi-value) - require PublicInbox::TLS; +my %KNOWN_TLS = (443 => 'https', 563 => 'nntps', 993 => 'imaps', 995 =>'pop3s'); +my %KNOWN_STARTTLS = (110 => 'pop3', 119 => 'nntp', 143 => 'imap'); +my %SCHEME2PORT = map { $KNOWN_TLS{$_} => $_ + 0 } keys %KNOWN_TLS; +for (keys %KNOWN_STARTTLS) { $SCHEME2PORT{$KNOWN_STARTTLS{$_}} = $_ + 0 } +$SCHEME2PORT{http} = 80; + +our ($parent_pipe, %POST_ACCEPT, %XNETD); +our %WORKER_SIG = ( + INT => \&worker_quit, + QUIT => \&worker_quit, + TERM => \&worker_quit, + TTIN => 'IGNORE', + TTOU => 'IGNORE', + USR1 => \&reopen_logs, + USR2 => 'IGNORE', + WINCH => 'IGNORE', + CHLD => \&PublicInbox::DS::enqueue_reap, +); + +sub listener_opt ($) { + my ($str) = @_; # opt1=val1,opt2=val2 (opt may repeat for multi-value) my $o = {}; # allow ',' as delimiter since '&' is shell-unfriendly - foreach (split(/[,&]/, $opt_str)) { + for (split(/[,&]/, $str)) { my ($k, $v) = split(/=/, $_, 2); - push @{$o->{$k} ||= []}, $v; + push @{$o->{$k}}, $v; } # key may be a part of cert. At least # p5-io-socket-ssl/example/ssl_server.pl has this fallback: - $o->{cert} //= [ $default_cert ]; + $o->{cert} //= [ $default_cert ] if defined($default_cert); $o->{key} //= defined($default_key) ? [ $default_key ] : $o->{cert}; - my %ctx_opt = (SSL_server => 1); + $o; +} + +sub check_absolute ($$) { + my ($var, $val) = @_; + die <<EOM if index($val // '/', '/') != 0; +$var must be an absolute path when using --daemonize: $val +EOM +} + +sub accept_tls_opt ($) { + my ($opt) = @_; + my $o = ref($opt) eq 'HASH' ? $opt : listener_opt($opt); + return if !defined($o->{cert}); + require PublicInbox::TLS; + my @ctx_opt; # parse out hostname:/path/to/ mappings: - foreach my $k (qw(cert key)) { - my $x = $ctx_opt{'SSL_'.$k.'_file'} = {}; + for my $k (qw(cert key)) { + $o->{$k} // next; + push(@ctx_opt, "SSL_${k}_file", {}); foreach my $path (@{$o->{$k}}) { my $host = ''; $path =~ s/\A([^:]+):// and $host = $1; - $x->{$host} = $path; + $ctx_opt[-1]->{$host} = $path; + check_absolute($k, $path) if $daemonize; } } - my $ctx = IO::Socket::SSL::SSL_Context->new(%ctx_opt) or - die 'SSL_Context->new: '.PublicInbox::TLS::err(); - - # save ~34K per idle connection (cf. SSL_CTX_set_mode(3ssl)) - # RSS goes from 346MB to 171MB with 10K idle NNTPS clients on amd64 - # cf. https://rt.cpan.org/Ticket/Display.html?id=129463 - my $mode = eval { Net::SSLeay::MODE_RELEASE_BUFFERS() }; - if ($mode && $ctx->{context}) { - eval { Net::SSLeay::CTX_set_mode($ctx->{context}, $mode) }; - warn "W: $@ (setting SSL_MODE_RELEASE_BUFFERS)\n" if $@; - } + \@ctx_opt; +} - { SSL_server => 1, SSL_startHandshake => 0, SSL_reuse_ctx => $ctx }; +sub do_chown ($) { + $uid // return; + my ($path) = @_; + chown($uid, $gid, $path) or warn "chown $path: $!\n"; +} + +sub open_log_path ($$) { # my ($fh, $path) = @_; # $_[0] is modified + open $_[0], '>>', $_[1] or die "open(>> $_[1]): $!"; + $_[0]->autoflush(1); + do_chown($_[1]); + $_[0]; +} + +sub load_mod ($;$$) { + my ($scheme, $opt, $addr) = @_; + my $modc = "PublicInbox::\U$scheme"; + $modc =~ s/S\z//; + my $mod = $modc.'D'; + eval "require $mod"; # IMAPD|HTTPD|NNTPD|POP3D + die $@ if $@; + my %xn; + my $tlsd = $xn{tlsd} = $mod->new; + my %env = map { + substr($_, length('env.')) => $opt->{$_}->[-1]; + } grep(/\Aenv\./, keys %$opt); + $xn{refresh} = sub { + my ($sig) = @_; + local @ENV{keys %env} = values %env; + $tlsd->refresh_groups($sig); + }; + $xn{post_accept} = $tlsd->can('post_accept_cb') ? + $tlsd->post_accept_cb : sub { $modc->new($_[0], $tlsd) }; + my @paths = qw(out err); + if ($modc eq 'PublicInbox::HTTP') { + @paths = qw(err); + $xn{af_default} = 'httpready'; + if (my $p = $opt->{psgi}) { + die "multiple psgi= options specified\n" if @$p > 1; + check_absolute('psgi=', $p->[0]) if $daemonize; + $tlsd->{psgi} = $p->[0]; + warn "# $scheme://$addr psgi=$p->[0]\n"; + } + } + for my $f (@paths) { + my $p = $opt->{$f} or next; + die "multiple $f= options specified\n" if @$p > 1; + check_absolute("$f=", $p->[0]) if $daemonize; + $p = File::Spec->canonpath($p->[0]); + $tlsd->{$f} = $logs{$p} //= open_log_path(my $fh, $p); + warn "# $scheme://$addr $f=$p\n"; + } + # for per-listener $SIG{__WARN__}: + my $err = $tlsd->{err}; + $tlsd->{warn_cb} = sub { + print $err @_ unless PublicInbox::Eml::warn_ignore(@_) + }; + $opt->{'multi-accept'} and + $xn{'multi-accept'} = $opt->{'multi-accept'}->[-1]; + \%xn; } sub daemon_prepare ($) { my ($default_listen) = @_; my $listener_names = {}; # sockname => IO::Handle - $oldset = PublicInbox::Sigfd::block_signals(); + $oldset = PublicInbox::DS::block_signals(); @CMD = ($0, @ARGV); my ($prog) = ($CMD[0] =~ m!([^/]+)\z!g); + my $dh = defined($default_listen) ? " (default: $default_listen)" : ''; my $help = <<EOF; usage: $prog [-l ADDRESS] [--cert=FILE] [--key=FILE] options: - -l ADDRESS address to listen on (default: $default_listen) + -l ADDRESS address to listen on$dh --cert=FILE default SSL/TLS certificate - --key=FILE default SSL/TLS certificate + --key=FILE default SSL/TLS certificate key -W WORKERS number of worker processes to spawn (default: 1) + -X XWORKERS number of Xapian helper processes (default: undefined) See public-inbox-daemon(8) and $prog(1) man pages for more. EOF @@ -95,51 +180,64 @@ EOF 'l|listen=s' => \@cfg_listen, '1|stdout=s' => \$stdout, '2|stderr=s' => \$stderr, - 'W|worker-processes=i' => \$worker_processes, + 'W|worker-processes=i' => \$nworker, 'P|pid-file=s' => \$pid_file, 'u|user=s' => \$user, 'g|group=s' => \$group, 'D|daemonize' => \$daemonize, + 'multi-accept=i' => \$PublicInbox::Listener::MULTI_ACCEPT, 'cert=s' => \$default_cert, 'key=s' => \$default_key, + 'X|xapian-helpers=i' => \$xh_workers, 'help|h' => \(my $show_help), ); GetOptions(%opt) or die $help; if ($show_help) { print $help; exit 0 }; + $_ = File::Spec->canonpath($_ // next) for ($stdout, $stderr); if (defined $pid_file && $pid_file =~ /\.oldbin\z/) { die "--pid-file cannot end with '.oldbin'\n"; } @listeners = inherit($listener_names); - - # allow socket-activation users to set certs once and not - # have to configure each socket: - my @inherited_names = keys(%$listener_names) if defined($default_cert); + my @inherited_names = keys(%$listener_names); # ignore daemonize when inheriting $daemonize = undef if scalar @listeners; - push @cfg_listen, $default_listen unless (@listeners || @cfg_listen); - + unless (@listeners || @cfg_listen) { + $default_listen // die "no listeners specified\n"; + push @cfg_listen, $default_listen + } + my ($default_scheme) = (($default_listen // '') =~ m!\A([^:]+)://!); foreach my $l (@cfg_listen) { my $orig = $l; - my $scheme = ''; - if ($l =~ s!\A([^:]+)://!!) { - $scheme = $1; - } elsif ($l =~ /\A(?:\[[^\]]+\]|[^:]+):([0-9])+/) { - my $s = $KNOWN_TLS{$1} // $KNOWN_STARTTLS{$1}; - $scheme = $s if defined $s; + my ($scheme, $port, $opt); + $l =~ s!\A([a-z0-9]+)://!! and $scheme = $1; + $scheme //= $default_scheme; + if ($l =~ /\A(?:\[[^\]]+\]|[^:]+):([0-9]+)/) { + $port = $1 + 0; + $scheme //= $KNOWN_TLS{$port} // $KNOWN_STARTTLS{$port}; + } + $scheme // die "unable to determine URL scheme of $orig\n"; + if (!defined($port) && index($l, '/') != 0) { # AF_UNIX socket + $port = $SCHEME2PORT{$scheme} // + die "no port in listen=$orig\n"; + $l =~ s!\A([^/]+)!$1:$port! or + die "unable to add port=$port to $l\n"; } + $l =~ s!/\z!!; # chop one trailing slash if ($l =~ s!/?\?(.+)\z!!) { - $tls_opt{"$scheme://$l"} = accept_tls_opt($1); + $opt = listener_opt($1); + $tls_opt{"$scheme://$l"} = accept_tls_opt($opt); } elsif (defined($default_cert)) { $tls_opt{"$scheme://$l"} = accept_tls_opt(''); - } elsif ($scheme =~ /\A(?:https|imaps|imaps)\z/) { + } elsif ($scheme =~ /\A(?:https|imaps|nntps|pop3s)\z/) { die "$orig specified w/o cert=\n"; } - # TODO: use scheme to load either NNTP.pm or HTTP.pm - - next if $listener_names->{$l}; # already inherited + if ($listener_names->{$l}) { # already inherited + $XNETD{$l} = load_mod($scheme, $opt, $l); + next; + } my (%o, $sock_pkg); if (index($l, '/') == 0) { $sock_pkg = 'IO::Socket::UNIX'; @@ -166,63 +264,67 @@ EOF } $o{Listen} = 1024; my $prev = umask 0000; - my $s = eval { $sock_pkg->new(%o) }; - warn "error binding $l: $! ($@)\n" unless $s; + my $s = eval { $sock_pkg->new(%o) } or + warn "error binding $l: $! ($@)\n"; umask $prev; - if ($s) { - $listener_names->{sockname($s)} = $s; - $s->blocking(0); - push @listeners, $s; - } + $s // next; + $s->blocking(0); + my $sockname = sockname($s); + warn "# bound $scheme://$sockname\n"; + $XNETD{$sockname} //= load_mod($scheme, $opt); + $listener_names->{$sockname} = $s; + push @listeners, $s; } # cert/key options in @cfg_listen takes precedence when inheriting, # but map well-known inherited ports if --listen isn't specified - # at all - for my $sockname (@inherited_names) { - $sockname =~ /:([0-9]+)\z/ or next; - if (my $scheme = $KNOWN_TLS{$1}) { - $tls_opt{"$scheme://$sockname"} ||= accept_tls_opt(''); - } elsif (($scheme = $KNOWN_STARTTLS{$1})) { - next if $tls_opt{"$scheme://$sockname"}; - $tls_opt{''} ||= accept_tls_opt(''); + # at all. This allows socket-activation users to set certs once + # and not have to configure each socket: + if (defined $default_cert) { + my ($stls) = (($default_scheme // '') =~ /\A(pop3|nntp|imap)/); + for my $x (@inherited_names) { + $x =~ /:([0-9]+)\z/ or next; # no TLS for AF_UNIX + if (my $scheme = $KNOWN_TLS{$1}) { + $XNETD{$x} //= load_mod($scheme); + $tls_opt{"$scheme://$x"} ||= accept_tls_opt(''); + } elsif (($scheme = $KNOWN_STARTTLS{$1})) { + $XNETD{$x} //= load_mod($scheme); + $tls_opt{"$scheme://$x"} ||= accept_tls_opt(''); + } elsif (defined $stls) { + $tls_opt{"$stls://$x"} ||= accept_tls_opt(''); + } } } - - die "No listeners bound\n" unless @listeners; -} - -sub check_absolute ($$) { - my ($var, $val) = @_; - if (defined $val && index($val, '/') != 0) { - die -"--$var must be an absolute path when using --daemonize: $val\n"; + if (defined $default_scheme) { + for my $x (@inherited_names) { + $XNETD{$x} //= load_mod($default_scheme); + } } + die "No listeners bound\n" unless @listeners; } sub daemonize () { if ($daemonize) { + require Cwd; foreach my $i (0..$#ARGV) { my $arg = $ARGV[$i]; next unless -e $arg; - $ARGV[$i] = abs_path($arg); + $ARGV[$i] = Cwd::abs_path($arg); } - check_absolute('stdout', $stdout); - check_absolute('stderr', $stderr); - check_absolute('pid-file', $pid_file); + check_absolute('--stdout', $stdout); + check_absolute('--stderr', $stderr); + check_absolute('--pid-file', $pid_file); + check_absolute('--cert', $default_cert); + check_absolute('--key', $default_key); chdir '/' or die "chdir failed: $!"; } - - return unless (defined $pid_file || defined $group || defined $user - || $daemonize); - - eval { require Net::Server::Daemonize }; - if ($@) { - die -"Net::Server required for --pid-file, --group, --user, and --daemonize\n$@\n"; + if (defined($pid_file) || defined($group) || defined($user)) { + eval { require Net::Server::Daemonize; 1 } // die <<EOF; +Net::Server required for --pid-file, --group, --user +$@ +EOF } - Net::Server::Daemonize::check_pid_file($pid_file) if defined $pid_file; $uid = Net::Server::Daemonize::get_uid($user) if defined $user; if (defined $group) { @@ -241,24 +343,38 @@ sub daemonize () { }; if ($daemonize) { - my $pid = fork; - die "could not fork: $!\n" unless defined $pid; + my $pid = PublicInbox::OnDestroy::fork_tmp; exit if $pid; - open(STDIN, '+<', '/dev/null') or die "redirect stdin failed: $!\n"; open STDOUT, '>&STDIN' or die "redirect stdout failed: $!\n"; open STDERR, '>&STDIN' or die "redirect stderr failed: $!\n"; POSIX::setsid(); - $pid = fork; - die "could not fork: $!\n" unless defined $pid; + $pid = PublicInbox::OnDestroy::fork_tmp; exit if $pid; } return unless defined $pid_file; write_pid($pid_file); - # for ->DESTROY: - bless { pid => $$, pid_file => \$pid_file }, __PACKAGE__; + on_destroy \&unlink_pid_file_safe_ish, \$pid_file; +} + +sub has_busy_clients { # post_loop_do CB + my ($state) = @_; + my $now = now(); + my $n = PublicInbox::DS::close_non_busy(); + if ($n) { + if ($state->{-w} < now()) { + warn "$$ quitting, $n client(s) left\n"; + $state->{-w} = now() + 5; + } + unless (defined $state->{0}) { + $state->{0} = (split(/\s+/, $0))[0]; + $state->{0} =~ s!\A.*?([^/]+)\z!$1!; + } + $0 = "$state->{0} quitting, $n client(s) left"; + } + $n; # true: loop continues, false: loop breaks } sub worker_quit { # $_[0] = signal name or number (unused) @@ -267,51 +383,35 @@ sub worker_quit { # $_[0] = signal name or number (unused) $_->close foreach @listeners; # call PublicInbox::DS::close @listeners = (); - my $proc_name; - my $warn = 0; + # drop idle connections and try to quit gracefully - PublicInbox::DS->SetPostLoopCallback(sub { - my ($dmap, undef) = @_; - my $n = 0; - my $now = now(); - - foreach my $s (values %$dmap) { - $s->can('busy') or next; - if ($s->busy($now)) { - ++$n; - } else { - # close as much as possible, early as possible - $s->close; - } - } - if ($n) { - if (($warn + 5) < now()) { - warn "$$ quitting, $n client(s) left\n"; - $warn = now(); - } - unless (defined $proc_name) { - $proc_name = (split(/\s+/, $0))[0]; - $proc_name =~ s!\A.*?([^/]+)\z!$1!; - } - $0 = "$proc_name quitting, $n client(s) left"; - } - $n; # true: loop continues, false: loop breaks - }); + @PublicInbox::DS::post_loop_do = (\&has_busy_clients, { -w => 0 }) +} + +sub spawn_xh () { + $xh_workers // return; + require PublicInbox::XhcMset; + local $) = $gid if defined $gid; + local $( = $gid if defined $gid; + local $> = $uid if defined $uid; + local $< = $uid if defined $uid; + $PublicInbox::Search::XHC = eval { + local $ENV{STDERR_PATH} = $stderr; + local $ENV{STDOUT_PATH} = $stdout; + PublicInbox::XapClient::start_helper('-j', $xh_workers) + }; + warn "E: $@" if $@; + awaitpid($PublicInbox::Search::XHC->{io}->attached_pid, \&respawn_xh) + if $PublicInbox::Search::XHC; } sub reopen_logs { - if ($stdout) { - open STDOUT, '>>', $stdout or - warn "failed to redirect stdout to $stdout: $!\n"; - STDOUT->autoflush(1); - do_chown($stdout); - } - if ($stderr) { - open STDERR, '>>', $stderr or - warn "failed to redirect stderr to $stderr: $!\n"; - STDERR->autoflush(1); - do_chown($stderr); - } + my ($sig) = @_; + $logs{$stdout} //= \*STDOUT if defined $stdout; + $logs{$stderr} //= \*STDERR if defined $stderr; + while (my ($p, $fh) = each %logs) { open_log_path($fh, $p) } + ($sig && defined($xh_workers) && $PublicInbox::Search::XHC) and + kill('USR1', $PublicInbox::Search::XHC->{io}->attached_pid); } sub sockname ($) { @@ -371,17 +471,16 @@ sub inherit ($) { my $end = $fds + 2; # LISTEN_FDS_START - 1 my @rv = (); foreach my $fd (3..$end) { - my $s = IO::Handle->new_from_fd($fd, 'r'); + open(my $s, '<&=', $fd) or warn "fdopen fd=$fd: $!"; if (my $k = sockname($s)) { - if ($s->blocking) { - $s->blocking(0); - warn <<""; -Inherited socket (fd=$fd) is blocking, making it non-blocking. + my $prev_was_blocking = $s->blocking(0); + warn <<"" if $prev_was_blocking; +Inherited socket ($k fd=$fd) is blocking, making it non-blocking. Set 'NonBlocking = true' in the systemd.service unit to avoid stalled processes when multiple service instances start. - } $listener_names->{$k} = $s; + warn "# inherited $k fd=$fd\n"; push @rv, $s; } else { warn "failed to inherit fd=$fd (LISTEN_FDS=$fds)"; @@ -400,177 +499,147 @@ sub upgrade { # $_[0] = signal name or number (unused) warn "BUG: .oldbin suffix exists: $pid_file\n"; return; } - unlink_pid_file_safe_ish($$, $pid_file); + unlink_pid_file_safe_ish(\$pid_file); $pid_file .= '.oldbin'; write_pid($pid_file); } - my $pid = fork; - unless (defined $pid) { - warn "fork failed: $!\n"; - return; - } - if ($pid == 0) { - use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); + my $pid = eval { PublicInbox::OnDestroy::fork_tmp }; + if (!defined($pid)) { + warn "fork failed: $! $@\n"; + } elsif ($pid == 0) { $ENV{LISTEN_FDS} = scalar @listeners; $ENV{LISTEN_PID} = $$; foreach my $s (@listeners) { # @listeners are globs with workers, PI::L w/o workers $s = $s->{sock} if ref($s) eq 'PublicInbox::Listener'; - - my $fl = fcntl($s, F_GETFD, 0); - fcntl($s, F_SETFD, $fl &= ~FD_CLOEXEC); + fcntl($s, F_SETFD, 0) // die "F_SETFD: $!"; } exec @CMD; die "Failed to exec: $!\n"; + } else { + awaitpid($pid, \&upgrade_aborted); + $reexec_pid = $pid; } - $reexec_pid = $pid; } -sub kill_workers ($) { - my ($s) = @_; - - while (my ($pid, $id) = each %pids) { - kill $s, $pid; - } -} +sub kill_workers ($) { kill $_[0], values(%WORKERS) } -sub upgrade_aborted ($) { - my ($p) = @_; - warn "reexec PID($p) died with: $?\n"; +sub upgrade_aborted { + my ($pid) = @_; + warn "reexec PID($pid) died with: $?\n"; $reexec_pid = undef; return unless $pid_file; my $file = $pid_file; $file =~ s/\.oldbin\z// or die "BUG: no '.oldbin' suffix in $file"; - unlink_pid_file_safe_ish($$, $pid_file); + unlink_pid_file_safe_ish(\$pid_file); $pid_file = $file; eval { write_pid($pid_file) }; warn $@, "\n" if $@; } -sub reap_children { # $_[0] = 'CHLD' or POSIX::SIGCHLD() - while (1) { - my $p = waitpid(-1, WNOHANG) or return; - if (defined $reexec_pid && $p == $reexec_pid) { - upgrade_aborted($p); - } elsif (defined(my $id = delete $pids{$p})) { - warn "worker[$id] PID($p) died with: $?\n"; - } elsif ($p > 0) { - warn "unknown PID($p) reaped: $?\n"; - } else { - return; - } - } -} - -sub unlink_pid_file_safe_ish ($$) { - my ($unlink_pid, $file) = @_; - return unless defined $unlink_pid && $unlink_pid == $$; +sub unlink_pid_file_safe_ish ($) { + my ($fref) = @_; - open my $fh, '<', $file or return; + open my $fh, '<', $$fref or return; local $/ = "\n"; defined(my $read_pid = <$fh>) or return; chomp $read_pid; - if ($read_pid == $unlink_pid) { - Net::Server::Daemonize::unlink_pid_file($file); - } + Net::Server::Daemonize::unlink_pid_file($$fref) if $read_pid == $$; } sub master_quit ($) { exit unless @listeners; @listeners = (); - kill_workers($_[0]); + exit unless kill_workers($_[0]); +} + +sub reap_worker { # awaitpid CB + my ($pid, $nr) = @_; + warn "worker[$nr] died \$?=$?\n" if $?; + delete $WORKERS{$nr}; + exit if !@listeners && !keys(%WORKERS); + PublicInbox::DS::requeue(\&start_workers); +} + +sub start_worker ($) { + my ($nr) = @_; + return unless @listeners; + my $pid = PublicInbox::DS::fork_persist; + if ($pid == 0) { + undef %WORKERS; + undef $xh_workers; + local $PublicInbox::DS::Poller; # allow epoll/kqueue + $set_user->() if $set_user; + PublicInbox::EOFpipe->new($parent_pipe, \&worker_quit); + worker_loop(); + exit 0; + } else { + $WORKERS{$nr} = $pid; + awaitpid($pid, \&reap_worker, $nr); + } +} + +sub start_workers { + my @idx = grep { !defined($WORKERS{$_}) } (0..($nworker - 1)) or return; + eval { start_worker($_) for @idx }; + warn "E: $@\n" if $@; +} + +sub trim_workers { + my @nr = grep { $_ >= $nworker } keys %WORKERS; + kill('TERM', @WORKERS{@nr}); } sub master_loop { - pipe(my ($p0, $p1)) or die "failed to create parent-pipe: $!"; - my $set_workers = $worker_processes; + local $parent_pipe; + pipe($parent_pipe, my $p1) or die "failed to create parent-pipe: $!"; + my $set_workers = $nworker; # for SIGWINCH reopen_logs(); - my $ignore_winch; - my $sig = { - USR1 => sub { reopen_logs(); kill_workers($_[0]); }, + spawn_xh; + my $msig = { + USR1 => sub { reopen_logs($_[0]); kill_workers($_[0]); }, USR2 => \&upgrade, QUIT => \&master_quit, INT => \&master_quit, TERM => \&master_quit, WINCH => sub { - return if $ignore_winch || !@listeners; - if (-t STDIN || -t STDOUT || -t STDERR) { - $ignore_winch = 1; - warn <<EOF; -ignoring SIGWINCH since we are not daemonized -EOF - } else { - $worker_processes = 0; - } + $nworker = 0; + trim_workers(); }, HUP => sub { - return unless @listeners; - $worker_processes = $set_workers; + $nworker = $set_workers; # undo WINCH kill_workers($_[0]); + PublicInbox::DS::requeue(\&start_workers) }, TTIN => sub { - return unless @listeners; - if ($set_workers > $worker_processes) { - ++$worker_processes; + if ($set_workers > $nworker) { + ++$nworker; } else { - $worker_processes = ++$set_workers; + $nworker = ++$set_workers; } + PublicInbox::DS::requeue(\&start_workers); }, TTOU => sub { - $worker_processes = --$set_workers if $set_workers > 0; + return if $nworker <= 0; + --$nworker; + trim_workers(); }, - CHLD => \&reap_children, + CHLD => \&PublicInbox::DS::enqueue_reap, }; - my $sigfd = PublicInbox::Sigfd->new($sig, 0); - local %SIG = (%SIG, %$sig) if !$sigfd; - PublicInbox::Sigfd::sig_setmask($oldset) if !$sigfd; - while (1) { # main loop - my $n = scalar keys %pids; - unless (@listeners) { - exit if $n == 0; - $set_workers = $worker_processes = $n = 0; - } - - if ($n > $worker_processes) { - while (my ($k, $v) = each %pids) { - kill('TERM', $k) if $v >= $worker_processes; - } - $n = $worker_processes; - } - my $want = $worker_processes - 1; - if ($n <= $want) { - PublicInbox::Sigfd::block_signals() if !$sigfd; - for my $i ($n..$want) { - my $pid = fork; - if (!defined $pid) { - warn "failed to fork worker[$i]: $!\n"; - } elsif ($pid == 0) { - $set_user->() if $set_user; - return $p0; # run normal work code - } else { - warn "PID=$pid is worker[$i]\n"; - $pids{$pid} = $i; - } - } - PublicInbox::Sigfd::sig_setmask($oldset) if !$sigfd; - } - - if ($sigfd) { # Linux and IO::KQueue users: - $sigfd->wait_once; - } else { # wake up every second - sleep(1); - } - } + $msig->{WINCH} = sub { + warn "ignoring SIGWINCH since we are not daemonized\n"; + } if -t STDIN || -t STDOUT || -t STDERR; + start_workers(); + PublicInbox::DS::event_loop($msig, $oldset); exit # never gets here, just for documentation } -sub tls_start_cb ($$) { - my ($opt, $orig_post_accept) = @_; +sub tls_cb { + my ($post_accept, $tlsd) = @_; sub { my ($io, $addr, $srv) = @_; - my $ssl = IO::Socket::SSL->start_SSL($io, %$opt); - $orig_post_accept->($ssl, $addr, $srv); + $post_accept->(PublicInbox::TLS::start($io, $tlsd), $addr, $srv) } } @@ -584,95 +653,91 @@ sub defer_accept ($$) { my $sec = unpack('i', $x); return if $sec > 0; # systemd users may set a higher value setsockopt($s, IPPROTO_TCP, $TCP_DEFER_ACCEPT, 1); - } elsif ($^O eq 'freebsd') { - my $x = getsockopt($s, SOL_SOCKET, SO_ACCEPTFILTER); - return if defined $x; # don't change if set + } elsif ($^O =~ /\A(?:freebsd|netbsd|dragonfly)\z/) { + my $x = getsockopt($s, SOL_SOCKET, $SO_ACCEPTFILTER); + return if ($x // "\0") =~ /[^\0]/s; # don't change if set my $accf_arg = pack('a16a240', $af_name, ''); - setsockopt($s, SOL_SOCKET, SO_ACCEPTFILTER, $accf_arg); + setsockopt($s, SOL_SOCKET, $SO_ACCEPTFILTER, $accf_arg); } } -sub daemon_loop ($$$$) { - my ($refresh, $post_accept, $tlsd, $af_default) = @_; - my %post_accept; - while (my ($k, $v) = each %tls_opt) { - if ($k =~ s!\A(?:https|imaps|nntps)://!!) { - $post_accept{$k} = tls_start_cb($v, $post_accept); - } elsif ($tlsd) { # STARTTLS, $k eq '' is OK - $tlsd->{accept_tls} = $v; +sub daemon_loop () { + local $PublicInbox::Config::DEDUPE = {}; # enable dedupe cache + my $refresh = $WORKER_SIG{HUP} = sub { + my ($sig) = @_; + %$PublicInbox::Config::DEDUPE = (); # clear cache + for my $xn (values %XNETD) { + delete $xn->{tlsd}->{ssl_ctx}; # PublicInbox::TLS::start + eval { $xn->{refresh}->($sig) }; + warn "refresh $@\n" if $@; } - } - my $sig = { - HUP => $refresh, - INT => \&worker_quit, - QUIT => \&worker_quit, - TERM => \&worker_quit, - TTIN => 'IGNORE', - TTOU => 'IGNORE', - USR1 => \&reopen_logs, - USR2 => 'IGNORE', - WINCH => 'IGNORE', - CHLD => \&PublicInbox::DS::enqueue_reap, }; - if ($worker_processes > 0) { + while (my ($k, $ctx_opt) = each %tls_opt) { + $ctx_opt // next; + my ($scheme, $l) = split(m!://!, $k, 2); + my $xn = $XNETD{$l} // die "BUG: no xnetd for $k"; + $xn->{tlsd}->{ssl_ctx_opt} //= $ctx_opt; + $scheme =~ m!\A(?:https|imaps|nntps|pop3s)! and + $POST_ACCEPT{$l} = tls_cb(@$xn{qw(post_accept tlsd)}); + } + undef %tls_opt; + if ($nworker > 0) { $refresh->(); # preload by default - my $fh = master_loop(); # returns if in child process - PublicInbox::EOFpipe->new($fh, \&worker_quit, undef); + return master_loop(); } else { reopen_logs(); $set_user->() if $set_user; - $sig->{USR2} = sub { worker_quit() if upgrade() }; + $WORKER_SIG{USR2} = sub { worker_quit() if upgrade() }; $refresh->(); } + local $PublicInbox::DS::Poller; # allow epoll/kqueue + worker_loop(); +} + +sub worker_loop { $uid = $gid = undef; reopen_logs(); + spawn_xh; # only for -W0 @listeners = map {; - my $tls_cb = $post_accept{sockname($_)}; + my $l = sockname($_); + my $tls_cb = $POST_ACCEPT{$l}; + my $xn = $XNETD{$l} // die "BUG: no xnetd for $l"; # NNTPS, HTTPS, HTTP, IMAPS and POP3S are client-first traffic # IMAP, NNTP and POP3 are server-first - defer_accept($_, $tls_cb ? 'dataready' : $af_default); + defer_accept($_, $tls_cb ? 'dataready' : $xn->{af_default}); # this calls epoll_create: - PublicInbox::Listener->new($_, $tls_cb || $post_accept) + PublicInbox::Listener->new($_, $tls_cb || $xn->{post_accept}, + $xn->{'multi-accept'}) } @listeners; - my $sigfd = PublicInbox::Sigfd->new($sig, $SFD_NONBLOCK); - local %SIG = (%SIG, %$sig) if !$sigfd; - if (!$sigfd) { - # wake up every second to accept signals if we don't - # have signalfd or IO::KQueue: - PublicInbox::Sigfd::sig_setmask($oldset); - PublicInbox::DS->SetLoopTimeout(1000); - } - PublicInbox::DS->EventLoop; + PublicInbox::DS::event_loop(\%WORKER_SIG, $oldset); } -sub run ($$$;$) { - my ($default, $refresh, $post_accept, $tlsd) = @_; - local $SIG{PIPE} = 'IGNORE'; - daemon_prepare($default); - my $af_default = $default =~ /:8080\z/ ? 'httpready' : undef; - my $for_destroy = daemonize(); - - # this wastes a bit of memory for non-PublicInbox::WWW -httpd users - # oh well... - eval { - require PublicInbox::Gcf2; - require PublicInbox::Gcf2Client; - }; - local $PublicInbox::GitAsyncCat::GCF2C = - PublicInbox::Gcf2Client::new() if !$@; - - daemon_loop($refresh, $post_accept, $tlsd, $af_default); - PublicInbox::DS->Reset; - # ->DESTROY runs when $for_destroy goes out-of-scope +sub respawn_xh { # awaitpid cb + my ($pid) = @_; + return unless @listeners; + warn "W: xap_helper PID:$pid died: \$?=$?, respawning...\n"; + spawn_xh; } -sub do_chown ($) { - my ($path) = @_; - if (defined $uid and !chown($uid, $gid, $path)) { - warn "could not chown $path: $!\n"; - } +sub run { + my ($default_listen) = @_; + $nworker = 1; + local (%XNETD, %POST_ACCEPT); + daemon_prepare($default_listen); + my $unlink_on_leave = daemonize(); + + # localize GCF2C for tests: + local $PublicInbox::GitAsyncCat::GCF2C; + local $PublicInbox::Git::async_warn = 1; + local $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb(); + local %WORKER_SIG = %WORKER_SIG; + local $PublicInbox::XapClient::tries = 0; + local $PublicInbox::Search::XHC if defined($xh_workers); + + daemon_loop(); + # $unlink_on_leave runs } sub write_pid ($) { @@ -681,8 +746,4 @@ sub write_pid ($) { do_chown($path); } -sub DESTROY { - unlink_pid_file_safe_ish($_[0]->{pid}, ${$_[0]->{pid_file}}); -} - 1; diff --git a/lib/PublicInbox/DirIdle.pm b/lib/PublicInbox/DirIdle.pm index 458285e2..230df166 100644 --- a/lib/PublicInbox/DirIdle.pm +++ b/lib/PublicInbox/DirIdle.pm @@ -1,56 +1,99 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Used by public-inbox-watch for Maildir (and possibly MH in the future) package PublicInbox::DirIdle; -use strict; +use v5.12; use parent 'PublicInbox::DS'; -use PublicInbox::Syscall qw(EPOLLIN EPOLLET); +use PublicInbox::Syscall qw(EPOLLIN); use PublicInbox::In2Tie; -my ($MAIL_IN, $ino_cls); -if ($^O eq 'linux' && eval { require Linux::Inotify2; 1 }) { - $MAIL_IN = Linux::Inotify2::IN_MOVED_TO() | - Linux::Inotify2::IN_CREATE(); - $ino_cls = 'Linux::Inotify2'; +my ($MAIL_IN, $MAIL_GONE, $ino_cls); +if ($^O eq 'linux' && eval { require PublicInbox::Inotify; 1 }) { + $MAIL_IN = PublicInbox::Inotify::IN_MOVED_TO() | + PublicInbox::Inotify::IN_CREATE(); + $MAIL_GONE = PublicInbox::Inotify::IN_DELETE() | + PublicInbox::Inotify::IN_DELETE_SELF() | + PublicInbox::Inotify::IN_MOVE_SELF() | + PublicInbox::Inotify::IN_MOVED_FROM(); + $ino_cls = 'PublicInbox::Inotify'; # Perl 5.22+ is needed for fileno(DIRHANDLE) support: } elsif ($^V ge v5.22 && eval { require PublicInbox::KQNotify }) { $MAIL_IN = PublicInbox::KQNotify::MOVED_TO_OR_CREATE(); + $MAIL_GONE = PublicInbox::KQNotify::NOTE_DELETE() | + PublicInbox::KQNotify::NOTE_REVOKE() | + PublicInbox::KQNotify::NOTE_RENAME(); $ino_cls = 'PublicInbox::KQNotify'; } else { require PublicInbox::FakeInotify; $MAIL_IN = PublicInbox::FakeInotify::MOVED_TO_OR_CREATE(); + $MAIL_GONE = PublicInbox::FakeInotify::IN_DELETE() | + PublicInbox::FakeInotify::IN_DELETE_SELF() | + PublicInbox::FakeInotify::IN_MOVE_SELF(); } sub new { - my ($class, $dirs, $cb) = @_; + my ($class, $cb) = @_; my $self = bless { cb => $cb }, $class; my $inot; if ($ino_cls) { $inot = $ino_cls->new or die "E: $ino_cls->new: $!"; my $io = PublicInbox::In2Tie::io($inot); - $self->SUPER::new($io, EPOLLIN | EPOLLET); + $self->SUPER::new($io, EPOLLIN); } else { require PublicInbox::FakeInotify; $inot = PublicInbox::FakeInotify->new; # starts timer } - - # Linux::Inotify2->watch or similar - $inot->watch($_, $MAIL_IN) for @$dirs; $self->{inot} = $inot; - PublicInbox::FakeInotify::poll_once($self) if !$ino_cls; $self; } +sub add_watches { + my ($self, $dirs, $gone) = @_; + my $fl = $MAIL_IN | ($gone ? $MAIL_GONE : 0); + my @ret; + for my $d (@$dirs) { + my $w = $self->{inot}->watch($d, $fl) or next; + push @ret, $w; + } + PublicInbox::FakeInotify::poll_once($self) if !$ino_cls; + @ret +} + +sub rm_watches { + my ($self, $dir) = @_; + my $inot = $self->{inot}; + if (my $cb = $inot->can('rm_watches')) { # TODO for fake watchers + $cb->($inot, $dir); + } +} + +sub close { + my ($self) = @_; + delete $self->{cb}; + $self->SUPER::close; # if using real kevent/inotify +} + sub event_step { my ($self) = @_; - my $cb = $self->{cb}; - local $PublicInbox::DS::in_loop = 0; # waitpid() synchronously + my $cb = $self->{cb} or return; + local $PublicInbox::DS::in_loop = 0; # waitpid() synchronously (FIXME) eval { - my @events = $self->{inot}->read; # Linux::Inotify2->read + my @events = $self->{inot}->read; # Inotify3->read $cb->($_) for @events; }; warn "$self->{inot}->read err: $@\n" if $@; } +sub force_close { + my ($self) = @_; + my $inot = delete $self->{inot} // return; + if ($inot->can('fh')) { # Inotify3 or Linux::Inotify2 2.3+ + $inot->fh->close or warn "CLOSE ERROR: $!"; + } elsif ($inot->isa('Linux::Inotify2')) { + require PublicInbox::LI2Wrap; + PublicInbox::LI2Wrap::wrapclose($inot); + } +} + 1; diff --git a/lib/PublicInbox/DummyInbox.pm b/lib/PublicInbox/DummyInbox.pm index 69b0b683..c516eec4 100644 --- a/lib/PublicInbox/DummyInbox.pm +++ b/lib/PublicInbox/DummyInbox.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # An EXAMINE-able, PublicInbox::Inbox-like object for IMAP. Some @@ -7,16 +7,16 @@ package PublicInbox::DummyInbox; use strict; -sub created_at { 0 } # Msgmap::created_at +sub uidvalidity { 0 } # Msgmap::created_at sub mm { shift } sub uid_range { [] } # Over::uid_range sub subscribe_unlock { undef }; no warnings 'once'; -*max = \&created_at; +*max = \&uidvalidity; *query_xover = \&uid_range; *over = \&mm; -*search = *unsubscribe_unlock = +*isrch = *search = *unsubscribe_unlock = *get_art = *description = *base_url = \&subscribe_unlock; 1; diff --git a/lib/PublicInbox/EOFpipe.pm b/lib/PublicInbox/EOFpipe.pm index 489caf82..77b699a2 100644 --- a/lib/PublicInbox/EOFpipe.pm +++ b/lib/PublicInbox/EOFpipe.pm @@ -1,23 +1,24 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::EOFpipe; -use strict; +use v5.12; use parent qw(PublicInbox::DS); -use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT $F_SETPIPE_SZ); sub new { - my (undef, $rd, $cb, $arg) = @_; - my $self = bless { cb => $cb, arg => $arg }, __PACKAGE__; - # 1031: F_SETPIPE_SZ, 4096: page size - fcntl($rd, 1031, 4096) if $^O eq 'linux'; + my (undef, $rd, @cb_args) = @_; + my $self = bless { cb_args => \@cb_args }, __PACKAGE__; + # 4096: page size + fcntl($rd, $F_SETPIPE_SZ, 4096) if $F_SETPIPE_SZ; $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); } sub event_step { my ($self) = @_; if ($self->do_read(my $buf, 1) == 0) { # auto-closed - $self->{cb}->($self->{arg}); + my ($cb, @args) = @{delete $self->{cb_args}}; + $cb->(@args); } } diff --git a/lib/PublicInbox/Emergency.pm b/lib/PublicInbox/Emergency.pm index b705e776..968d7d6f 100644 --- a/lib/PublicInbox/Emergency.pm +++ b/lib/PublicInbox/Emergency.pm @@ -1,97 +1,78 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Emergency Maildir delivery for MDA package PublicInbox::Emergency; -use strict; -use warnings; +use v5.12; use Fcntl qw(:DEFAULT SEEK_SET); use Sys::Hostname qw(hostname); -use IO::Handle; # ->flush, ->autoflush +use IO::Handle; # ->flush +use Errno qw(EEXIST); +use File::Path (); sub new { my ($class, $dir) = @_; - - foreach (qw(new tmp cur)) { - my $d = "$dir/$_"; - next if -d $d; - require File::Path; - if (!File::Path::mkpath($d) && !-d $d) { - die "failed to mkpath($d): $!\n"; - } - } - bless { dir => $dir, files => {}, t => 0, cnt => 0, pid => $$ }, $class; + File::Path::make_path(map { $dir.$_ } qw(/tmp /new /cur)); + bless { dir => $dir, t => 0 }, $class; } sub _fn_in { - my ($self, $dir) = @_; - my @host = split(/\./, hostname); + my ($self, $pid, $dir) = @_; + my $host = $self->{-host} //= (split(/\./, hostname))[0] // 'localhost'; my $now = time; + my $n; if ($self->{t} != $now) { $self->{t} = $now; - $self->{cnt} = 0; + $n = $self->{cnt} = 0; } else { - $self->{cnt}++; + $n = ++$self->{cnt}; } - - my $f; - do { - $f = "$self->{dir}/$dir/$self->{t}.$$"."_$self->{cnt}.$host[0]"; - $self->{cnt}++; - } while (-e $f); - $f; + "$self->{dir}/$dir/$self->{t}.$pid"."_$n.$host"; } sub prepare { my ($self, $strref) = @_; - - die "already in transaction: $self->{tmp}" if $self->{tmp}; + my $pid = $$; + my $tmp_key = "tmp.$pid"; + die "BUG: in transaction: $self->{$tmp_key}" if $self->{$tmp_key}; my ($tmp, $fh); do { - $tmp = _fn_in($self, 'tmp'); + $tmp = _fn_in($self, $pid, 'tmp'); $! = undef; - } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_RDWR) && $!{EEXIST}); - print $fh $$strref or die "write failed: $!"; - $fh->flush or die "flush failed: $!"; - $fh->autoflush(1); + } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_RDWR) and $! == EEXIST); + print $fh $$strref or die "print: $!"; + $fh->flush or die "flush: $!"; $self->{fh} = $fh; - $self->{tmp} = $tmp; + $self->{$tmp_key} = $tmp; } sub abort { my ($self) = @_; delete $self->{fh}; - my $tmp = delete $self->{tmp} or return; - - unlink($tmp) or warn "Failed to unlink $tmp: $!"; + my $tmp = delete $self->{"tmp.$$"} or return; + unlink($tmp) or warn "W: unlink($tmp): $!"; undef; } sub fh { my ($self) = @_; - my $fh = $self->{fh} or die "{fh} not open!\n"; - seek($fh, 0, SEEK_SET) or die "seek(fh) failed: $!"; - sysseek($fh, 0, SEEK_SET) or die "sysseek(fh) failed: $!"; + my $fh = $self->{fh} or die "BUG: {fh} not open"; + seek($fh, 0, SEEK_SET) or die "seek: $!"; + sysseek($fh, 0, SEEK_SET) or die "sysseek: $!"; $fh; } sub commit { my ($self) = @_; - $$ == $self->{pid} or return; # no-op in forked child - + my $pid = $$; + my $tmp = delete $self->{"tmp.$pid"} or return; delete $self->{fh}; - my $tmp = delete $self->{tmp} or return; - my $new; + my ($new, $ok); do { - $new = _fn_in($self, 'new'); - } while (!link($tmp, $new) && $!{EEXIST}); - my @sn = stat($new) or die "stat $new failed: $!"; - my @st = stat($tmp) or die "stat $tmp failed: $!"; - if ($st[0] == $sn[0] && $st[1] == $sn[1]) { - unlink($tmp) or warn "Failed to unlink $tmp: $!"; - } else { - warn "stat($new) and stat($tmp) differ"; - } + $new = _fn_in($self, $pid, 'new'); + } while (!($ok = link($tmp, $new)) && $! == EEXIST); + die "link($tmp, $new): $!" unless $ok; + unlink($tmp) or warn "W: unlink($tmp): $!"; } sub DESTROY { commit($_[0]) } diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 571edc5c..d59d7c3f 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Lazy MIME parser, it still slurps the full message but keeps short @@ -28,7 +28,7 @@ package PublicInbox::Eml; use strict; use v5.10.1; use Carp qw(croak); -use Encode qw(find_encoding decode encode); # stdlib +use Encode qw(find_encoding); # stdlib use Text::Wrap qw(wrap); # stdlib, we need Perl 5.6+ for $huge use MIME::Base64 3.05; # Perl 5.10.0 / 5.9.2 use MIME::QuotedPrint 3.05; # ditto @@ -51,7 +51,9 @@ $MIME_ENC{quotedprint} = $MIME_ENC{'quoted-printable'} = $MIME_ENC{qp}; $MIME_DEC{quotedprint} = $MIME_DEC{'quoted-printable'} = $MIME_DEC{qp}; $MIME_ENC{$_} = \&identity_codec for qw(7bit 8bit binary); -my %DECODE_ADDRESS = map { $_ => 1 } qw(From To Cc Sender Reply-To); +my %DECODE_ADDRESS = map { + ($_ => 1, "Resent-$_" => 1) +} qw(From To Cc Sender Reply-To Bcc); my %DECODE_FULL = ( Subject => 1, 'Content-Description' => 1, @@ -120,9 +122,10 @@ sub new { my $hdr = substr($$ref, 0, $header_size_limit + 1); hdr_truncate($hdr) if length($hdr) > $header_size_limit; bless { hdr => \$hdr, crlf => $1 }, __PACKAGE__; - } else { # nothing useful - my $hdr = $$ref = ''; - bless { hdr => \$hdr, crlf => "\n" }, __PACKAGE__; + } else { # just a body w/o header? + my $hdr = ''; + my $eol = ($$ref =~ /(\r?\n)/) ? $1 : "\n"; + bless { hdr => \$hdr, crlf => $eol, bdy => $ref }, __PACKAGE__; } } @@ -141,6 +144,7 @@ sub header_raw { my $re = re_memo($_[1]); my @v = (${ $_[0]->{hdr} } =~ /$re/g); for (@v) { + utf8::decode($_); # SMTPUTF8 # for compatibility w/ Email::Simple::Header, s/\s+\z//s; s/\A\s+//s; @@ -217,7 +221,7 @@ sub mp_descend ($$) { # There's also a case where quoted text showed up in the # preamble # <20060515162817.65F0F1BBAE@citi.umich.edu> - unshift(@parts, new_sub(undef, \$pre)) if $pre =~ /:/s; + unshift(@parts, new_sub(undef, \$pre)) if index($pre, ':') >= 0; return \@parts; } # "multipart", but no boundary found, treat as single part @@ -234,6 +238,7 @@ sub mp_descend ($$) { # $cb - user-supplied callback sub # $arg - user-supplied arg (think pthread_create) # $once - unref body scalar during iteration +# $all - used by IMAP server, only sub each_part { my ($self, $cb, $arg, $once, $all) = @_; my $p = mp_descend($self, $once // 0) or @@ -329,11 +334,24 @@ sub body_set { undef; } +# workaround https://rt.cpan.org/Public/Bug/Display.html?id=139622 +# Encode 2.87..3.12 leaks on croak, so we defer and croak ourselves +our @enc_warn; +my $enc_warn = sub { push @enc_warn, @_ }; + sub body_str_set { - my ($self, $body_str) = @_; - my $charset = ct($self)->{attributes}->{charset} or - Carp::confess('body_str was given, but no charset is defined'); - body_set($self, \(encode($charset, $body_str, Encode::FB_CROAK))); + my ($self, $str) = @_; + my $cs = ct($self)->{attributes}->{charset} // + croak('body_str was given, but no charset is defined'); + my $enc = find_encoding($cs) // croak "unknown encoding `$cs'"; + my $tmp; + { + local @enc_warn; + local $SIG{__WARN__} = $enc_warn; + $tmp = $enc->encode($str, Encode::FB_WARN); + croak(@enc_warn) if @enc_warn; + }; + body_set($self, \$tmp); } sub content_type { scalar header($_[0], 'Content-Type') } @@ -347,14 +365,15 @@ sub header_set { $pfx .= ': '; my $len = 78 - length($pfx); @vals = map {; + utf8::encode(my $v = $_); # to bytes, support SMTPUTF8 # folding differs from Email::Simple::Header, # we favor tabs for visibility (and space savings :P) if (length($_) >= $len && (/\n[^ \t]/s || !/\n/s)) { local $Text::Wrap::columns = $len; local $Text::Wrap::huge = 'overflow'; - $pfx . wrap('', "\t", $_) . $self->{crlf}; + $pfx . wrap('', "\t", $v) . $self->{crlf}; } else { - $pfx . $_ . $self->{crlf}; + $pfx . $v . $self->{crlf}; } } @vals; $$hdr =~ s!$re!shift(@vals) // ''!ge; # replace current headers, first @@ -378,7 +397,9 @@ sub header_str_set { header_set($self, $name, @vals); } -sub mhdr_decode ($) { eval { $MIME_Header->decode($_[0]) } // $_[0] } +sub mhdr_decode ($) { + eval { $MIME_Header->decode($_[0], Encode::FB_DEFAULT) } // $_[0]; +} sub filename { my $dis = header_raw($_[0], 'Content-Disposition'); @@ -447,15 +468,19 @@ sub body { sub body_str { my ($self) = @_; my $ct = ct($self); - my $charset = $ct->{attributes}->{charset}; - if (!$charset) { - if ($STR_TYPE{$ct->{type}} && $STR_SUBTYPE{$ct->{subtype}}) { + my $cs = $ct->{attributes}->{charset} // do { + ($STR_TYPE{$ct->{type}} && $STR_SUBTYPE{$ct->{subtype}}) and return body($self); - } - Carp::confess("can't get body as a string for ", + croak("can't get body as a string for ", join("\n\t", header_raw($self, 'Content-Type'))); - } - decode($charset, body($self), Encode::FB_CROAK); + }; + my $enc = find_encoding($cs) or croak "unknown encoding `$cs'"; + my $ret = body($self); + local @enc_warn; + local $SIG{__WARN__} = $enc_warn; + $ret = $enc->decode($ret, Encode::FB_WARN); + croak(@enc_warn) if @enc_warn; + $ret; } sub as_string { @@ -475,9 +500,42 @@ sub charset_set { sub crlf { $_[0]->{crlf} // "\n" } +sub raw_size { + my ($self) = @_; + my $len = length(${$self->{hdr}}); + defined($self->{bdy}) and + $len += length(${$self->{bdy}}) + length($self->{crlf}); + $len; +} + +# warnings to ignore when handling spam mailboxes and maybe other places +sub warn_ignore { + my $s = "@_"; + # Email::Address::XS warnings + $s =~ /^Argument contains empty / + || $s =~ /^Element at index [0-9]+.*? contains / + # PublicInbox::MsgTime + || $s =~ /^bogus TZ offset: .+?, ignoring and assuming \+0000/ + || $s =~ /^bad Date: .+? in / + # Encode::Unicode::UTF7 + || $s =~ /^Bad UTF7 data escape at / +} + +# this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..." +sub warn_ignore_cb { + my $cb = $SIG{__WARN__} // \&CORE::warn; + sub { $cb->(@_) unless warn_ignore(@_) } +} + sub willneed { re_memo($_) for @_ } willneed(qw(From To Cc Date Subject Content-Type In-Reply-To References Message-ID X-Alt-Message-ID)); +# This fixes an old bug from import (pre-a0c07cba0e5d8b6a) +# mutt also pipes single RFC822 messages with a "From " line, +# but no Content-Length or "From " escaping. +# "git format-patch" also generates such files by default. +sub strip_from { $_[0] =~ s/\A[\r\n]*From [^\n]*\n//s } + 1; diff --git a/lib/PublicInbox/EmlContentFoo.pm b/lib/PublicInbox/EmlContentFoo.pm index c163eaf5..80fc7364 100644 --- a/lib/PublicInbox/EmlContentFoo.pm +++ b/lib/PublicInbox/EmlContentFoo.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> # Copyright (C) 2004- Simon Cozens, Casey West, Ricardo SIGNES # This library is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. diff --git a/lib/PublicInbox/Epoll.pm b/lib/PublicInbox/Epoll.pm new file mode 100644 index 00000000..7e0aa6e7 --- /dev/null +++ b/lib/PublicInbox/Epoll.pm @@ -0,0 +1,26 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# OO API for epoll +package PublicInbox::Epoll; +use v5.12; +use PublicInbox::Syscall qw(epoll_create epoll_ctl epoll_wait + EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL); +use Fcntl qw(F_SETFD FD_CLOEXEC); +use autodie qw(open fcntl); + +sub new { + open(my $fh, '+<&=', epoll_create()); + fcntl($fh, F_SETFD, FD_CLOEXEC); + bless \$fh, __PACKAGE__; +} + +sub ep_add { epoll_ctl(fileno(${$_[0]}), EPOLL_CTL_ADD, fileno($_[1]), $_[2]) } +sub ep_mod { epoll_ctl(fileno(${$_[0]}), EPOLL_CTL_MOD, fileno($_[1]), $_[2]) } +sub ep_del { epoll_ctl(fileno(${$_[0]}), EPOLL_CTL_DEL, fileno($_[1]), 0) } + +# n.b. maxevents=1000 is the historical default. maxevents=1 (yes, one) +# is more fair under load with multiple worker processes sharing one listener +sub ep_wait { epoll_wait(fileno(${$_[0]}), 1000, @_[1, 2]) } + +1; diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 03faf3a1..95feb885 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used by the web interface to link to messages outside of the our @@ -11,7 +11,7 @@ use warnings; use PublicInbox::Hval qw(ascii_html prurl mid_href); use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::Smsg; -our $MIN_PARTIAL_LEN = 16; +our $MIN_PARTIAL_LEN = 14; # for 'XXXXXXXXXX.fsf' msgids gnus generates # TODO: user-configurable our @EXT_URL = map { ascii_html($_) } ( @@ -32,8 +32,8 @@ sub PARTIAL_MAX () { 100 } sub search_partial ($$) { my ($ibx, $mid) = @_; return if length($mid) < $MIN_PARTIAL_LEN; - my $srch = $ibx->search or return; - my $opt = { limit => PARTIAL_MAX, mset => 2 }; + my $srch = $ibx->isrch or return; + my $opt = { limit => PARTIAL_MAX, relevance => -1 }; my @try = ("m:$mid*"); my $chop = $mid; if ($chop =~ s/(\W+)(\w*)\z//) { @@ -76,7 +76,7 @@ sub search_partial ($$) { sub ext_msg_i { my ($other, $ctx) = @_; - return if $other->{name} eq $ctx->{-inbox}->{name} || !$other->base_url; + return if $other->{name} eq $ctx->{ibx}->{name} || !$other->base_url; my $mm = $other->mm or return; @@ -103,25 +103,54 @@ sub ext_msg_step { } } +sub ext_msg_ALL ($) { + my ($ctx) = @_; + my $ALL = $ctx->{www}->{pi_cfg}->ALL or return; + my $by_eidx_key = $ctx->{www}->{pi_cfg}->{-by_eidx_key}; + my $cur_key = eval { $ctx->{ibx}->eidx_key } // + return partial_response($ctx); # $cur->{ibx} == $ALL + my %seen = ($cur_key => 1); + my ($id, $prev); + while (my $x = $ALL->over->next_by_mid($ctx->{mid}, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($x->{num}); + for my $k (@$xr3) { + $k =~ s/:[0-9]+:$x->{blob}\z// or next; + next if $k eq $cur_key; + my $ibx = $by_eidx_key->{$k} // next; + $ibx->base_url or next; + push(@{$ctx->{found}}, $ibx) unless $seen{$k}++; + } + } + return exact($ctx) if $ctx->{found}; + + # fall back to partial MID matching + for my $ibxish ($ctx->{ibx}, $ALL) { + my $mids = search_partial($ibxish, $ctx->{mid}) or next; + push @{$ctx->{partial}}, [ $ibxish, $mids ]; + last if ($ctx->{n_partial} += scalar(@$mids)) >= PARTIAL_MAX; + } + partial_response($ctx); +} + sub ext_msg { my ($ctx) = @_; - sub { + ext_msg_ALL($ctx) // sub { $ctx->{-wcb} = $_[0]; # HTTP server write callback if ($ctx->{env}->{'pi-httpd.async'}) { require PublicInbox::ConfigIter; my $iter = PublicInbox::ConfigIter->new( - $ctx->{www}->{pi_config}, + $ctx->{www}->{pi_cfg}, \&ext_msg_step, $ctx); $iter->event_step; } else { - $ctx->{www}->{pi_config}->each_inbox(\&ext_msg_i, $ctx); + $ctx->{www}->{pi_cfg}->each_inbox(\&ext_msg_i, $ctx); finalize_exact($ctx); } }; } -# called via PublicInbox::DS->EventLoop +# called via PublicInbox::DS::event_loop sub event_step { my ($ctx, $sync) = @_; # can't find a partial match in current inbox, try the others: @@ -141,7 +170,7 @@ sub finalize_exact { # fall back to partial MID matching my $mid = $ctx->{mid}; - my $cur = $ctx->{-inbox}; + my $cur = $ctx->{ibx}; my $mids = search_partial($cur, $mid); if ($mids) { $ctx->{n_partial} = scalar(@$mids); @@ -159,7 +188,13 @@ sub finalize_exact { finalize_partial($ctx); } -sub finalize_partial { +sub _url_pfx ($$) { + my ($ctx, $u) = @_; + (index($u, '://') < 0 && index($u, '/') != 0) ? + "$ctx->{-upfx}../$u" : $u; +} + +sub partial_response ($) { my ($ctx) = @_; my $mid = $ctx->{mid}; my $code = 404; @@ -167,16 +202,17 @@ sub finalize_partial { my $html = ascii_html($mid); my $title = "<$html> not found"; my $s = "<pre>Message-ID <$html>\nnot found\n"; + $ctx->{-upfx} //= '../'; if (my $n_partial = $ctx->{n_partial}) { $code = 300; my $es = $n_partial == 1 ? '' : 'es'; $n_partial .= '+' if ($n_partial == PARTIAL_MAX); $s .= "\n$n_partial partial match$es found:\n\n"; - my $cur_name = $ctx->{-inbox}->{name}; + my $cur_name = $ctx->{ibx}->{name}; foreach my $pair (@{$ctx->{partial}}) { my ($ibx, $res) = @$pair; - my $env = $ctx->{env} if $ibx->{name} eq $cur_name; - my $u = $ibx->base_url($env) or next; + my $e = $ibx->{name} eq $cur_name ? $ctx->{env} : undef; + my $u = _url_pfx($ctx, $ibx->base_url($e) // next); foreach my $m (@$res) { my $href = mid_href($m); my $html = ascii_html($m); @@ -191,10 +227,11 @@ sub finalize_partial { } $ctx->{-html_tip} = $s .= '</pre>'; $ctx->{-title_html} = $title; - $ctx->{-upfx} = '../'; - $ctx->{-wcb}->(html_oneshot($ctx, $code)); + html_oneshot($ctx, $code); } +sub finalize_partial ($) { $_[0]->{-wcb}->(partial_response($_[0])) } + sub ext_urls { my ($ctx, $mid, $href, $html) = @_; @@ -222,13 +259,13 @@ sub exact { my $title = "<$html> found in "; my $end = @$found == 1 ? 'another inbox' : 'other inboxes'; $ctx->{-title_html} = $title . $end; - $ctx->{-upfx} = '../'; + $ctx->{-upfx} //= '../'; my $ext_urls = ext_urls($ctx, $mid, $href, $html); my $code = (@$found == 1 && $ext_urls eq '') ? 200 : 300; $ctx->{-html_tip} = join('', "<pre>Message-ID: <$html>\nfound in $end:\n\n", (map { - my $u = $_->base_url; + my $u = _url_pfx($ctx, $_->base_url); qq(<a\nhref="$u$href/">$u$html/</a>\n) } @$found), $ext_urls, '</pre>'); diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm new file mode 100644 index 00000000..d43c23e6 --- /dev/null +++ b/lib/PublicInbox/ExtSearch.pm @@ -0,0 +1,134 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Read-only external (detached) index for cross inbox search. +# This is a read-only counterpart to PublicInbox::ExtSearchIdx +# and behaves like PublicInbox::Inbox AND PublicInbox::Search +package PublicInbox::ExtSearch; +use strict; +use v5.10.1; +use PublicInbox::Over; +use PublicInbox::Inbox; +use PublicInbox::MiscSearch; +use DBI qw(:sql_types); # SQL_BLOB + +# for ->reopen, ->mset, ->mset_to_artnums +use parent qw(PublicInbox::Search); + +sub new { + my ($class, $topdir) = @_; + bless { + topdir => $topdir, + -primary_address => 'unknown@example.com', + # xpfx => 'ei15' + xpfx => "$topdir/ei".PublicInbox::Search::SCHEMA_VERSION + }, $class; +} + +sub misc { + my ($self) = @_; + $self->{misc} //= PublicInbox::MiscSearch->new("$self->{xpfx}/misc"); +} + +# same as per-inbox ->over, for now... +sub over { + my ($self) = @_; + $self->{over} // eval { + PublicInbox::Inbox::_cleanup_later($self); + my $over = PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + $over->dbh; # may die + $self->{over} = $over; + }; +} + +sub git { + my ($self) = @_; + $self->{git} //= do { + PublicInbox::Inbox::_cleanup_later($self); + PublicInbox::Git->new("$self->{topdir}/ALL.git"); + }; +} + +# returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table +sub nntp_xref_for { # NNTP only + my ($self, $xibx, $xsmsg) = @_; + my $dbh = over($self)->dbh; + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($xibx->{newsgroup}); + my $xibx_id = $sth->fetchrow_array // do { + warn "W: `$xibx->{newsgroup}' not found in $self->{topdir}\n"; + return; + }; + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1 + + $sth->bind_param(1, $xsmsg->oidbin, SQL_BLOB); + + # NNTP::cmd_over can set {num} to zero according to RFC 3977 8.3.2 + $sth->bind_param(2, $xsmsg->{num} || $xsmsg->{-orig_num}); + $sth->bind_param(3, $xibx_id); + $sth->execute; + my $docid = $sth->fetchrow_array // do { + warn <<EOF; +W: `$xibx->{newsgroup}:$xsmsg->{num}' not found in $self->{topdir}" +EOF + return; + }; + + # LIMIT is number of newsgroups on server: + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id,xnum FROM xref3 WHERE docid = ? AND ibx_id != ? + + $sth->execute($docid, $xibx_id); + my $rows = $sth->fetchall_arrayref; + + my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1 + + my %xref = map { + my ($ibx_id, $xnum) = @$_; + + $eidx_key_sth->execute($ibx_id); + my $eidx_key = $eidx_key_sth->fetchrow_array; + + # only include if there's a newsgroup name + $eidx_key && index($eidx_key, '/') >= 0 ? + () : ($eidx_key => $xnum) + } @$rows; + $xref{$xibx->{newsgroup}} = $xsmsg->{num}; + \%xref; +} + +sub mm { undef } + +sub altid_map { {} } + +sub description { + my ($self) = @_; + ($self->{description} //= + PublicInbox::Git::cat_desc("$self->{topdir}/description")) // + '$EXTINDEX_DIR/description missing'; +} + +sub search { + PublicInbox::Inbox::_cleanup_later($_[0]); + $_[0]; +} + +sub thing_type { 'external index' } + +no warnings 'once'; +*base_url = \&PublicInbox::Inbox::base_url; +*smsg_eml = \&PublicInbox::Inbox::smsg_eml; +*smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid; +*msg_by_mid = \&PublicInbox::Inbox::msg_by_mid; +*modified = \&PublicInbox::Inbox::modified; + +*max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef +*isrch = \&search; + +1; diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm new file mode 100644 index 00000000..883dbea3 --- /dev/null +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -0,0 +1,1429 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Detached/external index cross inbox search indexing support +# read-write counterpart to PublicInbox::ExtSearch +# +# It's based on the same ideas as public-inbox-v2-format(5) using +# over.sqlite3 for dedupe and sharded Xapian. msgmap.sqlite3 is +# missing, so there is no Message-ID conflict resolution, meaning +# no NNTP support for now. +# +# v2 has a 1:1 mapping of index:inbox or msgmap for NNTP support. +# This is intended to be an M:N index:inbox mapping, but it'll likely +# be 1:N in common practice (M==1) + +package PublicInbox::ExtSearchIdx; +use strict; +use v5.10.1; +use parent qw(PublicInbox::ExtSearch PublicInbox::Umask PublicInbox::Lock); +use Carp qw(croak carp); +use Scalar::Util qw(blessed); +use Sys::Hostname qw(hostname); +use File::Glob qw(bsd_glob GLOB_NOSORT); +use PublicInbox::MultiGit; +use PublicInbox::Spawn (); +use PublicInbox::Search; +use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob); +use PublicInbox::OverIdx; +use PublicInbox::MiscIdx; +use PublicInbox::MID qw(mids); +use PublicInbox::V2Writable; +use PublicInbox::InboxWritable; +use PublicInbox::ContentHash qw(content_hash); +use PublicInbox::Eml; +use PublicInbox::DS qw(now add_timer); +use DBI qw(:sql_types); # SQL_BLOB +use PublicInbox::Admin qw(fmt_localtime); + +sub new { + my (undef, $dir, $opt) = @_; + my $l = $opt->{indexlevel} // 'full'; + $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and + die "invalid indexlevel=$l\n"; + $l eq 'basic' and die "E: indexlevel=basic not yet supported\n"; + my $self = bless { + xpfx => "$dir/ei".PublicInbox::Search::SCHEMA_VERSION, + topdir => $dir, + creat => $opt->{creat}, + ibx_map => {}, # (newsgroup//inboxdir) => $ibx + ibx_active => [], # by config section order + ibx_known => [], # by config section order + indexlevel => $l, + transact_bytes => 0, + total_bytes => 0, + current_info => '', + parallel => 1, + lock_path => "$dir/ei.lock", + }, __PACKAGE__; + $self->{shards} = $self->count_shards || + nproc_shards({ nproc => $opt->{jobs} }); + my $oidx = PublicInbox::OverIdx->new("$self->{xpfx}/over.sqlite3"); + $self->{-no_fsync} = $oidx->{-no_fsync} = 1 if !$opt->{fsync}; + $self->{-dangerous} = 1 if $opt->{dangerous}; + $self->{oidx} = $oidx; + $self +} + +sub attach_inbox { + my ($self, $ibx, $types) = @_; + $self->{ibx_map}->{$ibx->eidx_key} //= do { + delete $self->{-ibx_ary_known}; # invalidate cache + delete $self->{-ibx_ary_active}; # invalidate cache + $types //= [ qw(active known) ]; + for my $t (@$types) { + push @{$self->{"ibx_$t"}}, $ibx; + } + $ibx; + } +} + +sub _ibx_attach { # each_inbox callback + my ($ibx, $self, $types) = @_; + attach_inbox($self, $ibx, $types); +} + +sub attach_config { + my ($self, $cfg, $ibxs) = @_; + $self->{cfg} = $cfg; + my $types; + if ($ibxs) { + for my $ibx (@$ibxs) { + $self->{ibx_map}->{$ibx->eidx_key} //= do { + push @{$self->{ibx_active}}, $ibx; + push @{$self->{ibx_known}}, $ibx; + $ibx; + } + } + # invalidate cache + delete $self->{-ibx_ary_known}; + delete $self->{-ibx_ary_active}; + $types = [ 'known' ]; + } + $types //= [ qw(known active) ]; + $cfg->each_inbox(\&_ibx_attach, $self, $types); +} + +sub check_batch_limit ($) { + my ($req) = @_; + my $self = $req->{self}; + my $new_smsg = $req->{new_smsg}; + my $n = $self->{transact_bytes} += $new_smsg->{bytes}; + + # set flag for PublicInbox::V2Writable::index_todo: + ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes}; +} + +sub bad_ibx_id ($$;$) { + my ($self, $ibx_id, $cb) = @_; + my $msg = "E: bad/stale ibx_id=#$ibx_id encountered"; + my $ekey = $self->{oidx}->dbh->selectrow_array(<<EOM, undef, $ibx_id); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1 +EOM + $msg .= " (formerly `$ekey')" if defined $ekey; + $cb //= \&carp; + $cb->($msg, "\nE: running $0 --gc may be required"); +} + +sub check_xr3 ($$$) { + my ($self, $id2pos, $xr3) = @_; + @$xr3 = grep { + defined($id2pos->{$_->[0]}) ? 1 : bad_ibx_id($self, $_->[0]) + } @$xr3; +} + +sub apply_boost ($$) { + my ($req, $smsg) = @_; + my $id2pos = $req->{id2pos}; # index in ibx_sorted + my $xr3 = $req->{self}->{oidx}->get_xref3($smsg->{num}, 1); + check_xr3($req->{self}, $id2pos, $xr3); + @$xr3 = sort { # sort ascending + $id2pos->{$a->[0]} <=> $id2pos->{$b->[0]} + || + $a->[1] <=> $b->[1] # break ties with {xnum} + } @$xr3; + my $new_smsg = $req->{new_smsg}; + return if $xr3->[0]->[2] ne $new_smsg->oidbin; # loser + + # replace the old smsg with the more boosted one + $new_smsg->{num} = $smsg->{num}; + $new_smsg->populate($req->{eml}, $req); + $req->{self}->{oidx}->add_overview($req->{eml}, $new_smsg); +} + +sub remove_doc ($$) { + my ($self, $docid) = @_; + $self->{oidx}->delete_by_num($docid); + $self->{oidx}->eidxq_del($docid); + $self->idx_shard($docid)->ipc_do('xdb_remove', $docid); +} + +sub _unref_doc ($$$$$;$) { + my ($sync, $docid, $ibx, $xnum, $oidbin, $eml) = @_; + my $smsg; + if (ref($docid)) { + $smsg = $docid; + $docid = $smsg->{num}; + } + if (defined($oidbin) && defined($xnum) && blessed($ibx) && $ibx->over) { + my $smsg = $ibx->over->get_art($xnum); + if ($smsg && $smsg->oidbin eq $oidbin) { + carp("BUG: (non-fatal) ".$ibx->eidx_key. + " #$xnum $smsg->{blob} still valid"); + return; + } + } + my $s = 'DELETE FROM xref3 WHERE oidbin = ?'; + $s .= ' AND ibx_id = ?' if defined($ibx); + $s .= ' AND xnum = ?' if defined($xnum); + my $del = $sync->{self}->{oidx}->dbh->prepare_cached($s); + my $col = 0; + $del->bind_param(++$col, $oidbin, SQL_BLOB); + $del->bind_param(++$col, $ibx->{-ibx_id}) if $ibx; + $del->bind_param(++$col, $xnum) if defined($xnum); + $del->execute; + my $xr3 = $sync->{self}->{oidx}->get_xref3($docid); + if (scalar(@$xr3) == 0) { # all gone + remove_doc($sync->{self}, $docid); + } else { # enqueue for reindex of remaining messages + if ($ibx) { + my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key; + my $idx = $sync->{self}->idx_shard($docid); + $idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml); + } # else: we can't remove_eidx_info in reindex-only path + + # replace invalidated blob ASAP with something which should be + # readable since we may commit the transaction on checkpoint. + # eidxq processing will re-apply boost + $smsg //= $sync->{self}->{oidx}->get_art($docid); + my $hex = unpack('H*', $oidbin); + if ($smsg && $smsg->{blob} eq $hex) { + $xr3->[0] =~ /:([a-f0-9]{40,}+)\z/ or + die "BUG: xref $xr3->[0] has no OID"; + $sync->{self}->{oidx}->update_blob($smsg, $1); + } + # yes, add, we'll need to re-apply boost + $sync->{self}->{oidx}->eidxq_add($docid); + } + @$xr3 +} + +sub do_xpost ($$) { + my ($req, $smsg) = @_; + my $self = $req->{self}; + my $docid = $smsg->{num}; + my $oid = $req->{oid}; + my $xibx = $req->{ibx}; + my $eml = $req->{eml}; + if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message + my $eidx_key = $xibx->eidx_key; + my $xnum = $req->{xnum}; + $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key); + my $idx = $self->idx_shard($docid); + $idx->ipc_do('add_eidx_info', $docid, $eidx_key, $eml); + apply_boost($req, $smsg) if $req->{boost_in_use}; + } else { # 'd' no {xnum} + $self->git->async_wait_all; + $oid = pack('H*', $oid); + _unref_doc($req, $docid, $xibx, undef, $oid, $eml); + } +} + +# called by V2Writable::sync_prepare +sub artnum_max { $_[0]->{oidx}->eidx_max } + +sub index_unseen ($) { + my ($req) = @_; + my $new_smsg = $req->{new_smsg} or die 'BUG: {new_smsg} unset'; + my $eml = delete $req->{eml}; + $new_smsg->populate($eml, $req); + my $self = $req->{self}; + my $docid = $self->{oidx}->adj_counter('eidx_docid', '+'); + $new_smsg->{num} = $docid; + my $idx = $self->idx_shard($docid); + $self->{oidx}->add_overview($eml, $new_smsg); + my $oid = $new_smsg->{blob}; + my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset'; + $self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key); + $idx->index_eml($eml, $new_smsg, $ibx->eidx_key); + check_batch_limit($req); +} + +sub do_finalize ($) { + my ($req) = @_; + if (my $indexed = $req->{indexed}) { # duplicated messages + do_xpost($req, $_) for @$indexed; + } elsif (exists $req->{new_smsg}) { # totally unseen messsage + index_unseen($req); + } else { + # `d' message was already unindexed in the v1/v2 inboxes, + # so it's too noisy to warn, here. + } + # cur_cmt may be undef for unindex_oid, set by V2Writable::index_todo + if (defined(my $cur_cmt = $req->{cur_cmt})) { + ${$req->{latest_cmt}} = $cur_cmt; + } +} + +sub do_step ($) { # main iterator for adding messages to the index + my ($req) = @_; + my $self = $req->{self} // die 'BUG: {self} missing'; + while (1) { + if (my $next_arg = $req->{next_arg}) { + if (my $smsg = $self->{oidx}->next_by_mid(@$next_arg)) { + $req->{cur_smsg} = $smsg; + $self->git->cat_async($smsg->{blob}, + \&ck_existing, $req); + return; # ck_existing calls do_step + } + delete $req->{next_arg}; + } + die "BUG: {cur_smsg} still set" if $req->{cur_smsg}; + my $mid = shift(@{$req->{mids}}) // last; + my ($id, $prev); + $req->{next_arg} = [ $mid, \$id, \$prev ]; + # loop again + } + do_finalize($req); +} + +sub _blob_missing ($$) { # called when a known $smsg->{blob} is gone + my ($req, $smsg) = @_; + # xnum and ibx are unknown, we only call this when an entry from + # /ei*/over.sqlite3 is bad, not on entries from xap*/over.sqlite3 + $req->{self}->git->async_wait_all; + _unref_doc($req, $smsg, undef, undef, $smsg->oidbin); +} + +sub ck_existing { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + my $smsg = delete $req->{cur_smsg} or die 'BUG: {cur_smsg} missing'; + if ($type eq 'missing') { + _blob_missing($req, $smsg); + } elsif (!is_bad_blob($oid, $type, $size, $smsg->{blob})) { + my $self = $req->{self} // die 'BUG: {self} missing'; + local $self->{current_info} = "$self->{current_info} $oid"; + my $cur = PublicInbox::Eml->new($bref); + if (content_hash($cur) eq $req->{chash}) { + push @{$req->{indexed}}, $smsg; # for do_xpost + } # else { index_unseen later } + } + do_step($req); +} + +# is the messages visible in the inbox currently being indexed? +# return the number if so +sub cur_ibx_xnum ($$;$) { + my ($req, $bref, $mismatch) = @_; + my $ibx = $req->{ibx} or die 'BUG: current {ibx} missing'; + + $req->{eml} = PublicInbox::Eml->new($bref); + $req->{chash} = content_hash($req->{eml}); + $req->{mids} = mids($req->{eml}); + for my $mid (@{$req->{mids}}) { + my ($id, $prev); + while (my $x = $ibx->over->next_by_mid($mid, \$id, \$prev)) { + return $x->{num} if $x->{blob} eq $req->{oid}; + push @$mismatch, $x if $mismatch; + } + } + undef; +} + +sub index_oid { # git->cat_async callback for 'm' + my ($bref, $oid, $type, $size, $req) = @_; + my $self = $req->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + my $new_smsg = $req->{new_smsg} = bless { + blob => $oid, + }, 'PublicInbox::Smsg'; + $new_smsg->set_bytes($$bref, $size); + ++${$req->{nr}}; + my $mismatch = []; + $req->{xnum} = cur_ibx_xnum($req, $bref, $mismatch) // do { + warn "# deleted\n"; + warn "# mismatch $_->{blob}\n" for @$mismatch; + ${$req->{latest_cmt}} = $req->{cur_cmt} // + die "BUG: {cur_cmt} unset ($oid)\n"; + return; + }; + do_step($req); +} + +sub unindex_oid { # git->cat_async callback for 'd' + my ($bref, $oid, $type, $size, $req) = @_; + my $self = $req->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + return if defined(cur_ibx_xnum($req, $bref)); # was re-added + do_step($req); +} + +# overrides V2Writable::last_commits, called by sync_ranges via sync_prepare +sub last_commits { + my ($self, $sync) = @_; + my $heads = []; + my $ekey = $sync->{ibx}->eidx_key; + my $uv = $sync->{ibx}->uidvalidity; + for my $i (0..$sync->{epoch_max}) { + $heads->[$i] = $self->{oidx}->eidx_meta("lc-v2:$ekey//$uv;$i"); + } + $heads; +} + +sub _ibx_index_reject ($) { + my ($ibx) = @_; + $ibx->mm // return 'unindexed, no msgmap.sqlite3'; + $ibx->uidvalidity // return 'no UIDVALIDITY'; + $ibx->over // return 'unindexed, no over.sqlite3'; + undef; +} + +sub _sync_inbox ($$$) { + my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + if (defined(my $err = _ibx_index_reject($ibx))) { + return "W: skipping $ekey ($err)"; + } + $sync->{ibx} = $ibx; + $sync->{nr} = \(my $nr = 0); + my $v = $ibx->version; + if ($v == 2) { + $sync->{epoch_max} = $ibx->max_git_epoch // return; + sync_prepare($self, $sync); # or return # TODO: once MiscIdx is stable + } elsif ($v == 1) { + my $uv = $ibx->uidvalidity; + my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv"); + my $head = $ibx->mm->last_commit // + return "E: $ibx->{inboxdir} is not indexed"; + my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head); + my $unit = { stack => $stk, git => $ibx->git }; + push @{$sync->{todo}}, $unit; + } else { + return "E: $ekey unsupported inbox version (v$v)"; + } + for my $unit (@{delete($sync->{todo}) // []}) { + last if $sync->{quit}; + index_todo($self, $sync, $unit); + } + $self->{midx}->index_ibx($ibx) unless $sync->{quit}; + $ibx->git->cleanup; # done with this inbox, now + undef; +} + +sub eidx_gc_scan_inboxes ($$) { + my ($self, $sync) = @_; + my ($x3_doc, $ibx_ck); +restart: + $x3_doc = $self->{oidx}->dbh->prepare(<<EOM); +SELECT docid,xnum,oidbin FROM xref3 WHERE ibx_id = ? +EOM + $ibx_ck = $self->{oidx}->dbh->prepare(<<EOM); +SELECT ibx_id,eidx_key FROM inboxes +EOM + $ibx_ck->execute; + while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) { + next if $self->{ibx_map}->{$eidx_key}; + $self->{midx}->remove_eidx_key($eidx_key); + warn "# deleting messages for $eidx_key...\n"; + $x3_doc->execute($ibx_id); + my $ibx = { -ibx_id => $ibx_id, -gc_eidx_key => $eidx_key }; + while (my ($docid, $xnum, $oid) = $x3_doc->fetchrow_array) { + my $r = _unref_doc($sync, $docid, $ibx, $xnum, $oid); + $oid = unpack('H*', $oid); + $r = $r ? 'unref' : 'remove'; + warn "# $r #$docid $eidx_key $oid\n"; + if (checkpoint_due($sync)) { + $x3_doc = $ibx_ck = undef; + reindex_checkpoint($self, $sync); + goto restart; + } + } + $self->{oidx}->dbh->do(<<'', undef, $ibx_id); +DELETE FROM inboxes WHERE ibx_id = ? + + # drop last_commit info + my $pat = $eidx_key; + $pat =~ s/([_%\\])/\\$1/g; + $self->{oidx}->dbh->do('PRAGMA case_sensitive_like = ON'); + my $lc_i = $self->{oidx}->dbh->prepare(<<''); +SELECT key FROM eidx_meta WHERE key LIKE ? ESCAPE ? + + $lc_i->execute("lc-%:$pat//%", '\\'); + while (my ($key) = $lc_i->fetchrow_array) { + next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!; + warn "# removing $key\n"; + $self->{oidx}->dbh->do(<<'', undef, $key); +DELETE FROM eidx_meta WHERE key = ? + + } + warn "# $eidx_key removed\n"; + } +} + +sub eidx_gc_scan_shards ($$) { # TODO: use for lei/store + my ($self, $sync) = @_; + my $nr = $self->{oidx}->dbh->do(<<''); +DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) + + warn "# eliminated $nr stale xref3 entries\n" if $nr != 0; + reindex_checkpoint($self, $sync) if checkpoint_due($sync); + + # fixup from old bugs: + $nr = $self->{oidx}->dbh->do(<<''); +DELETE FROM over WHERE num > 0 AND num NOT IN (SELECT docid FROM xref3) + + warn "# eliminated $nr stale over entries\n" if $nr != 0; + reindex_checkpoint($self, $sync) if checkpoint_due($sync); + + $nr = $self->{oidx}->dbh->do(<<''); +DELETE FROM eidxq WHERE docid NOT IN (SELECT num FROM over) + + warn "# eliminated $nr stale reindex queue entries\n" if $nr != 0; + reindex_checkpoint($self, $sync) if checkpoint_due($sync); + + my ($cur) = $self->{oidx}->dbh->selectrow_array(<<EOM); +SELECT MIN(num) FROM over WHERE num > 0 +EOM + $cur // return; # empty + my ($r, $n, %active_shards); + $nr = 0; + while (1) { + $r = $self->{oidx}->dbh->selectcol_arrayref(<<"", undef, $cur); +SELECT num FROM over WHERE num >= ? ORDER BY num ASC LIMIT 10000 + + last unless scalar(@$r); + while (defined($n = shift @$r)) { + for my $i ($cur..($n - 1)) { + my $idx = idx_shard($self, $i); + $idx->ipc_do('xdb_remove_quiet', $i); + $active_shards{$idx} = $idx; + } + $cur = $n + 1; + } + if (checkpoint_due($sync)) { + for my $idx (values %active_shards) { + $nr += $idx->ipc_do('nr_quiet_rm') + } + %active_shards = (); + reindex_checkpoint($self, $sync); + } + } + warn "# eliminated $nr stale Xapian documents\n" if $nr != 0; +} + +sub eidx_gc { + my ($self, $opt) = @_; + $self->{cfg} or die "E: GC requires ->attach_config\n"; + $opt->{-idx_gc} = 1; + my $sync = { + need_checkpoint => \(my $need_checkpoint = 0), + check_intvl => 10, + next_check => now() + 10, + checkpoint_unlocks => 1, + -opt => $opt, + self => $self, + }; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + eidx_gc_scan_inboxes($self, $sync); + eidx_gc_scan_shards($self, $sync); + done($self); +} + +sub _ibx_for ($$$) { + my ($self, $sync, $smsg) = @_; + my $ibx_id = delete($smsg->{ibx_id}) // die 'BUG: {ibx_id} unset'; + my $pos = $sync->{id2pos}->{$ibx_id} // + bad_ibx_id($self, $ibx_id, \&croak); + $self->{-ibx_ary_known}->[$pos] // + die "BUG: ibx for $smsg->{blob} not mapped" +} + +sub _fd_constrained ($) { + my ($self) = @_; + $self->{-fd_constrained} //= do { + my $soft; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($soft, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + chomp($soft = `sh -c 'ulimit -n'`); + } + if (defined($soft)) { + # $want is an estimate + my $want = scalar(@{$self->{ibx_active}}) + 64; + my $ret = $want > $soft; + if ($ret) { + warn <<EOF; +RLIMIT_NOFILE=$soft insufficient (want: $want), will close DB handles early +EOF + } + $ret; + } else { + warn "Unable to determine RLIMIT_NOFILE: $@\n"; + 1; + } + }; +} + +sub _reindex_finalize ($$$) { + my ($req, $smsg, $eml) = @_; + my $sync = $req->{sync}; + my $self = $sync->{self}; + my $by_chash = delete $req->{by_chash} or die 'BUG: no {by_chash}'; + my $nr = scalar(keys(%$by_chash)) or die 'BUG: no content hashes'; + my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}'; + my $docid = $smsg->{num} = $orig_smsg->{num}; + $self->{oidx}->add_overview($eml, $smsg); # may rethread + check_batch_limit({ %$sync, new_smsg => $smsg }); + my $chash0 = $smsg->{chash} // die "BUG: $smsg->{blob} no {chash}"; + my $stable = delete($by_chash->{$chash0}) // + die "BUG: $smsg->{blob} chash missing"; + my $idx = $self->idx_shard($docid); + my $top_smsg = pop @$stable; + $top_smsg == $smsg or die 'BUG: top_smsg != smsg'; + my $ibx = _ibx_for($self, $sync, $smsg); + $idx->index_eml($eml, $smsg, $ibx->eidx_key); + for my $x (reverse @$stable) { + $ibx = _ibx_for($self, $sync, $x); + my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}'; + $idx->ipc_do('add_eidx_info', $docid, $ibx->eidx_key, $hdr); + } + return if $nr == 1; # likely, all good + + $self->git->async_wait_all; + warn "W: #$docid split into $nr due to deduplication change\n"; + my @todo; + for my $ary (values %$by_chash) { + for my $x (reverse @$ary) { + warn "removing #$docid xref3 $x->{blob}\n"; + my $bin = $x->oidbin; + my $n = _unref_doc($sync, $docid, undef, undef, $bin); + die "BUG: $x->{blob} invalidated #$docid" if $n == 0; + } + my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty"; + $x->{num} = delete($x->{xnum}) // die '{xnum} unset'; + $ibx = _ibx_for($self, $sync, $x); + if (my $over = $ibx->over) { + my $e = $over->get_art($x->{num}); + $e->{blob} eq $x->{blob} or die <<EOF; +$x->{blob} != $e->{blob} (${\$ibx->eidx_key}:$e->{num}); +EOF + push @todo, $ibx, $e; + $over->dbh_close if _fd_constrained($self); + } else { + die "$ibx->{inboxdir}: over.sqlite3 unusable: $!\n"; + } + } + undef $by_chash; + while (my ($ibx, $e) = splice(@todo, 0, 2)) { + reindex_unseen($self, $sync, $ibx, $e); + } +} + +sub _reindex_oid { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + my $sync = $req->{sync}; + my $self = $sync->{self}; + my $orig_smsg = $req->{orig_smsg} // die 'BUG: no {orig_smsg}'; + my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2]; + my $docid = $orig_smsg->{num}; + if (is_bad_blob($oid, $type, $size, $expect_oid)) { + my $oidbin = pack('H*', $expect_oid); + my $remain = _unref_doc($sync, $docid, undef, undef, $oidbin); + if ($remain == 0) { + warn "W: #$docid ($oid) gone or corrupt\n"; + } elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { + $self->git->cat_async($next_oid, \&_reindex_oid, $req); + } else { + warn "BUG: #$docid ($oid) gone (UNEXPECTED)\n"; + } + return; + } + my $ci = $self->{current_info}; + local $self->{current_info} = "$ci #$docid $oid"; + my $re_smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $re_smsg->set_bytes($$bref, $size); + my $eml = PublicInbox::Eml->new($bref); + $re_smsg->populate($eml, { autime => $orig_smsg->{ds}, + cotime => $orig_smsg->{ts} }); + my $chash = content_hash($eml); + $re_smsg->{chash} = $chash; + $re_smsg->{xnum} = $req->{xr3r}->[$req->{ix}]->[1]; + $re_smsg->{ibx_id} = $req->{xr3r}->[$req->{ix}]->[0]; + $re_smsg->{hdr} = $eml->header_obj; + push @{$req->{by_chash}->{$chash}}, $re_smsg; + if (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { + $self->git->cat_async($next_oid, \&_reindex_oid, $req); + } else { # last $re_smsg is the highest priority xref3 + local $self->{current_info} = "$ci #$docid"; + _reindex_finalize($req, $re_smsg, $eml); + } +} + +sub _reindex_smsg ($$$) { + my ($self, $sync, $smsg) = @_; + my $docid = $smsg->{num}; + my $xr3 = $self->{oidx}->get_xref3($docid, 1); + if (scalar(@$xr3) == 0) { # _reindex_check_stale should've covered this + warn <<""; +BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex + + remove_doc($self, $docid); + return; + } + + # we sort {xr3r} in the reverse order of ibx_sorted so we can + # hit the common case in _reindex_finalize without rereading + # from git (or holding multiple messages in memory). + my $id2pos = $sync->{id2pos}; # index in ibx_sorted + check_xr3($self, $id2pos, $xr3); + @$xr3 = sort { # sort descending + $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]} + || + $b->[1] <=> $a->[1] # break ties with {xnum} + } @$xr3; + @$xr3 = map { [ $_->[0], $_->[1], unpack('H*', $_->[2]) ] } @$xr3; + my $req = { orig_smsg => $smsg, sync => $sync, xr3r => $xr3, ix => 0 }; + $self->git->cat_async($xr3->[$req->{ix}]->[2], \&_reindex_oid, $req); +} + +sub checkpoint_due ($) { + my ($sync) = @_; + ${$sync->{need_checkpoint}} || (now() > $sync->{next_check}); +} + +sub host_ident () { + # I've copied FS images and only changed the hostname before, + # so prepend hostname. Use `state' since these a BOFH can change + # these while this process is running and we always want to be + # able to release locks taken by this process. + state $retval = hostname . '-' . do { + my $m; # machine-id(5) is systemd + if (open(my $fh, '<', '/etc/machine-id')) { $m = <$fh> } + # (g)hostid(1) is in GNU coreutils, kern.hostid is most BSDs + chomp($m ||= `{ sysctl -n kern.hostid || + hostid || ghostid; } 2>/dev/null` + || "no-machine-id-or-hostid-on-$^O"); + $m; + }; +} + +sub eidxq_release { + my ($self) = @_; + my $expect = delete($self->{-eidxq_locked}) or return; + my ($owner_pid, undef) = split(/-/, $expect); + return if $owner_pid != $$; # shards may fork + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') // ''; + if ($cur eq $expect) { + $oidx->eidx_meta('eidxq_lock', ''); + return 1; + } elsif ($cur ne '') { + warn "E: eidxq_lock($expect) stolen by $cur\n"; + } else { + warn "E: eidxq_lock($expect) released by another process\n"; + } + undef; +} + +sub DESTROY { + my ($self) = @_; + eidxq_release($self) and $self->{oidx}->commit_lazy; +} + +sub _eidxq_take ($) { + my ($self) = @_; + my $val = "$$-${\time}-$>-".host_ident; + $self->{oidx}->eidx_meta('eidxq_lock', $val); + $self->{-eidxq_locked} = $val; +} + +sub eidxq_lock_acquire ($) { + my ($self) = @_; + my $oidx = $self->{oidx}; + $oidx->begin_lazy; + my $cur = $oidx->eidx_meta('eidxq_lock') || return _eidxq_take($self); + if (my $locked = $self->{-eidxq_locked}) { # be lazy + return $locked if $locked eq $cur; + } + my ($pid, $time, $euid, $ident) = split(/-/, $cur, 4); + my $t = fmt_localtime($time); + local $self->{current_info} = 'eidxq'; + if ($euid == $> && $ident eq host_ident) { + kill(0, $pid) and warn <<EOM and return; +# PID:$pid (re)indexing since $t, it will continue our work +EOM + if ($!{ESRCH}) { + warn "# eidxq_lock is stale ($cur), clobbering\n"; + return _eidxq_take($self); + } + warn "E: kill(0, $pid) failed: $!\n"; # fall-through: + } + my $fn = $oidx->dbh->sqlite_db_filename; + warn <<EOF; +W: PID:$pid, UID:$euid on $ident is indexing Xapian since $t +W: If this is unexpected, delete `eidxq_lock' from the `eidx_meta' table: +W: sqlite3 $fn 'DELETE FROM eidx_meta WHERE key = "eidxq_lock"' +EOF + undef; +} + +sub ibx_sorted ($$) { + my ($self, $type) = @_; + $self->{"-ibx_ary_$type"} //= do { + # highest boost first, stable for config-ordering tiebreaker + use sort 'stable'; + [ sort { + ($b->{boost} // 0) <=> ($a->{boost} // 0) + } @{$self->{'ibx_'.$type} // die "BUG: $type unknown"} ]; + } +} + +sub prep_id2pos ($) { + my ($self) = @_; + my %id2pos; + my $pos = 0; + $id2pos{$_->{-ibx_id}} = $pos++ for (@{ibx_sorted($self, 'known')}); + \%id2pos; +} + +sub eidxq_process ($$) { # for reindexing + my ($self, $sync) = @_; + local $self->{current_info} = 'eidxq process'; + return unless ($self->{cfg} && eidxq_lock_acquire($self)); + my $dbh = $self->{oidx}->dbh; + my $tot = $dbh->selectrow_array('SELECT COUNT(*) FROM eidxq') or return; + ${$sync->{nr}} = 0; + local $sync->{-regen_fmt} = "%u/$tot\n"; + my $pr = $sync->{-opt}->{-progress}; + if ($pr) { + my $min = $dbh->selectrow_array('SELECT MIN(docid) FROM eidxq'); + my $max = $dbh->selectrow_array('SELECT MAX(docid) FROM eidxq'); + $pr->("Xapian indexing $min..$max (total=$tot)\n"); + } + $sync->{id2pos} //= prep_id2pos($self); + my ($del, $iter); +restart: + $del = $dbh->prepare('DELETE FROM eidxq WHERE docid = ?'); + $iter = $dbh->prepare('SELECT docid FROM eidxq ORDER BY docid ASC'); + $iter->execute; + while (defined(my $docid = $iter->fetchrow_array)) { + last if $sync->{quit}; + if (my $smsg = $self->{oidx}->get_art($docid)) { + _reindex_smsg($self, $sync, $smsg); + } else { + warn "E: #$docid does not exist in over\n"; + } + $del->execute($docid); + ++${$sync->{nr}}; + + if (checkpoint_due($sync)) { + $dbh = $del = $iter = undef; + reindex_checkpoint($self, $sync); # release lock + $dbh = $self->{oidx}->dbh; + goto restart; + } + } + $self->git->async_wait_all; + $pr->("reindexed ${$sync->{nr}}/$tot\n") if $pr; +} + +sub _reindex_unseen { # git->cat_async callback + my ($bref, $oid, $type, $size, $req) = @_; + return if is_bad_blob($oid, $type, $size, $req->{oid}); + my $self = $req->{self} // die 'BUG: {self} unset'; + local $self->{current_info} = "$self->{current_info} $oid"; + my $new_smsg = bless { blob => $oid, }, 'PublicInbox::Smsg'; + $new_smsg->set_bytes($$bref, $size); + my $eml = $req->{eml} = PublicInbox::Eml->new($bref); + $req->{new_smsg} = $new_smsg; + $req->{chash} = content_hash($eml); + $req->{mids} = mids($eml); # do_step iterates through this + do_step($req); # enter the normal indexing flow +} + +# --reindex may catch totally unseen messages, this handles them +sub reindex_unseen ($$$$) { + my ($self, $sync, $ibx, $xsmsg) = @_; + my $req = { + %$sync, # has {self} + autime => $xsmsg->{ds}, + cotime => $xsmsg->{ts}, + oid => $xsmsg->{blob}, + ibx => $ibx, + xnum => $xsmsg->{num}, + # {mids} and {chash} will be filled in at _reindex_unseen + }; + warn "# reindex_unseen ${\$ibx->eidx_key}:$req->{xnum}:$req->{oid}\n"; + $self->git->cat_async($xsmsg->{blob}, \&_reindex_unseen, $req); +} + +sub _unref_stale_range ($$$) { + my ($sync, $ibx, $lt_or_gt) = @_; + my $r; + my $lim = 10000; + do { + $r = $sync->{self}->{oidx}->dbh->selectall_arrayref( + <<EOS, undef, $ibx->{-ibx_id}); +SELECT docid,xnum,oidbin FROM xref3 +WHERE ibx_id = ? AND $lt_or_gt LIMIT $lim +EOS + return if $sync->{quit}; + for (@$r) { # hopefully rare, not worth optimizing: + my ($docid, $xnum, $oidbin) = @$_; + my $hex = unpack('H*', $oidbin); + warn("# $xnum:$hex (#$docid): stale\n"); + _unref_doc($sync, $docid, $ibx, $xnum, $oidbin); + } + } while (scalar(@$r) == $lim); + 1; +} + +sub _reindex_check_ibx ($$$) { + my ($self, $sync, $ibx) = @_; + my $ibx_id = $ibx->{-ibx_id}; + my $slice = 10000; + my $opt = { limit => $slice }; + my ($beg, $end) = (1, $slice); + my $ekey = $ibx->eidx_key; + my ($max, $max0); + do { + $max0 = $ibx->mm->num_highwater; + sync_inbox($self, $sync, $ibx) and return; # warned + $max = $ibx->mm->num_highwater; + return if $sync->{quit}; + } while ($max > $max0 && + warn("# $ekey moved $max0..$max, resyncing..\n")); + $end = $max if $end > $max; + + # first, check if we missed any messages in target $ibx + my $msgs; + my $pr = $sync->{-opt}->{-progress}; + local $sync->{-regen_fmt} = "$ekey checking %u/$max\n"; + ${$sync->{nr}} = 0; + my $fast = $sync->{-opt}->{fast}; + my $usr; # _unref_stale_range (< $lo) called + my ($lo, $hi); + while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end, $opt)})) { + ${$sync->{nr}} = $beg; + $beg = $msgs->[-1]->{num} + 1; + $end = $beg + $slice; + $end = $max if $end > $max; + if (checkpoint_due($sync)) { + reindex_checkpoint($self, $sync); # release lock + } + ($lo, $hi) = ($msgs->[0]->{num}, $msgs->[-1]->{num}); + $usr //= _unref_stale_range($sync, $ibx, "xnum < $lo"); + my $x3a = $self->{oidx}->dbh->selectall_arrayref( + <<"", undef, $ibx_id, $lo, $hi); +SELECT xnum,oidbin,docid FROM xref3 WHERE +ibx_id = ? AND xnum >= ? AND xnum <= ? + + my %x3m; + for (@$x3a) { + my $k = pack('J', $_->[0]) . $_->[1]; + push @{$x3m{$k}}, $_->[2]; + } + undef $x3a; + for my $xsmsg (@$msgs) { + my $k = pack('JH*', $xsmsg->{num}, $xsmsg->{blob}); + my $docids = delete($x3m{$k}); + if (!defined($docids)) { + reindex_unseen($self, $sync, $ibx, $xsmsg); + } elsif (!$fast) { + for my $num (@$docids) { + $self->{oidx}->eidxq_add($num); + } + } + return if $sync->{quit}; + } + next unless scalar keys %x3m; + $self->git->async_wait_all; # wait for reindex_unseen + + # eliminate stale/mismatched entries + my %mismatch = map { $_->{num} => $_->{blob} } @$msgs; + while (my ($k, $docids) = each %x3m) { + my ($xnum, $hex) = unpack('JH*', $k); + my $bin = pack('H*', $hex); + my $exp = $mismatch{$xnum}; + if (defined $exp) { + my $smsg = $ibx->over->get_art($xnum) // next; + # $xnum may be expired by another process + if ($smsg->{blob} eq $hex) { + warn <<""; +BUG: (non-fatal) $ekey #$xnum $smsg->{blob} still matches (old exp: $exp) + + next; + } # else: continue to unref + } + my $m = defined($exp) ? "mismatch (!= $exp)" : 'stale'; + warn("# $xnum:$hex (#@$docids): $m\n"); + for my $i (@$docids) { + _unref_doc($sync, $i, $ibx, $xnum, $bin); + } + return if $sync->{quit}; + } + } + defined($hi) and ($hi < $max) and + _unref_stale_range($sync, $ibx, "xnum > $hi AND xnum <= $max"); +} + +sub _reindex_inbox ($$$) { + my ($self, $sync, $ibx) = @_; + my $ekey = $ibx->eidx_key; + local $self->{current_info} = $ekey; + if (defined(my $err = _ibx_index_reject($ibx))) { + warn "W: cannot reindex $ekey ($err)\n"; + } else { + _reindex_check_ibx($self, $sync, $ibx); + } + delete @$ibx{qw(over mm search git)}; # won't need these for a bit +} + +sub eidx_reindex { + my ($self, $sync) = @_; + return unless $self->{cfg}; + + # acquire eidxq_lock early because full reindex takes forever + # and incremental -extindex processes can run during our checkpoints + if (!eidxq_lock_acquire($self)) { + warn "E: aborting --reindex\n"; + return; + } + for my $ibx (@{ibx_sorted($self, 'active')}) { + _reindex_inbox($self, $sync, $ibx); + last if $sync->{quit}; + } + $self->git->async_wait_all; # ensure eidxq gets filled completely + eidxq_process($self, $sync) unless $sync->{quit}; +} + +sub sync_inbox { + my ($self, $sync, $ibx) = @_; + my $err = _sync_inbox($self, $sync, $ibx); + delete @$ibx{qw(mm over)}; + warn $err, "\n" if defined($err); + $err; +} + +sub dd_smsg { # git->cat_async callback + my ($bref, $oid, $type, $size, $dd) = @_; + my $smsg = $dd->{smsg} // die 'BUG: dd->{smsg} missing'; + my $self = $dd->{self} // die 'BUG: {self} missing'; + my $per_mid = $dd->{per_mid} // die 'BUG: {per_mid} missing'; + if ($type eq 'missing') { + _blob_missing($dd, $smsg); + } elsif (!is_bad_blob($oid, $type, $size, $smsg->{blob})) { + local $self->{current_info} = "$self->{current_info} $oid"; + my $chash = content_hash(PublicInbox::Eml->new($bref)); + push(@{$per_mid->{dd_chash}->{$chash}}, $smsg); + } + return if $per_mid->{last_smsg} != $smsg; + while (my ($chash, $ary) = each %{$per_mid->{dd_chash}}) { + my $keep = shift @$ary; + next if !scalar(@$ary); + $per_mid->{sync}->{dedupe_cull} += scalar(@$ary); + print STDERR + "# <$keep->{mid}> keeping #$keep->{num}, dropping ", + join(', ', map { "#$_->{num}" } @$ary),"\n"; + next if $per_mid->{sync}->{-opt}->{'dry-run'}; + my $oidx = $self->{oidx}; + for my $smsg (@$ary) { + my $gone = $smsg->{num}; + $oidx->merge_xref3($keep->{num}, $gone, $smsg->oidbin); + remove_doc($self, $gone); + } + } +} + +sub eidx_dedupe ($$$) { + my ($self, $sync, $msgids) = @_; + $sync->{dedupe_cull} = 0; + my $candidates = 0; + my $nr_mid = 0; + return unless eidxq_lock_acquire($self); + my ($iter, $cur_mid); + my $min_id = 0; + my $idx = 0; + my ($max_id) = $self->{oidx}->dbh->selectrow_array(<<EOS); +SELECT MAX(id) FROM msgid +EOS + local $sync->{-regen_fmt} = "dedupe %u/$max_id\n"; + + # note: we could write this query more intelligently, + # but that causes lock contention with read-only processes +dedupe_restart: + $cur_mid = $msgids->[$idx]; + if ($cur_mid eq '') { # all Message-IDs + $iter = $self->{oidx}->dbh->prepare(<<EOS); +SELECT mid,id FROM msgid WHERE id > ? ORDER BY id ASC +EOS + $iter->execute($min_id); + } else { + $iter = $self->{oidx}->dbh->prepare(<<EOS); +SELECT mid,id FROM msgid WHERE mid = ? AND id > ? ORDER BY id ASC +EOS + $iter->execute($cur_mid, $min_id); + } + while (my ($mid, $id) = $iter->fetchrow_array) { + last if $sync->{quit}; + $self->{current_info} = "dedupe $mid"; + ${$sync->{nr}} = $min_id = $id; + my ($prv, @smsg); + while (my $x = $self->{oidx}->next_by_mid($mid, \$id, \$prv)) { + push @smsg, $x; + } + next if scalar(@smsg) < 2; + my $per_mid = { + dd_chash => {}, # chash => [ary of smsgs] + last_smsg => $smsg[-1], + sync => $sync + }; + $nr_mid++; + $candidates += scalar(@smsg) - 1; + for my $smsg (@smsg) { + my $dd = { + per_mid => $per_mid, + smsg => $smsg, + self => $self, + }; + $self->git->cat_async($smsg->{blob}, \&dd_smsg, $dd); + } + # need to wait on every single one @smsg contents can get + # invalidated inside dd_smsg for messages with multiple + # Message-IDs. + $self->git->async_wait_all; + + if (checkpoint_due($sync)) { + undef $iter; + reindex_checkpoint($self, $sync); + goto dedupe_restart; + } + } + goto dedupe_restart if defined($msgids->[++$idx]); + + my $n = delete $sync->{dedupe_cull}; + if (my $pr = $sync->{-opt}->{-progress}) { + $pr->("culled $n/$candidates candidates ($nr_mid msgids)\n"); + } + ${$sync->{nr}} = 0; +} + +sub eidx_sync { # main entry point + my ($self, $opt) = @_; + + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; + local $self->{current_info} = ''; + local $SIG{__WARN__} = sub { + return if PublicInbox::Eml::warn_ignore(@_); + $warn_cb->($self->{current_info}, ': ', @_); + }; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + $self->{oidx}->rethread_prepare($opt); + my $sync = { + need_checkpoint => \(my $need_checkpoint = 0), + check_intvl => 10, + next_check => now() + 10, + -opt => $opt, + # DO NOT SET {reindex} here, it's incompatible with reused + # V2Writable code, reindex is totally different here + # compared to v1/v2 inboxes because we have multiple histories + self => $self, + -regen_fmt => "%u/?\n", + }; + local $SIG{USR1} = sub { $need_checkpoint = 1 }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; + for my $ibx (@{ibx_sorted($self, 'known')}) { + $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key); + } + + if (scalar(grep { defined($_->{boost}) } @{$self->{ibx_known}})) { + $sync->{id2pos} //= prep_id2pos($self); + $sync->{boost_in_use} = 1; + } + + if (my $msgids = delete($opt->{dedupe})) { + local $sync->{checkpoint_unlocks} = 1; + eidx_dedupe($self, $sync, $msgids); + } + if (delete($opt->{reindex})) { + local $sync->{checkpoint_unlocks} = 1; + eidx_reindex($self, $sync); + } + + # don't use $_ here, it'll get clobbered by reindex_checkpoint + if ($opt->{scan} // 1) { + for my $ibx (@{ibx_sorted($self, 'active')}) { + last if $sync->{quit}; + sync_inbox($self, $sync, $ibx); + } + } + $self->{oidx}->rethread_done($opt) unless $sync->{quit}; + eidxq_process($self, $sync) unless $sync->{quit}; + + eidxq_release($self); + done($self); + $sync; # for eidx_watch +} + +sub update_last_commit { # overrides V2Writable + my ($self, $sync, $stk) = @_; + my $unit = $sync->{unit} // return; + my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + defined($latest_cmt) or return; + my $ibx = $sync->{ibx} or die 'BUG: {ibx} missing'; + my $ekey = $ibx->eidx_key; + my $uv = $ibx->uidvalidity; + my $epoch = $unit->{epoch}; + my $meta_key; + my $v = $ibx->version; + if ($v == 2) { + die 'No {epoch} for v2 unit' unless defined $epoch; + $meta_key = "lc-v2:$ekey//$uv;$epoch"; + } elsif ($v == 1) { + die 'Unexpected {epoch} for v1 unit' if defined $epoch; + $meta_key = "lc-v1:$ekey//$uv"; + } else { + die "Unsupported inbox version: $v"; + } + my $last = $self->{oidx}->eidx_meta($meta_key); + if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) { + my @cmd = (qw(rev-list --count), "$last..$latest_cmt"); + chomp(my $n = $unit->{git}->qx(@cmd)); + return if $n ne '' && $n == 0; + } + $self->{oidx}->eidx_meta($meta_key, $latest_cmt); +} + +sub symlink_packs ($$) { + my ($ibx, $pd) = @_; + my $ret = 0; + my $glob = "$ibx->{inboxdir}/git/*.git/objects/pack/*.idx"; + for my $idx (bsd_glob($glob, GLOB_NOSORT)) { + my $src = substr($idx, 0, -length('.idx')); + my $dst = $pd . substr($src, rindex($src, '/')); + if (-f "$src.pack" and + symlink("$src.pack", "$dst.pack") and + symlink($idx, "$dst.idx") and + -f $idx) { + ++$ret; + # .promisor, .bitmap, .rev and .keep are optional + # XXX should we symlink .keep here? + for my $s (qw(promisor bitmap rev)) { + symlink("$src.$s", "$dst.$s") if -f "$src.$s"; + } + } elsif (!$!{EEXIST}) { + warn "W: ln -s $src.{pack,idx} => $dst.*: $!\n"; + unlink "$dst.pack", "$dst.idx"; + } + } + $ret; +} + +sub idx_init { # similar to V2Writable + my ($self, $opt) = @_; + return if $self->{idx_shards}; + + $self->git->cleanup; + my $mode = 0644; + my $ALL = $self->git->{git_dir}; # topdir/ALL.git + my ($has_new, $alt, $seen, $prune, $prune_nr); + if ($opt->{-private}) { # LeiStore + my $local = "$self->{topdir}/local"; # lei/store + $self->{mg} //= PublicInbox::MultiGit->new($self->{topdir}, + 'ALL.git', 'local'); + $mode = 0600; + unless (-d $ALL) { + umask 077; # don't bother restoring for lei + PublicInbox::Import::init_bare($ALL); + $self->git->qx(qw(config core.sharedRepository 0600)); + } + ($alt, $seen) = $self->{mg}->read_alternates(\$mode); + $has_new = $self->{mg}->merge_epochs($alt, $seen); + } else { # extindex has no epochs + $self->{mg} //= PublicInbox::MultiGit->new($self->{topdir}, + 'ALL.git'); + $prune = $opt->{-idx_gc} ? \$prune_nr : undef; + ($alt, $seen) = $self->{mg}->read_alternates(\$mode, $prune); + PublicInbox::Import::init_bare($ALL); + } + + # git-multi-pack-index(1) can speed up "git cat-file" startup slightly + my $git_midx = 0; + my $pd = "$ALL/objects/pack"; + if (opendir(my $dh, $pd)) { # drop stale symlinks + while (defined(my $dn = readdir($dh))) { + if ($dn =~ /\.(?:idx|pack|promisor|bitmap|rev)\z/) { + my $f = "$pd/$dn"; + unlink($f) if -l $f && !-e $f; + } + } + } elsif ($!{ENOENT}) { + mkdir($pd) or die "mkdir($pd): $!"; + } else { + die "opendir($pd): $!"; + } + my $new = ''; + for my $ibx (@{ibx_sorted($self, 'active')}) { + # create symlinks for multi-pack-index + $git_midx += symlink_packs($ibx, $pd); + # add new lines to our alternates file + my $d = $ibx->git->{git_dir} . '/objects'; + next if exists $alt->{$d}; + if (my @st = stat($d)) { + next if $seen->{"$st[0]\0$st[1]"}++; + } else { + warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n"; + next if $opt->{-idx_gc}; + } + $new .= "$d\n"; + } + ($has_new || $prune_nr || $new ne '') and + $self->{mg}->write_alternates($mode, $alt, $new); + my $restore = $self->with_umask; + if ($git_midx && ($opt->{'multi-pack-index'} // 1)) { + my $cmd = $self->git->cmd('multi-pack-index'); + push @$cmd, '--no-progress' if ($opt->{quiet}//0) > 1; + my $lk = $self->lock_for_scope; + system(@$cmd, 'write'); + # ignore errors, fairly new command, may not exist + } + $self->parallel_init($self->{indexlevel}); + PublicInbox::V2Writable::_idx_init($self, $opt); # acquires ei.lock + $self->{midx} = PublicInbox::MiscIdx->new($self); + $self->{oidx}->begin_lazy; + $self->{oidx}->eidx_prep; + $self->{midx}->create_xdb if $new ne ''; +} + +sub _watch_commit { # PublicInbox::DS::add_timer callback + my ($self) = @_; + delete $self->{-commit_timer}; + eidxq_process($self, $self->{-watch_sync}); + eidxq_release($self); + my $fmt = delete $self->{-watch_sync}->{-regen_fmt}; + reindex_checkpoint($self, $self->{-watch_sync}); + $self->{-watch_sync}->{-regen_fmt} = $fmt; + + # call event_step => done unless commit_timer is armed + PublicInbox::DS::requeue($self); +} + +sub on_inbox_unlock { # called by PublicInbox::InboxIdle + my ($self, $ibx) = @_; + my $opt = $self->{-watch_sync}->{-opt}; + my $pr = $opt->{-progress}; + my $ekey = $ibx->eidx_key; + local $0 = "sync $ekey"; + $pr->("indexing $ekey\n") if $pr; + $self->idx_init($opt); + sync_inbox($self, $self->{-watch_sync}, $ibx); + $self->{-commit_timer} //= add_timer($opt->{'commit-interval'} // 10, + \&_watch_commit, $self); +} + +sub eidx_reload { # -extindex --watch SIGHUP handler + my ($self, $idler) = @_; + if ($self->{cfg}) { + my $pr = $self->{-watch_sync}->{-opt}->{-progress}; + $pr->('reloading ...') if $pr; + delete $self->{-resync_queue}; + delete $self->{-ibx_ary_known}; + delete $self->{-ibx_ary_active}; + $self->{ibx_known} = []; + $self->{ibx_active} = []; + %{$self->{ibx_map}} = (); + delete $self->{-watch_sync}->{id2pos}; + my $cfg = PublicInbox::Config->new; + attach_config($self, $cfg); + $idler->refresh($cfg); + $pr->(" done\n") if $pr; + } else { + warn "reload not supported without --all\n"; + } +} + +sub eidx_resync_start ($) { # -extindex --watch SIGUSR1 handler + my ($self) = @_; + $self->{-resync_queue} //= [ @{ibx_sorted($self, 'active')} ]; + PublicInbox::DS::requeue($self); # trigger our ->event_step +} + +sub event_step { # PublicInbox::DS::requeue callback + my ($self) = @_; + if (my $resync_queue = $self->{-resync_queue}) { + if (my $ibx = shift(@$resync_queue)) { + on_inbox_unlock($self, $ibx); + PublicInbox::DS::requeue($self); + } else { + delete $self->{-resync_queue}; + _watch_commit($self); + } + } else { + done($self) unless $self->{-commit_timer}; + } +} + +sub eidx_watch { # public-inbox-extindex --watch main loop + my ($self, $opt) = @_; + local @SIG{keys %SIG} = values %SIG; + for my $sig (qw(HUP USR1 TSTP QUIT INT TERM)) { + $SIG{$sig} = sub { warn "SIG$sig ignored while scanning\n" }; + } + require PublicInbox::InboxIdle; + require PublicInbox::DS; + require PublicInbox::Syscall; + require PublicInbox::Sigfd; + my $idler = PublicInbox::InboxIdle->new($self->{cfg}); + if (!$self->{cfg}) { + $idler->watch_inbox($_) for (@{ibx_sorted($self, 'active')}); + } + for my $ibx (@{ibx_sorted($self, 'active')}) { + $ibx->subscribe_unlock(__PACKAGE__, $self) + } + my $pr = $opt->{-progress}; + $pr->("performing initial scan ...\n") if $pr; + my $sync = eidx_sync($self, $opt); # initial sync + return if $sync->{quit}; + my $oldset = PublicInbox::DS::block_signals(); + local $self->{current_info} = ''; + my $cb = $SIG{__WARN__} || \&CORE::warn; + local $SIG{__WARN__} = sub { + return if PublicInbox::Eml::warn_ignore(@_); + $cb->($self->{current_info}, ': ', @_); + }; + my $sig = { + HUP => sub { eidx_reload($self, $idler) }, + USR1 => sub { eidx_resync_start($self) }, + TSTP => sub { kill('STOP', $$) }, + }; + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + $sig->{QUIT} = $sig->{INT} = $sig->{TERM} = $quit; + local $self->{-watch_sync} = $sync; # for ->on_inbox_unlock + local @PublicInbox::DS::post_loop_do = (sub { !$sync->{quit} }); + $pr->("initial scan complete, entering event loop\n") if $pr; + # calls InboxIdle->event_step: + PublicInbox::DS::event_loop($sig, $oldset); + done($self); +} + +no warnings 'once'; +*done = \&PublicInbox::V2Writable::done; +*parallel_init = \&PublicInbox::V2Writable::parallel_init; +*nproc_shards = \&PublicInbox::V2Writable::nproc_shards; +*sync_prepare = \&PublicInbox::V2Writable::sync_prepare; +*index_todo = \&PublicInbox::V2Writable::index_todo; +*count_shards = \&PublicInbox::V2Writable::count_shards; +*atfork_child = \&PublicInbox::V2Writable::atfork_child; +*idx_shard = \&PublicInbox::V2Writable::idx_shard; +*reindex_checkpoint = \&PublicInbox::V2Writable::reindex_checkpoint; +*checkpoint = \&PublicInbox::V2Writable::checkpoint; +*barrier = \&PublicInbox::V2Writable::barrier; + +1; diff --git a/lib/PublicInbox/FakeImport.pm b/lib/PublicInbox/FakeImport.pm new file mode 100644 index 00000000..bccc3321 --- /dev/null +++ b/lib/PublicInbox/FakeImport.pm @@ -0,0 +1,26 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# pretend to do PublicInbox::Import::add for "lei index" +package PublicInbox::FakeImport; +use strict; +use v5.10.1; +use PublicInbox::ContentHash qw(git_sha); +use PublicInbox::Import; + +sub new { bless { bytes_added => 0 }, __PACKAGE__ } + +sub add { + my ($self, $eml, $check_cb, $smsg) = @_; + PublicInbox::Import::drop_unwanted_headers($eml); + $smsg->populate($eml); + my $raw = $eml->as_string; + $smsg->{blob} = git_sha(1, \$raw)->hexdigest; + $smsg->set_bytes($raw, length($raw)); + if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore + $oidx->vivify_xvmd($smsg) or return; + } + 1; +} + +1; diff --git a/lib/PublicInbox/FakeInotify.pm b/lib/PublicInbox/FakeInotify.pm index 92758613..8be07135 100644 --- a/lib/PublicInbox/FakeInotify.pm +++ b/lib/PublicInbox/FakeInotify.pm @@ -1,89 +1,176 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # for systems lacking Linux::Inotify2 or IO::KQueue, just emulates -# enough of Linux::Inotify2 +# enough of Linux::Inotify2 we use. package PublicInbox::FakeInotify; -use strict; +use v5.12; use Time::HiRes qw(stat); -use PublicInbox::DS; +use PublicInbox::DS qw(add_timer); +use Errno qw(ENOTDIR ENOENT); sub IN_MODIFY () { 0x02 } # match Linux inotify +# my $IN_MOVED_FROM 0x00000040 /* File was moved from X. */ # my $IN_MOVED_TO = 0x80; # my $IN_CREATE = 0x100; sub MOVED_TO_OR_CREATE () { 0x80 | 0x100 } +sub IN_DELETE () { 0x200 } +sub IN_DELETE_SELF () { 0x400 } +sub IN_MOVE_SELF () { 0x800 } my $poll_intvl = 2; # same as Filesys::Notify::Simple -sub new { bless { watch => {} }, __PACKAGE__ } +sub new { bless {}, __PACKAGE__ } -# behaves like Linux::Inotify2->watch -sub watch { - my ($self, $path, $mask) = @_; - my @st = stat($path) or return; - my $k = "$path\0$mask"; - $self->{watch}->{$k} = $st[10]; # 10 - ctime - bless [ $self->{watch}, $k ], 'PublicInbox::FakeInotify::Watch'; +sub on_dir_change ($$$$$) { # used by KQNotify subclass + my ($self, $events, $dh, $path, $dir_delete) = @_; + my $old = $self->{dirlist}->{$path}; + my @cur = grep(!/\A\.\.?\z/, readdir($dh)); + $self->{dirlist}->{$path} = \@cur; + + # new files: + my %tmp = map { $_ => undef } @cur; + delete @tmp{@$old}; + push(@$events, map { + bless \"$path/$_", 'PublicInbox::FakeInotify::Event' + } keys %tmp); + + if ($dir_delete) { + %tmp = map { $_ => undef } @$old; + delete @tmp{@cur}; + push(@$events, map { + bless \"$path/$_", 'PublicInbox::FakeInotify::GoneEvent' + } keys %tmp); + } } -sub on_new_files ($$$$) { - my ($events, $dh, $path, $old_ctime) = @_; - while (defined(my $base = readdir($dh))) { - next if $base =~ /\A\.\.?\z/; - my $full = "$path/$base"; - my @st = stat($full); - if (@st && $st[10] > $old_ctime) { - push @$events, - bless(\$full, 'PublicInbox::FakeInotify::Event') +sub watch_open ($$$) { # used by KQNotify subclass + my ($self, $path, $dir_delete) = @_; + my ($fh, @st, @st0, $tries); + do { +again: + unless (@st0 = stat($path)) { + warn "W: stat($path): $!" if $! != ENOENT; + return; + } + if (!(-d _ ? opendir($fh, $path) : open($fh, '<', $path))) { + goto again if $! == ENOTDIR && ++$tries < 10; + warn "W: open($path): $!" if $! != ENOENT; + return; } + @st = stat($fh) or die "fstat($path): $!"; + } while ("@st[0,1]" ne "@st0[0,1]" && + ((++$tries < 10) || (warn(<<EOM) && return))); +E: $path switching inodes too frequently to watch +EOM + if (-d _) { + $self->{dirlist}->{$path} = []; + on_dir_change($self, [], $fh, $path, $$dir_delete); + } else { + $$dir_delete = 0; } + bless [ @st[0, 1, 10], $path, $fh ], 'PublicInbox::FakeInotify::Watch' +} + +# behaves like Linux::Inotify2->watch +sub watch { + my ($self, $path, $mask) = @_; # mask is ignored + my $dir_delete = $mask & IN_DELETE ? 1 : 0; + my $w = watch_open($self, $path, \$dir_delete) or return; + pop @$w; # no need to keep $fh open for non-kqueue + $self->{watch}->{"$path\0$dir_delete"} = $w; +} + +sub gone ($$$) { # used by KQNotify subclass + my ($self, $ident, $path) = @_; + delete $self->{watch}->{$ident}; + delete $self->{dirlist}->{$path}; + bless(\$path, 'PublicInbox::FakeInotify::SelfGoneEvent'); +} + +# fuzz the time for freshly modified directories for low-res VFS +sub dir_adj ($) { + my ($old_ctime) = @_; + my $now = Time::HiRes::time; + my $diff = $now - $old_ctime; + my $adj = $poll_intvl + 1; + ($diff > -$adj && $diff < $adj) ? 1 : 0; } # behaves like non-blocking Linux::Inotify2->read sub read { my ($self) = @_; - my $watch = $self->{watch} or return (); - my $events = []; - for my $x (keys %$watch) { - my ($path, $mask) = split(/\0/, $x, 2); - my @now = stat($path) or next; - my $old_ctime = $watch->{$x}; - $watch->{$x} = $now[10]; - next if $old_ctime == $now[10]; - if ($mask & IN_MODIFY) { - push @$events, - bless(\$path, 'PublicInbox::FakeInotify::Event') - } elsif ($mask & MOVED_TO_OR_CREATE) { - opendir(my $dh, $path) or do { - warn "W: opendir $path: $!\n"; + my $ret = []; + while (my ($ident, $w) = each(%{$self->{watch}})) { + if (!@$w) { # cancelled + delete($self->{watch}->{$ident}); + next; + } + my $dir_delete = (split(/\0/, $ident, 2))[1]; + my ($old_dev, $old_ino, $old_ctime, $path) = @$w; + my @new_st = stat($path); + warn "W: stat($path): $!\n" if !@new_st && $! != ENOENT; + if (!@new_st || "$old_dev $old_ino" ne "@new_st[0,1]") { + push @$ret, gone($self, $ident, $path); + next; + } + if (-d _ && $new_st[10] > ($old_ctime - dir_adj($old_ctime))) { + opendir(my $fh, $path) or do { + if ($! == ENOENT || $! == ENOTDIR) { + push @$ret, gone($self, $ident, $path); + } else { + warn "W: opendir($path): $!"; + } next; }; - on_new_files($events, $dh, $path, $old_ctime); + @new_st = stat($fh) or die "fstat($path): $!"; + if ("$old_dev $old_ino" ne "@new_st[0,1]") { + push @$ret, gone($self, $ident, $path); + next; + } + $w->[2] = $new_st[10]; + on_dir_change($self, $ret, $fh, $path, $dir_delete); + } elsif ($new_st[10] > $old_ctime) { # regular files, etc + $w->[2] = $new_st[10]; + push @$ret, bless(\$path, + 'PublicInbox::FakeInotify::Event'); } } - @$events; + @$ret; } sub poll_once { my ($obj) = @_; $obj->event_step; # PublicInbox::InboxIdle::event_step - PublicInbox::DS::add_timer($poll_intvl, \&poll_once, $obj); + add_timer($poll_intvl, \&poll_once, $obj); } package PublicInbox::FakeInotify::Watch; -use strict; +use v5.12; -sub cancel { - my ($self) = @_; - delete $self->[0]->{$self->[1]}; -} +sub cancel { @{$_[0]} = () } -sub name { - my ($self) = @_; - (split(/\0/, $self->[1], 2))[0]; -} +sub name { $_[0]->[3] } package PublicInbox::FakeInotify::Event; -use strict; +use v5.12; sub fullname { ${$_[0]} } + +sub IN_DELETE { 0 } +sub IN_MOVED_FROM { 0 } +sub IN_DELETE_SELF { 0 } + +package PublicInbox::FakeInotify::GoneEvent; +use v5.12; +our @ISA = qw(PublicInbox::FakeInotify::Event); + +sub IN_DELETE { 1 } +sub IN_MOVED_FROM { 0 } + +package PublicInbox::FakeInotify::SelfGoneEvent; +use v5.12; +our @ISA = qw(PublicInbox::FakeInotify::GoneEvent); + +sub IN_DELETE_SELF { 1 } + 1; diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 805076f0..225565f4 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -1,13 +1,13 @@ -# Copyright (C) 2013-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used for generating Atom feeds for web-accessible mailing list archives. package PublicInbox::Feed; use strict; -use warnings; +use v5.10.1; use PublicInbox::View; use PublicInbox::WwwAtomStream; -use PublicInbox::Smsg; # this loads w/o Search::Xapian +use PublicInbox::Smsg; # this loads w/o Xapian sub generate_i { my ($ctx) = @_; @@ -19,29 +19,25 @@ sub generate { my ($ctx) = @_; my $msgs = $ctx->{msgs} = recent_msgs($ctx); return _no_thread() unless @$msgs; - PublicInbox::WwwAtomStream->response($ctx, 200, \&generate_i); + PublicInbox::WwwAtomStream->response($ctx, \&generate_i); } sub generate_thread_atom { my ($ctx) = @_; - my $msgs = $ctx->{msgs} = $ctx->{-inbox}->over->get_thread($ctx->{mid}); + my $msgs = $ctx->{msgs} = $ctx->{ibx}->over->get_thread($ctx->{mid}); return _no_thread() unless @$msgs; - PublicInbox::WwwAtomStream->response($ctx, 200, \&generate_i); + PublicInbox::WwwAtomStream->response($ctx, \&generate_i); } sub generate_html_index { my ($ctx) = @_; # if the 'r' query parameter is given, it is a legacy permalink # which we must continue supporting: - my $qp = $ctx->{qp}; - my $ibx = $ctx->{-inbox}; - if ($qp && !$qp->{r} && $ibx->over) { + !$ctx->{qp}->{r} && $ctx->{ibx}->over and return PublicInbox::View::index_topics($ctx); - } - my $env = $ctx->{env}; - my $url = $ibx->base_url($env) . 'new.html'; - my $qs = $env->{QUERY_STRING}; + my $url = $ctx->{ibx}->base_url($ctx->{env}) . 'new.html'; + my $qs = $ctx->{env}->{QUERY_STRING}; $url .= "?$qs" if $qs ne ''; [302, [ 'Location', $url, 'Content-Type', 'text/plain'], [ "Redirecting to $url\n" ] ]; @@ -49,12 +45,15 @@ sub generate_html_index { sub new_html_i { my ($ctx, $eml) = @_; - $ctx->zmore($ctx->html_top) if exists $ctx->{-html_tip}; + print { $ctx->zfh } $ctx->html_top if exists $ctx->{-html_tip}; - $eml and return PublicInbox::View::eml_entry($ctx, $eml); + if ($eml) { + $ctx->{smsg}->populate($eml) if !$ctx->{ibx}->{over}; + return PublicInbox::View::eml_entry($ctx, $eml); + } my $smsg = shift @{$ctx->{msgs}} or - $ctx->zmore(PublicInbox::View::pagination_footer( - $ctx, './new.html')); + print { $ctx->zfh } PublicInbox::View::pagination_footer( + $ctx, './new.html'); $smsg; } @@ -67,8 +66,9 @@ sub new_html { } $ctx->{-html_tip} = '<pre>'; $ctx->{-upfx} = ''; + $ctx->{-spfx} = '' if $ctx->{ibx}->{coderepo}; $ctx->{-hr} = 1; - PublicInbox::WwwStream::aresponse($ctx, 200, \&new_html_i); + PublicInbox::WwwStream::aresponse($ctx, \&new_html_i); } # private subs @@ -79,12 +79,11 @@ sub _no_thread () { sub recent_msgs { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; - my $max = $ibx->{feedmax}; + my $ibx = $ctx->{ibx}; + my $max = $ibx->{feedmax} // 25; return PublicInbox::View::paginate_recent($ctx, $max) if $ibx->over; # only for rare v1 inboxes which aren't indexed at all - my $qp = $ctx->{qp}; my $hex = '[a-f0-9]'; my $addmsg = qr!^:000000 100644 \S+ (\S+) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 (\S+) \S+ D\t(${hex}{2}/${hex}{38})$!; @@ -92,7 +91,7 @@ sub recent_msgs { # revision ranges may be specified my $range = 'HEAD'; - my $r = $qp->{r} if $qp; + my $r = $ctx->{qp}->{r}; if ($r && ($r =~ /\A(?:$refhex\.\.)?$refhex\z/o)) { $range = $r; } @@ -108,13 +107,13 @@ sub recent_msgs { my $last; my $last_commit; local $/ = "\n"; - my @oids; + my @ret; while (defined(my $line = <$log>)) { if ($line =~ /$addmsg/o) { my $add = $1; next if $deleted{$add}; # optimization-only - push @oids, $add; - if (scalar(@oids) >= $max) { + push(@ret, bless { blob => $add }, 'PublicInbox::Smsg'); + if (scalar(@ret) >= $max) { $last = 1; last; } @@ -136,8 +135,7 @@ sub recent_msgs { $last_commit and $ctx->{next_page} = qq[<a\nhref="?r=$last_commit"\nrel=next>] . 'next (older)</a>'; - - [ map { bless {blob => $_ }, 'PublicInbox::Smsg' } @oids ]; + \@ret; } 1; diff --git a/lib/PublicInbox/Fetch.pm b/lib/PublicInbox/Fetch.pm new file mode 100644 index 00000000..814d6e8e --- /dev/null +++ b/lib/PublicInbox/Fetch.pm @@ -0,0 +1,225 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# Wrapper to "git fetch" remote public-inboxes +package PublicInbox::Fetch; +use v5.12; +use parent qw(PublicInbox::IPC); +use URI (); +use PublicInbox::Spawn qw(popen_rd run_qx run_wait); +use PublicInbox::Admin; +use PublicInbox::LEI; +use PublicInbox::LeiCurl; +use PublicInbox::LeiMirror; +use PublicInbox::SHA qw(sha_all); +use File::Temp (); +use PublicInbox::Git qw(git_exe); + +sub new { bless {}, __PACKAGE__ } + +sub remote_url ($$) { + my ($lei, $dir) = @_; + my $rn = $lei->{opt}->{'try-remote'} // [ 'origin', '_grokmirror' ]; + for my $r (@$rn) { + my $cmd = [ git_exe, 'config', "remote.$r.url" ]; + my $url = run_qx($cmd, undef, { -C => $dir, 2 => $lei->{2} }); + next if $?; + $url =~ s!/*\n!!s; + return $url; + } + undef +} + +# PSGI mount prefixes and manifest.js.gz prefixes don't always align... +# TODO: remove, handle multi-inbox fetch +sub deduce_epochs ($$) { + my ($m, $path) = @_; + my ($v1_ent, @v2_epochs); + my $path_pfx = ''; + $path =~ s!/+\z!!; + do { + $v1_ent = $m->{$path}; + @v2_epochs = grep(m!\A\Q$path\E/git/[0-9]+\.git\z!, keys %$m); + } while (!defined($v1_ent) && !@v2_epochs && + $path =~ s!\A(/[^/]+)/!/! and $path_pfx .= $1); + ($path_pfx, $v1_ent ? $path : undef, @v2_epochs); +} + +sub do_manifest ($$$) { + my ($lei, $dir, $ibx_uri) = @_; + my $muri = URI->new("$ibx_uri/manifest.js.gz"); + my $ft = File::Temp->new(TEMPLATE => 'm-XXXX', + UNLINK => 1, DIR => $dir, SUFFIX => '.tmp'); + my $mf = "$dir/manifest.js.gz"; + my $m0; # current manifest.js.gz contents + if (open my $fh, '<', $mf) { + $m0 = eval { + PublicInbox::LeiMirror::decode_manifest($fh, $mf, $mf) + }; + warn($@) if $@; + } + my ($bn) = ($ft->filename =~ m!/([^/]+)\z!); + my $curl_cmd = $lei->{curl}->for_uri($lei, $muri, qw(-R -o), $bn); + my $opt = { -C => $dir }; + $opt->{$_} = $lei->{$_} for (0..2); + my $cerr = PublicInbox::LeiMirror::run_reap($lei, $curl_cmd, $opt); + if ($cerr) { + return [ 404, $muri ] if ($cerr >> 8) == 22; # 404 Missing + $lei->child_error($cerr, "@$curl_cmd failed"); + return; + } + my $m1 = eval { + PublicInbox::LeiMirror::decode_manifest($ft, $ft, $muri); + } or return [ 404, $muri ]; + my $mdiff = { %$m1 }; + + # filter out unchanged entries. We check modified, too, since + # fingerprints are SHA-1, so there's a teeny chance they'll collide + while (my ($k, $v0) = each %{$m0 // {}}) { + my $cur = $m1->{$k} // next; + my $f0 = $v0->{fingerprint} // next; + my $f1 = $cur->{fingerprint} // next; + my $t0 = $v0->{modified} // next; + my $t1 = $cur->{modified} // next; + delete($mdiff->{$k}) if $f0 eq $f1 && $t0 == $t1; + } + unless (keys %$mdiff) { + $lei->child_error(127 << 8) if $lei->{opt}->{'exit-code'}; + return; + } + my (undef, $v1_path, @v2_epochs) = + deduce_epochs($mdiff, $ibx_uri->path); + [ 200, $muri, $v1_path, \@v2_epochs, $ft, $mf, $m1 ]; +} + +sub get_fingerprint2 { + my ($git_dir) = @_; + my $rd = popen_rd([git_exe, 'show-ref'], undef, { -C => $git_dir }); + sha_all(256, $rd)->digest; # ignore show-ref errors +} + +sub writable_dir ($) { + my ($dir) = @_; + return unless -d $dir && -w _; + my @st = stat($dir); + $st[2] & 0222; # any writable bits set? (in case of root) +} + +sub do_fetch { # main entry point + my ($cls, $lei, $cd) = @_; + my $ibx_ver; + $lei->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $dir = PublicInbox::Admin::resolve_inboxdir($cd, \$ibx_ver); + my ($ibx_uri, @git_dir, @epochs, $mg, @new_epoch, $skip); + if ($ibx_ver == 1) { + my $url = remote_url($lei, $dir) // + die "E: $dir missing remote.*.url\n"; + $ibx_uri = URI->new($url); + } else { # v2: + require PublicInbox::MultiGit; + $mg = PublicInbox::MultiGit->new($dir, 'all.git', 'git'); + @epochs = $mg->git_epochs; + my ($git_url, $epoch); + for my $nr (@epochs) { # try newest epoch, first + my $edir = "$dir/git/$nr.git"; + if (!writable_dir($edir)) { + $skip->{$nr} = 1; + next; + } + next if defined $git_url; + if (defined(my $url = remote_url($lei, $edir))) { + $git_url = $url; + $epoch = $nr; + } else { + warn "W: $edir missing remote.*.url\n"; + my $o = { -C => $edir }; + $o->{1} = $o->{2} = $lei->{2}; + run_wait([git_exe, qw(config -l)], undef, $o) + and $lei->child_error($?); + } + } + @epochs = grep { !$skip->{$_} } @epochs if $skip; + $skip //= {}; # makes code below easier + $git_url or die "Unable to determine git URL\n"; + my $inbox_url = $git_url; + $inbox_url =~ s!/git/$epoch(?:\.git)?/?\z!! or + $inbox_url =~ s!/$epoch(?:\.git)?/?\z!! or die <<EOM; +Unable to infer inbox URL from <$git_url> +EOM + $ibx_uri = URI->new($inbox_url); + } + PublicInbox::LeiMirror::write_makefile($dir, $ibx_ver); + $lei->qerr("# inbox URL: $ibx_uri/"); + my $res = do_manifest($lei, $dir, $ibx_uri) or return; + my ($code, $muri, $v1_path, $v2_epochs, $ft, $mf, $m1) = @$res; + if ($code == 404) { + # any pre-manifest.js.gz instances running? Just fetch all + # existing ones and unconditionally try cloning the next + $v2_epochs = [ map { "$dir/git/$_.git" } @epochs ]; + if (@epochs) { + my $n = $epochs[-1] + 1; + push @$v2_epochs, "$dir/git/$n.git" if !$skip->{$n}; + } + } else { + $code == 200 or die "BUG unexpected code $code\n"; + } + my $mculled; + if ($ibx_ver == 2) { + defined($v1_path) and warn <<EOM; +E: got v1 `$v1_path' when expecting v2 epoch(s) in <$muri>, WTF? +EOM + @git_dir = map { "$dir/git/$_.git" } sort { $a <=> $b } map { + my ($nr) = (m!/([0-9]+)\.git\z!g); + $skip->{$nr} ? () : $nr; + } @$v2_epochs; + if ($m1 && scalar keys %$skip) { + my $re = join('|', keys %$skip); + my @del = grep(m!/git/$re\.git\z!, keys %$m1); + delete @$m1{@del}; + $mculled = 1; + } + } else { + $git_dir[0] = $dir; + } + # n.b. this expects all epochs are from the same host + my $torsocks = $lei->{curl}->torsocks($lei, $muri); + my $fp2 = $lei->{opt}->{'exit-code'} ? [] : undef; + my $xit = 127; + for my $d (@git_dir) { + my $cmd; + my $opt = {}; # for spawn + if (-d $d) { + $fp2->[0] = get_fingerprint2($d) if $fp2; + $cmd = [ @$torsocks, git_exe, "--git-dir=$d", + PublicInbox::LeiMirror::fetch_args($lei, $opt)]; + } else { + my $e_uri = $ibx_uri->clone; + my ($epath) = ($d =~ m!(/git/[0-9]+\.git)\z!); + defined($epath) or + die "BUG: $d is not an epoch to clone\n"; + $e_uri->path($ibx_uri->path.$epath); + $cmd = [ @$torsocks, + PublicInbox::LeiMirror::clone_cmd($lei, $opt), + $$e_uri, $d]; + push @new_epoch, substr($epath, 5, -4) + 0; + $xit = 0; + } + my $cerr = PublicInbox::LeiMirror::run_reap($lei, $cmd, $opt); + # do not bail on clone failure if we didn't have a manifest + if ($cerr && ($code == 200 || -d $d)) { + $lei->child_error($cerr, "@$cmd failed"); + return; + } + if ($fp2 && $xit) { + $fp2->[1] = get_fingerprint2($d); + $xit = 0 if $fp2->[0] ne $fp2->[1]; + } + } + for my $i (@new_epoch) { $mg->epoch_cfg_set($i) } + if ($ft) { + PublicInbox::LeiMirror::dump_manifest($m1 => $ft) if $mculled; + PublicInbox::LeiMirror::ft_rename($ft, $mf, 0666); + } + $lei->child_error($xit << 8) if $fp2 && $xit; +} + +1; diff --git a/lib/PublicInbox/Filter/Base.pm b/lib/PublicInbox/Filter/Base.pm index d54570fd..f6355e1b 100644 --- a/lib/PublicInbox/Filter/Base.pm +++ b/lib/PublicInbox/Filter/Base.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # base class for creating per-list or per-project filters diff --git a/lib/PublicInbox/Filter/Gmane.pm b/lib/PublicInbox/Filter/Gmane.pm index c326faca..a18b77d2 100644 --- a/lib/PublicInbox/Filter/Gmane.pm +++ b/lib/PublicInbox/Filter/Gmane.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Filter for importing some archives from gmane diff --git a/lib/PublicInbox/Filter/Mirror.pm b/lib/PublicInbox/Filter/Mirror.pm index 9f6dd342..fe915fc3 100644 --- a/lib/PublicInbox/Filter/Mirror.pm +++ b/lib/PublicInbox/Filter/Mirror.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Dumb filter for blindly accepting everything diff --git a/lib/PublicInbox/Filter/RubyLang.pm b/lib/PublicInbox/Filter/RubyLang.pm index 06e4ea75..57ebbe78 100644 --- a/lib/PublicInbox/Filter/RubyLang.pm +++ b/lib/PublicInbox/Filter/RubyLang.pm @@ -1,11 +1,10 @@ -# Copyright (C) 2017-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Filter for lists.ruby-lang.org trailers package PublicInbox::Filter::RubyLang; -use base qw(PublicInbox::Filter::Base); -use strict; -use warnings; +use v5.10.1; +use parent qw(PublicInbox::Filter::Base); use PublicInbox::MID qw(mids); my $l1 = qr/Unsubscribe:\s @@ -16,7 +15,7 @@ sub new { my ($class, %opts) = @_; my $altid = delete $opts{-altid}; my $self = $class->SUPER::new(%opts); - my $ibx = $self->{-inbox}; + my $ibx = $self->{ibx}; # altid = serial:ruby-core:file=msgmap.sqlite3 if (!$altid && $ibx && $ibx->{altid}) { $altid ||= $ibx->{altid}->[0]; @@ -56,16 +55,22 @@ sub scrub { my $hdr = $mime->header_obj; my $mids = mids($hdr); return $self->REJECT('Message-ID missing') unless (@$mids); - my @v = $hdr->header_raw('X-Mail-Count'); my $n; - foreach (@v) { - /\A\s*([0-9]+)\s*\z/ or next; - $n = $1; - last; - } - unless (defined $n) { - return $self->REJECT('X-Mail-Count not numeric'); + my @v = $hdr->header_raw('X-Mail-Count'); # old host only + if (@v) { + for (@v) { + /\A\s*([0-9]+)\s*\z/ or next; + $n = $1; + last; + } + } else { # new host: nue.mailmanlists.eu + for ($hdr->header_str('Subject')) { + /\A\[ruby-[^:]+:([0-9]+)\]/ or next; + $n = $1; + last; + } } + $n // return $self->REJECT('could not get count not numeric'); foreach my $mid (@$mids) { my $r = $altid->mm_alt->mid_set($n, $mid); next if $r == 0; diff --git a/lib/PublicInbox/Filter/SubjectTag.pm b/lib/PublicInbox/Filter/SubjectTag.pm index aca6688b..ecedf666 100644 --- a/lib/PublicInbox/Filter/SubjectTag.pm +++ b/lib/PublicInbox/Filter/SubjectTag.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2017-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Filter for various [tags] in subjects diff --git a/lib/PublicInbox/Filter/Vger.pm b/lib/PublicInbox/Filter/Vger.pm index 2c73738d..5b3c0277 100644 --- a/lib/PublicInbox/Filter/Vger.pm +++ b/lib/PublicInbox/Filter/Vger.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Filter for vger.kernel.org list trailer @@ -24,7 +24,7 @@ sub scrub { # the vger appender seems to only work on the raw string, # so in multipart (e.g. GPG-signed) messages, the list trailer # becomes invisible to MIME-aware email clients. - if ($s =~ s/$l0\n$l1\n$l2\n$l3\n($l4\n)?\z//os) { + if ($s =~ s/$l0\n$l1\n$l2\n$l3\n(?:$l4\n)?\n*\z//os) { $mime = PublicInbox::Eml->new(\$s); } $self->ACCEPT($mime); diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 7983c841..78392990 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -1,87 +1,132 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # backend for a git-cat-file-workalike based on libgit2, # other libgit2 stuff may go here, too. package PublicInbox::Gcf2; -use strict; -use PublicInbox::Spawn qw(which popen_rd); -use Fcntl qw(LOCK_EX); +use v5.12; +use PublicInbox::Spawn qw(which run_qx); # may set PERL_INLINE_DIRECTORY +use Fcntl qw(SEEK_SET); +use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use IO::Handle; # autoflush -my (%CFG, $c_src, $lockfh); +use PublicInbox::Git qw($ck_unlinked_packs); +use PublicInbox::Lock; +use autodie qw(close open seek truncate); + BEGIN { + my (%CFG, $c_src); # PublicInbox::Spawn will set PERL_INLINE_DIRECTORY - # to ~/.cache/public-inbox/inline-c if it exists + # to ~/.cache/public-inbox/inline-c if it exists and Inline::C works my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} // die 'PERL_INLINE_DIRECTORY not defined'; - my $f = "$inline_dir/.public-inbox.lock"; - open $lockfh, '>', $f or die "failed to open $f: $!\n"; - my $pc = which($ENV{PKG_CONFIG} // 'pkg-config'); - my ($dir) = (__FILE__ =~ m!\A(.+?)/[^/]+\z!); - my $rdr = {}; - open $rdr->{2}, '>', '/dev/null' or die "open /dev/null: $!"; - for my $x (qw(libgit2)) { - my $l = popen_rd([$pc, '--libs', $x], undef, $rdr); - $l = do { local $/; <$l> }; - next if $?; - my $c = popen_rd([$pc, '--cflags', $x], undef, $rdr); - $c = do { local $/; <$c> }; - next if $?; - # note: we name C source files .h to prevent - # ExtUtils::MakeMaker from automatically trying to - # build them. - my $f = "$dir/gcf2_$x.h"; - if (open(my $fh, '<', $f)) { - chomp($l, $c); - local $/; - $c_src = <$fh>; - $CFG{LIBS} = $l; - $CFG{CCFLAGSEX} = $c; - last; - } else { - die "E: $f: $!\n"; - } + # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking + my $lk = PublicInbox::Lock->new("$inline_dir/.public-inbox.lock"); + my $fh = $lk->lock_acquire; + + my $pc = which($ENV{PKG_CONFIG} // 'pkg-config') // + die "pkg-config missing for libgit2"; + my ($dir) = (__FILE__ =~ m!\A(.+?)/[^/]+\z!); + my $vals = {}; + my $rdr = { 2 => \(my $err) }; + my @switches = qw(modversion cflags libs); + for my $k (@switches) { + chomp(my $val = run_qx([$pc, "--$k", 'libgit2'], undef, $rdr)); + die "E: libgit2 not installed: $err\n" if $?; + $vals->{$k} = $val; } - die "E: libgit2 not installed\n" unless $c_src; + my $f = "$dir/gcf2_libgit2.h"; + $c_src = PublicInbox::IO::try_cat $f or die "cat $f: $!"; + # append pkg-config results to the source to ensure Inline::C + # can rebuild if there's changes (it doesn't seem to detect + # $CFG{CCFLAGSEX} nor $CFG{CPPFLAGS} changes) + $c_src .= "/* $pc --$_ libgit2 => $vals->{$_} */\n" for @switches; + open my $oldout, '>&', \*STDOUT; + open my $olderr, '>&', \*STDERR; + open STDOUT, '>&', $fh; + open STDERR, '>&', $fh; + STDERR->autoflush(1); + STDOUT->autoflush(1); + $CFG{CCFLAGSEX} = $vals->{cflags}; + $CFG{LIBS} = $vals->{libs}; - # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking - flock($lockfh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n"; + # we use Capitalized and ALLCAPS for compatibility with old Inline::C + CORE::eval <<'EOM'; +use Inline C => Config => %CFG, BOOT => q[git_libgit2_init();]; +use Inline C => $c_src, BUILD_NOISY => 1; +EOM + $err = $@; + open(STDERR, '>&', $olderr); + open(STDOUT, '>&', $oldout); + if ($err) { + seek($fh, 0, SEEK_SET); + my @msg = <$fh>; + truncate($fh, 0); + die "Inline::C Gcf2 build failed:\n", $err, "\n", @msg; + } } -# we use Capitalized and ALLCAPS for compatibility with old Inline::C -use Inline C => Config => %CFG, BOOT => 'git_libgit2_init();'; -use Inline C => $c_src; -undef $c_src; -undef %CFG; -undef $lockfh; +sub add_alt ($$) { + my ($gcf2, $git_dir) = @_; + my $objdir = PublicInbox::Git->new($git_dir)->git_path('objects'); -# Usage: $^X -MPublicInbox::Gcf2 -e 'PublicInbox::Gcf2::loop()' + # libgit2 (tested 0.27.7+dfsg.1-0.2 and 0.28.3+dfsg.1-1~bpo10+1 + # in Debian) doesn't handle relative epochs properly when nested + # multiple levels. Add all the absolute paths to workaround it, + # since $EXTINDEX_DIR/ALL.git/objects/info/alternates uses absolute + # paths to reference $V2INBOX_DIR/all.git/objects and + # $V2INBOX_DIR/all.git/objects/info/alternates uses relative paths + # to refer to $V2INBOX_DIR/git/$EPOCH.git/objects + # + # See https://bugs.debian.org/975607 + if (my $s = PublicInbox::IO::try_cat("$objdir/info/alternates")) { + $gcf2->add_alternate($_) for ($s =~ m!^(/[^\n]+)\n!gms); + } + $gcf2->add_alternate($objdir); + 1; +} + +# Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop [EXPIRE-TIMEOUT] # (see lib/PublicInbox/Gcf2Client.pm) -sub loop { +sub loop (;$) { + my $exp = $_[0] || $ARGV[0] || 60; # seconds my $gcf2 = new(); + my (%seen, $check_at); STDERR->autoflush(1); STDOUT->autoflush(1); + my $pid = $$; while (<STDIN>) { chomp; my ($oid, $git_dir) = split(/ /, $_, 2); - $gcf2->add_alternate("$git_dir/objects"); + $seen{$git_dir} //= add_alt($gcf2, $git_dir); if (!$gcf2->cat_oid(1, $oid)) { # retry once if missing. We only get unabbreviated OIDs # from SQLite or Xapian DBs, here, so malicious clients # can't trigger excessive retries: - warn "I: $$ $oid missing, retrying in $git_dir\n"; + warn "# $$ $oid missing, retrying in $git_dir\n"; $gcf2 = new(); - $gcf2->add_alternate("$git_dir/objects"); + %seen = ($git_dir => add_alt($gcf2, $git_dir)); + $check_at = clock_gettime(CLOCK_MONOTONIC) + $exp; if ($gcf2->cat_oid(1, $oid)) { - warn "I: $$ $oid found after retry\n"; + warn "# $$ $oid found after retry\n"; } else { warn "W: $$ $oid missing after retry\n"; print "$oid missing\n"; # mimic git-cat-file } + } else { # check expiry to deal with deleted pack files + my $now = clock_gettime(CLOCK_MONOTONIC); + $check_at //= $now + $exp; + if ($now > $check_at) { + undef $check_at; + if (!$ck_unlinked_packs || + $ck_unlinked_packs->($pid)) { + $gcf2 = new(); + %seen = (); + } + } } } } diff --git a/lib/PublicInbox/Gcf2Client.pm b/lib/PublicInbox/Gcf2Client.pm index 42ff1bf3..07ff7dcb 100644 --- a/lib/PublicInbox/Gcf2Client.pm +++ b/lib/PublicInbox/Gcf2Client.pm @@ -1,51 +1,48 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # connects public-inbox processes to PublicInbox::Gcf2::loop() package PublicInbox::Gcf2Client; -use strict; +use v5.12; use parent qw(PublicInbox::DS); use PublicInbox::Git; -use PublicInbox::Spawn qw(popen_rd); -use IO::Handle (); -use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLOUT); -# fields: -# async_cat => GitAsyncCat ref (read-only pipe) -# sock => writable pipe to Gcf2::loop - -sub new { bless($_[0] // {}, __PACKAGE__) } +use PublicInbox::Gcf2; # fails if Inline::C or libgit2-dev isn't available +use PublicInbox::Spawn qw(spawn); +use Socket qw(AF_UNIX SOCK_STREAM); +use PublicInbox::Syscall qw(EPOLLIN); +use PublicInbox::IO; +use autodie qw(socketpair); -sub gcf2c_begin ($) { - my ($self) = @_; +# fields: +# sock => socket to Gcf2::loop +# The rest of these fields are compatible with what PublicInbox::Git +# uses code-sharing +# pid => PID of Gcf2::loop process +# pid.owner => process which spawned {pid} +# in => same as {sock}, for compatibility with PublicInbox::Git +# inflight => array (see PublicInbox::Git) +sub new { + my ($opt) = @_; + my $self = bless {}, __PACKAGE__; # ensure the child process has the same @INC we do: my $env = { PERL5LIB => join(':', @INC) }; - my ($out_r, $out_w); - pipe($out_r, $out_w) or die "pipe failed: $!"; - my $rdr = { 0 => $out_r, 2 => $self->{2} }; - my $cmd = [$^X, qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop()]]; - @$self{qw(in pid)} = popen_rd($cmd, $env, $rdr); - fcntl($out_w, 1031, 4096) if $^O eq 'linux'; # 1031: F_SETPIPE_SZ - $out_w->autoflush(1); - $out_w->blocking(0); - $self->SUPER::new($out_w, 0); # EPOLL_CTL_ADD (a bit wasteful :x) + socketpair(my $s1, my $s2, AF_UNIX, SOCK_STREAM, 0); + $s1->blocking(0); + $opt->{0} = $opt->{1} = $s2; + my $cmd = [$^X, $^W ? ('-w') : (), + qw[-MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop]]; $self->{inflight} = []; + PublicInbox::IO::attach_pid($s1, spawn($cmd, $env, $opt), + \&PublicInbox::Git::gcf_drain, $self->{inflight}); + $self->{epwatch} = \undef; # for Git->cleanup + $self->SUPER::new($s1, EPOLLIN); } -sub fail { - my $self = shift; - $self->close; # PublicInbox::DS::close - PublicInbox::Git::fail($self, @_); -} - -sub cat_async ($$$;$) { +sub gcf2_async ($$$;$) { my ($self, $req, $cb, $arg) = @_; - my $inflight = $self->{inflight} // gcf2c_begin($self); - - # rare, I hope: - cat_async_step($self, $inflight) if $self->{wbuf}; - - $self->write(\"$req\n") or $self->fail("gcf2c write: $!"); - push @$inflight, $req, $cb, $arg; + my $inflight = $self->gcf_inflight or return; + PublicInbox::Git::write_all($self, $req, \&cat_async_step, $inflight); + push @$inflight, \$req, $cb, $arg; # ref prevents Git.pm retries } # ensure PublicInbox::Git::cat_async_step never calls cat_async_retry @@ -53,10 +50,10 @@ sub alternates_changed {} no warnings 'once'; -# this is the write-only end of a pipe, DS->EventLoop will call this -*event_step = \&PublicInbox::DS::flush_write; - -# used by GitAsyncCat -*cat_async_step = \&PublicInbox::Git::cat_async_step; +*gcf_inflight = \&PublicInbox::Git::gcf_inflight; # for event_step +*cat_async_step = \&PublicInbox::Git::cat_async_step; # for event_step +*event_step = \&PublicInbox::Git::event_step; +*fail = \&PublicInbox::Git::fail; +*DESTROY = \&PublicInbox::Git::DESTROY; 1; diff --git a/lib/PublicInbox/GetlineBody.pm b/lib/PublicInbox/GetlineBody.pm deleted file mode 100644 index 988bc63f..00000000 --- a/lib/PublicInbox/GetlineBody.pm +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> - -# Wrap a pipe or file for PSGI streaming response bodies and calls the -# end callback when the object goes out-of-scope. -# This depends on rpipe being _blocking_ on getline. -# -# This is only used by generic PSGI servers and not public-inbox-httpd -package PublicInbox::GetlineBody; -use strict; -use warnings; - -sub new { - my ($class, $rpipe, $end, $end_arg, $buf, $filter) = @_; - bless { - rpipe => $rpipe, - end => $end, - end_arg => $end_arg, - initial_buf => $buf, - filter => $filter, - }, $class; -} - -# close should always be called after getline returns undef, -# but a client aborting a connection can ruin our day; so lets -# hope our underlying PSGI server does not leak references, here. -sub DESTROY { $_[0]->close } - -sub getline { - my ($self) = @_; - my $rpipe = $self->{rpipe} or return; # EOF was set on previous call - my $buf = delete($self->{initial_buf}) // $rpipe->getline; - delete($self->{rpipe}) unless defined $buf; # set EOF for next call - if (my $filter = $self->{filter}) { - $buf = $filter->translate($buf); - } - $buf; -} - -sub close { - my ($self) = @_; - my ($end, $end_arg) = delete @$self{qw(end end_arg)}; - $end->($end_arg) if $end; -} - -1; diff --git a/lib/PublicInbox/GetlineResponse.pm b/lib/PublicInbox/GetlineResponse.pm new file mode 100644 index 00000000..290cce74 --- /dev/null +++ b/lib/PublicInbox/GetlineResponse.pm @@ -0,0 +1,40 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# For generic PSGI servers (not public-inbox-httpd/netd) which assumes their +# getline response bodies can be backpressure-aware for slow clients +# This depends on rpipe being _blocking_ on getline. +package PublicInbox::GetlineResponse; +use v5.12; + +sub response { + my ($qsp) = @_; + my ($res, $rbuf); + do { # read header synchronously + sysread($qsp->{rpipe}, $rbuf, 65536); + $res = $qsp->parse_hdr_done($rbuf); # fills $bref + } until defined($res); + my ($wcb, $filter) = $qsp->yield_pass(undef, $res) or return; + my $self = $res->[2] = bless { + qsp => $qsp, + filter => $filter, + }, __PACKAGE__; + my ($bref) = @{delete $qsp->{yield_parse_hdr}}; + $self->{rbuf} = $$bref if $$bref ne ''; + $wcb->($res); +} + +sub getline { + my ($self) = @_; + my $rpipe = $self->{qsp}->{rpipe} // do { + delete($self->{qsp})->finish; + return; # EOF was set on previous call + }; + my $buf = delete($self->{rbuf}) // $rpipe->getline; + $buf // delete($self->{qsp}->{rpipe}); # set EOF for next call + $self->{filter} ? $self->{filter}->translate($buf) : $buf; +} + +sub close {} + +1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 2323cecc..a9a821ad 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: GPLv2 or later <https://www.gnu.org/licenses/gpl-2.0.txt> # # Used to read files from a git repository without excessive forking. @@ -9,23 +9,35 @@ package PublicInbox::Git; use strict; use v5.10.1; -use parent qw(Exporter); +use parent qw(Exporter PublicInbox::DS); +use PublicInbox::DS qw(now); +use autodie qw(socketpair read); use POSIX (); -use IO::Handle; # ->autoflush -use Errno qw(EINTR); +use Socket qw(AF_UNIX SOCK_STREAM); +use PublicInbox::Syscall qw(EPOLLIN EPOLLET); +use Errno qw(EAGAIN); use File::Glob qw(bsd_glob GLOB_NOSORT); -use Time::HiRes qw(stat); -use PublicInbox::Spawn qw(popen_rd); +use File::Spec (); +use PublicInbox::Spawn qw(spawn popen_rd run_qx which); +use PublicInbox::IO qw(read_all try_cat); use PublicInbox::Tmpfile; -use Carp qw(croak); -our @EXPORT_OK = qw(git_unquote git_quote); -our $PIPE_BUFSIZ = 65536; # Linux default +use Carp qw(croak carp); +use PublicInbox::SHA qw(sha_all); +our %HEXLEN2SHA = (40 => 1, 64 => 256); +our %OFMT2HEXLEN = (sha1 => 40, sha256 => 64); +our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN + $ck_unlinked_packs git_exe); our $in_cleanup; +our $async_warn; # true in read-only daemons -use constant MAX_INFLIGHT => - (($^O eq 'linux' ? 4096 : POSIX::_POSIX_PIPE_BUF()) * 3) - / - 65; # SHA-256 hex size + "\n" in preparation for git using non-SHA1 +# committerdate:unix is git 2.9.4+ (2017-05-05), so using raw instead +my @MODIFIED_DATE = qw[for-each-ref --sort=-committerdate + --format=%(committerdate:raw) --count=1]; + +use constant { + MAX_INFLIGHT => 18, # arbitrary, formerly based on PIPE_BUF + BATCH_CMD_VER => v2.36.0, # git 2.36+ +}; my %GIT_ESC = ( a => "\a", @@ -40,19 +52,44 @@ my %GIT_ESC = ( ); my %ESC_GIT = map { $GIT_ESC{$_} => $_ } keys %GIT_ESC; +my $EXE_ST = ''; # pack('dd', st_dev, st_ino); # no `q' in some 32-bit builds +my ($GIT_EXE, $GIT_VER); + +sub git_exe () { + my $now = now; + state $next_check = $now - 10; + return $GIT_EXE if $now < $next_check; + $next_check = $now + 10; + $GIT_EXE = which('git') // die "git not found in $ENV{PATH}"; + my @st = stat(_) or die "stat($GIT_EXE): $!"; # can't do HiRes w/ _ + my $st = pack('dd', $st[0], $st[1]); + if ($st ne $EXE_ST) { + my $v = run_qx([ $GIT_EXE, '--version' ]); + die "$GIT_EXE --version: \$?=$?" if $?; + $v =~ /\b([0-9]+(?:\.[0-9]+){2})/ or die + "$GIT_EXE --version output: $v # unparseable"; + $GIT_VER = eval("v$1") // die "BUG: bad vstring: $1 ($v)"; + $EXE_ST = $st; + } + $GIT_EXE; +} + +sub git_version () { + git_exe; + $GIT_VER; +} # unquote pathnames used by git, see quote.c::unquote_c_style.c in git.git sub git_unquote ($) { return $_[0] unless ($_[0] =~ /\A"(.*)"\z/); $_[0] = $1; - $_[0] =~ s/\\([\\"abfnrtv])/$GIT_ESC{$1}/g; - $_[0] =~ s/\\([0-7]{1,3})/chr(oct($1))/ge; + $_[0] =~ s!\\([\\"abfnrtv]|[0-3][0-7]{2})!$GIT_ESC{$1}//chr(oct($1))!ge; $_[0]; } sub git_quote ($) { if ($_[0] =~ s/([\\"\a\b\f\n\r\t\013]|[^[:print:]])/ - '\\'.($ESC_GIT{$1}||sprintf("%0o",ord($1)))/egs) { + '\\'.($ESC_GIT{$1}||sprintf("%03o",ord($1)))/egs) { return qq{"$_[0]"}; } $_[0]; @@ -60,230 +97,302 @@ sub git_quote ($) { sub new { my ($class, $git_dir) = @_; + $git_dir .= '/'; + $git_dir =~ tr!/!/!s; + chop $git_dir; # may contain {-tmp} field for File::Temp::Dir - bless { git_dir => $git_dir, alt_st => '', -git_path => {} }, $class + my %dedupe = ($git_dir => undef); + bless { git_dir => (keys %dedupe)[0] }, $class } sub git_path ($$) { my ($self, $path) = @_; - $self->{-git_path}->{$path} ||= do { - local $/ = "\n"; - chomp(my $str = $self->qx(qw(rev-parse --git-path), $path)); - - # git prior to 2.5.0 did not understand --git-path - if ($str eq "--git-path\n$path") { - $str = "$self->{git_dir}/$path"; + $self->{-git_path}->{$path} //= do { + my $d = "$self->{git_dir}/$path"; + if (-e $d) { + $d; + } else { + local $/ = "\n"; + my $rdr = { 2 => \my $err }; + my $s = $self->qx([qw(rev-parse --git-path), $path], + undef, $rdr); + chomp $s; + + # git prior to 2.5.0 did not understand --git-path + $s eq "--git-path\n$path" ? $d : $s; } - $str; }; } sub alternates_changed { my ($self) = @_; my $alt = git_path($self, 'objects/info/alternates'); + use Time::HiRes qw(stat); my @st = stat($alt) or return 0; # can't rely on 'q' on some 32-bit builds, but `d' works my $st = pack('dd', $st[10], $st[7]); # 10: ctime, 7: size - return 0 if $self->{alt_st} eq $st; + return 0 if ($self->{alt_st} // '') eq $st; $self->{alt_st} = $st; # always a true value } +sub object_format { + $_[0]->{object_format} //= do { + my $fmt = $_[0]->qx(qw(config extensions.objectformat)); + $fmt eq "sha256\n" ? \'sha256' : \undef; + } +} + sub last_check_err { my ($self) = @_; - my $fh = $self->{err_c} or return; - sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); - defined(sysread($fh, my $buf, -s $fh)) or - fail($self, "sysread failed: $!"); + my $fh = $self->{err_c} or return ''; + sysseek($fh, 0, 0) or $self->fail("sysseek: $!"); + my $size = -s $fh or return ''; + sysread($fh, my $buf, $size) // $self->fail("sysread: $!"); + truncate($fh, 0) or $self->fail("truncate: $!"); $buf; } -sub _bidi_pipe { - my ($self, $batch, $in, $out, $pid, $err) = @_; - if ($self->{$pid}) { - if (defined $err) { # "err_c" - my $fh = $self->{$err}; - sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); - truncate($fh, 0) or fail($self, "truncate failed: $!"); - } - return; +sub gcf_drain { # awaitpid cb + my ($pid, $inflight, $bc) = @_; + while (@$inflight) { + my ($req, $cb, $arg) = splice(@$inflight, 0, 3); + $req = $$req if ref($req); + $bc and $req =~ s/\A(?:contents|info) //; + $req =~ s/ .*//; # drop git_dir for Gcf2Client + eval { $cb->(undef, $req, undef, undef, $arg) }; + warn "E: (in abort) $req: $@" if $@; } - my ($out_r, $out_w); - pipe($out_r, $out_w) or fail($self, "pipe failed: $!"); - my @cmd = (qw(git), "--git-dir=$self->{git_dir}", - qw(-c core.abbrev=40 cat-file), $batch); - my $redir = { 0 => $out_r }; - if ($err) { - my $id = "git.$self->{git_dir}$batch.err"; - my $fh = tmpfile($id) or fail($self, "tmpfile($id): $!"); - $self->{$err} = $fh; - $redir->{2} = $fh; - } - my ($in_r, $p) = popen_rd(\@cmd, undef, $redir); - $self->{$pid} = $p; - $out_w->autoflush(1); - if ($^O eq 'linux') { # 1031: F_SETPIPE_SZ - fcntl($out_w, 1031, 4096); - fcntl($in_r, 1031, 4096) if $batch eq '--batch-check'; - } - $self->{$out} = $out_w; - $self->{$in} = $in_r; -} - -sub my_read ($$$) { - my ($fh, $rbuf, $len) = @_; - my $left = $len - length($$rbuf); - my $r; - while ($left > 0) { - $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)); - if ($r) { - $left -= $r; - } else { - next if (!defined($r) && $! == EINTR); - return $r; - } - } - \substr($$rbuf, 0, $len, ''); } -sub my_readline ($$) { - my ($fh, $rbuf) = @_; - while (1) { - if ((my $n = index($$rbuf, "\n")) >= 0) { - return substr($$rbuf, 0, $n + 1, ''); - } - my $r = sysread($fh, $$rbuf, $PIPE_BUFSIZ, length($$rbuf)); - next if $r || (!defined($r) && $! == EINTR); - return defined($r) ? '' : undef; # EOF or error +sub _sock_cmd { + my ($self, $batch, $err_c) = @_; + $self->{sock} and Carp::confess('BUG: {sock} exists'); + socketpair(my $s1, my $s2, AF_UNIX, SOCK_STREAM, 0); + $s1->blocking(0); + my $opt = { pgid => 0, 0 => $s2, 1 => $s2 }; + my $gd = $self->{git_dir}; + if ($gd =~ s!/([^/]+/[^/]+)\z!/!) { + $opt->{-C} = $gd; + $gd = $1; } + + # git 2.31.0+ supports -c core.abbrev=no, don't bother with + # core.abbrev=64 since not many releases had SHA-256 prior to 2.31 + my $abbr = git_version lt v2.31.0 ? 40 : 'no'; + my @cmd = ($GIT_EXE, "--git-dir=$gd", '-c', "core.abbrev=$abbr", + 'cat-file', "--$batch"); + if ($err_c) { + my $id = "git.$self->{git_dir}.$batch.err"; + $self->{err_c} = $opt->{2} = tmpfile($id, undef, 1) or + $self->fail("tmpfile($id): $!"); + } + my $inflight = []; # TODO consider moving this into the IO object + my $pid = spawn(\@cmd, undef, $opt); + $self->{sock} = PublicInbox::IO::attach_pid($s1, $pid, + \&gcf_drain, $inflight, $self->{-bc}); + $self->{inflight} = $inflight; } -sub cat_async_retry ($$$$$) { - my ($self, $inflight, $req, $cb, $arg) = @_; +sub cat_async_retry ($$) { + my ($self, $old_inflight) = @_; # {inflight} may be non-existent, but if it isn't we delete it # here to prevent cleanup() from waiting: - delete $self->{inflight}; - cleanup($self); + my ($sock, $epwatch) = delete @$self{qw(sock epwatch inflight)}; + $self->SUPER::close if $epwatch; + my $new_inflight = batch_prepare($self); + + while (my ($oid, $cb, $arg) = splice(@$old_inflight, 0, 3)) { + write_all($self, $oid."\n", \&cat_async_step, $new_inflight); + $oid = \$oid if !@$new_inflight; # to indicate oid retried + push @$new_inflight, $oid, $cb, $arg; + } + $sock->close if $sock; # only safe once old_inflight is empty + cat_async_step($self, $new_inflight); # take one step +} - $self->{inflight} = $inflight; - batch_prepare($self); - my $buf = "$req\n"; - for (my $i = 0; $i < @$inflight; $i += 3) { - $buf .= "$inflight->[$i]\n"; +sub gcf_inflight ($) { + my ($self) = @_; + # FIXME: the first {sock} check can succeed but Perl can complain + # about an undefined value. Not sure why or how this happens but + # t/imapd.t can complain about it, sometimes. + if ($self->{sock}) { + if (eval { $self->{sock}->can_reap }) { + return $self->{inflight}; + } elsif ($@) { + no warnings 'uninitialized'; + warn "E: $self sock=$self->{sock}: can_reap failed: ". + "$@ (continuing...)"; + } + delete @$self{qw(sock inflight)}; + } else { + $self->close; } - print { $self->{out} } $buf or fail($self, "write error: $!"); - unshift(@$inflight, \$req, $cb, $arg); # \$ref to indicate retried + undef; +} - cat_async_step($self, $inflight); # take one step +# returns true if prefetch is successful +sub async_prefetch { + my ($self, $oid, $cb, $arg) = @_; + my $inflight = gcf_inflight($self) or return; + return if @$inflight; + substr($oid, 0, 0) = 'contents ' if $self->{-bc}; + write_all($self, "$oid\n", \&cat_async_step, $inflight); + push(@$inflight, $oid, $cb, $arg); } sub cat_async_step ($$) { my ($self, $inflight) = @_; - die 'BUG: inflight empty or odd' if scalar(@$inflight) < 3; - my ($req, $cb, $arg) = splice(@$inflight, 0, 3); - my $rbuf = delete($self->{cat_rbuf}) // \(my $new = ''); + croak 'BUG: inflight empty or odd' if scalar(@$inflight) < 3; + my ($req, $cb, $arg) = @$inflight[0, 1, 2]; my ($bref, $oid, $type, $size); - my $head = my_readline($self->{in}, $rbuf); + my $head = $self->{sock}->my_readline; + my $cmd = ref($req) ? $$req : $req; # ->fail may be called via Gcf2Client.pm + my $info = $self->{-bc} && substr($cmd, 0, 5) eq 'info '; if ($head =~ /^([0-9a-f]{40,}) (\S+) ([0-9]+)$/) { ($oid, $type, $size) = ($1, $2, $3 + 0); - $bref = my_read($self->{in}, $rbuf, $size + 1) or - $self->fail(defined($bref) ? 'read EOF' : "read: $!"); - chop($$bref) eq "\n" or $self->fail('LF missing after blob'); + unless ($info) { # --batch-command + $bref = $self->{sock}->my_bufread($size + 1) or + $self->fail(defined($bref) ? + 'read EOF' : "read: $!"); + chop($$bref) eq "\n" or + $self->fail('LF missing after blob'); + } + } elsif ($info && $head =~ / (missing|ambiguous)\n/) { + $type = $1; + $oid = substr($cmd, 5); # remove 'info ' } elsif ($head =~ s/ missing\n//s) { $oid = $head; # ref($req) indicates it's already been retried # -gcf2 retries internally, so it never hits this path: if (!ref($req) && !$in_cleanup && $self->alternates_changed) { - return cat_async_retry($self, $inflight, - $req, $cb, $arg); + return cat_async_retry($self, $inflight); } $type = 'missing'; - $oid = ref($req) ? $$req : $req if $oid eq ''; + if ($oid eq '') { + $oid = $cmd; + $oid =~ s/\A(?:contents|info) // if $self->{-bc}; + } } else { - $self->fail("Unexpected result from async git cat-file: $head"); + my $err = $! ? " ($!)" : ''; + $self->fail("bad result from async cat-file: $head$err"); } + splice(@$inflight, 0, 3); # don't retry $cb on ->fail eval { $cb->($bref, $oid, $type, $size, $arg) }; - $self->{cat_rbuf} = $rbuf if $$rbuf ne ''; - warn "E: $oid: $@\n" if $@; + async_err($self, $req, $oid, $@, $info ? 'check' : 'cat') if $@; } sub cat_async_wait ($) { my ($self) = @_; - my $inflight = delete $self->{inflight} or return; - while (scalar(@$inflight)) { - cat_async_step($self, $inflight); - } + my $inflight = gcf_inflight($self) or return; + cat_async_step($self, $inflight) while (scalar(@$inflight)); } sub batch_prepare ($) { - _bidi_pipe($_[0], qw(--batch in out pid)); + my ($self) = @_; + if (git_version ge BATCH_CMD_VER) { + $self->{-bc} = 1; + _sock_cmd($self, 'batch-command', 1); + } else { + _sock_cmd($self, 'batch'); + } } sub _cat_file_cb { - my ($bref, undef, undef, $size, $result) = @_; - @$result = ($bref, $size); + my ($bref, $oid, $type, $size, $result) = @_; + @$result = ($bref, $oid, $type, $size); } sub cat_file { - my ($self, $oid, $sizeref) = @_; + my ($self, $oid) = @_; my $result = []; cat_async($self, $oid, \&_cat_file_cb, $result); cat_async_wait($self); - $$sizeref = $result->[1] if $sizeref; - $result->[0]; + wantarray ? @$result : $result->[0]; } sub check_async_step ($$) { - my ($self, $inflight_c) = @_; - die 'BUG: inflight empty or odd' if scalar(@$inflight_c) < 3; - my ($req, $cb, $arg) = splice(@$inflight_c, 0, 3); - my $rbuf = delete($self->{rbuf_c}) // \(my $new = ''); - chomp(my $line = my_readline($self->{in_c}, $rbuf)); + my ($ck, $inflight) = @_; + croak 'BUG: inflight empty or odd' if scalar(@$inflight) < 3; + my ($req, $cb, $arg) = @$inflight[0, 1, 2]; + chomp(my $line = $ck->{sock}->my_readline); my ($hex, $type, $size) = split(/ /, $line); - # Future versions of git.git may have type=ambiguous, but for now, - # we must handle 'dangling' below (and maybe some other oddball - # stuff): + # git <2.21 would show `dangling' (2.21+ shows `ambiguous') # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ - if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') { - my $ret = my_read($self->{in_c}, $rbuf, $type + 1); - fail($self, defined($ret) ? 'read EOF' : "read: $!") if !$ret; + if ($hex eq 'dangling') { + my $ret = $ck->{sock}->my_bufread($type + 1); + $ck->fail(defined($ret) ? 'read EOF' : "read: $!") if !$ret; } - eval { $cb->($hex, $type, $size, $arg, $self) }; - warn "E: check($req) $@\n" if $@; - $self->{rbuf_c} = $rbuf if $$rbuf ne ''; + splice(@$inflight, 0, 3); # don't retry $cb on ->fail + eval { $cb->(undef, $hex, $type, $size, $arg) }; + async_err($ck, $req, $hex, $@, 'check') if $@; } sub check_async_wait ($) { my ($self) = @_; - my $inflight_c = delete $self->{inflight_c} or return; - while (scalar(@$inflight_c)) { - check_async_step($self, $inflight_c); - } + return cat_async_wait($self) if $self->{-bc}; + my $ck = $self->{ck} or return; + my $inflight = gcf_inflight($ck) or return; + check_async_step($ck, $inflight) while (scalar(@$inflight)); +} + +# git <2.36 +sub ck { + $_[0]->{ck} //= bless { git_dir => $_[0]->{git_dir} }, + 'PublicInbox::GitCheck'; } sub check_async_begin ($) { my ($self) = @_; cleanup($self) if alternates_changed($self); - _bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c)); - die 'BUG: already in async check' if $self->{inflight_c}; - $self->{inflight_c} = []; + if (git_version ge BATCH_CMD_VER) { + $self->{-bc} = 1; + _sock_cmd($self, 'batch-command', 1); + } else { + _sock_cmd($self = ck($self), 'batch-check', 1); + } +} + +sub write_all { + my ($self, $buf, $read_step, $inflight) = @_; + $self->{sock} // Carp::confess 'BUG: no {sock}'; + Carp::confess('BUG: not an array') if ref($inflight) ne 'ARRAY'; + $read_step->($self, $inflight) while @$inflight >= MAX_INFLIGHT; + do { + my $w = syswrite($self->{sock}, $buf); + if (defined $w) { + return if $w == length($buf); + substr($buf, 0, $w, ''); # sv_chop + } elsif ($! != EAGAIN) { + $self->fail("write: $!"); + } + $read_step->($self, $inflight); + } while (1); } sub check_async ($$$$) { my ($self, $oid, $cb, $arg) = @_; - my $inflight_c = $self->{inflight_c} // check_async_begin($self); - if (scalar(@$inflight_c) >= MAX_INFLIGHT) { - check_async_step($self, $inflight_c); + my $inflight; + if ($self->{-bc}) { # likely as time goes on +batch_command: + $inflight = gcf_inflight($self) // cat_async_begin($self); + substr($oid, 0, 0) = 'info '; + write_all($self, "$oid\n", \&cat_async_step, $inflight); + } else { # accounts for git upgrades while we're running: + my $ck = $self->{ck}; # undef OK, maybe set in check_async_begin + $inflight = ($ck ? gcf_inflight($ck) : undef) + // check_async_begin($self); + goto batch_command if $self->{-bc}; + write_all($self->{ck}, "$oid\n", \&check_async_step, $inflight); } - print { $self->{out_c} } $oid, "\n" or fail($self, "write error: $!"); - push(@$inflight_c, $oid, $cb, $arg); + push(@$inflight, $oid, $cb, $arg); } sub _check_cb { # check_async callback - my ($hex, $type, $size, $result) = @_; + my (undef, $hex, $type, $size, $result) = @_; @$result = ($hex, $type, $size); } @@ -294,75 +403,87 @@ sub check { check_async_wait($self); my ($hex, $type, $size) = @$result; - # Future versions of git.git may show 'ambiguous', but for now, - # we must handle 'dangling' below (and maybe some other oddball - # stuff): + # git <2.21 would show `dangling' (2.21+ shows `ambiguous') # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ - return if $type eq 'missing' || $type eq 'ambiguous'; - return if $hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop'; + return if $type =~ /\A(?:missing|ambiguous)\z/ || $hex eq 'dangling'; ($hex, $type, $size); } -sub _destroy { - my ($self, $rbuf, $in, $out, $pid, $err) = @_; - delete @$self{($rbuf, $in, $out)}; - delete $self->{$err} if $err; # `err_c' +sub fail { + my ($self, $msg) = @_; + $self->close; + croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg"); +} - # GitAsyncCat::event_step may delete {pid} - my $p = delete $self->{$pid} or return; +sub async_err ($$$$$) { + my ($self, $req, $oid, $err, $action) = @_; + $req = $$req if ref($req); # retried + my $msg = "E: $action $req ($oid): $err"; + $async_warn ? carp($msg) : $self->fail($msg); +} - # PublicInbox::DS may not be loaded - eval { PublicInbox::DS::dwaitpid($p, undef, undef) }; - waitpid($p, 0) if $@; # wait synchronously if not in event loop +sub cmd { + my $self = shift; + [ git_exe(), "--git-dir=$self->{git_dir}", @_ ] } -sub cat_async_abort ($) { - my ($self) = @_; - if (my $inflight = delete $self->{inflight}) { - while (@$inflight) { - my ($req, $cb, $arg) = splice(@$inflight, 0, 3); - $req =~ s/ .*//; # drop git_dir for Gcf2Client - eval { $cb->(undef, $req, undef, undef, $arg) }; - warn "E: $req: $@ (in abort)\n" if $@; - } +# $git->popen(qw(show f00)); # or +# $git->popen(qw(show f00), { GIT_CONFIG => ... }, { 2 => ... }); +sub popen { + my ($self, $cmd) = splice(@_, 0, 2); + $cmd = $self->cmd(ref($cmd) ? @$cmd : + ($cmd, grep { defined && !ref } @_)); + popen_rd($cmd, grep { !defined || ref } @_); # env and opt +} + +# same args as popen above +sub qx { + my $fh = popen(@_); + if (wantarray) { + my @ret = <$fh>; + $fh->close; # caller should check $? + @ret; + } else { + local $/; + my $ret = <$fh>; + $fh->close; # caller should check $? + $ret; } - cleanup($self); } -sub fail { - my ($self, $msg) = @_; - cat_async_abort($self); - croak(ref($self) . ' ' . ($self->{git_dir} // '') . ": $msg"); +sub date_parse { + my $self = shift; + map { + substr($_, length('--max-age='), -1) + } $self->qx('rev-parse', map { "--since=$_" } @_); } -sub popen { - my ($self, @cmd) = @_; - @cmd = ('git', "--git-dir=$self->{git_dir}", @cmd); - popen_rd(\@cmd); +sub _active ($) { + scalar(@{gcf_inflight($_[0]) // []}) || + ($_[0]->{ck} && scalar(@{gcf_inflight($_[0]->{ck}) // []})) } -sub qx { - my ($self, @cmd) = @_; - my $fh = $self->popen(@cmd); - local $/ = "\n"; - return <$fh> if wantarray; - local $/; - <$fh> +# check_async and cat_async may trigger the other, so ensure they're +# both completely done by using this: +sub async_wait_all ($) { + my ($self) = @_; + while (_active($self)) { + check_async_wait($self); + cat_async_wait($self); + } } # returns true if there are pending "git cat-file" processes sub cleanup { - my ($self) = @_; + my ($self, $lazy) = @_; + ($lazy && _active($self)) and + return $self->{epwatch} ? watch_async($self) : 1; local $in_cleanup = 1; - delete $self->{async_cat}; - check_async_wait($self); - cat_async_wait($self); - _destroy($self, qw(cat_rbuf in out pid)); - _destroy($self, qw(chk_rbuf in_c out_c pid_c err_c)); - !!($self->{pid} || $self->{pid_c}); + async_wait_all($self); + $_->close for ($self, (delete($self->{ck}) // ())); + undef; } - # assuming a well-maintained repo, this should be a somewhat # accurate estimation of its size # TODO: show this in the WWW UI as a hint to potential cloners @@ -370,98 +491,193 @@ sub packed_bytes { my ($self) = @_; my $n = 0; my $pack_dir = git_path($self, 'objects/pack'); - foreach my $p (bsd_glob("$pack_dir/*.pack", GLOB_NOSORT)) { - $n += -s $p; - } + $n += (-s $_ // 0) for (bsd_glob("$pack_dir/*.pack", GLOB_NOSORT)); $n } -sub DESTROY { cleanup(@_) } +sub DESTROY { cleanup($_[0]) } sub local_nick ($) { - my ($self) = @_; - my $ret = '???'; # don't show full FS path, basename should be OK: - if ($self->{git_dir} =~ m!/([^/]+)(?:/\.git)?\z!) { - $ret = "/path/to/$1"; - } - wantarray ? ($ret) : $ret; + $_[0]->{nick} // ($_[0]->{git_dir} =~ m!/([^/]+?)(?:/*\.git/*)?\z! ? + "$1.git" : undef); } sub host_prefix_url ($$) { my ($env, $url) = @_; return $url if index($url, '//') >= 0; - my $scheme = $env->{'psgi.url_scheme'}; my $host_port = $env->{HTTP_HOST} // "$env->{SERVER_NAME}:$env->{SERVER_PORT}"; - "$scheme://$host_port". ($env->{SCRIPT_NAME} || '/') . $url; + my $sn = $env->{SCRIPT_NAME} // ''; + "$env->{'psgi.url_scheme'}://\L$host_port\E$sn/$url"; +} + +sub base_url { # for coderepos, PSGI-only + my ($self, $env) = @_; # env - PSGI env + my $nick = $self->{nick} // return undef; + my $url = host_prefix_url($env, ''); + # for mount in Plack::Builder + $url .= '/' if substr($url, -1, 1) ne '/'; + $url . $nick . '/'; } +sub isrch {} # TODO + sub pub_urls { my ($self, $env) = @_; if (my $urls = $self->{cgit_url}) { - return map { host_prefix_url($env, $_) } @$urls; + map { host_prefix_url($env, $_) } @$urls; + } else { + (base_url($self, $env) // '???'); } - local_nick($self); } sub cat_async_begin { my ($self) = @_; cleanup($self) if $self->alternates_changed; - $self->batch_prepare; - die 'BUG: already in async' if $self->{inflight}; - $self->{inflight} = []; + die 'BUG: already in async' if gcf_inflight($self); + batch_prepare($self); } sub cat_async ($$$;$) { my ($self, $oid, $cb, $arg) = @_; - my $inflight = $self->{inflight} // cat_async_begin($self); - if (scalar(@$inflight) >= MAX_INFLIGHT) { - cat_async_step($self, $inflight); - } - - print { $self->{out} } $oid, "\n" or fail($self, "write error: $!"); + my $inflight = gcf_inflight($self) // cat_async_begin($self); + substr($oid, 0, 0) = 'contents ' if $self->{-bc}; + write_all($self, $oid."\n", \&cat_async_step, $inflight); push(@$inflight, $oid, $cb, $arg); } -sub async_prefetch { - my ($self, $oid, $cb, $arg) = @_; - if (my $inflight = $self->{inflight}) { - # we could use MAX_INFLIGHT here w/o the halving, - # but lets not allow one client to monopolize a git process - if (scalar(@$inflight) < int(MAX_INFLIGHT/2)) { - print { $self->{out} } $oid, "\n" or - fail($self, "write error: $!"); - return push(@$inflight, $oid, $cb, $arg); +# returns the modified time of a git repo, same as the "modified" field +# of a grokmirror manifest +sub modified ($;$) { + my $fh = $_[1] // popen($_[0], @MODIFIED_DATE); + (split(/ /, <$fh> // time))[0] + 0; # integerize for JSON +} + +sub cat_desc ($) { + my $desc = try_cat($_[0]); + chomp $desc; + utf8::decode($desc); + $desc =~ s/\s+/ /smg; + $desc eq '' ? undef : $desc; +} + +sub description { + cat_desc("$_[0]->{git_dir}/description") // 'Unnamed repository'; +} + +sub cloneurl { + my ($self, $env) = @_; + $self->{cloneurl} // do { + my @urls = split(/\s+/s, try_cat("$self->{git_dir}/cloneurl")); + scalar(@urls) ? ($self->{cloneurl} = \@urls) : undef; + } // [ substr(base_url($self, $env), 0, -1) ]; +} + +# for grokmirror, which doesn't read gitweb.description +# templates/hooks--update.sample and git-multimail in git.git +# only match "Unnamed repository", not the full contents of +# templates/this--description in git.git +sub manifest_entry { + my ($self, $epoch, $default_desc) = @_; + my $gd = $self->{git_dir}; + my @git = (git_exe, "--git-dir=$gd"); + my $sr = popen_rd([@git, 'show-ref']); + my $own = popen_rd([@git, qw(config gitweb.owner)]); + my $mod = popen_rd([@git, @MODIFIED_DATE]); + my $buf = description($self); + if (defined $epoch && index($buf, 'Unnamed repository') == 0) { + $buf = "$default_desc [epoch $epoch]"; + } + my $ent = { description => $buf, reference => undef }; + if (open(my $alt, '<', "$gd/objects/info/alternates")) { + # n.b.: GitPython doesn't seem to handle comments or C-quoted + # strings like native git does; and we don't for now, either. + local $/ = "\n"; + chomp(my @alt = <$alt>); + + # grokmirror only supports 1 alternate for "reference", + if (scalar(@alt) == 1) { + $buf = File::Spec->rel2abs($alt[0], "$gd/objects"); + $buf =~ s!/[^/]+/?\z!!; # basename + $ent->{reference} = $buf; } } - undef; + $ent->{fingerprint} = sha_all(1, $sr)->hexdigest; + $sr->close or return; # empty, uninitialized git repo + $ent->{modified} = modified(undef, $mod); + chomp($buf = <$own> // ''); + utf8::decode($buf); + $ent->{owner} = $buf eq '' ? undef : $buf; + $ent; +} + +our $ck_unlinked_packs = $^O eq 'linux' ? sub { + # FIXME: port gcf2-like over to git.git so we won't need to + # deal with libgit2 + my $s = try_cat "/proc/$_[0]/maps"; + $s =~ /\.(?:idx|pack) \(deleted\)/s ? 1 : undef; +} : undef; + +# returns true if there are pending cat-file processes +sub cleanup_if_unlinked { + my ($self) = @_; + $ck_unlinked_packs or return cleanup($self, 1); + # Linux-specific /proc/$PID/maps access + # TODO: support this inside git.git + my $nr_live = 0; + for my $obj ($self, ($self->{ck} // ())) { + my $sock = $obj->{sock} // next; + my $pid = $sock->attached_pid // next; + $ck_unlinked_packs->($pid) and return cleanup($self, 1); + ++$nr_live; + } + $nr_live; } -sub extract_cmt_time { - my ($bref, undef, undef, undef, $modified) = @_; - - if ($$bref =~ /^committer .*?> ([0-9]+) [\+\-]?[0-9]+/sm) { - my $cmt_time = $1 + 0; - $$modified = $cmt_time if $cmt_time > $$modified; +sub event_step { + my ($self) = @_; + my $inflight = gcf_inflight($self); + if ($inflight && @$inflight) { + $self->cat_async_step($inflight); + return $self->close unless $self->{sock}; + # don't loop here to keep things fair, but we must requeue + # if there's already-read data in pi_io_rbuf + $self->requeue if $self->{sock}->has_rbuf; } } -# returns the modified time of a git repo, same as the "modified" field -# of a grokmirror manifest -sub modified ($) { +sub schedule_cleanup { + my ($self) = @_; + PublicInbox::DS::add_uniq_timer($self+0, 30, \&cleanup, $self, 1); +} + +# idempotently registers with DS epoll/kqueue/select/poll +sub watch_async ($) { my ($self) = @_; - my $modified = 0; - my $fh = popen($self, qw(rev-parse --branches)); - local $/ = "\n"; - while (my $oid = <$fh>) { - chomp $oid; - cat_async($self, $oid, \&extract_cmt_time, \$modified); + schedule_cleanup($self); + $self->{epwatch} //= do { + $self->SUPER::new($self->{sock}, EPOLLIN); + \undef; } - cat_async_wait($self); - $modified || time; } +sub close { + my ($self) = @_; + my $sock = $self->{sock}; + delete @$self{qw(-bc err_c inflight)}; + delete($self->{epwatch}) ? $self->SUPER::close : delete($self->{sock}); + $sock->close if $sock; # calls gcf_drain via awaitpid +} + +package PublicInbox::GitCheck; # only for git <2.36 +use v5.12; +our @ISA = qw(PublicInbox::Git); +no warnings 'once'; + +# for event_step +*cat_async_step = \&PublicInbox::Git::check_async_step; + 1; __END__ =pod diff --git a/lib/PublicInbox/GitAsyncCat.pm b/lib/PublicInbox/GitAsyncCat.pm index b9dbe0cc..f57e0336 100644 --- a/lib/PublicInbox/GitAsyncCat.pm +++ b/lib/PublicInbox/GitAsyncCat.pm @@ -1,89 +1,51 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# -# internal class used by PublicInbox::Git + PublicInbox::DS -# This parses the output pipe of "git cat-file --batch" -# -# Note: this does NOT set the non-blocking flag, we expect `git cat-file' -# to be a local process, and git won't start writing a blob until it's -# fully read. So minimize context switching and read as much as possible -# and avoid holding a buffer in our heap any longer than it has to live. package PublicInbox::GitAsyncCat; -use strict; -use parent qw(PublicInbox::DS Exporter); -use POSIX qw(WNOHANG); -use PublicInbox::Syscall qw(EPOLLIN EPOLLET); -our @EXPORT = qw(git_async_cat git_async_prefetch); -use PublicInbox::Git (); +use v5.12; +use parent qw(Exporter); +our @EXPORT = qw(ibx_async_cat ibx_async_prefetch async_check); our $GCF2C; # singleton PublicInbox::Gcf2Client -sub close { - my ($self) = @_; - - if (my $gitish = delete $self->{gitish}) { - PublicInbox::Git::cat_async_abort($gitish); - } - $self->SUPER::close; # PublicInbox::DS::close -} - -sub event_step { - my ($self) = @_; - my $gitish = $self->{gitish} or return; - return $self->close if ($gitish->{in} // 0) != ($self->{sock} // 1); - my $inflight = $gitish->{inflight}; - if ($inflight && @$inflight) { - $gitish->cat_async_step($inflight); - - # child death? - if (($gitish->{in} // 0) != ($self->{sock} // 1)) { - $self->close; - } elsif (@$inflight || exists $gitish->{cat_rbuf}) { - # ok, more to do, requeue for fairness - $self->requeue; - } - } elsif ((my $pid = waitpid($gitish->{pid}, WNOHANG)) > 0) { - # May happen if the child process is killed by a BOFH - # (or segfaults) - delete $gitish->{pid}; - warn "E: gitish $pid exited with \$?=$?\n"; - $self->close; +sub ibx_async_cat ($$$$) { + my ($ibx, $oid, $cb, $arg) = @_; + my $isrch = $ibx->{isrch}; + my $git = $isrch ? $isrch->{es}->git : ($ibx->{git} // $ibx->git); + # {topdir} means ExtSearch (likely [extindex "all"]) with potentially + # 100K alternates. git v2.33+ can handle 100k alternates fairly well. + if (!$isrch && !defined($ibx->{topdir}) && !defined($git->{-tmp}) && + ($GCF2C //= eval { + require PublicInbox::Gcf2Client; + PublicInbox::Gcf2Client::new(); + } // 0)) { # 0: do not retry if libgit2 or Inline::C are missing + $GCF2C->gcf2_async("$oid $git->{git_dir}\n", $cb, $arg); + \undef; + } else { # read-only end of git-cat-file pipe + $git->cat_async($oid, $cb, $arg); + $git->watch_async; } } -sub git_async_cat ($$$$) { - my ($git, $oid, $cb, $arg) = @_; - my $gitish = $GCF2C; - if ($gitish) { - $oid .= " $git->{git_dir}"; - } else { - $gitish = $git; - } - $gitish->cat_async($oid, $cb, $arg); - $gitish->{async_cat} //= do { - my $self = bless { gitish => $gitish }, __PACKAGE__; - $self->SUPER::new($gitish->{in}, EPOLLIN|EPOLLET); - \undef; # this is a true ref() - }; +sub async_check ($$$$) { + my ($ibx, $oidish, $cb, $arg) = @_; # $ibx may be $ctx + my $git = $ibx->{git} // $ibx->git; + $git->check_async($oidish, $cb, $arg); + ($git->{ck} // $git)->watch_async; } # this is safe to call inside $cb, but not guaranteed to enqueue -# returns true if successful, undef if not. -sub git_async_prefetch { - my ($git, $oid, $cb, $arg) = @_; - if ($GCF2C) { - if ($GCF2C->{async_cat} && !$GCF2C->{wbuf}) { - $oid .= " $git->{git_dir}"; - return $GCF2C->cat_async($oid, $cb, $arg); - } - } elsif ($git->{async_cat} && (my $inflight = $git->{inflight})) { - # we could use MAX_INFLIGHT here w/o the halving, - # but lets not allow one client to monopolize a git process - if (@$inflight < int(PublicInbox::Git::MAX_INFLIGHT/2)) { - print { $git->{out} } $oid, "\n" or - $git->fail("write error: $!"); - return push(@$inflight, $oid, $cb, $arg); +# returns true if successful, undef if not. For fairness, we only +# prefetch if there's no in-flight requests. +sub ibx_async_prefetch { + my ($ibx, $oid, $cb, $arg) = @_; + my $git = $ibx->git; + if (!defined($ibx->{topdir}) && $GCF2C) { + if (!@{$GCF2C->gcf_inflight // []}) { + $oid .= " $git->{git_dir}\n"; + return $GCF2C->gcf2_async($oid, $cb, $arg); # true } + } elsif ($git->{epwatch}) { + return $git->async_prefetch($oid, $cb, $arg); } undef; } diff --git a/lib/PublicInbox/GitCredential.pm b/lib/PublicInbox/GitCredential.pm index c6da6a09..bb225ff3 100644 --- a/lib/PublicInbox/GitCredential.pm +++ b/lib/PublicInbox/GitCredential.pm @@ -1,32 +1,44 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# git-credential wrapper with built-in .netrc fallback package PublicInbox::GitCredential; -use strict; +use v5.12; use PublicInbox::Spawn qw(popen_rd); +use autodie qw(close pipe); -sub run ($$) { - my ($self, $op) = @_; - my ($in_r, $in_w); - pipe($in_r, $in_w) or die "pipe: $!"; - my $out_r = popen_rd([qw(git credential), $op], undef, { 0 => $in_r }); - close $in_r or die "close in_r: $!"; +sub run ($$;$) { + my ($self, $op, $lei) = @_; + my ($in_r, $in_w, $out_r); + my $cmd = [ qw(git credential), $op ]; + pipe($in_r, $in_w); + if ($lei) { # we'll die if disconnected: + pipe($out_r, my $out_w); + $lei->send_exec_cmd([ $in_r, $out_w ], $cmd, {}); + } else { + $out_r = popen_rd($cmd, undef, { 0 => $in_r }); + } + close $in_r; my $out = ''; for my $k (qw(url protocol host username password)) { - defined(my $v = $self->{$k}) or next; + my $v = $self->{$k} // next; die "`$k' contains `\\n' or `\\0'\n" if $v =~ /[\n\0]/; $out .= "$k=$v\n"; } - $out .= "\n"; - print $in_w $out or die "print (git credential $op): $!"; - close $in_w or die "close (git credential $op): $!"; + say $in_w $out; + close $in_w; return $out_r if $op eq 'fill'; <$out_r> and die "unexpected output from `git credential $op'\n"; - close $out_r or die "`git credential $op' failed: \$!=$! \$?=$?\n"; + $out_r->close or die "`git credential $op' failed: \$!=$! \$?=$?\n"; } -sub check_netrc ($) { - my ($self) = @_; +sub check_netrc { + my ($self, $lei) = @_; + + # n.b. lei doesn't load ~/.netrc by default, public-inbox-watch does, + # which may've been a mistake, but we have to live with it. + return if ($lei && !$lei->{opt}->{netrc}); # part of the standard library, but distributions may split it out eval { require Net::Netrc }; @@ -41,15 +53,16 @@ sub check_netrc ($) { } sub fill { - my ($self) = @_; - my $out_r = run($self, 'fill'); + my ($self, $lei) = @_; + my $out_r = run($self, 'fill', $lei); while (<$out_r>) { chomp; return if $_ eq ''; /\A([^=]+)=(.*)\z/ or die "bad line: $_\n"; $self->{$1} = $2; } - close $out_r or die "git credential fill failed: \$!=$! \$?=$?\n"; + $out_r->close or die "git credential fill failed: \$!=$! \$?=$?\n"; + $self->{filled} = 1; } 1; diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index fd2e00dd..396aa783 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -1,20 +1,22 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # when no endpoints match, fallback to this and serve a static file # or smart HTTP. This is our wrapper for git-http-backend(1) package PublicInbox::GitHTTPBackend; use strict; -use warnings; +use v5.10.1; use Fcntl qw(:seek); use IO::Handle; # ->flush use HTTP::Date qw(time2str); +use PublicInbox::Limiter; use PublicInbox::Qspawn; use PublicInbox::Tmpfile; use PublicInbox::WwwStatic qw(r @NO_CACHE); +use Carp (); # 32 is same as the git-daemon connection limit -my $default_limiter = PublicInbox::Qspawn::Limiter->new(32); +my $default_limiter = PublicInbox::Limiter->new(32); # n.b. serving "description" and "cloneurl" should be innocuous enough to # not cause problems. serving "config" might... @@ -22,13 +24,10 @@ my @text = qw[HEAD info/refs info/attributes objects/info/(?:http-alternates|alternates|packs) cloneurl description]; -my @binary = qw! - objects/[a-f0-9]{2}/[a-f0-9]{38} - objects/pack/pack-[a-f0-9]{40}\.(?:pack|idx) - !; +my @binary = ('objects/[a-f0-9]{2}/[a-f0-9]{38,62}', + 'objects/pack/pack-[a-f0-9]{40,64}\.(?:pack|idx)'); our $ANY = join('|', @binary, @text, 'git-upload-pack'); -my $BIN = join('|', @binary); my $TEXT = join('|', @text); sub serve { @@ -45,10 +44,7 @@ sub serve { serve_dumb($env, $git, $path); } -sub err ($@) { - my ($env, @msg) = @_; - $env->{'psgi.errors'}->print(@msg, "\n"); -} +sub ucarp { Carp::carp(@_); undef } my $prev = 0; my $exp; @@ -64,13 +60,13 @@ sub serve_dumb { my $h = []; my $type; - if ($path =~ m!\Aobjects/[a-f0-9]{2}/[a-f0-9]{38}\z!) { + if ($path =~ m!\Aobjects/[a-f0-9]{2}/[a-f0-9]{38,62}\z!) { $type = 'application/x-git-loose-object'; cache_one_year($h); - } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.pack\z!) { + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40,64}\.pack\z!) { $type = 'application/x-git-packed-objects'; cache_one_year($h); - } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.idx\z!) { + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40,64}\.idx\z!) { $type = 'application/x-git-packed-objects-toc'; cache_one_year($h); } elsif ($path =~ /\A(?:$TEXT)\z/o) { @@ -83,10 +79,10 @@ sub serve_dumb { PublicInbox::WwwStatic::response($env, $h, $path, $type); } -sub git_parse_hdr { # {parse_hdr} for Qspawn - my ($r, $bref, $dumb_args) = @_; +sub ghb_parse_hdr { # header parser for Qspawn + my ($r, $bref, @dumb_args) = @_; my $res = parse_cgi_headers($r, $bref) or return; # incomplete - $res->[0] == 403 ? serve_dumb(@$dumb_args) : $res; + $res->[0] == 403 ? serve_dumb(@dumb_args) : $res; } # returns undef if 403 so it falls back to dumb HTTP @@ -98,6 +94,7 @@ sub serve_smart { foreach my $name (qw(QUERY_STRING REMOTE_USER REMOTE_ADDR HTTP_CONTENT_ENCODING + HTTP_GIT_PROTOCOL CONTENT_TYPE SERVER_PROTOCOL REQUEST_METHOD)) { @@ -108,8 +105,9 @@ sub serve_smart { $env{GIT_HTTP_EXPORT_ALL} = '1'; $env{PATH_TRANSLATED} = "$git->{git_dir}/$path"; my $rdr = input_prepare($env) or return r(500); + $rdr->{quiet} = 1; my $qsp = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, $rdr); - $qsp->psgi_return($env, $limiter, \&git_parse_hdr, [$env, $git, $path]); + $qsp->psgi_yield($env, $limiter, \&ghb_parse_hdr, $env, $git, $path); } sub input_prepare { @@ -117,42 +115,23 @@ sub input_prepare { my $input = $env->{'psgi.input'}; my $fd = eval { fileno($input) }; - if (defined $fd && $fd >= 0) { - return { 0 => $fd }; - } + return { 0 => $fd } if (defined $fd && $fd >= 0); my $id = "git-http.input.$env->{REMOTE_ADDR}:$env->{REMOTE_PORT}"; - my $in = tmpfile($id); - unless (defined $in) { - err($env, "could not open temporary file: $!"); - return; - } + my $in = tmpfile($id) // return ucarp("tmpfile: $!"); my $buf; while (1) { - my $r = $input->read($buf, 8192); - unless (defined $r) { - err($env, "error reading input: $!"); - return; - } + my $r = $input->read($buf, 8192) // return ucarp("read $!"); last if $r == 0; - unless (print $in $buf) { - err($env, "error writing temporary file: $!"); - return; - } + print $in $buf // return ucarp("print: $!"); } # ensure it's visible to git-http-backend(1): - unless ($in->flush) { - err($env, "error writing temporary file: $!"); - return; - } - unless (defined(sysseek($in, 0, SEEK_SET))) { - err($env, "error seeking temporary file: $!"); - return; - } + $in->flush // return ucarp("flush: $!"); + sysseek($in, 0, SEEK_SET) // return ucarp($env, "seek: $!"); { 0 => $in }; } -sub parse_cgi_headers { - my ($r, $bref) = @_; +sub parse_cgi_headers { # {parse_hdr} for Qspawn + my ($r, $bref, $ctx) = @_; return r(500) unless defined $r && $r >= 0; $$bref =~ s/\A(.*?)\r?\n\r?\n//s or return $r == 0 ? r(500) : undef; my $h = $1; @@ -166,7 +145,18 @@ sub parse_cgi_headers { push @h, $k, $v; } } - [ $code, \@h ] + + # fallback to WwwCoderepo if cgit 404s + if ($code == 404 && $ctx->{www} && !$ctx->{_coderepo_tried}++) { + my $wcb = delete $ctx->{env}->{'qspawn.wcb'}; + $ctx->{env}->{'plack.skip-deflater'} = 1; # prevent 2x gzip + $ctx->{env}->{'qspawn.fallback'} = $code; + my $res = $ctx->{www}->coderepo->srv($ctx); + $ctx->{env}->{'qspawn.wcb'} = $wcb; + $res; # CODE or ARRAY ref + } else { + [ $code, \@h ] + } } 1; diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm index 20030433..8b630f25 100644 --- a/lib/PublicInbox/GzipFilter.pm +++ b/lib/PublicInbox/GzipFilter.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # In public-inbox <=1.5.0, public-inbox-httpd favored "getline" @@ -18,6 +18,7 @@ use Compress::Raw::Zlib qw(Z_OK); use PublicInbox::CompressNoop; use PublicInbox::Eml; use PublicInbox::GitAsyncCat; +use Carp qw(carp); our @EXPORT_OK = qw(gzf_maybe); my %OPT = (-WindowBits => 15 + 16, -AppendOutput => 1); @@ -46,18 +47,20 @@ sub gz_or_noop { sub gzf_maybe ($$) { bless { gz => gz_or_noop(@_) }, __PACKAGE__ } sub psgi_response { + # $code may be an HTTP response code (e.g. 200) or a CODE ref (mbox_hdr) my ($self, $code, $res_hdr) = @_; - my $env = $self->{env}; - $self->{gz} //= gz_or_noop($res_hdr, $env); - if ($env->{'pi-httpd.async'}) { - my $http = $env->{'psgix.io'}; # PublicInbox::HTTP + if ($self->{env}->{'pi-httpd.async'}) { + my $http = $self->{env}->{'psgix.io'}; # PublicInbox::HTTP $http->{forward} = $self; sub { my ($wcb) = @_; # -httpd provided write callback - $self->{http_out} = $wcb->([$code, $res_hdr]); + $self->{wcb_args} = [ $code, $res_hdr, $wcb ]; $self->can('async_next')->($http); # start stepping }; } else { # generic PSGI code path + ref($code) eq 'CODE' and + ($code, $res_hdr) = @{$code->($self)}; + $self->{gz} //= gz_or_noop($res_hdr, $self->{env}); [ $code, $res_hdr, $self ]; } } @@ -84,104 +87,132 @@ sub gzip_or_die () { $gz; } -# for GetlineBody (via Qspawn) when NOT using $env->{'pi-httpd.async'} +sub gone { # what: search/over/mm + my ($ctx, $what) = @_; + warn "W: `$ctx->{ibx}->{name}' $what went away unexpectedly\n"; + undef; +} + +# for GetlineResponse (via Qspawn) when NOT using $env->{'pi-httpd.async'} # Also used for ->getline callbacks -sub translate ($$) { - my $self = $_[0]; # $_[1] => input +sub translate { + my $self = shift; # $_[1] => input # allocate the zlib context lazily here, instead of in ->new. # Deflate contexts are memory-intensive and this object may # be sitting in the Qspawn limiter queue for a while. - my $gz = $self->{gz} //= gzip_or_die(); - my $zbuf = delete($self->{zbuf}); - if (defined $_[1]) { # my $buf = $_[1]; - my $err = $gz->deflate($_[1], $zbuf); - die "gzip->deflate: $err" if $err != Z_OK; - return $zbuf if length($zbuf) >= 8192; - - $self->{zbuf} = $zbuf; - ''; + $self->{gz} //= gzip_or_die(); + if (defined $_[0]) { # my $buf = $_[1]; + zmore($self, @_); + length($self->{zbuf}) >= 8192 ? delete($self->{zbuf}) : ''; } else { # undef == EOF - my $err = $gz->flush($zbuf); - die "gzip->flush: $err" if $err != Z_OK; - $zbuf; + $self->zflush; } } +# returns PublicInbox::HTTP::{Chunked,Identity} +sub http_out ($) { + my ($self) = @_; + $self->{http_out} // do { + my $args = delete $self->{wcb_args} // return undef; + my $wcb = pop @$args; # from PublicInbox:HTTP async + # $args->[0] may be \&mbox_hdr or similar + $args = $args->[0]->($self) if ref($args->[0]) eq 'CODE'; + $self->{gz} //= gz_or_noop($args->[1], $self->{env}); + $self->{http_out} = $wcb->($args); # $wcb->([$code, $hdr_ary]) + }; +} + +# returns undef if HTTP client disconnected, may return 0 +# because ->translate can return '' sub write { - # my $ret = bytes::length($_[1]); # XXX does anybody care? - $_[0]->{http_out}->write(translate($_[0], $_[1])); + my $self = shift; + http_out($self)->write($self->translate(@_)); +} + +sub zfh { + $_[0]->{zfh} // do { + open($_[0]->{zfh}, '>>', \($_[0]->{pbuf} //= '')) or + die "open: $!"; + $_[0]->{zfh} + }; } # similar to ->translate; use this when we're sure we know we have # more data to buffer after this sub zmore { - my $self = $_[0]; # $_[1] => input - my $err = $self->{gz}->deflate($_[1], $self->{zbuf}); - die "gzip->deflate: $err" if $err != Z_OK; - undef; + my $self = shift; + my $zfh = delete $self->{zfh}; + if (@_ > 1 || $zfh) { + print { $zfh // zfh($self) } @_; + @_ = (delete $self->{pbuf}); + delete $self->{zfh}; + }; + http_out($self); + my $err; + ($err = $self->{gz}->deflate($_[0], $self->{zbuf})) == Z_OK or + die "gzip->deflate: $err"; } # flushes and returns the final bit of gzipped data -sub zflush ($;$) { - my $self = $_[0]; # $_[1] => final input (optional) - my $zbuf = delete $self->{zbuf}; - my $gz = delete $self->{gz}; +sub zflush ($;@) { + my $self = shift; # $_[1..Inf] => final input (optional) + zmore($self, @_) if scalar(@_) || $self->{zfh}; + # not a bug, recursing on DS->write failure + my $gz = delete $self->{gz} // return ''; my $err; - if (defined $_[1]) { - $err = $gz->deflate($_[1], $zbuf); - die "gzip->deflate: $err" if $err != Z_OK; - } - $err = $gz->flush($zbuf); - die "gzip->flush: $err" if $err != Z_OK; + my $zbuf = delete $self->{zbuf}; + ($err = $gz->flush($zbuf)) == Z_OK or die "gzip->flush: $err"; $zbuf; } sub close { my ($self) = @_; - if (my $http_out = delete $self->{http_out}) { - $http_out->write(zflush($self)); - $http_out->close; - } + my $http_out = http_out($self) // return; + $http_out->write($self->zflush); + (delete($self->{http_out}) // return)->close; } -sub bail { +sub bail { my $self = shift; - if (my $env = $self->{env}) { - eval { $env->{'psgi.errors'}->print(@_, "\n") }; - warn("E: error printing to psgi.errors: $@", @_) if $@; - my $http = $env->{'psgix.io'} or return; # client abort - eval { $http->close }; # should hit our close - warn "E: error in http->close: $@" if $@; - eval { $self->close }; # just in case... - warn "E: error in self->close: $@" if $@; - } else { - warn @_, "\n"; - } + carp @_; + my $env = $self->{env} or return; + my $http = $env->{'psgix.io'} or return; # client abort + eval { $http->close }; # should hit our close + carp "E: error in http->close: $@" if $@; + eval { $self->close }; # just in case... + carp "E: error in self->close: $@" if $@; } # this is public-inbox-httpd-specific sub async_blob_cb { # git->cat_async callback my ($bref, $oid, $type, $size, $self) = @_; - my $http = $self->{env}->{'psgix.io'}; + my $http = $self->{env}->{'psgix.io'}; # PublicInbox::HTTP $http->{forward} or return; # client aborted - my $smsg = $self->{smsg} or bail($self, 'BUG: no smsg'); - if (!defined($oid)) { + my $smsg = $self->{smsg} or return bail($self, 'BUG: no smsg'); + $type // return + bail($self, "abort: $smsg->{blob} $self->{ibx}->{inboxdir}"); + if ($type ne 'blob') { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message - warn "E: $smsg->{blob} missing in $self->{-inbox}->{inboxdir}\n"; + warn "E: $smsg->{blob} $type in $self->{ibx}->{inboxdir}\n"; return $http->next_step($self->can('async_next')); } - $smsg->{blob} eq $oid or bail($self, "BUG: $smsg->{blob} != $oid"); + $smsg->{blob} eq $oid or return + bail($self, "BUG: $smsg->{blob} != $oid"); eval { $self->async_eml(PublicInbox::Eml->new($bref)) }; - bail($self, "E: async_eml: $@") if $@; - $http->next_step($self->can('async_next')); + return bail($self, "E: async_eml: $@") if $@; + if ($self->{-low_prio}) { # run via PublicInbox::WWW::event_step + push(@{$self->{www}->{-low_prio_q}}, $self) == 1 and + PublicInbox::DS::requeue($self->{www}); + } else { + $http->next_step($self->can('async_next')); + } } sub smsg_blob { my ($self, $smsg) = @_; - git_async_cat($self->{-inbox}->git, $smsg->{blob}, - \&async_blob_cb, $self); + ibx_async_cat($self->{ibx}, $smsg->{blob}, \&async_blob_cb, $self); } 1; diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 5844ef44..7162732e 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Generic PSGI server for convenience. It aims to provide @@ -21,13 +21,11 @@ package PublicInbox::HTTP; use strict; use parent qw(PublicInbox::DS); -use bytes (); # only for bytes::length use Fcntl qw(:seek); use Plack::HTTPParser qw(parse_http_request); # XS or pure Perl use Plack::Util; use HTTP::Status qw(status_message); use HTTP::Date qw(time2str); -use IO::Handle; # ->write use PublicInbox::DS qw(msg_more); use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); use PublicInbox::Tmpfile; @@ -39,23 +37,19 @@ use constant { }; use Errno qw(EAGAIN); -my $pipelineq = []; -sub process_pipelineq () { - my $q = $pipelineq; - $pipelineq = []; - foreach (@$q) { - next unless $_->{sock}; - rbuf_process($_); - } -} - # Use the same configuration parameter as git since this is primarily # a slow-client sponge for git-http-backend # TODO: support per-respository http.maxRequestBuffer somehow... our $MAX_REQUEST_BUFFER = $ENV{GIT_HTTP_MAX_REQUEST_BUFFER} || (10 * 1024 * 1024); -open(my $null_io, '<', '/dev/null') or die "failed to open /dev/null: $!"; +open(my $null_io, '<', '/dev/null') or die "open /dev/null: $!"; +{ + my @n = stat($null_io) or die "stat(/dev/null): $!"; + my @i = stat(STDIN) or die "stat(STDIN): $!"; + $null_io = *STDIN{IO} if "@n[0, 1]" eq "@i[0, 1]"; +} + my $http_date; my $prev = 0; sub http_date () { @@ -64,13 +58,13 @@ sub http_date () { } sub new ($$$) { - my ($class, $sock, $addr, $httpd) = @_; - my $self = bless { httpd => $httpd }, $class; + my ($class, $sock, $addr, $srv_env) = @_; + my $self = bless { srv_env => $srv_env }, $class; my $ev = EPOLLIN; my $wbuf; if ($sock->can('accept_SSL') && !$sock->accept_SSL) { - return CORE::close($sock) if $! != EAGAIN; - $ev = PublicInbox::TLS::epollbit(); + return $sock->close if $! != EAGAIN; + $ev = PublicInbox::TLS::epollbit() or return $sock->close; $wbuf = [ \&PublicInbox::DS::accept_tls_step ]; } $self->{wbuf} = $wbuf if $wbuf; @@ -81,52 +75,36 @@ sub new ($$$) { sub event_step { # called by PublicInbox::DS my ($self) = @_; - - return unless $self->flush_write && $self->{sock}; + local $SIG{__WARN__} = $self->{srv_env}->{'pi-httpd.warn_cb'}; + return unless $self->flush_write && $self->{sock} && !$self->{forward}; # only read more requests if we've drained the write buffer, # otherwise we can be buffering infinitely w/o backpressure return read_input($self) if ref($self->{env}); - my $rbuf = $self->{rbuf} // (\(my $x = '')); - $self->do_read($rbuf, 8192, bytes::length($$rbuf)) or return; - rbuf_process($self, $rbuf); -} -sub rbuf_process { - my ($self, $rbuf) = @_; - $rbuf //= $self->{rbuf} // (\(my $x = '')); - - my %env = %{$self->{httpd}->{env}}; # full hash copy - my $r = parse_http_request($$rbuf, \%env); - - # We do not support Trailers in chunked requests, for now - # (they are rarely-used and git (as of 2.7.2) does not use them) - if ($r == -1 || $env{HTTP_TRAILER} || - # this length-check is necessary for PURE_PERL=1: - ($r == -2 && bytes::length($$rbuf) > 0x4000)) { - return quit($self, 400); - } - if ($r < 0) { # incomplete - $self->rbuf_idle($rbuf); - return $self->requeue; + my $rbuf = $self->{rbuf} // (\(my $x = '')); + my %env = %{$self->{srv_env}}; # full hash copy + my $r; + while (($r = parse_http_request($$rbuf, \%env)) < 0) { + # We do not support Trailers in chunked requests, for + # now (they are rarely-used and git (as of 2.7.2) does + # not use them). + # this length-check is necessary for PURE_PERL=1: + if ($r == -1 || $env{HTTP_TRAILER} || + ($r == -2 && length($$rbuf) > 0x4000)) { + return quit($self, 400); + } + $self->do_read($rbuf, 8192, length($$rbuf)) or return; } + return quit($self, 400) if grep(/\s/, keys %env); # stop smugglers $$rbuf = substr($$rbuf, $r); - my $len = input_prepare($self, \%env); - defined $len or return write_err($self, undef); # EMFILE/ENFILE + my $len = input_prepare($self, \%env) // + return write_err($self, undef); # EMFILE/ENFILE $len ? read_input($self, $rbuf) : app_dispatch($self, undef, $rbuf); } -# IO::Handle::write returns boolean, this returns bytes written: -sub xwrite ($$$) { - my ($fh, $rbuf, $max) = @_; - my $w = bytes::length($$rbuf); - $w = $max if $w > $max; - $fh->write($$rbuf, $w) or return; - $w; -} - sub read_input ($;$) { my ($self, $rbuf) = @_; $rbuf //= $self->{rbuf} // (\(my $x = '')); @@ -139,7 +117,7 @@ sub read_input ($;$) { while ($len > 0) { if ($$rbuf ne '') { - my $w = xwrite($input, $rbuf, $len); + my $w = syswrite($input, $$rbuf, $len); return write_err($self, $len) unless $w; $len -= $w; die "BUG: $len < 0 (w=$w)" if $len < 0; @@ -163,7 +141,7 @@ sub app_dispatch { $env->{REMOTE_ADDR} = $self->{remote_addr}; $env->{REMOTE_PORT} = $self->{remote_port}; if (defined(my $host = $env->{HTTP_HOST})) { - $host =~ s/:([0-9]+)\z// and $env->{SERVER_PORT} = $1; + $host =~ s/:([0-9]+)\z// and $env->{SERVER_PORT} = $1 + 0; $env->{SERVER_NAME} = $host; } if (defined $input) { @@ -173,7 +151,7 @@ sub app_dispatch { # note: NOT $self->{sock}, we want our close (+ PublicInbox::DS::close), # to do proper cleanup: $env->{'psgix.io'} = $self; # for ->close or async_pass - my $res = Plack::Util::run_app($self->{httpd}->{app}, $env); + my $res = Plack::Util::run_app($env->{'pi-httpd.app'}, $env); eval { if (ref($res) eq 'CODE') { $res->(sub { response_write($self, $env, $_[0]) }); @@ -182,7 +160,7 @@ sub app_dispatch { } }; if ($@) { - err($self, "response_write error: $@"); + warn "response_write error: $@"; $self->close; } } @@ -213,6 +191,7 @@ sub response_header_write { my $alive; if (!$term && $prot_persist) { # auto-chunk $chunked = $alive = 2; + $alive = 3 if $env->{REQUEST_METHOD} eq 'HEAD'; $h .= "Transfer-Encoding: chunked\r\n"; # no need for "Connection: keep-alive" with HTTP/1.1 } elsif ($term && ($prot_persist || ($conn =~ /\bkeep-alive\b/i))) { @@ -236,7 +215,7 @@ sub response_header_write { sub chunked_write ($$) { my $self = $_[0]; return if $_[1] eq ''; - msg_more($self, sprintf("%x\r\n", bytes::length($_[1]))); + msg_more($self, sprintf("%x\r\n", length($_[1]))); msg_more($self, $_[1]); # use $self->write(\"\n\n") if you care about real-time @@ -249,22 +228,19 @@ sub identity_write ($$) { $self->write(\($_[1])) if $_[1] ne ''; } -sub next_request ($) { - my ($self) = @_; - if ($self->{rbuf}) { - # avoid recursion for pipelined requests - PublicInbox::DS::requeue(\&process_pipelineq) if !@$pipelineq; - push @$pipelineq, $self; - } else { # wait for next request - $self->requeue; - } -} - sub response_done { my ($self, $alive) = @_; + if (my $forward = delete $self->{forward}) { # avoid recursion + eval { $forward->close }; + if ($@) { + warn "response forward->close error: $@"; + return $self->close; # idempotent + } + } delete $self->{env}; # we're no longer busy + # HEAD requests set $alive = 3 so we don't send "0\r\n\r\n"; $self->write(\"0\r\n\r\n") if $alive == 2; - $self->write($alive ? \&next_request : \&close); + $self->write($alive ? $self->can('requeue') : \&close); } sub getline_pull { @@ -274,7 +250,7 @@ sub getline_pull { # limit our own running time for fairness with other # clients and to avoid buffering too much: my $buf = eval { - local $/ = \8192; + local $/ = \65536; $forward->getline; } if $forward; @@ -296,17 +272,9 @@ sub getline_pull { return; # likely } } elsif ($@) { - err($self, "response ->getline error: $@"); + warn "response ->getline error: $@"; $self->close; } - # avoid recursion - if (delete $self->{forward}) { - eval { $forward->close }; - if ($@) { - err($self, "response ->close error: $@"); - $self->close; # idempotent - } - } response_done($self, delete $self->{alive}); } @@ -327,19 +295,13 @@ sub response_write { getline_pull($self); # kick-off! } # these are returned to the calling application: - } elsif ($alive == 2) { + } elsif ($alive >= 2) { bless [ $self, $alive ], 'PublicInbox::HTTP::Chunked'; } else { bless [ $self, $alive ], 'PublicInbox::HTTP::Identity'; } } -sub input_tmpfile ($) { - my $input = tmpfile('http.input', $_[0]->{sock}) or return; - $input->autoflush(1); - $input; -} - sub input_prepare { my ($self, $env) = @_; my ($input, $len); @@ -355,39 +317,33 @@ sub input_prepare { return quit($self, 400) if $hte !~ /\Achunked\z/i; $len = CHUNK_START; - $input = input_tmpfile($self); + $input = tmpfile('http.input', $self->{sock}); } else { $len = $env->{CONTENT_LENGTH}; if (defined $len) { # rfc7230 3.3.3.4 return quit($self, 400) if $len !~ /\A[0-9]+\z/; - return quit($self, 413) if $len > $MAX_REQUEST_BUFFER; - $input = $len ? input_tmpfile($self) : $null_io; + $input = $len ? tmpfile('http.input', $self->{sock}) + : $null_io; } else { $input = $null_io; } } # TODO: expire idle clients on ENFILE / EMFILE - return unless $input; - - $env->{'psgi.input'} = $input; + $env->{'psgi.input'} = $input // return; $self->{env} = $env; $self->{input_left} = $len || 0; } sub env_chunked { ($_[0]->{HTTP_TRANSFER_ENCODING} // '') =~ /\Achunked\z/i } -sub err ($$) { - eval { $_[0]->{httpd}->{env}->{'psgi.errors'}->print($_[1]."\n") }; -} - sub write_err { my ($self, $len) = @_; my $msg = $! || '(zero write)'; $msg .= " ($len bytes remaining)" if defined $len; - err($self, "error buffering to input: $msg"); + warn "error buffering to input: $msg"; quit($self, 500); } @@ -396,7 +352,7 @@ sub recv_err { if ($! == EAGAIN) { # epoll/kevent watch already set by do_read $self->{input_left} = $len; } else { - err($self, "error reading input: $! ($len bytes remaining)"); + warn "error reading input: $! ($len bytes remaining)"; } } @@ -411,12 +367,12 @@ sub read_input_chunked { # unlikely... $$rbuf =~ s/\A\r\n//s and return app_dispatch($self, $input, $rbuf); - return quit($self, 400) if bytes::length($$rbuf) > 2; + return quit($self, 400) if length($$rbuf) > 2; } if ($len == CHUNK_END) { if ($$rbuf =~ s/\A\r\n//s) { $len = CHUNK_START; - } elsif (bytes::length($$rbuf) > 2) { + } elsif (length($$rbuf) > 2) { return quit($self, 400); } } @@ -426,14 +382,14 @@ sub read_input_chunked { # unlikely... if (($len + -s $input) > $MAX_REQUEST_BUFFER) { return quit($self, 413); } - } elsif (bytes::length($$rbuf) > CHUNK_MAX_HDR) { + } elsif (length($$rbuf) > CHUNK_MAX_HDR) { return quit($self, 400); } # will break from loop since $len >= 0 } if ($len < 0) { # chunk header is trickled, read more - $self->do_read($rbuf, 8192, bytes::length($$rbuf)) or + $self->do_read($rbuf, 8192, length($$rbuf)) or return recv_err($self, $len); # (implicit) goto chunk_start if $r > 0; } @@ -442,7 +398,7 @@ sub read_input_chunked { # unlikely... # drain the current chunk until ($len <= 0) { if ($$rbuf ne '') { - my $w = xwrite($input, $rbuf, $len); + my $w = syswrite($input, $$rbuf, $len); return write_err($self, "$len chunk") if !$w; $len -= $w; if ($len == 0) { @@ -477,15 +433,14 @@ sub close { my $self = $_[0]; if (my $forward = delete $self->{forward}) { eval { $forward->close }; - err($self, "forward ->close error: $@") if $@; + warn "forward ->close error: $@" if $@; } $self->SUPER::close; # PublicInbox::DS::close } -# for graceful shutdown in PublicInbox::Daemon: -sub busy () { +sub busy { # for graceful shutdown in PublicInbox::Daemon: my ($self) = @_; - ($self->{rbuf} || exists($self->{env}) || $self->{wbuf}); + defined($self->{rbuf}) || exists($self->{env}) || defined($self->{wbuf}) } # runs $cb on the next iteration of the event loop at earliest @@ -499,11 +454,12 @@ sub next_step { # They may be exposed to the PSGI application when the PSGI app # returns a CODE ref for "push"-based responses package PublicInbox::HTTP::Chunked; -use strict; +use v5.12; sub write { # ([$http], $buf) = @_; - PublicInbox::HTTP::chunked_write($_[0]->[0], $_[1]) + PublicInbox::HTTP::chunked_write($_[0]->[0], $_[1]); + $_[0]->[0]->{sock} ? length($_[1]) : undef; } sub close { @@ -512,12 +468,13 @@ sub close { } package PublicInbox::HTTP::Identity; -use strict; +use v5.12; our @ISA = qw(PublicInbox::HTTP::Chunked); sub write { # ([$http], $buf) = @_; PublicInbox::HTTP::identity_write($_[0]->[0], $_[1]); + $_[0]->[0]->{sock} ? length($_[1]) : undef; } 1; diff --git a/lib/PublicInbox/HTTPD.pm b/lib/PublicInbox/HTTPD.pm index a9f55ff6..6a6347d8 100644 --- a/lib/PublicInbox/HTTPD.pm +++ b/lib/PublicInbox/HTTPD.pm @@ -1,29 +1,33 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # wraps a listen socket for HTTP and links it to the PSGI app in # public-inbox-httpd package PublicInbox::HTTPD; +use v5.10.1; use strict; -use warnings; -use Plack::Util; -use PublicInbox::HTTPD::Async; -use PublicInbox::Daemon; +use Plack::Util (); +use Plack::Builder; +use PublicInbox::HTTP; -sub pi_httpd_async { PublicInbox::HTTPD::Async->new(@_) } +# we have a different env for ever listener socket for +# SERVER_NAME, SERVER_PORT and psgi.url_scheme +# envs: listener FD => PSGI env +sub new { bless { envs => {}, err => \*STDERR }, __PACKAGE__ } -sub new { - my ($class, $sock, $app) = @_; - my $n = getsockname($sock) or die "not a socket: $sock $!\n"; +# this becomes {srv_env} in PublicInbox::HTTP +sub env_for ($$$) { + my ($self, $srv, $client) = @_; + my $n = getsockname($srv) or die "not a socket: $srv $!\n"; my ($host, $port) = PublicInbox::Daemon::host_with_port($n); - - my %env = ( + { SERVER_NAME => $host, SERVER_PORT => $port, SCRIPT_NAME => '', 'psgi.version' => [ 1, 1 ], - 'psgi.errors' => \*STDERR, - 'psgi.url_scheme' => 'http', + 'psgi.errors' => $self->{err}, + 'psgi.url_scheme' => $client->can('accept_SSL') ? + 'https' : 'http', 'psgi.nonblocking' => Plack::Util::TRUE, 'psgi.streaming' => Plack::Util::TRUE, 'psgi.run_once' => Plack::Util::FALSE, @@ -37,13 +41,48 @@ sub new { # XXX unstable API!, only GitHTTPBackend needs # this to limit git-http-backend(1) parallelism. # We also check for the truthiness of this to - # detect when to use git_async_cat for slow blobs - 'pi-httpd.async' => \&pi_httpd_async - ); - bless { - app => $app, - env => \%env - }, $class; + # detect when to use async paths for slow blobs + 'pi-httpd.async' => 1, + 'pi-httpd.app' => $self->{app}, + 'pi-httpd.warn_cb' => $self->{warn_cb}, + } +} + +sub refresh_groups { + my ($self) = @_; + my $app; + $self->{psgi} //= $main::ARGV[0] if @main::ARGV; + if ($self->{psgi}) { + eval { $app = Plack::Util::load_psgi($self->{psgi}) }; + die $@, <<EOM if $@; +$0 runs in /, command-line paths must be absolute +EOM + } else { + require PublicInbox::WWW; + my $www = PublicInbox::WWW->new; + $www->preload; + $app = builder { + eval { enable 'ReverseProxy' }; + $@ and warn <<EOM; +Plack::Middleware::ReverseProxy missing, +URL generation for redirects may be wrong if behind a reverse proxy +EOM + enable 'Head'; + sub { $www->call(@_) }; + }; + } + $_->{'pi-httpd.app'} = $app for values %{$self->{envs}}; + $self->{app} = $app; +} + +sub post_accept_cb { # for Listener->{post_accept} + my ($self) = @_; + sub { + my ($client, $addr, $srv) = @_; # $_[4] - tls_wrap (unused) + PublicInbox::HTTP->new($client, $addr, + $self->{envs}->{fileno($srv)} //= + env_for($self, $srv, $client)); + } } 1; diff --git a/lib/PublicInbox/HTTPD/Async.pm b/lib/PublicInbox/HTTPD/Async.pm deleted file mode 100644 index 87a6a5f9..00000000 --- a/lib/PublicInbox/HTTPD/Async.pm +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# -# XXX This is a totally unstable API for public-inbox internal use only -# This is exposed via the 'pi-httpd.async' key in the PSGI env hash. -# The name of this key is not even stable! -# Currently intended for use with read-only pipes with expensive -# processes such as git-http-backend(1), cgit(1) -# -# fields: -# http: PublicInbox::HTTP ref -# fh: PublicInbox::HTTP::{Identity,Chunked} ref (can ->write + ->close) -# cb: initial read callback -# arg: arg for {cb} -# end_obj: CODE or object which responds to ->event_step when ->close is called -package PublicInbox::HTTPD::Async; -use strict; -use parent qw(PublicInbox::DS); -use Errno qw(EAGAIN); -use PublicInbox::Syscall qw(EPOLLIN EPOLLET); - -# This is called via: $env->{'pi-httpd.async'}->() -# $io is a read-only pipe ($rpipe) for now, but may be a -# bidirectional socket in the future. -sub new { - my ($class, $io, $cb, $arg, $end_obj) = @_; - - # no $io? call $cb at the top of the next event loop to - # avoid recursion: - unless (defined($io)) { - PublicInbox::DS::requeue($cb ? $cb : $arg); - die '$end_obj unsupported w/o $io' if $end_obj; - return; - } - my $self = bless { - cb => $cb, # initial read callback - arg => $arg, # arg for $cb - end_obj => $end_obj, # like END{}, can ->event_step - }, $class; - IO::Handle::blocking($io, 0); - $self->SUPER::new($io, EPOLLIN | EPOLLET); -} - -sub event_step { - my ($self) = @_; - if (my $cb = delete $self->{cb}) { - # this may call async_pass when headers are done - $cb->(delete $self->{arg}); - } elsif (my $sock = $self->{sock}) { - my $http = $self->{http}; - # $self->{sock} is a read pipe for git-http-backend or cgit - # and 65536 is the default Linux pipe size - my $r = sysread($sock, my $buf, 65536); - if ($r) { - $self->{fh}->write($buf); # may call $http->close - if ($http->{sock}) { # !closed - $self->requeue; - # let other clients get some work done, too - return; - } - - # else: fall through to close below... - } elsif (!defined $r && $! == EAGAIN) { - return; # EPOLLET means we'll be notified - } - - # Done! Error handling will happen in $self->{fh}->close - # called by end_obj->event_step handler - delete $http->{forward}; - $self->close; # queues end_obj->event_step to be called - } # else { # we may've been requeued but closed by $http -} - -# once this is called, all data we read is passed to the -# to the PublicInbox::HTTP instance ($http) via $fh->write -sub async_pass { - my ($self, $http, $fh, $bref) = @_; - # In case the client HTTP connection ($http) dies, it - # will automatically close this ($self) object. - $http->{forward} = $self; - - # write anything we overread when we were reading headers - $fh->write($$bref); # PublicInbox:HTTP::{chunked,identity}_wcb - - # we're done with this, free this memory up ASAP since the - # calls after this may use much memory: - $$bref = undef; - - $self->{http} = $http; - $self->{fh} = $fh; - - # either hit EAGAIN or ->requeue to keep EPOLLET happy - event_step($self); -} - -# may be called as $forward->close in PublicInbox::HTTP or EOF (event_step) -sub close { - my $self = $_[0]; - $self->SUPER::close; # DS::close - - # we defer this to the next timer loop since close is deferred - if (my $end_obj = delete $self->{end_obj}) { - # this calls $end_obj->event_step - # (likely PublicInbox::Qspawn::event_step, - # NOT PublicInbox::HTTPD::Async::event_step) - PublicInbox::DS::requeue($end_obj); - } -} - -1; diff --git a/lib/PublicInbox/HlMod.pm b/lib/PublicInbox/HlMod.pm index de285fc2..f42ece80 100644 --- a/lib/PublicInbox/HlMod.pm +++ b/lib/PublicInbox/HlMod.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # I have no idea how stable or safe this is for handling untrusted @@ -14,7 +14,7 @@ # wrapper for SWIG-generated highlight.pm bindings package PublicInbox::HlMod; use strict; -use warnings; +use v5.10.1; use highlight; # SWIG-generated stuff use PublicInbox::Hval qw(src_escape ascii_html); my $hl; @@ -54,8 +54,7 @@ sub _parse_filetypes ($) { (\%ext2lang, \@shebang); } -# We only need one instance, so we don't need to do -# highlight::CodeGenerator::deleteInstance +# We only need one instance sub new { my ($class) = @_; $hl ||= do { @@ -95,33 +94,26 @@ sub do_hl { sub do_hl_lang { my ($self, $str, $lang) = @_; - my $dir = $self->{-dir}; my $langpath; - if (defined $lang) { - $langpath = $dir->getLangPath("$lang.lang") or return; - $lang = undef unless -f $langpath - } - unless (defined $lang) { - $lang = _shebang2lang($self, $str) or return; - $langpath = $dir->getLangPath("$lang.lang") or return; - return unless -f $langpath + $langpath = $self->{-dir}->getLangPath("$lang.lang") or return; + undef $lang unless -f $langpath; } - my $gen = $self->{$langpath} ||= do { - my $g = highlight::CodeGenerator::getInstance($highlight::HTML); - $g->setFragmentCode(1); # generate html fragment + $lang //= _shebang2lang($self, $str) // return; + $langpath = $self->{-dir}->getLangPath("$lang.lang") or return; + return unless -f $langpath; - # whatever theme works - my $themepath = $dir->getThemePath('print.theme'); - $g->initTheme($themepath); - $g->loadLanguage($langpath); - $g->setEncoding('utf-8'); - $g; - }; + my $g = highlight::CodeGenerator::getInstance($highlight::HTML); + $g->setFragmentCode(1); # generate html fragment + # whatever theme works + $g->initTheme($self->{-dir}->getThemePath('print.theme')); + $g->loadLanguage($langpath); + $g->setEncoding('utf-8'); # we assume $$str is valid UTF-8, but the SWIG binding doesn't # know that, so ensure it's marked as UTF-8 even if it isnt... - my $out = $gen->generateString($$str); + my $out = $g->generateString($$str); + highlight::CodeGenerator::deleteInstance($g); utf8::decode($out); src_escape($out); \$out; diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index fb21041a..963dbb71 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -1,18 +1,19 @@ -# Copyright (C) 2014-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2014-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # represents a header value in various forms. Used for HTML generation # in our web interface(s) package PublicInbox::Hval; +use v5.10.1; # be careful about unicode_strings in v5.12; use strict; -use warnings; use Encode qw(find_encoding); use PublicInbox::MID qw/mid_clean mid_escape/; use base qw/Exporter/; our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape - to_attr prurl mid_href fmt_ts ts2str/; + to_attr prurl mid_href fmt_ts ts2str utf8_maybe/; use POSIX qw(strftime); my $enc_ascii = find_encoding('us-ascii'); +use File::Spec; # safe-ish acceptable filename pattern for portability our $FN = '[a-zA-Z0-9][a-zA-Z0-9_\-\.]+[a-zA-Z0-9]'; # needs \z anchor @@ -34,7 +35,7 @@ my %escape_sequence = ( "\x7f" => '\\x7f', # DEL ); -my %xhtml_map = ( +our %xhtml_map = ( '"' => '"', '&' => '&', "'" => ''', @@ -69,7 +70,16 @@ sub prurl ($$) { $u = $host_match[0] // $u->[0]; # fall through to below: } - index($u, '//') == 0 ? "$env->{'psgi.url_scheme'}:$u" : $u; + my $dslash = index($u, '//'); + if ($dslash == 0) { + "$env->{'psgi.url_scheme'}:$u" + } elsif ($dslash < 0 && substr($u, 0, 1) ne '/' && + substr(my $path = $env->{PATH_INFO}, 0, 1) eq '/') { + # this won't touch the FS at all: + File::Spec->abs2rel("/$u", $path); + } else { + $u; + } } # for misguided people who believe in this stuff, give them a @@ -82,15 +92,22 @@ sub obfuscate_addrs ($$;$) { my $repl = $_[2] // '•'; my $re = $ibx->{-no_obfuscate_re}; # regex of domains my $addrs = $ibx->{-no_obfuscate}; # { $address => 1 } - $_[1] =~ s/(([\w\.\+=\-]+)\@([\w\-]+\.[\w\.\-]+))/ - my ($addr, $user, $domain) = ($1, $2, $3); - if ($addrs->{$addr} || ((defined $re && $domain =~ $re))) { - $addr; + $_[1] =~ s#(\S+)\@([\w\-]+\.[\w\.\-]+)# + my ($pfx, $domain) = ($1, $2); + if (index($pfx, '://') > 0 || $pfx !~ s/([\w\.\+=\-]+)\z//) { + "$pfx\@$domain"; } else { - $domain =~ s!([^\.]+)\.!$1$repl!; - $user . '@' . $domain + my $user = $1; + my $addr = "$user\@$domain"; + if ($addrs->{$addr} || ((defined($re) && + $domain =~ $re))) { + $pfx.$addr; + } else { + $domain =~ s!([^\.]+)\.!$1$repl!; + $pfx . $user . '@' . $domain + } } - /sge; + #sge; } # like format_sanitized_subject in git.git pretty.c with '%f' format string @@ -111,7 +128,7 @@ $ESCAPES{'/'} = ':'; # common sub to_attr ($) { my ($str) = @_; - # git would never do this to us: + # git would never do this to us, mail diff uses // to prevent anchors: return if index($str, '//') >= 0; my $first = ''; @@ -128,6 +145,15 @@ sub to_attr ($) { sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) }; # human-friendly format -sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) } +sub fmt_ts ($) { + # strftime %k is not portable and leading zeros in %H slow me down + my (undef, $M, $H, $d, $m, $Y) = gmtime $_[0]; + sprintf '%u-%02u-%02u % 2u:%02u', $Y + 1900, $m + 1, $d, $H, $M; +} + +sub utf8_maybe ($) { + utf8::decode($_[0]); + utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists +} 1; diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index a861282f..b12533cb 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Each instance of this represents an IMAP client connected to @@ -36,18 +36,10 @@ use parent qw(PublicInbox::DS); use PublicInbox::Eml; use PublicInbox::EmlContentFoo qw(parse_content_disposition); use PublicInbox::DS qw(now); -use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); use PublicInbox::GitAsyncCat; use Text::ParseWords qw(parse_line); use Errno qw(EAGAIN); -use PublicInbox::IMAPsearchqp; - -my $Address; -for my $mod (qw(Email::Address::XS Mail::Address)) { - eval "require $mod" or next; - $Address = $mod and last; -} -die "neither Email::Address::XS nor Mail::Address loaded: $@" if !$Address; +use PublicInbox::Address; sub LINE_MAX () { 8000 } # RFC 2683 3.2.1.5 @@ -99,33 +91,15 @@ undef %FETCH_NEED; my $valid_range = '[0-9]+|[0-9]+:[0-9]+|[0-9]+:\*'; $valid_range = qr/\A(?:$valid_range)(?:,(?:$valid_range))*\z/; -# RFC 3501 5.4. Autologout Timer needs to be >= 30min -$PublicInbox::DS::EXPTIME = 60 * 30; - -sub greet ($) { +sub do_greet { my ($self) = @_; my $capa = capa($self); $self->write(\"* OK [$capa] public-inbox-imapd ready\r\n"); } -sub new ($$$) { - my ($class, $sock, $imapd) = @_; - my $self = bless { imapd => $imapd }, 'PublicInbox::IMAP_preauth'; - my $ev = EPOLLIN; - my $wbuf; - if ($sock->can('accept_SSL') && !$sock->accept_SSL) { - return CORE::close($sock) if $! != EAGAIN; - $ev = PublicInbox::TLS::epollbit(); - $wbuf = [ \&PublicInbox::DS::accept_tls_step, \&greet ]; - } - $self->SUPER::new($sock, $ev | EPOLLONESHOT); - if ($wbuf) { - $self->{wbuf} = $wbuf; - } else { - greet($self); - } - $self->update_idle_time; - $self; +sub new { + my (undef, $sock, $imapd) = @_; + (bless { imapd => $imapd }, 'PublicInbox::IMAP_preauth')->greet($sock) } sub logged_in { 1 } @@ -140,7 +114,7 @@ sub capa ($) { $capa .= ' COMPRESS=DEFLATE'; } else { if (!($self->{sock} // $self)->can('accept_SSL') && - $self->{imapd}->{accept_tls}) { + $self->{imapd}->{ssl_ctx_opt}) { $capa .= ' STARTTLS'; } $capa .= ' AUTH=ANONYMOUS'; @@ -157,6 +131,7 @@ sub login_success ($$) { sub auth_challenge_ok ($) { my ($self) = @_; my $tag = delete($self->{-login_tag}) or return; + $self->{anon} = 1; login_success($self, $tag); } @@ -195,14 +170,14 @@ sub cmd_capability ($$) { # but uo2m_hibernate can compact and deduplicate it sub uo2m_ary_new ($;$) { my ($self, $exists) = @_; - my $base = $self->{uid_base}; - my $uids = $self->{ibx}->over->uid_range($base + 1, $base + UID_SLICE); + my $ub = $self->{uid_base}; + my $uids = $self->{ibx}->over(1)->uid_range($ub + 1, $ub + UID_SLICE); # convert UIDs to offsets from {base} my @tmp; # [$UID_OFFSET] => $MSN my $msn = 0; - ++$base; - $tmp[$_ - $base] = ++$msn for @$uids; + ++$ub; + $tmp[$_ - $ub] = ++$msn for @$uids; $$exists = $msn if $exists; \@tmp; } @@ -243,7 +218,7 @@ sub uo2m_extend ($$;$) { # need to extend the current range: my $base = $self->{uid_base}; ++$beg; - my $uids = $self->{ibx}->over->uid_range($beg, $base + UID_SLICE); + my $uids = $self->{ibx}->over(1)->uid_range($beg, $base + UID_SLICE); return $uo2m if !scalar(@$uids); my @tmp; # [$UID_OFFSET] => $MSN my $write_method = $_[2] // 'msg_more'; @@ -316,20 +291,18 @@ sub on_inbox_unlock { } } -# called every X minute(s) or so by PublicInbox::DS::later -my $IDLERS = {}; -my $idle_timer; +# called every minute or so by PublicInbox::DS::later +my $IDLERS; # fileno($obj->{sock}) => PublicInbox::IMAP sub idle_tick_all { my $old = $IDLERS; - $IDLERS = {}; + $IDLERS = undef; for my $i (values %$old) { next if ($i->{wbuf} || !exists($i->{-idle_tag})); - $i->update_idle_time or next; $IDLERS->{fileno($i->{sock})} = $i; $i->write(\"* OK Still here\r\n"); } - $idle_timer = scalar keys %$IDLERS ? - PublicInbox::DS::later(\&idle_tick_all) : undef; + $IDLERS and + PublicInbox::DS::add_uniq_timer('idle', 60, \&idle_tick_all); } sub cmd_idle ($$) { @@ -342,11 +315,11 @@ sub cmd_idle ($$) { my $fd = fileno($sock); $self->{-idle_tag} = $tag; # only do inotify on most recent slice - if ($ibx->over->max < $uid_end) { + if ($ibx->over(1)->max < $uid_end) { $ibx->subscribe_unlock($fd, $self); $self->{imapd}->idler_start; } - $idle_timer //= PublicInbox::DS::later(\&idle_tick_all); + PublicInbox::DS::add_uniq_timer('idle', 60, \&idle_tick_all); $IDLERS->{$fd} = $self; \"+ idling\r\n" } @@ -371,21 +344,18 @@ sub idle_done ($$) { "$idle_tag OK Idle done\r\n"; } -sub ensure_slices_exist ($$$) { - my ($imapd, $ibx, $max) = @_; - defined(my $mb_top = $ibx->{newsgroup}) or return; +sub ensure_slices_exist ($$) { + my ($imapd, $ibx) = @_; + my $mb_top = $ibx->{newsgroup} // return; my $mailboxes = $imapd->{mailboxes}; - my @created; - for (my $i = int($max/UID_SLICE); $i >= 0; --$i) { + my $list = $imapd->{mailboxlist}; # may be undef, just autoviv + noop + for (my $i = int($ibx->art_max/UID_SLICE); $i >= 0; --$i) { my $sub_mailbox = "$mb_top.$i"; last if exists $mailboxes->{$sub_mailbox}; $mailboxes->{$sub_mailbox} = $ibx; $sub_mailbox =~ s/\Ainbox\./INBOX./i; # more familiar to users - push @created, $sub_mailbox; + push @$list, qq[* LIST (\\HasNoChildren) "." $sub_mailbox\r\n] } - return unless @created; - my $l = $imapd->{inboxlist} or return; - push @$l, map { qq[* LIST (\\HasNoChildren) "." $_\r\n] } @created; } sub inbox_lookup ($$;$) { @@ -393,22 +363,23 @@ sub inbox_lookup ($$;$) { my ($ibx, $exists, $uidmax, $uid_base) = (undef, 0, 0, 0); $mailbox = lc $mailbox; $ibx = $self->{imapd}->{mailboxes}->{$mailbox} or return; - my $over = $ibx->over; + my $over = $ibx->over(1); if ($over != $ibx) { # not a dummy $mailbox =~ /\.([0-9]+)\z/ or die "BUG: unexpected dummy mailbox: $mailbox\n"; $uid_base = $1 * UID_SLICE; - # ->num_highwater caches for writers, so use ->meta_accessor - $uidmax = $ibx->mm->meta_accessor('num_highwater') // 0; + $uidmax = $ibx->mm->num_highwater // 0; if ($examine) { $self->{uid_base} = $uid_base; $self->{ibx} = $ibx; $self->{uo2m} = uo2m_ary_new($self, \$exists); } else { - $exists = $over->imap_exists; + my $uid_end = $uid_base + UID_SLICE; + $exists = $over->imap_exists($uid_base, $uid_end); } - ensure_slices_exist($self->{imapd}, $ibx, $over->max); + delete $ibx->{-art_max}; + ensure_slices_exist($self->{imapd}, $ibx); } else { if ($examine) { $self->{uid_base} = $uid_base; @@ -417,8 +388,9 @@ sub inbox_lookup ($$;$) { } # if "INBOX.foo.bar" is selected and "INBOX.foo.bar.0", # check for new UID ranges (e.g. "INBOX.foo.bar.1") - if (my $z = $self->{imapd}->{mailboxes}->{"$mailbox.0"}) { - ensure_slices_exist($self->{imapd}, $z, $z->over->max); + if (my $ibx = $self->{imapd}->{mailboxes}->{"$mailbox.0"}) { + delete $ibx->{-art_max}; + ensure_slices_exist($self->{imapd}, $ibx); } } ($ibx, $exists, $uidmax + 1, $uid_base); @@ -447,8 +419,10 @@ sub _esc ($) { if (!defined($v)) { 'NIL'; } elsif ($v =~ /[{"\r\n%*\\\[]/) { # literal string + utf8::encode($v); '{' . length($v) . "}\r\n" . $v; } else { # quoted string + utf8::encode($v); qq{"$v"} } } @@ -458,7 +432,7 @@ sub addr_envelope ($$;$) { my $v = $eml->header_raw($x) // ($y ? $eml->header_raw($y) : undef) // return 'NIL'; - my @x = $Address->parse($v) or return 'NIL'; + my @x = PublicInbox::Address::objects($v) or return 'NIL'; '(' . join('', map { '(' . join(' ', _esc($_->name), 'NIL', @@ -499,7 +473,7 @@ sub body_disposition ($) { my $cd = $eml->header_raw('Content-Disposition') or return 'NIL'; $cd = parse_content_disposition($cd); my $buf = '('._esc($cd->{type}); - $buf .= ' ' . _esc_hash(delete $cd->{attributes}); + $buf .= ' ' . _esc_hash($cd->{attributes}); $buf .= ')'; } @@ -511,7 +485,7 @@ sub body_leaf ($$;$) { my $ct = $eml->ct; $buf .= '('._esc($ct->{type}).' '; $buf .= _esc($ct->{subtype}); - $buf .= ' ' . _esc_hash(delete $ct->{attributes}); + $buf .= ' ' . _esc_hash($ct->{attributes}); $buf .= ' ' . _esc($eml->header_raw('Content-ID')); $buf .= ' ' . _esc($eml->header_raw('Content-Description')); my $cte = $eml->header_raw('Content-Transfer-Encoding') // '7bit'; @@ -540,7 +514,7 @@ sub body_parent ($$$) { $buf .= @$hold ? join('', @$hold) : 'NIL'; $buf .= ' '._esc($ct->{subtype}); if ($structure) { - $buf .= ' '._esc_hash(delete $ct->{attributes}); + $buf .= ' '._esc_hash($ct->{attributes}); $buf .= ' '.body_disposition($eml); $buf .= ' '._esc($eml->header_raw('Content-Language')); $buf .= ' '._esc($eml->header_raw('Content-Location')); @@ -583,22 +557,6 @@ sub fetch_body ($;$) { join('', @hold); } -sub requeue_once ($) { - my ($self) = @_; - # COMPRESS users all share the same DEFLATE context. - # Flush it here to ensure clients don't see - # each other's data - $self->zflush; - - # no recursion, schedule another call ASAP, - # but only after all pending writes are done. - # autovivify wbuf: - my $new_size = push(@{$self->{wbuf}}, \&long_step); - - # wbuf may be populated by $cb, no need to rearm if so: - $self->requeue if $new_size == 1; -} - sub fetch_run_ops { my ($self, $smsg, $bref, $ops, $partial) = @_; my $uid = $smsg->{num}; @@ -612,25 +570,37 @@ sub fetch_run_ops { $self->msg_more(")\r\n"); } -sub fetch_blob_cb { # called by git->cat_async via git_async_cat +sub requeue { # overrides PublicInbox::DS::requeue + my ($self) = @_; + if ($self->{anon}) { # AUTH=ANONYMOUS gets high priority + $self->SUPER::requeue; + } else { # low priority + push(@{$self->{imapd}->{-authed_q}}, $self) == 1 and + PublicInbox::DS::requeue($self->{imapd}); + } +} + +sub fetch_blob_cb { # called by git->cat_async via ibx_async_cat my ($bref, $oid, $type, $size, $fetch_arg) = @_; my ($self, undef, $msgs, $range_info, $ops, $partial) = @$fetch_arg; + my $ibx = $self->{ibx} or return $self->close; # client disconnected my $smsg = shift @$msgs or die 'BUG: no smsg'; - if (!defined($oid)) { + if (!defined($type)) { + warn "E: git aborted on $oid / $smsg->{blob} $ibx->{inboxdir}"; + return $self->close; + } elsif ($type ne 'blob') { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message - warn "E: $smsg->{blob} missing in $self->{ibx}->{inboxdir}\n"; - return requeue_once($self); - } else { - $smsg->{blob} eq $oid or die "BUG: $smsg->{blob} != $oid"; + warn "E: $smsg->{blob} $type in $ibx->{inboxdir}\n"; + return $self->requeue_once; } + $smsg->{blob} eq $oid or die "BUG: $smsg->{blob} != $oid"; my $pre; - if (!$self->{wbuf} && (my $nxt = $msgs->[0])) { - $pre = git_async_prefetch($self->{ibx}->git, $nxt->{blob}, - \&fetch_blob_cb, $fetch_arg); - } + ($self->{anon} && !$self->{wbuf} && $msgs->[0]) and + $pre = ibx_async_prefetch($ibx, $msgs->[0]->{blob}, + \&fetch_blob_cb, $fetch_arg); fetch_run_ops($self, $smsg, $bref, $ops, $partial); - $pre ? $self->zflush : requeue_once($self); + $pre ? $self->dflush : $self->requeue_once; } sub emit_rfc822 { @@ -688,7 +658,7 @@ sub op_eml_new { $_[4] = PublicInbox::Eml->new($_[3]) } # s/From / fixes old bug from import (pre-a0c07cba0e5d8b6a) sub to_crlf_full { ${$_[0]} =~ s/(?<!\r)\n/\r\n/sg; - ${$_[0]} =~ s/\A[\r\n]*From [^\r\n]*\r\n//s; + PublicInbox::Eml::strip_from(${$_[0]}); } sub op_crlf_bref { to_crlf_full($_[3]) } @@ -721,7 +691,7 @@ sub range_step ($$) { uid_clamp($self, \$beg, \$end); } elsif ($range =~ /\A([0-9]+):\*\z/) { $beg = $1 + 0; - $end = $self->{ibx}->over->max; + $end = $self->{ibx}->over(1)->max; $end = $uid_end if $end > $uid_end; $beg = $end if $beg > $end; uid_clamp($self, \$beg, \$end); @@ -739,7 +709,7 @@ sub range_step ($$) { sub refill_range ($$$) { my ($self, $msgs, $range_info) = @_; my ($beg, $end, $range_csv) = @$range_info; - if (scalar(@$msgs = @{$self->{ibx}->over->query_xover($beg, $end)})) { + if (scalar(@$msgs = @{$self->{ibx}->over(1)->query_xover($beg, $end)})){ $range_info->[0] = $msgs->[-1]->{num} + 1; return; } @@ -759,7 +729,7 @@ sub fetch_blob { # long_response } } uo2m_extend($self, $msgs->[-1]->{num}); - git_async_cat($self->{ibx}->git, $msgs->[0]->{blob}, + ibx_async_cat($self->{ibx}, $msgs->[0]->{blob}, \&fetch_blob_cb, \@_); } @@ -780,7 +750,7 @@ sub fetch_smsg { # long_response sub refill_uids ($$$;$) { my ($self, $uids, $range_info, $sql) = @_; my ($beg, $end, $range_csv) = @$range_info; - my $over = $self->{ibx}->over; + my $over = $self->{ibx}->over(1); while (1) { if (scalar(@$uids = @{$over->uid_range($beg, $end, $sql)})) { $range_info->[0] = $uids->[-1] + 1; # update $beg @@ -848,7 +818,7 @@ sub cmd_status ($$$;@) { my %patmap = ('*' => '.*', '%' => '[^\.]*'); sub cmd_list ($$$$) { my ($self, $tag, $refname, $wildcard) = @_; - my $l = $self->{imapd}->{inboxlist}; + my $l = $self->{imapd}->{mailboxlist}; if ($refname eq '' && $wildcard eq '') { # request for hierarchy delimiter $l = [ qq[* LIST (\\Noselect) "." ""\r\n] ]; @@ -876,12 +846,12 @@ sub eml_index_offs_i { # PublicInbox::Eml::each_part callback # prepares an index for BODY[$SECTION_IDX] fetches sub eml_body_idx ($$) { my ($eml, $section_idx) = @_; - my $idx = $eml->{imap_all_parts} //= do { + my $idx = $eml->{imap_all_parts} // do { my $all = {}; $eml->each_part(\&eml_index_offs_i, $all, 0, 1); # top-level of multipart, BODY[0] not allowed (nz-number) delete $all->{0}; - $all; + $eml->{imap_all_parts} = $all; }; $idx->{$section_idx}; } @@ -1032,7 +1002,7 @@ sub fetch_compile ($) { # stabilize partial order for consistency and ease-of-debugging: if (scalar keys %partial) { $need |= NEED_BLOB; - $r[2] = [ map { [ $_, @{$partial{$_}} ] } sort keys %partial ]; + @{$r[2]} = map { [ $_, @{$partial{$_}} ] } sort keys %partial; } push @op, $OP_EML_NEW if ($need & (EML_HDR|EML_BDY)); @@ -1055,7 +1025,7 @@ sub fetch_compile ($) { # r[1] = [ $key1, $cb1, $key2, $cb2, ... ] use sort 'stable'; # makes output more consistent - $r[1] = [ map { ($_->[2], $_->[1]) } sort { $a->[0] <=> $b->[0] } @op ]; + @{$r[1]} = map { ($_->[2], $_->[1]) } sort { $a->[0] <=> $b->[0] } @op; @r; } @@ -1070,7 +1040,7 @@ sub cmd_uid_fetch ($$$$;@) { my $range_info = range_step($self, \$range_csv); return "$tag $range_info\r\n" if !ref($range_info); uo2m_hibernate($self) if $cb == \&fetch_blob; # slow, save RAM - long_response($self, $cb, $tag, [], $range_info, $ops, $partial); + $self->long_response($cb, $tag, [], $range_info, $ops, $partial); } sub cmd_fetch ($$$$;@) { @@ -1085,7 +1055,7 @@ sub cmd_fetch ($$$$;@) { my $range_info = range_step($self, \$range_csv); return "$tag $range_info\r\n" if !ref($range_info); uo2m_hibernate($self) if $cb == \&fetch_blob; # slow, save RAM - long_response($self, $cb, $tag, [], $range_info, $ops, $partial); + $self->long_response($cb, $tag, [], $range_info, $ops, $partial); } sub msn_convert ($$) { @@ -1109,11 +1079,12 @@ sub search_uid_range { # long_response 1; # more } -sub parse_query ($$) { +sub parse_imap_query ($$) { my ($self, $query) = @_; + # IMAPsearchqp gets loaded in IMAPD->refresh_groups my $q = PublicInbox::IMAPsearchqp::parse($self, $query); if (ref($q)) { - my $max = $self->{ibx}->over->max; + my $max = $self->{ibx}->over(1)->max; my $beg = 1; uid_clamp($self, \$beg, \$max); $q->{range_info} = [ $beg, $max ]; @@ -1121,49 +1092,28 @@ sub parse_query ($$) { $q; } -sub refill_xap ($$$$) { - my ($self, $uids, $range_info, $q) = @_; - my ($beg, $end) = @$range_info; - my $srch = $self->{ibx}->search; - my $opt = { mset => 2, limit => 1000 }; - my $mset = $srch->mset("$q uid:$beg..$end", $opt); - @$uids = @{$srch->mset_to_artnums($mset)}; - if (@$uids) { - $range_info->[0] = $uids->[-1] + 1; # update $beg - return; # possibly more - } - 0; # all done -} - -sub search_xap_range { # long_response - my ($self, $tag, $q, $range_info, $want_msn) = @_; - my $uids = []; - if (defined(my $err = refill_xap($self, $uids, $range_info, $q))) { - $err ||= 'OK Search done'; - $self->write("\r\n$tag $err\r\n"); - return; - } - msn_convert($self, $uids) if $want_msn; - $self->msg_more(join(' ', '', @$uids)); - 1; # more -} - sub search_common { my ($self, $tag, $query, $want_msn) = @_; my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n"; - my $q = parse_query($self, $query); + my $q = parse_imap_query($self, $query); return "$tag $q\r\n" if !ref($q); my ($sql, $range_info) = delete @$q{qw(sql range_info)}; if (!scalar(keys %$q)) { # overview.sqlite3 $self->msg_more('* SEARCH'); - long_response($self, \&search_uid_range, + $self->long_response(\&search_uid_range, $tag, $sql, $range_info, $want_msn); } elsif ($q = $q->{xap}) { - $self->{ibx}->search or + my $srch = $self->{ibx}->isrch or return "$tag BAD search not available for mailbox\r\n"; - $self->msg_more('* SEARCH'); - long_response($self, \&search_xap_range, - $tag, $q, $range_info, $want_msn); + my $opt = { + relevance => -1, + limit => UID_SLICE, + uid_range => $range_info + }; + my $mset = $srch->mset($q, $opt); + my $uids = $srch->mset_to_artnums($mset, $opt); + msn_convert($self, $uids) if scalar(@$uids) && $want_msn; + "* SEARCH @$uids\r\n$tag OK Search done\r\n"; } else { "$tag BAD Error\r\n"; } @@ -1179,15 +1129,6 @@ sub cmd_search ($$$;) { search_common($self, $tag, $query, 1); } -sub args_ok ($$) { # duplicated from PublicInbox::NNTP - my ($cb, $argc) = @_; - my $tot = prototype $cb; - my ($nreq, undef) = split(';', $tot); - $nreq = ($nreq =~ tr/$//) - 1; - $tot = ($tot =~ tr/$//) - 1; - ($argc <= $tot && $argc >= $nreq); -} - # returns 1 if we can continue, 0 if not due to buffered writes or disconnect sub process_line ($$) { my ($self, $l) = @_; @@ -1222,48 +1163,11 @@ sub process_line ($$) { my $err = $@; if ($err && $self->{sock}) { $l =~ s/\r?\n//s; - err($self, 'error from: %s (%s)', $l, $err); + warn("error from: $l ($err)\n"); $tag //= '*'; - $res = "$tag BAD program fault - command not performed\r\n"; + $res = \"$tag BAD program fault - command not performed\r\n"; } - return 0 unless defined $res; - $self->write($res); -} - -sub long_step { - my ($self) = @_; - # wbuf is unset or empty, here; {long} may add to it - my ($fd, $cb, $t0, @args) = @{$self->{long_cb}}; - my $more = eval { $cb->($self, @args) }; - if ($@ || !$self->{sock}) { # something bad happened... - delete $self->{long_cb}; - my $elapsed = now() - $t0; - if ($@) { - err($self, - "%s during long response[$fd] - %0.6f", - $@, $elapsed); - } - out($self, " deferred[$fd] aborted - %0.6f", $elapsed); - $self->close; - } elsif ($more) { # $self->{wbuf}: - $self->update_idle_time; - - # control passed to git_async_cat if $more == \undef - requeue_once($self) if !ref($more); - } else { # all done! - delete $self->{long_cb}; - my $elapsed = now() - $t0; - my $fd = fileno($self->{sock}); - out($self, " deferred[$fd] done - %0.6f", $elapsed); - my $wbuf = $self->{wbuf}; # do NOT autovivify - - $self->requeue unless $wbuf && @$wbuf; - } -} - -sub err ($$;@) { - my ($self, $fmt, @args) = @_; - printf { $self->{imapd}->{err} } $fmt."\n", @args; + defined($res) ? $self->write($res) : 0; } sub out ($$;@) { @@ -1271,25 +1175,12 @@ sub out ($$;@) { printf { $self->{imapd}->{out} } $fmt."\n", @args; } -sub long_response ($$;@) { - my ($self, $cb, @args) = @_; # cb returns true if more, false if done - - my $sock = $self->{sock} or return; - # make sure we disable reading during a long response, - # clients should not be sending us stuff and making us do more - # work while we are stream a response to them - $self->{long_cb} = [ fileno($sock), $cb, now(), @args ]; - long_step($self); # kick off! - undef; -} - # callback used by PublicInbox::DS for any (e)poll (in/out/hup/err) sub event_step { my ($self) = @_; - + local $SIG{__WARN__} = $self->{imapd}->{warn_cb}; return unless $self->flush_write && $self->{sock} && !$self->{long_cb}; - $self->update_idle_time; # only read more requests if we've drained the write buffer, # otherwise we can be buffering infinitely w/o backpressure @@ -1315,17 +1206,12 @@ sub event_step { return $self->close if $r < 0; $self->rbuf_idle($rbuf); - $self->update_idle_time; # maybe there's more pipelined data, or we'll have # to register it for socket-readiness notifications $self->requeue unless $pending; } -sub compressed { undef } - -sub zflush {} # overridden by IMAPdeflate - # RFC 4978 sub cmd_compress ($$$) { my ($self, $tag, $alg) = @_; @@ -1335,33 +1221,33 @@ sub cmd_compress ($$$) { # CRIME made TLS compression obsolete # return "$tag NO [COMPRESSIONACTIVE]\r\n" if $self->tls_compressed; - PublicInbox::IMAPdeflate->enable($self, $tag); + PublicInbox::IMAPdeflate->enable($self) or return + \"$tag BAD failed to activate compression\r\n"; + PublicInbox::DS::write($self, \"$tag OK DEFLATE active\r\n"); $self->requeue; undef } sub cmd_starttls ($$) { my ($self, $tag) = @_; - my $sock = $self->{sock} or return; - if ($sock->can('stop_SSL') || $self->compressed) { + (($self->{sock} // return)->can('stop_SSL') || $self->compressed) and return "$tag BAD TLS or compression already enabled\r\n"; - } - my $opt = $self->{imapd}->{accept_tls} or + $self->{imapd}->{ssl_ctx_opt} or return "$tag BAD can not initiate TLS negotiation\r\n"; $self->write(\"$tag OK begin TLS negotiation now\r\n"); - $self->{sock} = IO::Socket::SSL->start_SSL($sock, %$opt); + PublicInbox::TLS::start($self->{sock}, $self->{imapd}); $self->requeue if PublicInbox::DS::accept_tls_step($self); undef; } -# for graceful shutdown in PublicInbox::Daemon: -sub busy { - my ($self, $now) = @_; +sub busy { # for graceful shutdown in PublicInbox::Daemon: + my ($self) = @_; if (defined($self->{-idle_tag})) { $self->write(\"* BYE server shutting down\r\n"); return; # not busy anymore } - ($self->{rbuf} || $self->{wbuf} || $self->not_idle_long($now)); + defined($self->{rbuf}) || defined($self->{wbuf}) || + !$self->write(\"* BYE server shutting down\r\n"); } sub close { @@ -1381,4 +1267,8 @@ our @ISA = qw(PublicInbox::IMAP); sub logged_in { 0 } +package PublicInbox::IMAPdeflate; +use PublicInbox::DSdeflate; +our @ISA = qw(PublicInbox::DSdeflate PublicInbox::IMAP); + 1; diff --git a/lib/PublicInbox/IMAPClient.pm b/lib/PublicInbox/IMAPClient.pm index 33deee9e..56001517 100644 --- a/lib/PublicInbox/IMAPClient.pm +++ b/lib/PublicInbox/IMAPClient.pm @@ -4,17 +4,19 @@ # # The license for this file differs from the rest of public-inbox. # -# Workaround some bugs in upstream Mail::IMAPClient when +# Workaround some bugs in upstream Mail::IMAPClient <= 3.42 when # compression is enabled: # - reference cycle: https://rt.cpan.org/Ticket/Display.html?id=132654 # - read starvation: https://rt.cpan.org/Ticket/Display.html?id=132720 package PublicInbox::IMAPClient; use strict; use parent 'Mail::IMAPClient'; -use Errno qw(EAGAIN); +unless (eval('use Mail::IMAPClient 3.43')) { +require Errno; +no warnings 'once'; # RFC4978 COMPRESS -sub compress { +*compress = sub { my ($self) = @_; # BUG? strict check on capability commented out for now... @@ -101,7 +103,7 @@ sub compress { # I/O readiness notifications (select, poll). Refactoring # callers will be needed in the unlikely case somebody wants # to use edge-triggered notifications (EV_CLEAR, EPOLLET). - $! = EAGAIN; + $! = Errno::EAGAIN(); return undef; } @@ -114,6 +116,7 @@ sub compress { }; return $self; -} +}; +} # $Mail::IMAPClient::VERSION < 3.43 1; diff --git a/lib/PublicInbox/IMAPD.pm b/lib/PublicInbox/IMAPD.pm index 3c211ee1..42dc2a9f 100644 --- a/lib/PublicInbox/IMAPD.pm +++ b/lib/PublicInbox/IMAPD.pm @@ -1,12 +1,11 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# represents an IMAPD (currently a singleton), -# see script/public-inbox-imapd for how it is used +# represents an IMAPD, see script/public-inbox-imapd for how it is used package PublicInbox::IMAPD; use strict; +use v5.10.1; use PublicInbox::Config; -use PublicInbox::ConfigIter; use PublicInbox::InboxIdle; use PublicInbox::IMAP; use PublicInbox::DummyInbox; @@ -15,102 +14,84 @@ my $dummy = bless { uidvalidity => 0 }, 'PublicInbox::DummyInbox'; sub new { my ($class) = @_; bless { - mailboxes => {}, + # mailboxes => {}, err => \*STDERR, out => \*STDOUT, - # accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... } - # pi_config => PublicInbox::Config + # ssl_ctx_opt => { SSL_cert_file => ..., SSL_key_file => ... } + # pi_cfg => PublicInbox::Config # idler => PublicInbox::InboxIdle }, $class; } -sub imapd_refresh_ibx { # pi_config->each_inbox cb - my ($ibx, $imapd) = @_; - my $ngname = $ibx->{newsgroup} or return; - if (ref $ngname) { - warn 'multiple newsgroups not supported: '. - join(', ', @$ngname). "\n"; - return; - } elsif ($ngname =~ m![^a-z0-9/_\.\-\~\@\+\=:]! || - $ngname =~ /\.[0-9]+\z/) { - warn "mailbox name invalid: newsgroup=`$ngname'\n"; - return; - } - $ibx->over or return; - $ibx->{over} = undef; - my $mm = $ibx->mm or return; - $ibx->{mm} = undef; - - # RFC 3501 2.3.1.1 - "A good UIDVALIDITY value to use in - # this case is a 32-bit representation of the creation - # date/time of the mailbox" - defined($ibx->{uidvalidity} = $mm->created_at) or return; - PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max // 0); - - # preload to avoid fragmentation: - $ibx->description; - $ibx->base_url; - - # ensure dummies are selectable - my $dummies = $imapd->{dummies}; - do { - $dummies->{$ngname} = $dummy; - } while ($ngname =~ s/\.[^\.]+\z//); -} +sub _refresh_ibx { # pi_cfg->each_inbox cb + my ($ibx, $imapd, $cache, $dummies) = @_; + my $ngname = $ibx->{newsgroup} // return; -sub imapd_refresh_finalize { - my ($imapd, $pi_config) = @_; - my $mailboxes; - if (my $next = delete $imapd->{imapd_next}) { - $imapd->{mailboxes} = delete $next->{mailboxes}; - $mailboxes = delete $next->{dummies}; - } else { - $mailboxes = delete $imapd->{dummies}; - } - %$mailboxes = (%$mailboxes, %{$imapd->{mailboxes}}); - $imapd->{mailboxes} = $mailboxes; - $imapd->{inboxlist} = [ - map { - my $no = $mailboxes->{$_} == $dummy ? '' : 'No'; - my $u = $_; # capitalize "INBOX" for user-familiarity - $u =~ s/\Ainbox(\.|\z)/INBOX$1/i; - qq[* LIST (\\Has${no}Children) "." $u\r\n] - } keys %$mailboxes - ]; - $imapd->{pi_config} = $pi_config; - if (my $idler = $imapd->{idler}) { - $idler->refresh($pi_config); + if ($ngname =~ /\.[0-9]+\z/) { # don't confuse with 50K slices + warn "E: mailbox name invalid: newsgroup=`$ngname' (ignored)\n"; + return; } -} - -sub imapd_refresh_step { # pi_config->iterate_start cb - my ($pi_config, $section, $imapd) = @_; - if (defined($section)) { - return if $section !~ m!\Apublicinbox\.([^/]+)\z!; - my $ibx = $pi_config->lookup_name($1) or return; - imapd_refresh_ibx($ibx, $imapd->{imapd_next}); - } else { # undef == "EOF" - imapd_refresh_finalize($imapd, $pi_config); + my $ce = $cache->{$ngname}; + %$ibx = (%$ibx, %$ce) if $ce; + # only valid if msgmap and over works: + if (defined($ibx->uidvalidity)) { + # fill ->{mailboxes}: + PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx); + # preload to avoid fragmentation: + $ibx->description; + # ensure dummies are selectable: + do { + $dummies->{$ngname} = $dummy; + } while ($ngname =~ s/\.[^\.]+\z//); } + delete @$ibx{qw(mm over)}; } sub refresh_groups { my ($self, $sig) = @_; - my $pi_config = PublicInbox::Config->new; - if ($sig) { # SIGHUP is handled through the event loop - $self->{imapd_next} = { dummies => {}, mailboxes => {} }; - my $iter = PublicInbox::ConfigIter->new($pi_config, - \&imapd_refresh_step, $self); - $iter->event_step; - } else { # initial start is synchronous - $self->{dummies} = {}; - $pi_config->each_inbox(\&imapd_refresh_ibx, $self); - imapd_refresh_finalize($self, $pi_config); + my $pi_cfg = PublicInbox::Config->new; + require PublicInbox::IMAPsearchqp; + $self->{mailboxes} = $pi_cfg->{-imap_mailboxes} // do { + my $mailboxes = $self->{mailboxes} = {}; + my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; + my $dummies = {}; + $pi_cfg->each_inbox(\&_refresh_ibx, $self, $cache, $dummies); + %$mailboxes = (%$dummies, %$mailboxes); + @{$pi_cfg->{-imap_mailboxlist}} = map { $_->[2] } + sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] } + map { + # capitalize "INBOX" for user-familiarity + my $u = $_; + $u =~ s/\Ainbox(\.|\z)/INBOX$1/i; + if ($mailboxes->{$_} == $dummy) { + [ $u, -1, + qq[* LIST (\\HasChildren) "." $u\r\n]] + } else { + $u =~ /\A(.+)\.([0-9]+)\z/ or die +"BUG: `$u' has no slice digit(s)"; + [ $1, $2 + 0, '* LIST '. + qq[(\\HasNoChildren) "." $u\r\n] ] + } + } keys %$mailboxes; + $pi_cfg->{-imap_mailboxes} = $mailboxes; + }; + $self->{mailboxlist} = $pi_cfg->{-imap_mailboxlist} // + die 'BUG: no mailboxlist'; + $self->{pi_cfg} = $pi_cfg; + if (my $idler = $self->{idler}) { + $idler->refresh($pi_cfg); } } sub idler_start { - $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config}); + $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg}); +} + +sub event_step { # called vai requeue for low-priority IMAP clients + my ($self) = @_; + my $imap = shift(@{$self->{-authed_q}}) // return; + PublicInbox::DS::requeue($self) if scalar(@{$self->{-authed_q}}); + $imap->event_step; # PublicInbox::IMAP::event_step } 1; diff --git a/lib/PublicInbox/IMAPTracker.pm b/lib/PublicInbox/IMAPTracker.pm index be9caf76..4efa8a7e 100644 --- a/lib/PublicInbox/IMAPTracker.pm +++ b/lib/PublicInbox/IMAPTracker.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::IMAPTracker; use strict; @@ -65,7 +65,7 @@ sub new { my ($class, $url) = @_; # original name for compatibility with old setups: - my $dbname = PublicInbox::Config->config_dir() . "/imap.sqlite3"; + my $dbname = PublicInbox::Config->config_dir() . '/imap.sqlite3'; # use the new XDG-compliant name for new setups: if (!-f $dbname) { @@ -75,8 +75,11 @@ sub new { } if (!-f $dbname) { require File::Path; - require File::Basename; - File::Path::mkpath(File::Basename::dirname($dbname)); + require PublicInbox::Syscall; + my ($dir) = ($dbname =~ m!(.*?/)[^/]+\z!); + File::Path::mkpath($dir); + PublicInbox::Syscall::nodatacow_dir($dir); + open my $fh, '+>>', $dbname or die "failed to open $dbname: $!"; } my $self = bless { lock_path => "$dbname.lock", url => $url }, $class; $self->lock_acquire; diff --git a/lib/PublicInbox/IMAPdeflate.pm b/lib/PublicInbox/IMAPdeflate.pm deleted file mode 100644 index b98a069d..00000000 --- a/lib/PublicInbox/IMAPdeflate.pm +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# TODO: reduce duplication from PublicInbox::NNTPdeflate - -# RFC 4978 -package PublicInbox::IMAPdeflate; -use strict; -use warnings; -use 5.010_001; -use base qw(PublicInbox::IMAP); -use Compress::Raw::Zlib; - -my %IN_OPT = ( - -Bufsize => 1024, - -WindowBits => -15, # RFC 1951 - -AppendOutput => 1, -); - -# global deflate context and buffer -my $zbuf = \(my $buf = ''); -my $zout; -{ - my $err; - ($zout, $err) = Compress::Raw::Zlib::Deflate->new( - # nnrpd (INN) and Compress::Raw::Zlib favor MemLevel=9, - # the zlib C library and git use MemLevel=8 as the default - # -MemLevel => 9, - -Bufsize => 65536, # same as nnrpd - -WindowBits => -15, # RFC 1951 - -AppendOutput => 1, - ); - $err == Z_OK or die "Failed to initialize zlib deflate stream: $err"; -} - -sub enable { - my ($class, $self, $tag) = @_; - my ($in, $err) = Compress::Raw::Zlib::Inflate->new(%IN_OPT); - if ($err != Z_OK) { - $self->err("Inflate->new failed: $err"); - $self->write(\"$tag BAD failed to activate compression\r\n"); - return; - } - $self->write(\"$tag OK DEFLATE active\r\n"); - bless $self, $class; - $self->{zin} = $in; -} - -# overrides PublicInbox::NNTP::compressed -sub compressed { 1 } - -sub do_read ($$$$) { - my ($self, $rbuf, $len, $off) = @_; - - my $zin = $self->{zin} or return; # closed - my $doff; - my $dbuf = delete($self->{dbuf}) // ''; - $doff = length($dbuf); - my $r = PublicInbox::DS::do_read($self, \$dbuf, $len, $doff) or return; - - # Workaround inflate bug appending to OOK scalars: - # <https://rt.cpan.org/Ticket/Display.html?id=132734> - # We only have $off if the client is pipelining, and pipelining - # is where our substr() OOK optimization in event_step makes sense. - if ($off) { - my $copy = $$rbuf; - undef $$rbuf; - $$rbuf = $copy; - } - - # assert(length($$rbuf) == $off) as far as NNTP.pm is concerned - # -ConsumeInput is true, so $dbuf is automatically emptied - my $err = $zin->inflate($dbuf, $rbuf); - if ($err == Z_OK) { - $self->{dbuf} = $dbuf if $dbuf ne ''; - $r = length($$rbuf) and return $r; - # nothing ready, yet, get more, later - $self->requeue; - } else { - delete $self->{zin}; - $self->close; - } - 0; -} - -# override PublicInbox::DS::msg_more -sub msg_more ($$) { - my $self = $_[0]; - - # $_[1] may be a reference or not for ->deflate - my $err = $zout->deflate($_[1], $zbuf); - $err == Z_OK or die "->deflate failed $err"; - 1; -} - -sub zflush ($) { - my ($self) = @_; - - my $deflated = $zbuf; - $zbuf = \(my $next = ''); - - my $err = $zout->flush($deflated, Z_FULL_FLUSH); - $err == Z_OK or die "->flush failed $err"; - - # We can still let the lower socket layer do buffering: - PublicInbox::DS::msg_more($self, $$deflated); -} - -# compatible with PublicInbox::DS::write, so $_[1] may be a reference or not -sub write ($$) { - my $self = $_[0]; - return PublicInbox::DS::write($self, $_[1]) if ref($_[1]) eq 'CODE'; - - my $deflated = $zbuf; - $zbuf = \(my $next = ''); - - # $_[1] may be a reference or not for ->deflate - my $err = $zout->deflate($_[1], $deflated); - $err == Z_OK or die "->deflate failed $err"; - $err = $zout->flush($deflated, Z_FULL_FLUSH); - $err == Z_OK or die "->flush failed $err"; - - # We can still let the socket layer do buffering: - PublicInbox::DS::write($self, $deflated); -} - -1; diff --git a/lib/PublicInbox/IMAPsearchqp.pm b/lib/PublicInbox/IMAPsearchqp.pm index 190fefb9..0c37220c 100644 --- a/lib/PublicInbox/IMAPsearchqp.pm +++ b/lib/PublicInbox/IMAPsearchqp.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # IMAP search query parser. cf RFC 3501 @@ -124,7 +124,7 @@ sub ON { my ($self, $item) = @_; my $ts = yyyymmdd($item); my $end = $ts + 86399; # no leap day - push @{$self->{xap}}, "ts:$ts..$end"; + push @{$self->{xap}}, "rt:$ts..$end"; my $sql = $self->{sql} or return 1; $$sql .= " AND ts >= $ts AND ts <= $end"; } @@ -132,7 +132,7 @@ sub ON { sub BEFORE { my ($self, $item) = @_; my $ts = yyyymmdd($item); - push @{$self->{xap}}, "ts:..$ts"; + push @{$self->{xap}}, "rt:..$ts"; my $sql = $self->{sql} or return 1; $$sql .= " AND ts <= $ts"; } @@ -140,7 +140,7 @@ sub BEFORE { sub SINCE { my ($self, $item) = @_; my $ts = yyyymmdd($item); - push @{$self->{xap}}, "ts:$ts.."; + push @{$self->{xap}}, "rt:$ts.."; my $sql = $self->{sql} or return 1; $$sql .= " AND ts >= $ts"; } @@ -165,7 +165,7 @@ sub msn_set { # things that should not match sub impossible { my ($self) = @_; - push @{$self->{xap}}, 'bytes:..0'; + push @{$self->{xap}}, 'z:..0'; my $sql = $self->{sql} or return 1; $$sql .= ' AND num < 0'; } @@ -217,8 +217,8 @@ BEFORE_date : 'BEFORE' date { $q->BEFORE(\%item) } MSN_set : sequence_set { $q->msn_set($item{sequence_set}) } UID_set : "UID" sequence_set { $q->uid_set($item{sequence_set}) } -LARGER_number : "LARGER" number { $q->xap_only("bytes:$item{number}..") } -SMALLER_number : "SMALLER" number { $q->xap_only("bytes:..$item{number}") } +LARGER_number : "LARGER" number { $q->xap_only("z:$item{number}..") } +SMALLER_number : "SMALLER" number { $q->xap_only("z:..$item{number}") } DELETED : "DELETED" { $q->impossible } OLD : "OLD" { $q->impossible } @@ -279,6 +279,8 @@ sub parse { my $sql = ''; %$q = (sql => \$sql, imap => $imap); # imap = PublicInbox::IMAP obj # $::RD_TRACE = 1; + local $::RD_ERRORS = undef; + local $::RD_WARN = undef; my $res = eval { $prd->search_key(uc($query)) }; return $@ if $@ && $@ =~ /\A(?:BAD|NO) /; return 'BAD unexpected result' if !$res || $res != $q; diff --git a/lib/PublicInbox/IO.pm b/lib/PublicInbox/IO.pm new file mode 100644 index 00000000..8640f112 --- /dev/null +++ b/lib/PublicInbox/IO.pm @@ -0,0 +1,152 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# supports reaping of children tied to a pipe or socket +package PublicInbox::IO; +use v5.12; +use parent qw(IO::Handle Exporter); +use PublicInbox::DS qw(awaitpid); +our @EXPORT_OK = qw(poll_in read_all try_cat write_file); +use Carp qw(croak); +use IO::Poll qw(POLLIN); +use Errno qw(EINTR EAGAIN); +use PublicInbox::OnDestroy; +# don't autodie in top-level for Perl 5.16.3 (and maybe newer versions) +# we have our own ->close, so we scope autodie into each sub + +sub waitcb { # awaitpid callback + my ($pid, $errref, $cb, @args) = @_; + $$errref = $?; # sets .cerr for _close + $cb->($pid, @args) if $cb; # may clobber $? +} + +sub attach_pid { + my ($io, $pid, @cb_arg) = @_; + bless $io, __PACKAGE__; + # we share $err (and not $self) with awaitpid to avoid a ref cycle + my $e = \(my $err); + ${*$io}{pi_io_reap} = [ $PublicInbox::OnDestroy::fork_gen, $pid, $e ]; + awaitpid($pid, \&waitcb, $e, @cb_arg); + $io; +} + +sub attached_pid { + my ($io) = @_; + ${${*$io}{pi_io_reap} // []}[1]; +} + +sub can_reap { + my ($io) = @_; + ${${*$io}{pi_io_reap} // [-1]}[0] == $PublicInbox::OnDestroy::fork_gen; +} + +# caller cares about error result if they call close explicitly +# reap->[2] may be set before this is called via waitcb +sub close { + my ($io) = @_; + my $ret = $io->SUPER::close; + my $reap = delete ${*$io}{pi_io_reap}; + return $ret if ($reap->[0] // -1) != $PublicInbox::OnDestroy::fork_gen; + if (defined ${$reap->[2]}) { # reap_pids already reaped asynchronously + $? = ${$reap->[2]}; + } else { # wait synchronously + my $w = awaitpid($reap->[1]); + } + $? ? '' : $ret; +} + +sub DESTROY { + my ($io) = @_; + my $reap = delete ${*$io}{pi_io_reap}; + if (($reap->[0] // -1) == $PublicInbox::OnDestroy::fork_gen) { + $io->SUPER::close; + ${$reap->[2]} // awaitpid($reap->[1]); + } + $io->SUPER::DESTROY; +} + +sub write_file ($$@) { # mode, filename, LIST (for print) + use autodie qw(open close); + open(my $fh, shift, shift); + print $fh @_; + defined(wantarray) && !wantarray ? $fh : close $fh; +} + +sub poll_in ($;$) { + IO::Poll::_poll($_[1] // -1, fileno($_[0]), my $ev = POLLIN); +} + +sub read_all ($;$$$) { # pass $len=0 to read until EOF for :utf8 handles + use autodie qw(read); + my ($io, $len, $bref, $off) = @_; + $bref //= \(my $buf); + $off //= 0; + my $r = 0; + if (my $left = $len //= -s $io) { # known size (binmode :raw/:unix) + do { # retry for binmode :unix + $r = read($io, $$bref, $left, $off += $r) or croak( + "read($io) premature EOF ($left/$len remain)"); + } while ($left -= $r); + } else { # read until EOF + while (($r = read($io, $$bref, 65536, $off += $r))) {} + } + wantarray ? split(/^/sm, $$bref) : $$bref +} + +sub try_cat ($) { + my ($path) = @_; + open(my $fh, '<', $path) or return ''; + read_all $fh; +} + +# TODO: move existing HTTP/IMAP/NNTP/POP3 uses of rbuf here +sub my_bufread { + my ($io, $len) = @_; + my $rbuf = ${*$io}{pi_io_rbuf} //= \(my $new = ''); + my $left = $len - length($$rbuf); + my $r; + while ($left > 0) { + $r = sysread($io, $$rbuf, $left, length($$rbuf)); + if ($r) { + $left -= $r; + } elsif (defined($r)) { # EOF + return 0; + } else { + next if ($! == EAGAIN and poll_in($io)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error + } + } + my $no_pad = substr($$rbuf, 0, $len, ''); + delete(${*$io}{pi_io_rbuf}) if $$rbuf eq ''; + \$no_pad; +} + +# always uses "\n" +sub my_readline { + my ($io) = @_; + my $rbuf = ${*$io}{pi_io_rbuf} //= \(my $new = ''); + while (1) { + if ((my $n = index($$rbuf, "\n")) >= 0) { + my $ret = substr($$rbuf, 0, $n + 1, ''); + delete(${*$io}{pi_io_rbuf}) if $$rbuf eq ''; + return $ret; + } + my $r = sysread($io, $$rbuf, 65536, length($$rbuf)); + if (!defined($r)) { + next if ($! == EAGAIN and poll_in($io)); + next if $! == EINTR; # may be set by sysread or poll_in + return; # unrecoverable error + } elsif ($r == 0) { # return whatever's left on EOF + delete(${*$io}{pi_io_rbuf}); + return $$rbuf; + } # else { continue + } +} + +sub has_rbuf { + my ($io) = @_; + defined(${*$io}{pi_io_rbuf}); +} + +1; diff --git a/lib/PublicInbox/IPC.pm b/lib/PublicInbox/IPC.pm new file mode 100644 index 00000000..ed6d27fd --- /dev/null +++ b/lib/PublicInbox/IPC.pm @@ -0,0 +1,452 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# base class for remote IPC calls and workqueues, requires Storable or Sereal +# - ipc_do and ipc_worker_* is for a single worker/producer and uses pipes +# - wq_io_do and wq_worker* is for a single producer and multiple workers, +# using SOCK_SEQPACKET for work distribution +# use ipc_do when you need work done on a certain process +# use wq_io_do when your work can be done on any idle worker +package PublicInbox::IPC; +use v5.12; +use parent qw(Exporter); +use autodie qw(close pipe read socketpair sysread); +use Carp qw(croak); +use PublicInbox::DS qw(awaitpid); +use PublicInbox::Spawn; +use PublicInbox::OnDestroy; +use PublicInbox::WQWorker; +use Socket qw(AF_UNIX SOCK_STREAM SOCK_SEQPACKET); +my $MY_MAX_ARG_STRLEN = 4096 * 33; # extra 4K for serialization +our @EXPORT_OK = qw(ipc_freeze ipc_thaw nproc_shards); +my ($enc, $dec); +# ->imports at BEGIN turns sereal_*_with_object into custom ops on 5.14+ +# and eliminate method call overhead +BEGIN { + eval { + require Sereal::Encoder; + require Sereal::Decoder; + Sereal::Encoder->import('sereal_encode_with_object'); + Sereal::Decoder->import('sereal_decode_with_object'); + ($enc, $dec) = (Sereal::Encoder->new, Sereal::Decoder->new); + }; +}; + +if ($enc && $dec) { # should be custom ops + *ipc_freeze = sub ($) { sereal_encode_with_object $enc, $_[0] }; + *ipc_thaw = sub ($) { sereal_decode_with_object $dec, $_[0], my $ret }; +} else { + require Storable; + *ipc_freeze = \&Storable::freeze; + *ipc_thaw = \&Storable::thaw; +} + +our $recv_cmd = PublicInbox::Spawn->can('recv_cmd4'); +our $send_cmd = PublicInbox::Spawn->can('send_cmd4') // do { + require PublicInbox::CmdIPC4; + $recv_cmd //= PublicInbox::CmdIPC4->can('recv_cmd4'); + PublicInbox::CmdIPC4->can('send_cmd4'); +} // do { + require PublicInbox::Syscall; + $recv_cmd //= PublicInbox::Syscall->can('recv_cmd4'); + PublicInbox::Syscall->can('send_cmd4'); +}; + +sub _get_rec ($) { + my ($r) = @_; + my $len = <$r> // return; + chop($len) eq "\n" or croak "no LF byte in $len"; + my $n = read($r, my $buf, $len); + $n == $len or croak "short read: $n != $len"; + ipc_thaw($buf); +} + +sub _send_rec ($$) { + my ($w, $ref) = @_; + my $buf = ipc_freeze($ref); + print $w length($buf), "\n", $buf or croak "print: $!"; +} + +sub ipc_return ($$$) { + my ($w, $ret, $exc) = @_; + _send_rec($w, $exc ? bless(\$exc, 'PublicInbox::IPC::Die') : $ret); +} + +sub ipc_worker_loop ($$$) { + my ($self, $r_req, $w_res) = @_; + my ($rec, $wantarray, $sub, @args); + local $/ = "\n"; + while ($rec = _get_rec($r_req)) { + ($wantarray, $sub, @args) = @$rec; + # no waiting if client doesn't care, + # this is the overwhelmingly likely case + if (!defined($wantarray)) { + eval { $self->$sub(@args) }; + warn "$$ die: $@ (from nowait $sub)\n" if $@; + } elsif ($wantarray) { + my @ret = eval { $self->$sub(@args) }; + ipc_return($w_res, \@ret, $@); + } else { # '' => wantscalar + my $ret = eval { $self->$sub(@args) }; + ipc_return($w_res, \$ret, $@); + } + } +} + +sub exit_exception { exit(!!$@) } + +# starts a worker if Sereal or Storable is installed +sub ipc_worker_spawn { + my ($self, $ident, $oldset, $fields, @cb_args) = @_; + return if ($self->{-ipc_ppid} // -1) == $$; # idempotent + delete(@$self{qw(-ipc_req -ipc_res -ipc_ppid -ipc_pid)}); + pipe(my $r_req, my $w_req); + pipe(my $r_res, my $w_res); + my $sigset = $oldset // PublicInbox::DS::block_signals(); + $self->ipc_atfork_prepare; + my $pid = PublicInbox::DS::fork_persist; + if ($pid == 0) { + delete @$self{qw(-wq_s1 -wq_s2 -wq_workers -wq_ppid)}; + $w_req = $r_res = undef; + $w_res->autoflush(1); + $SIG{$_} = 'IGNORE' for (qw(TERM INT QUIT)); + local $0 = $ident; + # ensure we properly exit even if warn() dies: + my $end = on_destroy \&exit_exception; + eval { + $fields //= {}; + local @$self{keys %$fields} = values(%$fields); + my $on_destroy = $self->ipc_atfork_child; + local @SIG{keys %SIG} = values %SIG; + PublicInbox::DS::sig_setmask($sigset); + ipc_worker_loop($self, $r_req, $w_res); + }; + warn "worker $ident PID:$$ died: $@\n" if $@; + undef $end; # trigger exit + } + PublicInbox::DS::sig_setmask($sigset) unless $oldset; + $r_req = $w_res = undef; + $w_req->autoflush(1); + $self->{-ipc_req} = $w_req; + $self->{-ipc_res} = $r_res; + $self->{-ipc_ppid} = $$; + awaitpid($pid, \&ipc_worker_reap, $self, @cb_args); + $self->{-ipc_pid} = $pid; +} + +sub ipc_worker_reap { # awaitpid callback + my ($pid, $self, $cb, @args) = @_; + delete $self->{-wq_workers}->{$pid}; + return $cb->($pid, $self, @args) if $cb; + return if !$?; + my $s = $? & 127; + # TERM(15) is our default exit signal, PIPE(13) is likely w/ pager + warn "$self->{-wq_ident} PID:$pid died \$?=$?\n" if $s != 15 && $s != 13 +} + +# for base class, override in sub classes +sub ipc_atfork_prepare {} + +sub wq_atexit_child {} + +sub ipc_atfork_child { + my ($self) = @_; + my $io = delete($self->{-ipc_atfork_child_close}) or return; + close($_) for @$io; + undef; +} + +# idempotent, can be called regardless of whether worker is active or not +sub ipc_worker_stop { + my ($self) = @_; + my ($pid, $ppid) = delete(@$self{qw(-ipc_pid -ipc_ppid)}); + my ($w_req, $r_res) = delete(@$self{qw(-ipc_req -ipc_res)}); + if (!$w_req && !$r_res) { + die "unexpected PID:$pid without IPC pipes" if $pid; + return; # idempotent + } + die 'no PID with IPC pipes' unless $pid; + $w_req = $r_res = undef; + awaitpid($pid) if $$ == $ppid; # for non-event loop +} + +sub _wait_return ($$) { + my ($r_res, $sub) = @_; + my $ret = _get_rec($r_res) // die "no response on $sub"; + die $$ret if ref($ret) eq 'PublicInbox::IPC::Die'; + wantarray ? @$ret : $$ret; +} + +# call $self->$sub(@args), on a worker if ipc_worker_spawn was used +sub ipc_do { + my ($self, $sub, @args) = @_; + if (my $w_req = $self->{-ipc_req}) { # run in worker + if (defined(wantarray)) { + my $r_res = $self->{-ipc_res} or die 'no ipc_res'; + _send_rec($w_req, [ wantarray, $sub, @args ]); + _wait_return($r_res, $sub); + } else { # likely, fire-and-forget into pipe + _send_rec($w_req, [ undef , $sub, @args ]); + } + } else { # run locally + $self->$sub(@args); + } +} + +# needed when there's multiple IPC workers and the parent forking +# causes newer siblings to inherit older siblings sockets +sub ipc_sibling_atfork_child { + my ($self) = @_; + my ($pid, undef) = delete(@$self{qw(-ipc_pid -ipc_ppid)}); + delete(@$self{qw(-ipc_req -ipc_res)}); + $pid == $$ and die "BUG: $$ ipc_atfork_child called on itself"; +} + +sub recv_and_run { + my ($self, $s2, $len, $full_stream) = @_; + my @fds = $recv_cmd->($s2, my $buf, $len // $MY_MAX_ARG_STRLEN); + return if scalar(@fds) && !defined($fds[0]); + my $n = length($buf) or return 0; + my $nfd = 0; + for my $fd (@fds) { + open(my $cmdfh, '+<&=', $fd); + $self->{$nfd++} = $cmdfh; + $cmdfh->autoflush(1); + } + while ($full_stream && $n < $len) { + my $r = sysread($s2, $buf, $len - $n, $n); + croak "read EOF after $n/$len bytes" if $r == 0; + $n = length($buf); + } + # Sereal dies on truncated data, Storable returns undef + my $args = ipc_thaw($buf) // die "thaw error on buffer of size: $n"; + undef $buf; + my $sub = shift @$args; + eval { $self->$sub(@$args) }; + warn "$$ $0 wq_worker: $sub: $@" if $@; + delete @$self{0..($nfd-1)}; + $n; +} + +sub sock_defined { # PublicInbox::DS::post_loop_do CB + my ($wqw) = @_; + defined($wqw->{sock}); +} + +sub wq_worker_loop ($$$) { + my ($self, $bcast2, $oldset) = @_; + my $wqw = PublicInbox::WQWorker->new($self, $self->{-wq_s2}); + PublicInbox::WQWorker->new($self, $bcast2) if $bcast2; + local @PublicInbox::DS::post_loop_do = (\&sock_defined, $wqw); + my $sig = delete($self->{wq_sig}); + $sig->{CHLD} //= \&PublicInbox::DS::enqueue_reap; + PublicInbox::DS::event_loop($sig, $oldset); + PublicInbox::DS->Reset; +} + +sub do_sock_stream { # via wq_io_do, for big requests + my ($self, $len) = @_; + recv_and_run($self, my $s2 = delete $self->{0}, $len, 1); +} + +sub wq_broadcast { + my ($self, $sub, @args) = @_; + my $wkr = $self->{-wq_workers} or Carp::confess('no -wq_workers'); + my $buf = ipc_freeze([$sub, @args]); + for my $bcast1 (values %$wkr) { + my $sock = $bcast1 // $self->{-wq_s1} // next; + send($sock, $buf, 0) // croak "send: $!"; + # XXX shouldn't have to deal with EMSGSIZE here... + } +} + +sub stream_in_full ($$$) { + my ($s1, $fds, $buf) = @_; + socketpair(my $r, my $w, AF_UNIX, SOCK_STREAM, 0); + my $n = $send_cmd->($s1, [ fileno($r) ], + ipc_freeze(['do_sock_stream', length($buf)]), + 0) // croak "sendmsg: $!"; + undef $r; + $n = $send_cmd->($w, $fds, $buf, 0) // croak "sendmsg: $!"; + print $w substr($buf, $n) if $n < length($buf); # need > 2G on Linux + close $w; # autodies +} + +sub wq_io_do { # always async + my ($self, $sub, $ios, @args) = @_; + my $s1 = $self->{-wq_s1} or Carp::confess('no -wq_s1'); + my $fds = [ map { fileno($_) } @$ios ]; + my $buf = ipc_freeze([$sub, @args]); + if (length($buf) > $MY_MAX_ARG_STRLEN) { + stream_in_full($s1, $fds, $buf); + } else { + my $n = $send_cmd->($s1, $fds, $buf, 0); + return if defined($n); # likely + $!{ETOOMANYREFS} and croak "sendmsg: $! (check RLIMIT_NOFILE)"; + $!{EMSGSIZE} ? stream_in_full($s1, $fds, $buf) : + croak("sendmsg: $!"); + } +} + +sub wq_sync_run { + my ($self, $wantarray, $sub, @args) = @_; + if ($wantarray) { + my @ret = eval { $self->$sub(@args) }; + ipc_return($self->{0}, \@ret, $@); + } else { # '' => wantscalar + my $ret = eval { $self->$sub(@args) }; + ipc_return($self->{0}, \$ret, $@); + } +} + +sub wq_do { + my ($self, $sub, @args) = @_; + if (defined(wantarray)) { + pipe(my $r, my $w); + wq_io_do($self, 'wq_sync_run', [ $w ], wantarray, $sub, @args); + undef $w; + _wait_return($r, $sub); + } else { + wq_io_do($self, $sub, [], @args); + } +} + +sub prepare_nonblock { + ($_[0]->{-wq_s1} // die 'BUG: no {-wq_s1}')->blocking(0); + require PublicInbox::WQBlocked; +} + +sub wq_nonblock_do { # always async + my ($self, $sub, @args) = @_; + my $buf = ipc_freeze([$sub, @args]); + if ($self->{wqb}) { # saturated once, assume saturated forever + $self->{wqb}->flush_send($buf); + } else { + $send_cmd->($self->{-wq_s1}, [], $buf, 0) // + ($!{EAGAIN} ? PublicInbox::WQBlocked->new($self, $buf) + : croak("sendmsg: $!")); + } +} + +sub _wq_worker_start { + my ($self, $oldset, $fields, $one, @cb_args) = @_; + my ($bcast1, $bcast2); + $one or socketpair($bcast1, $bcast2, AF_UNIX, SOCK_SEQPACKET, 0); + my $pid = PublicInbox::DS::fork_persist; + if ($pid == 0) { + undef $bcast1; + delete @$self{qw(-wq_s1 -wq_ppid)}; + $self->{-wq_worker_nr} = + keys %{delete($self->{-wq_workers}) // {}}; + $SIG{$_} = 'DEFAULT' for (qw(TTOU TTIN TERM QUIT INT CHLD)); + local $0 = $one ? $self->{-wq_ident} : + "$self->{-wq_ident} $self->{-wq_worker_nr}"; + # ensure we properly exit even if warn() dies: + my $end = on_destroy \&exit_exception; + eval { + $fields //= {}; + local @$self{keys %$fields} = values(%$fields); + my $on_destroy = $self->ipc_atfork_child; + local @SIG{keys %SIG} = values %SIG; + wq_worker_loop($self, $bcast2, $oldset); + }; + warn "worker $self->{-wq_ident} PID:$$ died: $@" if $@; + undef $end; # trigger exit + } else { + $self->{-wq_workers}->{$pid} = $bcast1; + awaitpid($pid, \&ipc_worker_reap, $self, @cb_args); + } +} + +# starts workqueue workers if Sereal or Storable is installed +sub wq_workers_start { + my ($self, $ident, $nr_workers, $oldset, $fields, @cb_args) = @_; + ($send_cmd && $recv_cmd) or return; + return if $self->{-wq_s1}; # idempotent + socketpair($self->{-wq_s1}, $self->{-wq_s2},AF_UNIX, SOCK_SEQPACKET, 0); + $self->ipc_atfork_prepare; + $nr_workers //= $self->{-wq_nr_workers}; # was set earlier + my $sigset = $oldset // PublicInbox::DS::block_signals(); + $self->{-wq_workers} = {}; + $self->{-wq_ident} = $ident; + my $one = $nr_workers == 1; + $self->{-wq_nr_workers} = $nr_workers; + for (1..$nr_workers) { + _wq_worker_start($self, $sigset, $fields, $one, @cb_args); + } + PublicInbox::DS::sig_setmask($sigset) unless $oldset; + $self->{-wq_ppid} = $$; +} + +sub wq_close { + my ($self) = @_; + if (my $wqb = delete $self->{wqb}) { + $wqb->enq_close; + } + delete @$self{qw(-wq_s1 -wq_s2)} or return; + return if ($self->{-wq_ppid} // -1) != $$; + awaitpid($_) for keys %{$self->{-wq_workers}}; +} + +sub wq_kill { + my ($self, $sig) = @_; + kill($sig // 'TERM', keys %{$self->{-wq_workers}}); +} + +sub DESTROY { + my ($self) = @_; + my $ppid = $self->{-wq_ppid}; + wq_kill($self) if $ppid && $ppid == $$; + wq_close($self); + ipc_worker_stop($self); +} + +# _SC_NPROCESSORS_ONLN = 84 on both Linux glibc and musl, +# emitted using: $^X devel/sysdefs-list +my %NPROCESSORS_ONLN = ( + linux => 84, + freebsd => 58, + dragonfly => 58, + openbsd => 503, + netbsd => 1002 +); + +sub detect_nproc () { + my $n = $NPROCESSORS_ONLN{$^O}; + return POSIX::sysconf($n) if defined $n; + + # getconf(1) is POSIX, but *NPROCESSORS* vars are not even if + # glibc, {Free,Net,Open}BSD all support them. + for (qw(_NPROCESSORS_ONLN NPROCESSORS_ONLN)) { + `getconf $_ 2>/dev/null` =~ /^(\d+)$/ and return $1; + } + # note: GNU nproc(1) checks CPU affinity, which is nice but + # isn't remotely portable + undef +} + +# SATA storage lags behind what CPUs are capable of, so relying on +# nproc(1) can be misleading and having extra Xapian shards is a +# waste of FDs and space. It can also lead to excessive IO latency +# and slow things down. Users on NVME or other fast storage can +# use the NPROC env or switches in our script/public-inbox-* programs +# to increase Xapian shards +our $NPROC_MAX_DEFAULT = 4; + +sub nproc_shards ($) { + my ($creat_opt) = @_; + my $n = $creat_opt->{nproc} if ref($creat_opt) eq 'HASH'; + $n //= $ENV{NPROC}; + if (!$n) { + # assume 2 cores if not detectable or zero + state $NPROC_DETECTED = PublicInbox::IPC::detect_nproc() || 2; + $n = $NPROC_DETECTED; + $n = $NPROC_MAX_DEFAULT if $n > $NPROC_MAX_DEFAULT; + } + + # subtract for the main process and git-fast-import + $n -= 1; + $n < 1 ? 1 : $n; +} + +1; diff --git a/lib/PublicInbox/IdxStack.pm b/lib/PublicInbox/IdxStack.pm index ce75b46a..7681ee6f 100644 --- a/lib/PublicInbox/IdxStack.pm +++ b/lib/PublicInbox/IdxStack.pm @@ -1,25 +1,35 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # temporary stack for public-inbox-index +# FIXME: needs to support multi-hash in the same repo once git itself can package PublicInbox::IdxStack; -use v5.10.1; -use strict; +use v5.12; use Fcntl qw(:seek); -use constant FMT => eval { pack('Q', 1) } ? 'A1QQH*' : 'A1IIH*'; +use constant PACK_FMT => eval { pack('Q', 1) } ? 'A1QQH*H*' : 'A1IIH*H*'; +use autodie qw(open seek); +use PublicInbox::IO qw(read_all); # start off in write-only mode sub new { - open(my $io, '+>', undef) or die "open: $!"; + open(my $io, '+>', undef); + # latest_cmt is still useful when the newest revision is a `d'(elete), + # otherwise we favor $sync->{latest_cmt} for checkpoints and {quit} bless { wr => $io, latest_cmt => $_[1] }, __PACKAGE__ } # file_char = [d|m] sub push_rec { - my ($self, $file_char, $at, $ct, $blob_oid) = @_; - my $rec = pack(FMT, $file_char, $at, $ct, $blob_oid); - $self->{rec_size} //= length($rec); - print { $self->{wr} } $rec or die "print: $!"; + my ($self, $file_char, $at, $ct, $blob_oid, $cmt_oid) = @_; + my $rec = pack(PACK_FMT, $file_char, $at, $ct, $blob_oid, $cmt_oid); + $self->{unpack_fmt} // do { + my $len = length($cmt_oid); + my $fmt = PACK_FMT; + $fmt =~ s/H\*/H$len/g; + $self->{rec_size} = length($rec); + $self->{unpack_fmt} = $fmt; + }; + print { $self->{wr} } $rec; $self->{tot_size} += length($rec); } @@ -41,12 +51,8 @@ sub pop_rec { my $sz = $self->{rec_size} or return; my $rec_pos = $self->{tot_size} -= $sz; return if $rec_pos < 0; - my $io = $self->{rd}; - seek($io, $rec_pos, SEEK_SET) or die "seek: $!"; - my $r = read($io, my $buf, $sz); - defined($r) or die "read: $!"; - $r == $sz or die "read($r != $sz)"; - unpack(FMT, $buf); + seek($self->{rd}, $rec_pos, SEEK_SET); + unpack($self->{unpack_fmt}, read_all($self->{rd}, $sz)); } 1; diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 1a226cc7..fefc282a 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # git fast-import-based ssoma-mda MDA replacement @@ -6,10 +6,9 @@ # and public-inbox-watch. Not the WWW or NNTP code which only # requires read-only access. package PublicInbox::Import; -use strict; +use v5.12; use parent qw(PublicInbox::Lock); -use v5.10.1; -use PublicInbox::Spawn qw(spawn popen_rd); +use PublicInbox::Spawn qw(run_die run_qx spawn); use PublicInbox::MID qw(mids mid2path); use PublicInbox::Address; use PublicInbox::Smsg; @@ -18,14 +17,27 @@ use PublicInbox::ContentHash qw(content_digest); use PublicInbox::MDA; use PublicInbox::Eml; use POSIX qw(strftime); +use autodie qw(socketpair); +use Carp qw(croak); +use Socket qw(AF_UNIX SOCK_STREAM); +use PublicInbox::IO qw(read_all); + +sub default_branch () { + state $default_branch = do { + my $h = run_qx([qw(git config --global init.defaultBranch)], + { GIT_CONFIG => undef }); + chomp $h; + $h eq '' ? 'refs/heads/master' : "refs/heads/$h"; + } +} sub new { # we can't change arg order, this is documented in POD # and external projects may rely on it: my ($class, $git, $name, $email, $ibx) = @_; - my $ref = 'refs/heads/master'; + my $ref; if ($ibx) { - $ref = $ibx->{ref_head} // 'refs/heads/master'; + $ref = $ibx->{ref_head}; $name //= $ibx->{name}; $email //= $ibx->{-primary_address}; $git //= $ibx->git; @@ -34,7 +46,7 @@ sub new { git => $git, ident => "$name <$email>", mark => 1, - ref => $ref, + ref => $ref // default_branch, ibx => $ibx, path_type => '2/38', # or 'v2' lock_path => "$git->{git_dir}/ssoma.lock", # v2 changes this @@ -45,41 +57,36 @@ sub new { # idempotent start function sub gfi_start { my ($self) = @_; - - return ($self->{in}, $self->{out}) if $self->{pid}; - - my (@ret, $out_r, $out_w); - pipe($out_r, $out_w) or die "pipe failed: $!"; + my $io = $self->{io}; + return $io if $io; + socketpair($io, my $s2, AF_UNIX, SOCK_STREAM, 0); + $io->autoflush(1); $self->lock_acquire; eval { my ($git, $ref) = @$self{qw(git ref)}; local $/ = "\n"; chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $ref)); + die "fatal: rev-parse --revs-only $ref: \$?=$?" if $?; if ($self->{path_type} ne '2/38' && $self->{tip}) { - local $/ = "\0"; - my @t = $git->qx(qw(ls-tree -r -z --name-only), $ref); - chomp @t; - $self->{-tree} = { map { $_ => 1 } @t }; + my $t = $git->qx(qw(ls-tree -r -z --name-only), $ref); + die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; + $self->{-tree} = { map { $_ => 1 } split(/\0/, $t) }; } - my @cmd = ('git', "--git-dir=$git->{git_dir}", - qw(fast-import --quiet --done --date-format=raw)); - my ($in_r, $pid) = popen_rd(\@cmd, undef, { 0 => $out_r }); - $out_w->autoflush(1); - $self->{in} = $in_r; - $self->{out} = $out_w; - $self->{pid} = $pid; + my $gfi = $git->cmd(qw(fast-import + --quiet --done --date-format=raw)); + my $pid = spawn($gfi, undef, { 0 => $s2, 1 => $s2 }); $self->{nchg} = 0; - @ret = ($in_r, $out_w); + $self->{io} = PublicInbox::IO::attach_pid($io, $pid); }; if ($@) { $self->lock_release; die $@; } - @ret; + $self->{io}; } -sub wfail () { die "write to fast-import failed: $!" } +sub wfail () { croak "write to fast-import failed: $!" } sub now_raw () { time . ' +0000' } @@ -91,60 +98,43 @@ sub norm_body ($) { } # only used for v1 (ssoma) inboxes -sub _check_path ($$$$) { - my ($r, $w, $tip, $path) = @_; +sub _check_path ($$$) { + my ($io, $tip, $path) = @_; return if $tip eq ''; - print $w "ls $tip $path\n" or wfail; + print $io "ls $tip $path\n" or wfail; local $/ = "\n"; - defined(my $info = <$r>) or die "EOF from fast-import: $!"; + my $info = <$io> // die "EOF from fast-import: $!"; $info =~ /\Amissing / ? undef : $info; } -sub _cat_blob ($$$) { - my ($r, $w, $oid) = @_; - print $w "cat-blob $oid\n" or wfail; +sub _cat_blob ($$) { + my ($io, $oid) = @_; + print $io "cat-blob $oid\n" or wfail; local $/ = "\n"; - my $info = <$r>; - defined $info or die "EOF from fast-import / cat-blob: $!"; + my $info = <$io> // die "EOF from fast-import / cat-blob: $!"; $info =~ /\A[a-f0-9]{40,} blob ([0-9]+)\n\z/ or return; - my $left = $1; - my $offset = 0; - my $buf = ''; - my $n; - while ($left > 0) { - $n = read($r, $buf, $left, $offset); - defined($n) or die "read cat-blob failed: $!"; - $n == 0 and die 'fast-export (cat-blob) died'; - $left -= $n; - $offset += $n; - } - $n = read($r, my $lf, 1); - defined($n) or die "read final byte of cat-blob failed: $!"; - die "bad read on final byte: <$lf>" if $lf ne "\n"; - - # fixup some bugginess in old versions: - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $buf = read_all($io, my $len = $1 + 1); + my $lf = chop $buf; + croak "bad read on final byte: <$lf>" if $lf ne "\n"; \$buf; } sub cat_blob { my ($self, $oid) = @_; - my ($r, $w) = $self->gfi_start; - _cat_blob($r, $w, $oid); + _cat_blob($self->{io} // return, $oid); } sub check_remove_v1 { - my ($r, $w, $tip, $path, $mime) = @_; + my ($io, $tip, $path, $mime) = @_; - my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); + my $info = _check_path($io, $tip, $path) or return ('MISSING',undef); $info =~ m!\A100644 blob ([a-f0-9]{40,})\t!s or die "not blob: $info"; my $oid = $1; - my $msg = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed"; - my $cur = PublicInbox::Eml->new($msg); - my $cur_s = $cur->header('Subject'); - $cur_s = '' unless defined $cur_s; - my $cur_m = $mime->header('Subject'); - $cur_m = '' unless defined $cur_m; + my $bref = _cat_blob($io, $oid) or die "BUG: cat-blob $1 failed"; + PublicInbox::Eml::strip_from($$bref); + my $cur = PublicInbox::Eml->new($bref); + my $cur_s = $cur->header('Subject') // ''; + my $cur_m = $mime->header('Subject') // ''; if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { return ('MISMATCH', $cur); } @@ -153,16 +143,15 @@ sub check_remove_v1 { sub checkpoint { my ($self) = @_; - return unless $self->{pid}; - print { $self->{out} } "checkpoint\n" or wfail; + print { $self->{io} // return } "checkpoint\n" or wfail; undef; } sub progress { my ($self, $msg) = @_; - return unless $self->{pid}; - print { $self->{out} } "progress $msg\n" or wfail; - readline($self->{in}) eq "progress $msg\n" or die + my $io = $self->{io} or return; + print $io "progress $msg\n" or wfail; + readline($io) eq "progress $msg\n" or die "progress $msg not received\n"; undef; } @@ -172,14 +161,14 @@ sub _update_git_info ($$) { # for compatibility with existing ssoma installations # we can probably remove this entirely by 2020 my $git_dir = $self->{git}->{git_dir}; - my @cmd = ('git', "--git-dir=$git_dir"); + my @cmd = @{$self->{git}->cmd}; my $index = "$git_dir/ssoma.index"; if (-e $index && !$ENV{FAST}) { my $env = { GIT_INDEX_FILE => $index }; run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env); } - eval { run_die([@cmd, 'update-server-info']) }; my $ibx = $self->{ibx}; + eval { run_die([@cmd, 'update-server-info']) } if $ibx; if ($ibx && $ibx->version == 1 && -d "$ibx->{inboxdir}/public-inbox" && eval { require PublicInbox::SearchIdx }) { eval { @@ -188,7 +177,10 @@ sub _update_git_info ($$) { }; warn "$ibx->{inboxdir} index failed: $@\n" if $@; } - eval { run_die([@cmd, qw(gc --auto)]) } if $do_gc; + if ($do_gc) { + my @quiet = (-t STDERR ? () : '-q'); + eval { run_die([@cmd, qw(gc --auto), @quiet]) } + } } sub barrier { @@ -209,10 +201,9 @@ sub barrier { # used for v2 sub get_mark { my ($self, $mark) = @_; - die "not active\n" unless $self->{pid}; - my ($r, $w) = $self->gfi_start; - print $w "get-mark $mark\n" or wfail; - defined(my $oid = <$r>) or die "get-mark failed, need git 2.6.0+\n"; + my $io = $self->{io} or croak "not active\n"; + print $io "get-mark $mark\n" or wfail; + my $oid = <$io> // die "get-mark failed, need git 2.6.0+\n"; chomp($oid); $oid; } @@ -229,11 +220,11 @@ sub remove { my $path_type = $self->{path_type}; my ($path, $err, $cur, $blob); - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my $tip = $self->{tip}; if ($path_type eq '2/38') { $path = mid2path(v1_mid0($mime)); - ($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime); + ($err, $cur) = check_remove_v1($io, $tip, $path, $mime); return ($err, $cur) if $err; } else { my $sref; @@ -245,7 +236,7 @@ sub remove { } my $len = length($$sref); $blob = $self->{mark}++; - print $w "blob\nmark :$blob\ndata $len\n", + print $io "blob\nmark :$blob\ndata $len\n", $$sref, "\n" or wfail; } @@ -253,22 +244,22 @@ sub remove { my $commit = $self->{mark}++; my $parent = $tip =~ /\A:/ ? $tip : undef; unless ($parent) { - print $w "reset $ref\n" or wfail; + print $io "reset $ref\n" or wfail; } my $ident = $self->{ident}; my $now = now_raw(); $msg //= 'rm'; my $len = length($msg) + 1; - print $w "commit $ref\nmark :$commit\n", + print $io "commit $ref\nmark :$commit\n", "author $ident $now\n", "committer $ident $now\n", "data $len\n$msg\n\n", 'from ', ($parent ? $parent : $tip), "\n" or wfail; if (defined $path) { - print $w "D $path\n\n" or wfail; + print $io "D $path\n\n" or wfail; } else { - clean_tree_v2($self, $w, 'd'); - print $w "M 100644 :$blob d\n\n" or wfail; + clean_tree_v2($self, $io, 'd'); + print $io "M 100644 :$blob d\n\n" or wfail; } $self->{nchg}++; (($self->{tip} = ":$commit"), $cur); @@ -328,11 +319,40 @@ sub extract_cmt_info ($;$) { } # kill potentially confusing/misleading headers +our @UNWANTED_HEADERS = (qw(Bytes Lines Content-Length), + qw(Status X-Status)); +our $DROP_UNIQUE_UNSUB; sub drop_unwanted_headers ($) { - my ($mime) = @_; + my ($eml) = @_; + for (@UNWANTED_HEADERS, @PublicInbox::MDA::BAD_HEADERS) { + $eml->header_set($_); + } - $mime->header_set($_) for qw(Bytes Lines Content-Length Status); - $mime->header_set($_) for @PublicInbox::MDA::BAD_HEADERS; + # We don't want public-inbox readers to be able to unsubcribe the + # address which does archiving. WARNING: this breaks DKIM if the + # mailing list sender follows RFC 8058, section 4; but breaking DKIM + # (or have senders ignore RFC 8058 sec. 4) is preferable to having + # saboteurs unsubscribing independent archivists: + if ($DROP_UNIQUE_UNSUB && grep(/\AList-Unsubscribe=One-Click\z/, + $eml->header_raw('List-Unsubscribe-Post'))) { + for (qw(List-Unsubscribe-Post List-Unsubscribe)) { + $eml->header_set($_) + } + } +} + +sub load_config ($;$) { + my ($cfg, $do_exit) = @_; + my $v = $cfg->{lc 'publicinboxImport.dropUniqueUnsubscribe'}; + if (defined $v) { + $DROP_UNIQUE_UNSUB = $cfg->git_bool($v) // do { + warn <<EOM; +E: publicinboxImport.dropUniqueUnsubscribe=$v in $cfg->{-f} is not boolean +EOM + $do_exit //= \&CORE::exit; + $do_exit->(78); # EX_CONFIG + }; + } } # used by V2Writable, too @@ -356,11 +376,11 @@ sub v1_mid0 ($) { $mids->[0]; } sub clean_tree_v2 ($$$) { - my ($self, $w, $keep) = @_; + my ($self, $io, $keep) = @_; my $tree = $self->{-tree} or return; #v2 only delete $tree->{$keep}; foreach (keys %$tree) { - print $w "D $_\n" or wfail; + print $io "D $_\n" or wfail; } %$tree = ($keep => 1); } @@ -379,10 +399,10 @@ sub add { $path = 'm'; } - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my $tip = $self->{tip}; if ($path_type eq '2/38') { - _check_path($r, $w, $tip, $path) and return; + _check_path($io, $tip, $path) and return; } drop_unwanted_headers($mime); @@ -396,48 +416,51 @@ sub add { my $raw_email = $mime->{-public_inbox_raw} // $mime->as_string; my $n = length($raw_email); $self->{bytes_added} += $n; - print $w "blob\nmark :$blob\ndata ", $n, "\n" or wfail; - print $w $raw_email, "\n" or wfail; + print $io "blob\nmark :$blob\ndata $n\n", $raw_email, "\n" or wfail; # v2: we need this for Xapian if ($smsg) { $smsg->{blob} = $self->get_mark(":$blob"); - $smsg->{raw_bytes} = $n; - $smsg->{-raw_email} = \$raw_email; + $smsg->set_bytes($raw_email, $n); + if (my $oidx = delete $smsg->{-oidx}) { # used by LeiStore + my $eidx_git = delete $smsg->{-eidx_git}; + + # we need this sharedkv to dedupe blobs added in the + # same fast-import transaction + my $u = $self->{uniq_skv} //= do { + require PublicInbox::SharedKV; + my $x = PublicInbox::SharedKV->new; + $x->dbh; + $x; + }; + return if !$u->set_maybe($smsg->oidbin, 1); + return if (!$oidx->vivify_xvmd($smsg) && + $eidx_git->check($smsg->{blob})); + } } my $ref = $self->{ref}; my $commit = $self->{mark}++; my $parent = $tip =~ /\A:/ ? $tip : undef; unless ($parent) { - print $w "reset $ref\n" or wfail; + print $io "reset $ref\n" or wfail; } - print $w "commit $ref\nmark :$commit\n", + print $io "commit $ref\nmark :$commit\n", "author $author $at\n", - "committer $self->{ident} $ct\n" or wfail; - print $w "data ", (length($subject) + 1), "\n", + "committer $self->{ident} $ct\n", + "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; if ($tip ne '') { - print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; + print $io 'from ', ($parent ? $parent : $tip), "\n" or wfail; } - clean_tree_v2($self, $w, $path); - print $w "M 100644 :$blob $path\n\n" or wfail; + clean_tree_v2($self, $io, $path); + print $io "M 100644 :$blob $path\n\n" or wfail; $self->{nchg}++; $self->{tip} = ":$commit"; } -sub run_die ($;$$) { - my ($cmd, $env, $rdr) = @_; - my $pid = spawn($cmd, $env, $rdr); - waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; - $? == 0 or die join(' ', @$cmd) . " failed: $?\n"; -} - -my @INIT_FILES = ('HEAD' => "ref: refs/heads/master\n", - 'description' => <<EOD, -Unnamed repository; edit this file 'description' to name the repository. -EOD +my @INIT_FILES = ('HEAD' => undef, # filled in at runtime 'config' => <<EOC); [core] repositoryFormatVersion = 0 @@ -448,32 +471,37 @@ EOD EOC sub init_bare { - my ($dir) = @_; # or self + my ($dir, $head, $fmt) = @_; # or self $dir = $dir->{git}->{git_dir} if ref($dir); require File::Path; - File::Path::mkpath([ map { "$dir/$_" } qw(objects/info refs/heads) ]); - for (my $i = 0; $i < @INIT_FILES; $i++) { - my $f = $dir.'/'.$INIT_FILES[$i++]; + File::Path::make_path(map { $dir.$_ } qw(/objects/info /refs/heads)); + $INIT_FILES[1] //= 'ref: '.default_branch."\n"; + my @fn_contents = @INIT_FILES; + $fn_contents[1] = "ref: refs/heads/$head\n" if defined $head; + $fn_contents[3] = <<EOM if defined($fmt) && $fmt ne 'sha1'; +[core] + repositoryFormatVersion = 1 + filemode = true + bare = true +[extensions] + objectFormat = $fmt +EOM + while (my ($fn, $contents) = splice(@fn_contents, 0, 2)) { + my $f = $dir.'/'.$fn; next if -f $f; - open my $fh, '>', $f or die "open $f: $!"; - print $fh $INIT_FILES[$i] or die "print $f: $!"; - close $fh or die "close $f: $!"; + PublicInbox::IO::write_file '>', $f, $contents; } } # true if locked and active -sub active { !!$_[0]->{out} } +sub active { !!$_[0]->{io} } sub done { my ($self) = @_; - my $w = delete $self->{out} or return; + my $io = delete $self->{io} or return; eval { - my $r = delete $self->{in} or die 'BUG: missing {in} when done'; - print $w "done\n" or wfail; - my $pid = delete $self->{pid} or - die 'BUG: missing {pid} when done'; - waitpid($pid, 0) == $pid or die 'fast-import did not finish'; - $? == 0 or die "fast-import failed: $?"; + print $io "done\n" or wfail; + $io->close or croak "close fast-import \$?=$?"; # reaps }; my $wait_err = $@; my $nchg = delete $self->{nchg}; @@ -486,16 +514,10 @@ sub done { die $wait_err if $wait_err; } -sub atfork_child { - my ($self) = @_; - foreach my $f (qw(in out)) { - next unless defined($self->{$f}); - close $self->{$f} or die "failed to close import[$f]: $!\n"; - } -} +sub atfork_child { (delete($_[0]->{io}) // return)->close } -sub digest2mid ($$) { - my ($dig, $hdr) = @_; +sub digest2mid ($$;$) { + my ($dig, $hdr, $fallback_time) = @_; my $b64 = $dig->clone->b64digest; # Make our own URLs nicer: # See "Base 64 Encoding with URL and Filename Safe Alphabet" in RFC4648 @@ -504,7 +526,7 @@ sub digest2mid ($$) { # Add a date prefix to prevent a leading '-' in case that trips # up some tools (e.g. if a Message-ID were a expected as a # command-line arg) - my $dt = msg_datestamp($hdr); + my $dt = msg_datestamp($hdr, $fallback_time); $dt = POSIX::strftime('%Y%m%d%H%M%S', gmtime($dt)); "$dt.$b64" . '@z'; } @@ -545,7 +567,7 @@ sub replace_oids { my $git = $self->{git}; my @export = (qw(fast-export --no-data --use-done-feature), $old); my $rd = $git->popen(@export); - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my @buf; my $nreplace = 0; my @oids; @@ -556,17 +578,14 @@ sub replace_oids { push @buf, "reset $tmp\n"; } elsif (/^commit (?:.+)/) { if (@buf) { - print $w @buf or wfail; + print $io @buf or wfail; @buf = (); } push @buf, "commit $tmp\n"; } elsif (/^data ([0-9]+)/) { # only commit message, so $len is small: - my $len = $1; # + 1 for trailing "\n" push @buf, $_; - my $n = read($rd, my $buf, $len) or die "read: $!"; - $len == $n or die "short read ($n < $len)"; - push @buf, $buf; + push @buf, read_all($rd, my $len = $1); } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { my ($oid, $path) = ($1, $2); $tree->{$path} = 1; @@ -593,7 +612,7 @@ sub replace_oids { rewrite_commit($self, \@oids, \@buf, $mime); $nreplace++; } - print $w @buf, "\n" or wfail; + print $io @buf, "\n" or wfail; @buf = (); } elsif ($_ eq "done\n") { $done = 1; @@ -604,15 +623,15 @@ sub replace_oids { push @buf, $_; } } - close $rd or die "close fast-export failed: $?"; + $rd->close or die "E: git @export (\$?=$?)"; if (@buf) { - print $w @buf or wfail; + print $io @buf or wfail; } die 'done\n not seen from fast-export' unless $done; chomp(my $cmt = $self->get_mark(":$mark")) if $nreplace; $self->{nchg} = 0; # prevent _update_git_info until update-ref: $self->done; - my @git = ('git', "--git-dir=$git->{git_dir}"); + my @git = @{$git->cmd}; run_die([@git, qw(update-ref), $old, $tmp]) if $nreplace; diff --git a/lib/PublicInbox/In2Tie.pm b/lib/PublicInbox/In2Tie.pm index 7dee3627..3689432b 100644 --- a/lib/PublicInbox/In2Tie.pm +++ b/lib/PublicInbox/In2Tie.pm @@ -1,10 +1,10 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # used to ensure PublicInbox::DS can call fileno() as a function # on Linux::Inotify2 objects package PublicInbox::In2Tie; -use strict; +use v5.12; use Symbol qw(gensym); sub io { diff --git a/lib/PublicInbox/In3Event.pm b/lib/PublicInbox/In3Event.pm new file mode 100644 index 00000000..f93dc0da --- /dev/null +++ b/lib/PublicInbox/In3Event.pm @@ -0,0 +1,24 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# duck-type compatible with Linux::Inotify2::Event for pure Perl +# PublicInbox::Inotify3 w/o callback support +package PublicInbox::In3Event; +use v5.12; + +sub w { $_[0]->[2] } # PublicInbox::In3Watch +sub mask { $_[0]->[0] } +sub name { $_[0]->[1] } + +sub fullname { + my ($name, $wname) = ($_[0]->[1], $_[0]->[2]->name); + length($name) ? "$wname/$name" : $wname; +} + +my $buf = ''; +while (my ($sym, $mask) = each %PublicInbox::Inotify3::events) { + $buf .= "sub $sym { \$_[0]->[0] & $mask }\n"; +} +eval $buf; + +1; diff --git a/lib/PublicInbox/In3Watch.pm b/lib/PublicInbox/In3Watch.pm new file mode 100644 index 00000000..bdb91869 --- /dev/null +++ b/lib/PublicInbox/In3Watch.pm @@ -0,0 +1,20 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# duck-type compatible with Linux::Inotify2::Watch for pure Perl +# PublicInbox::Inotify3 for our needs, only +package PublicInbox::In3Watch; +use v5.12; + +sub mask { $_[0]->[1] } +sub name { $_[0]->[2] } + +sub cancel { + my ($self) = @_; + my ($wd, $in3) = @$self[0, 3]; + $in3 or return 1; # already canceled + pop @$self; + $in3->rm_watch($wd); +} + +1; diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index b0894a7d..dd689221 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -1,102 +1,53 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Represents a public-inbox (which may have multiple mailing addresses) package PublicInbox::Inbox; use strict; -use warnings; +use v5.10.1; use PublicInbox::Git; use PublicInbox::MID qw(mid2path); use PublicInbox::Eml; +use List::Util qw(max); +use Carp qw(croak); +use PublicInbox::Compat qw(uniqstr); -# Long-running "git-cat-file --batch" processes won't notice -# unlinked packs, so we need to restart those processes occasionally. -# Xapian and SQLite file handles are mostly stable, but sometimes an -# admin will attempt to replace them atomically after compact/vacuum -# and we need to be prepared for that. -my $cleanup_timer; -my $cleanup_avail = -1; # 0, or 1 -my $have_devel_peek; -my $CLEANUP = {}; # string(inbox) -> inbox - -sub git_cleanup ($) { - my ($self) = @_; - my $git = $self->{git} or return; - $git->cleanup; +# in case DBs get replaced (Xapcmd does it for v1) +sub check_inodes ($) { + for (qw(over mm)) { $_[0]->{$_}->check_inodes if $_[0]->{$_} } } -sub cleanup_task () { - $cleanup_timer = undef; - my $next = {}; - for my $ibx (values %$CLEANUP) { - my $again; - if ($have_devel_peek) { - foreach my $f (qw(search)) { - # we bump refcnt by assigning tmp, here: - my $tmp = $ibx->{$f} or next; - next if Devel::Peek::SvREFCNT($tmp) > 2; - delete $ibx->{$f}; - # refcnt is zero when tmp is out-of-scope - } - } - git_cleanup($ibx); - if (my $gits = $ibx->{-repo_objs}) { - foreach my $git (@$gits) { - $again = 1 if $git->cleanup; - } - } - check_inodes($ibx); - if ($have_devel_peek) { - $again ||= !!$ibx->{search}; - } - $next->{"$ibx"} = $ibx if $again; - } - $CLEANUP = $next; +# search/over/mm hold onto FDs and description+cloneurl may get updated. +# creating long-lived allocations in the same phase as short-lived +# allocations also leads to fragmentation, so we don't want some stuff +# living too long. +sub do_cleanup { + my ($ibx) = @_; + my ($srch) = delete @$ibx{qw(search over mm description cloneurl)}; + $srch //= $ibx; # extsearch + delete @$srch{qw(xdb qp)}; } -sub cleanup_possible () { +sub _cleanup_later ($) { # no need to require DS, here, if it were enabled another # module would've require'd it, already - eval { PublicInbox::DS::in_loop() } or return 0; - - eval { - require Devel::Peek; # needs separate package in Fedora - $have_devel_peek = 1; - }; - 1; -} - -sub _cleanup_later ($) { - my ($self) = @_; - $cleanup_avail = cleanup_possible() if $cleanup_avail < 0; - return if $cleanup_avail != 1; - $cleanup_timer //= PublicInbox::DS::later(\&cleanup_task); - $CLEANUP->{"$self"} = $self; -} - -sub _set_uint ($$$) { - my ($opts, $field, $default) = @_; - my $val = $opts->{$field}; - if (defined $val) { - $val = $val->[-1] if ref($val) eq 'ARRAY'; - $val = undef if $val !~ /\A[0-9]+\z/; - } - $opts->{$field} = $val || $default; + eval { PublicInbox::DS::in_loop() } and + PublicInbox::DS::add_uniq_timer($_[0]+0, 30, \&do_cleanup, @_) } sub _set_limiter ($$$) { - my ($self, $pi_config, $pfx) = @_; + my ($self, $pi_cfg, $pfx) = @_; my $lkey = "-${pfx}_limiter"; - $self->{$lkey} ||= do { + $self->{$lkey} //= do { # full key is: publicinbox.$NAME.httpbackendmax my $mkey = $pfx.'max'; my $val = $self->{$mkey} or return; my $lim; if ($val =~ /\A[0-9]+\z/) { - require PublicInbox::Qspawn; - $lim = PublicInbox::Qspawn::Limiter->new($val); + require PublicInbox::Limiter; + $lim = PublicInbox::Limiter->new($val); } elsif ($val =~ /\A[a-z][a-z0-9]*\z/) { - $lim = $pi_config->limiter($val); + $lim = $pi_cfg->limiter($val); warn "$mkey limiter=$val not found\n" if !$lim; } else { warn "$mkey limiter=$val not understood\n"; @@ -110,48 +61,46 @@ sub new { my $v = $opts->{address} ||= [ 'public-inbox@example.com' ]; my $p = $opts->{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; $opts->{domain} = ($p =~ /\@(\S+)\z/) ? $1 : 'localhost'; - my $pi_config = delete $opts->{-pi_config}; - _set_limiter($opts, $pi_config, 'httpbackend'); - _set_uint($opts, 'feedmax', 25); - $opts->{nntpserver} ||= $pi_config->{'publicinbox.nntpserver'}; - my $dir = $opts->{inboxdir}; - if (defined $dir && -f "$dir/inbox.lock") { - $opts->{version} = 2; + my $pi_cfg = delete $opts->{-pi_cfg}; + _set_limiter($opts, $pi_cfg, 'httpbackend'); + my $fmax = $opts->{feedmax}; + if (defined($fmax) && $fmax =~ /\A[0-9]+\z/) { + $opts->{feedmax} += 0; + } else { + delete $opts->{feedmax}; } - # allow any combination of multi-line or comma-delimited hide entries - my $hide = {}; - if (defined(my $h = $opts->{hide})) { - foreach my $v (@$h) { - $hide->{$_} = 1 foreach (split(/\s*,\s*/, $v)); - } - $opts->{-hide} = $hide; + for $v (@{delete($opts->{hide}) // []}) { + $opts->{-'hide_'.$_} = 1 for split(/\s*,\s*/, $v); } bless $opts, $class; } -sub version { $_[0]->{version} // 1 } +sub version { + $_[0]->{version} //= -f "$_[0]->{inboxdir}/inbox.lock" ? 2 : 1 +} sub git_epoch { - my ($self, $epoch) = @_; - $self->version == 2 or return; - $self->{"$epoch.git"} ||= do { + my ($self, $epoch) = @_; # v2-only, callers always supply $epoch + $self->{"$epoch.git"} //= do { my $git_dir = "$self->{inboxdir}/git/$epoch.git"; + return unless -d $git_dir; my $g = PublicInbox::Git->new($git_dir); - $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; - # no cleanup needed, we never cat-file off this, only clone + my $lim = $self->{-httpbackend_limiter}; + $g->{-httpbackend_limiter} = $lim if $lim; + # caller must manually cleanup when done $g; }; } sub git { my ($self) = @_; - $self->{git} ||= do { + $self->{git} //= do { my $git_dir = $self->{inboxdir}; $git_dir .= '/all.git' if $self->version == 2; my $g = PublicInbox::Git->new($git_dir); - $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; - _cleanup_later($self); + my $lim = $self->{-httpbackend_limiter}; + $g->{-httpbackend_limiter} = $lim if $lim; $g; }; } @@ -160,160 +109,167 @@ sub max_git_epoch { my ($self) = @_; return if $self->version < 2; my $cur = $self->{-max_git_epoch}; - my $changed = git($self)->alternates_changed; - if (!defined($cur) || $changed) { - git_cleanup($self) if $changed; + my $changed; + if (!defined($cur) || ($changed = git($self)->alternates_changed)) { + $self->{git}->cleanup if $changed; my $gits = "$self->{inboxdir}/git"; if (opendir my $dh, $gits) { - my $max = -1; - while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A([0-9]+)\.git\z! or next; - $max = $1 if $1 > $max; - } - $cur = $self->{-max_git_epoch} = $max if $max >= 0; - } else { - warn "opendir $gits failed: $!\n"; + my $max = max(map { + substr($_, 0, -4) + 0; # drop ".git" suffix + } grep(/\A[0-9]+\.git\z/, readdir($dh))) // return; + $cur = $self->{-max_git_epoch} = $max; } } $cur; } -sub mm { +sub mm_file { my ($self) = @_; - $self->{mm} ||= eval { + my $d = $self->{inboxdir}; + ($self->version >= 2 ? $d : "$d/public-inbox").'/msgmap.sqlite3'; +} + +sub mm { + my ($self, $req) = @_; + $self->{mm} //= eval { require PublicInbox::Msgmap; - my $dir = $self->{inboxdir}; - if ($self->version >= 2) { - PublicInbox::Msgmap->new_file("$dir/msgmap.sqlite3"); - } else { - PublicInbox::Msgmap->new($dir); - } - }; + _cleanup_later($self); + PublicInbox::Msgmap->new_file($self); + } // ($req ? croak("E: $@") : undef); } -sub search ($;$$) { - my ($self, $over_only, $ctx) = @_; - my $srch = $self->{search} ||= eval { +sub search { + my ($self) = @_; + $self->{search} // eval { _cleanup_later($self); require PublicInbox::Search; - PublicInbox::Search->new($self); - }; - ($over_only || eval { $srch->xdb }) ? $srch : do { - $ctx and $ctx->{env}->{'psgi.errors'}->print(<<EOF); -`$self->{name}' search went away unexpectedly -EOF - undef; + my $srch = PublicInbox::Search->new($self); + (eval { $srch->xdb }) ? ($self->{search} = $srch) : undef; }; } +# isrch is preferred for read-only interfaces if available since it +# reduces kernel cache and FD overhead +sub isrch { $_[0]->{isrch} // search($_[0]) } + sub over { - $_[0]->{over} //= eval { - my $srch = search($_[0], 1) or return; + my ($self, $req) = @_; + $self->{over} // eval { + my $srch = $self->{search} // do { + require PublicInbox::Search; + PublicInbox::Search->new($self); + }; + _cleanup_later($self); my $over = PublicInbox::Over->new("$srch->{xpfx}/over.sqlite3"); $over->dbh; # may fail - $over; - }; -} - -sub try_cat { - my ($path) = @_; - my $rv = ''; - if (open(my $fh, '<', $path)) { - local $/; - $rv = <$fh>; - } - $rv; + $self->{over} = $over; + } // ($req ? croak("E: $@") : undef); } sub description { my ($self) = @_; - ($self->{description} //= do { - my $desc = try_cat("$self->{inboxdir}/description"); - local $/ = "\n"; - chomp $desc; - utf8::decode($desc); - $desc =~ s/\s+/ /smg; - $desc eq '' ? undef : $desc; - }) // '($INBOX_DIR/description missing)'; + ($self->{description} //= + PublicInbox::Git::cat_desc("$self->{inboxdir}/description")) // + '($INBOX_DIR/description missing)'; } sub cloneurl { my ($self) = @_; - ($self->{cloneurl} //= do { - my $s = try_cat("$self->{inboxdir}/cloneurl"); - my @urls = split(/\s+/s, $s); - scalar(@urls) ? \@urls : undef - }) // []; + $self->{cloneurl} // do { + my @urls = split(/\s+/s, + PublicInbox::IO::try_cat "$self->{inboxdir}/cloneurl"); + scalar(@urls) ? ($self->{cloneurl} = \@urls) : undef; + } // []; } sub base_url { my ($self, $env) = @_; # env - PSGI env - if ($env) { + if ($env && $env->{'psgi.url_scheme'}) { my $url = PublicInbox::Git::host_prefix_url($env, ''); # for mount in Plack::Builder $url .= '/' if $url !~ m!/\z!; return $url .= $self->{name} . '/'; } - # called from a non-PSGI environment (e.g. NNTP/POP3): - $self->{-base_url} ||= do { - my $url = $self->{url}->[0] or return undef; - # expand protocol-relative URLs to HTTPS if we're - # not inside a web server - $url = "https:$url" if $url =~ m!\A//!; - $url .= '/' if $url !~ m!/\z!; - $url; - }; + # called from a non-PSGI environment or cross-inbox environment + # where multiple inboxes can have different domains + my $url = $self->{url} // return undef; + $url = $url->[0] // return undef; + # expand protocol-relative URLs to HTTPS if we're + # not inside a web server + substr($url, 0, 0, 'https:') if substr($url, 0, 2) eq '//'; + $url .= '/' if substr($url, -1, 1) ne '/'; + $url; +} + +# imapserver, nntpserver configs are used here: +sub _x_url ($$$) { + my ($self, $x, $ctx) = @_; # $x is "imap" or "nntp" + # no checking for nntp_usable here, we can point entirely + # to non-local servers or users run by a different user + my $ns = $self->{"${x}server"} // + $ctx->{www}->{pi_cfg}->get_all("publicinbox.${x}server"); + my $group = $self->{newsgroup}; + my @urls; + if ($ns && $group) { + @urls = map { + my $u = m!\A${x}s?://! ? $_ : "$x://$_"; + $u .= '/' if $u !~ m!/\z!; + $u.$group; + } @$ns; + } + if (my $mirrors = $self->{"${x}mirror"}) { + my @m; + for (@$mirrors) { + my $u = m!\A${x}s?://! ? $_ : "$x://$_"; + if ($u =~ m!\A${x}s?://[^/]+/?\z!) { + if ($group) { + $u .= '/' if $u !~ m!/\z!; + $u .= $group; + } else { # n.b. IMAP and POP3 use "newsgroup" + warn <<EOM; +publicinbox.$self->{name}.${x}mirror=$_ missing newsgroup name +EOM + } + } + # else: allow full URLs like: + # nntp://news.example.com/alt.example + push @m, $u; + } + @urls = uniqstr @urls, @m; + } + \@urls; } -sub nntp_url { - my ($self) = @_; - $self->{-nntp_url} ||= do { - # no checking for nntp_usable here, we can point entirely - # to non-local servers or users run by a different user - my $ns = $self->{nntpserver}; +# my ($self, $ctx) = @_; +sub imap_url { $_[0]->{-imap_url} //= _x_url($_[0], 'imap', $_[1]) } +sub nntp_url { $_[0]->{-nntp_url} //= _x_url($_[0], 'nntp', $_[1]) } + +sub pop3_url { + my ($self, $ctx) = @_; + $self->{-pop3_url} //= do { + my $ps = $self->{'pop3server'} // + $ctx->{www}->{pi_cfg}->get_all('publicinbox.pop3server'); my $group = $self->{newsgroup}; my @urls; - if ($ns && $group) { - $ns = [ $ns ] if ref($ns) ne 'ARRAY'; - @urls = map { - my $u = m!\Anntps?://! ? $_ : "nntp://$_"; - $u .= '/' if $u !~ m!/\z!; - $u.$group; - } @$ns; - } - - my $mirrors = $self->{nntpmirror}; - if ($mirrors) { - my @m; - foreach (@$mirrors) { - my $u = m!\Anntps?://! ? $_ : "nntp://$_"; - if ($u =~ m!\Anntps?://[^/]+/?\z!) { - if ($group) { - $u .= '/' if $u !~ m!/\z!; - $u .= $group; - } else { - warn -"publicinbox.$self->{name}.nntpmirror=$_ missing newsgroup name\n"; - } - } - # else: allow full URLs like: - # nntp://news.example.com/alt.example - push @m, $u; - } - - # List::Util::uniq requires Perl 5.26+, maybe we - # can use it by 2030 or so - my %seen; - @urls = grep { !$seen{$_}++ } (@urls, @m); + ($ps && $group) and + @urls = map { m!\Apop3?s?://! ? $_ : "pop3://$_" } @$ps; + if (my $mi = $self->{'pop3mirror'}) { + my @m = map { m!\Apop3?s?://! ? $_ : "pop3://$_" } @$mi; + @urls = uniqstr @urls, @m; } + my $n = 0; + for (@urls) { $n += s!/+\z!! } + warn <<EOM if $n; +W: pop3server and/or pop3mirror URLs should not end with trailing slash `/' +EOM \@urls; - }; + } } sub nntp_usable { my ($self) = @_; my $ret = mm($self) && over($self); - $self->{mm} = $self->{over} = $self->{search} = undef; + delete @$self{qw(mm over search)}; $ret; } @@ -328,53 +284,42 @@ sub msg_by_smsg ($$) { # ghosts may have undef smsg (from SearchThread.node) or # no {blob} field - return unless defined $smsg; - defined(my $blob = $smsg->{blob}) or return; - - git($self)->cat_file($blob); + $smsg // return; + $self->git->cat_file($smsg->{blob} // return); } sub smsg_eml { my ($self, $smsg) = @_; my $bref = msg_by_smsg($self, $smsg) or return; my $eml = PublicInbox::Eml->new($bref); - $smsg->populate($eml) unless exists($smsg->{num}); # v1 w/o SQLite + $smsg->{num} // $smsg->populate($eml); $eml; } -sub mid2num($$) { - my ($self, $mid) = @_; - my $mm = mm($self) or return; - $mm->num_for($mid); -} - sub smsg_by_mid ($$) { my ($self, $mid) = @_; - my $over = over($self) or return; - # favor the Message-ID we used for the NNTP article number: - defined(my $num = mid2num($self, $mid)) or return; - my $smsg = $over->get_art($num) or return; - PublicInbox::Smsg::psgi_cull($smsg); + my $over = $self->over or return; + my $smsg; + if (my $mm = $self->mm) { + # favor the Message-ID we used for the NNTP article number: + my $num = $mm->num_for($mid) // return; + $smsg = $over->get_art($num); + } else { + my ($id, $prev); + $smsg = $over->next_by_mid($mid, \$id, \$prev); + } + $smsg ? PublicInbox::Smsg::psgi_cull($smsg) : undef; } sub msg_by_mid ($$) { my ($self, $mid) = @_; - - over($self) or - return msg_by_path($self, mid2path($mid)); - my $smsg = smsg_by_mid($self, $mid); - $smsg ? msg_by_smsg($self, $smsg) : undef; -} - -sub recent { - my ($self, $opts, $after, $before) = @_; - over($self)->recent($opts, $after, $before); + $smsg ? msg_by_smsg($self, $smsg) : msg_by_path($self, mid2path($mid)); } sub modified { my ($self) = @_; - if (my $over = over($self)) { + if (my $over = $self->over) { my $msgs = $over->recent({limit => 1}); if (my $smsg = $msgs->[0]) { return $smsg->{ts}; @@ -388,7 +333,7 @@ sub modified { # (pathname is NOT public, but prefix is used for Xapian queries) sub altid_map ($) { my ($self) = @_; - $self->{-altid_map} //= eval { + eval { require PublicInbox::AltId; my $altid = $self->{altid} or return {}; my %h = map {; @@ -410,22 +355,43 @@ sub unsubscribe_unlock { delete $self->{unlock_subs}->{$ident}; } -sub check_inodes ($) { - my ($self) = @_; - for (qw(over mm)) { # TODO: search - $self->{$_}->check_inodes if $self->{$_}; - } -} - # called by inotify sub on_unlock { my ($self) = @_; - check_inodes($self); + check_inodes($self); # DB files may be replaced while holding lock my $subs = $self->{unlock_subs} or return; - for (values %$subs) { - eval { $_->on_inbox_unlock($self) }; + for my $obj (values %$subs) { + eval { $obj->on_inbox_unlock($self) }; warn "E: $@ ($self->{inboxdir})\n" if $@; } } +sub uidvalidity { $_[0]->{uidvalidity} //= eval { $_[0]->mm->created_at } } + +sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} } + +# only used by NNTP, so we need ->mm anyways +sub art_min { $_[0]->{-art_min} //= eval { $_[0]->mm(1)->min } } + +# used by IMAP, too, which tries to avoid ->mm (but ->{mm} is likely +# faster since it's smaller iff available) +sub art_max { + $_[0]->{-art_max} //= eval { $_[0]->{mm}->max } // + eval { $_[0]->over(1)->max }; +} + +sub mailboxid { # rfc 8474, 8620, 8621 + my ($self, $imap_slice) = @_; + my $pfx = defined($imap_slice) ? $self->{newsgroup} : $self->{name}; + utf8::encode($pfx); # to octets + # RFC 8620, 1.2 recommends not starting with dash or digits + # "A good solution to these issues is to prefix every id with a single + # alphabetical character." + 'M'.join('', map { sprintf('%02x', ord) } split(//, $pfx)) . + (defined($imap_slice) ? sprintf('-%x', $imap_slice) : '') . + sprintf('-%x', uidvalidity($self) // 0) +} + +sub thing_type { 'public inbox' } + 1; diff --git a/lib/PublicInbox/InboxIdle.pm b/lib/PublicInbox/InboxIdle.pm index 0cdd2e2a..3c4d4a68 100644 --- a/lib/PublicInbox/InboxIdle.pm +++ b/lib/PublicInbox/InboxIdle.pm @@ -1,20 +1,18 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # fields: -# pi_config: PublicInbox::Config ref # inot: Linux::Inotify2-like object # pathmap => { inboxdir => [ ibx, watch1, watch2, watch3... ] } mapping package PublicInbox::InboxIdle; -use strict; +use v5.12; use parent qw(PublicInbox::DS); -use Cwd qw(abs_path); -use PublicInbox::Syscall qw(EPOLLIN EPOLLET); +use PublicInbox::Syscall qw(EPOLLIN); my $IN_MODIFY = 0x02; # match Linux inotify my $ino_cls; -if ($^O eq 'linux' && eval { require Linux::Inotify2; 1 }) { - $IN_MODIFY = Linux::Inotify2::IN_MODIFY(); - $ino_cls = 'Linux::Inotify2'; +if ($^O eq 'linux' && eval { require PublicInbox::Inotify }) { + $IN_MODIFY = PublicInbox::Inotify::IN_MODIFY(); + $ino_cls = 'PublicInbox::Inotify'; } elsif (eval { require PublicInbox::KQNotify }) { $IN_MODIFY = PublicInbox::KQNotify::NOTE_WRITE(); $ino_cls = 'PublicInbox::KQNotify'; @@ -23,46 +21,57 @@ require PublicInbox::In2Tie if $ino_cls; sub in2_arm ($$) { # PublicInbox::Config::each_inbox callback my ($ibx, $self) = @_; - my $dir = abs_path($ibx->{inboxdir}); - if (!defined($dir)) { - warn "W: $ibx->{inboxdir} not watched: $!\n"; - return; - } + my $dir = $ibx->{inboxdir}; my $inot = $self->{inot}; my $cur = $self->{pathmap}->{$dir} //= []; + my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); # transfer old subscriptions to the current inbox, cancel the old watch - if (my $old_ibx = $cur->[0]) { - $ibx->{unlock_subs} and - die "BUG: $dir->{unlock_subs} should not exist"; + my $old_ibx = $cur->[0]; + $cur->[0] = $ibx; + if ($old_ibx) { + my $u = $ibx->{unlock_subs}; $ibx->{unlock_subs} = $old_ibx->{unlock_subs}; + %{$ibx->{unlock_subs}} = (%$u, %{$ibx->{unlock_subs}}) if $u; + + # *::Inotify*::Watch::name matches if watches are the + # same, no point in replacing a watch of the same name + if ($cur->[1]->name eq $lock) { + $self->{on_unlock}->{$lock} = $ibx; + return; + } + # rare, name changed (v1 inbox converted to v2) $cur->[1]->cancel; # Linux::Inotify2::Watch::cancel } - $cur->[0] = $ibx; - my $lock = "$dir/".($ibx->version >= 2 ? 'inbox.lock' : 'ssoma.lock'); if (my $w = $cur->[1] = $inot->watch($lock, $IN_MODIFY)) { $self->{on_unlock}->{$w->name} = $ibx; } else { warn "E: ".ref($inot)."->watch($lock, IN_MODIFY) failed: $!\n"; + warn <<"" if $!{ENOSPC} && $^O eq 'linux'; +# consider increasing /proc/sys/fs/inotify/max_user_watches + } # TODO: detect deleted packs (and possibly other files) } sub refresh { - my ($self, $pi_config) = @_; - $pi_config->each_inbox(\&in2_arm, $self); + my ($self, $pi_cfg) = @_; + $pi_cfg->each_inbox(\&in2_arm, $self); } +# internal API for ease-of-use +sub watch_inbox { in2_arm($_[1], $_[0]) }; + sub new { - my ($class, $pi_config) = @_; + my ($class, $pi_cfg) = @_; my $self = bless {}, $class; my $inot; if ($ino_cls) { $inot = $ino_cls->new or die "E: $ino_cls->new: $!"; my $io = PublicInbox::In2Tie::io($inot); - $self->SUPER::new($io, EPOLLIN | EPOLLET); + $self->SUPER::new($io, EPOLLIN); } else { require PublicInbox::FakeInotify; $inot = PublicInbox::FakeInotify->new; @@ -70,7 +79,7 @@ sub new { $self->{inot} = $inot; $self->{pathmap} = {}; # inboxdir => [ ibx, watch1, watch2, watch3...] $self->{on_unlock} = {}; # lock path => ibx - refresh($self, $pi_config); + refresh($self, $pi_cfg) if $pi_cfg; PublicInbox::FakeInotify::poll_once($self) if !$ino_cls; $self; } @@ -78,10 +87,11 @@ sub new { sub event_step { my ($self) = @_; eval { - my @events = $self->{inot}->read; # Linux::Inotify2::read + my @events = $self->{inot}->read; # PublicInbox::Inotify3::read my $on_unlock = $self->{on_unlock}; for my $ev (@events) { - if (my $ibx = $on_unlock->{$ev->fullname}) { + my $fn = $ev->fullname // next; # cancelled + if (my $ibx = $on_unlock->{$fn}) { $ibx->on_unlock; } } diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 752f1997..8e95cb28 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -1,23 +1,17 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Extends read-only Inbox for writing package PublicInbox::InboxWritable; use strict; use v5.10.1; -use parent qw(PublicInbox::Inbox Exporter); +use parent qw(PublicInbox::Inbox PublicInbox::Umask Exporter); use PublicInbox::Import; +use PublicInbox::IO qw(read_all); use PublicInbox::Filter::Base qw(REJECT); use Errno qw(ENOENT); -our @EXPORT_OK = qw(eml_from_path warn_ignore_cb); - -use constant { - PERM_UMASK => 0, - OLD_PERM_GROUP => 1, - OLD_PERM_EVERYBODY => 2, - PERM_GROUP => 0660, - PERM_EVERYBODY => 0664, -}; +our @EXPORT_OK = qw(eml_from_path); +use Fcntl qw(O_RDONLY O_NONBLOCK); sub new { my ($class, $ibx, $creat_opt) = @_; @@ -46,12 +40,13 @@ sub _init_v1 { require PublicInbox::Msgmap; my $sidx = PublicInbox::SearchIdx->new($self, 1); # just create $sidx->begin_txn_lazy; + my $mm = PublicInbox::Msgmap->new_file($self, 1); if (defined $skip_artnum) { - my $mm = PublicInbox::Msgmap->new($self->{inboxdir}, 1); $mm->{dbh}->begin_work; $mm->skip_artnum($skip_artnum); $mm->{dbh}->commit; } + undef $mm; # ->created_at set $sidx->commit_txn_lazy; } else { open my $fh, '>>', "$self->{inboxdir}/ssoma.lock" or @@ -64,7 +59,6 @@ sub init_inbox { if ($self->version == 1) { my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); - $self->umask_prepare; $self->with_umask(\&_init_v1, $self, $skip_artnum); } else { my $v2w = importer($self); @@ -102,7 +96,7 @@ sub filter { $im->done; } - my @args = (-inbox => $self); + my @args = (ibx => $self); # basic line splitting, only # Perhaps we can have proper quote splitting one day... ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/; @@ -118,184 +112,98 @@ sub filter { undef; } -sub is_maildir_basename ($) { - my ($bn) = @_; - return 0 if $bn !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/; - if ($bn =~ /:2,([A-Z]+)\z/i) { - my $flags = $1; - return 0 if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail - } - 1; -} - -sub is_maildir_path ($) { - my ($path) = @_; - my @p = split(m!/+!, $path); - (is_maildir_basename($p[-1]) && -f $path) ? 1 : 0; -} - sub eml_from_path ($) { my ($path) = @_; - if (open my $fh, '<', $path) { - my $str = do { local $/; <$fh> } or return; - PublicInbox::Eml->new(\$str); + if (sysopen(my $fh, $path, O_RDONLY|O_NONBLOCK)) { + return unless -f $fh && -s _; # no FIFOs or directories + PublicInbox::Eml->new(\(my $str = read_all($fh, -s _))); } else { # ENOENT is common with Maildir warn "failed to open $path: $!\n" if $! != ENOENT; undef; } } +sub _each_maildir_eml { + my ($fn, $kw, $eml, $im, $self) = @_; + return if grep(/\Adraft\z/, @$kw); + if ($self && (my $filter = $self->filter($im))) { + my $ret = $filter->scrub($eml) or return; + return if $ret == REJECT(); + $eml = $ret; + } + $im->add($eml); +} + +# XXX does anybody use this? sub import_maildir { my ($self, $dir) = @_; - my $im = $self->importer(1); - foreach my $sub (qw(cur new tmp)) { -d "$dir/$sub" or die "$dir is not a Maildir (missing $sub)\n"; } - foreach my $sub (qw(cur new)) { - opendir my $dh, "$dir/$sub" or die "opendir $dir/$sub: $!\n"; - while (defined(my $fn = readdir($dh))) { - next unless is_maildir_basename($fn); - my $mime = eml_from_path("$dir/$fn") or next; - - if (my $filter = $self->filter($im)) { - my $ret = $filter->scrub($mime) or return; - return if $ret == REJECT(); - $mime = $ret; - } - $im->add($mime); - } - } + my $im = $self->importer(1); + my @self = $self->filter($im) ? ($self) : (); + require PublicInbox::MdirReader; + PublicInbox::MdirReader->new->maildir_each_eml($dir, + \&_each_maildir_eml, $im, @self); $im->done; } -# asctime: From example@example.com Fri Jun 23 02:56:55 2000 -my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/; - -sub mb_add ($$$$) { - my ($im, $variant, $filter, $msg) = @_; - $$msg =~ s/(\r?\n)+\z/$1/s; - if ($variant eq 'mboxrd') { - $$msg =~ s/^>(>*From )/$1/gms; - } elsif ($variant eq 'mboxo') { - $$msg =~ s/^>From /From /gms; - } - my $mime = PublicInbox::Eml->new($msg); +sub _mbox_eml_cb { # MboxReader->mbox* callback + my ($eml, $im, $filter) = @_; if ($filter) { - my $ret = $filter->scrub($mime) or return; + my $ret = $filter->scrub($eml) or return; return if $ret == REJECT(); - $mime = $ret; + $eml = $ret; } - $im->add($mime) + $im->add($eml); } sub import_mbox { my ($self, $fh, $variant) = @_; - if ($variant !~ /\A(?:mboxrd|mboxo)\z/) { - die "variant must be 'mboxrd' or 'mboxo'\n"; - } + require PublicInbox::MboxReader; + my $cb = PublicInbox::MboxReader->reads($variant) or + die "$variant not supported\n"; my $im = $self->importer(1); - my $prev = undef; - my $msg = ''; - my $filter = $self->filter; - while (defined(my $l = <$fh>)) { - if ($l =~ /$from_strict/o) { - if (!defined($prev) || $prev =~ /^\r?$/) { - mb_add($im, $variant, $filter, \$msg) if $msg; - $msg = ''; - $prev = $l; - next; - } - warn "W[$.] $l\n"; - } - $prev = $l; - $msg .= $l; - } - mb_add($im, $variant, $filter, \$msg) if $msg; + $cb->(undef, $fh, \&_mbox_eml_cb, $im, $self->filter); $im->done; } -sub _read_git_config_perm { - my ($self) = @_; - chomp(my $perm = $self->git->qx('config', 'core.sharedRepository')); - $perm; -} - -sub _git_config_perm { - my $self = shift; - my $perm = scalar @_ ? $_[0] : _read_git_config_perm($self); - return PERM_UMASK if (!defined($perm) || $perm eq ''); - return PERM_UMASK if ($perm eq 'umask'); - return PERM_GROUP if ($perm eq 'group'); - if ($perm =~ /\A(?:all|world|everybody)\z/) { - return PERM_EVERYBODY; - } - return PERM_GROUP if ($perm =~ /\A(?:true|yes|on|1)\z/); - return PERM_UMASK if ($perm =~ /\A(?:false|no|off|0)\z/); - - my $i = oct($perm); - return PERM_UMASK if ($i == PERM_UMASK); - return PERM_GROUP if ($i == OLD_PERM_GROUP); - return PERM_EVERYBODY if ($i == OLD_PERM_EVERYBODY); - - if (($i & 0600) != 0600) { - die "core.sharedRepository mode invalid: ". - sprintf('%.3o', $i) . "\nOwner must have permissions\n"; - } - ($i & 0666); -} - -sub _umask_for { - my ($perm) = @_; # _git_config_perm return value - my $rv = $perm; - return umask if $rv == 0; - - # set +x bit if +r or +w were set - $rv |= 0100 if ($rv & 0600); - $rv |= 0010 if ($rv & 0060); - $rv |= 0001 if ($rv & 0006); - (~$rv & 0777); +sub cleanup ($) { + delete @{$_[0]}{qw(over mm git search)}; } -sub with_umask { - my ($self, $cb, @arg) = @_; - my $old = umask $self->{umask}; - my $rv = eval { $cb->(@arg) }; - my $err = $@; - umask $old; - die $err if $err; - $rv; +# v2+ only, XXX: maybe we can just rely on ->max_git_epoch and remove +sub git_dir_latest { + my ($self, $max) = @_; + defined($$max = $self->max_git_epoch) ? + "$self->{inboxdir}/git/$$max.git" : undef; } -sub umask_prepare { - my ($self) = @_; - my $perm = _git_config_perm($self); - my $umask = _umask_for($perm); - $self->{umask} = $umask; -} +# for unconfigured inboxes +sub detect_indexlevel ($) { + my ($ibx) = @_; -sub cleanup ($) { - delete @{$_[0]}{qw(over mm git search)}; -} + my $over = $ibx->over; + my $srch = $ibx->search; + delete @$ibx{qw(over search)}; # don't leave open FDs lying around -# warnings to ignore when handling spam mailboxes and maybe other places -sub warn_ignore { - my $s = "@_"; - # Email::Address::XS warnings - $s =~ /^Argument contains empty address at / - || $s =~ /^Element at index [0-9]+ contains / - # PublicInbox::MsgTime - || $s =~ /^bogus TZ offset: .+?, ignoring and assuming \+0000/ - || $s =~ /^bad Date: .+? in / -} + # brand new or never before indexed inboxes default to full + return 'full' unless $over; + my $l = 'basic'; + return $l unless $srch; + if (my $xdb = $srch->xdb) { + $l = 'full'; + my $m = $xdb->get_metadata('indexlevel'); + if ($m eq 'medium') { + $l = $m; + } elsif ($m ne '') { + warn <<""; +$ibx->{inboxdir} has unexpected indexlevel in Xapian: $m -# this expects to be RHS in this assignment: "local $SIG{__WARN__} = ..." -sub warn_ignore_cb { - my $cb = $SIG{__WARN__} // sub { print STDERR @_ }; - sub { - return if warn_ignore(@_); - $cb->(@_); + } + $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata'); } + $l; } 1; diff --git a/lib/PublicInbox/Inotify.pm b/lib/PublicInbox/Inotify.pm new file mode 100644 index 00000000..c4f1ae84 --- /dev/null +++ b/lib/PublicInbox/Inotify.pm @@ -0,0 +1,47 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# wrap Linux::Inotify2 XS module, support pure Perl via `syscall' someday +package PublicInbox::Inotify; +use v5.12; +our @ISA; +BEGIN { # prefer pure Perl since it works out-of-the-box + my $isa; + for my $m (qw(PublicInbox::Inotify3 Linux::Inotify2)) { + eval "require $m"; + next if $@; + $isa = $m; + } + if ($isa) { + push @ISA, $isa; + my $buf = ''; + for (qw(IN_MOVED_TO IN_CREATE IN_DELETE IN_DELETE_SELF + IN_MOVE_SELF IN_MOVED_FROM IN_MODIFY)) { + $buf .= "*$_ = \\&PublicInbox::Inotify3::$_;\n"; + } + eval $buf; + die $@ if $@; + } else { + die <<EOM; +W: inotify syscall numbers unknown on your platform and +W: Linux::Inotify2 missing: $@ +W: public-inbox hackers welcome the plain-text output of ./devel/sysdefs-list +W: at meta\@public-inbox.org +EOM + } +}; + +sub new { + $_[0]->SUPER::new // do { + my $msg = $!{EMFILE} ? <<EOM : "$_[0]->new: $!\n"; +inotify_init/inotify_init1: $! +You may need to raise the `fs.inotify.max_user_instances' sysctl limit. +Consult your OS documentation and/or sysctl(8) + sysctl.conf(5) manpages. +EOM + $msg =~ s/^/E: /smg; + require Carp; + Carp::croak($msg); + } +} + +1; diff --git a/lib/PublicInbox/Inotify3.pm b/lib/PublicInbox/Inotify3.pm new file mode 100644 index 00000000..4f337a7a --- /dev/null +++ b/lib/PublicInbox/Inotify3.pm @@ -0,0 +1,115 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Implements most Linux::Inotify2 functionality we need in pure Perl +# Anonymous sub support isn't supported since it's expensive in the +# best case and likely leaky in older Perls (e.g. 5.16.3) +package PublicInbox::Inotify3; +use v5.12; +use autodie qw(open); +use PublicInbox::Syscall (); +use Carp; +use Scalar::Util (); + +# this fails if undefined no unsupported platforms +use constant $PublicInbox::Syscall::INOTIFY; +our %events; + +# extracted from devel/sysdefs-list output, these should be arch-independent +BEGIN { +%events = ( + IN_ACCESS => 0x1, + IN_ALL_EVENTS => 0xfff, + IN_ATTRIB => 0x4, + IN_CLOSE => 0x18, + IN_CLOSE_NOWRITE => 0x10, + IN_CLOSE_WRITE => 0x8, + IN_CREATE => 0x100, + IN_DELETE => 0x200, + IN_DELETE_SELF => 0x400, + IN_DONT_FOLLOW => 0x2000000, + IN_EXCL_UNLINK => 0x4000000, + IN_IGNORED => 0x8000, + IN_ISDIR => 0x40000000, + IN_MASK_ADD => 0x20000000, + IN_MODIFY => 0x2, + IN_MOVE => 0xc0, + IN_MOVED_FROM => 0x40, + IN_MOVED_TO => 0x80, + IN_MOVE_SELF => 0x800, + IN_ONESHOT => 0x80000000, + IN_ONLYDIR => 0x1000000, + IN_OPEN => 0x20, + IN_Q_OVERFLOW => 0x4000, + IN_UNMOUNT => 0x2000, +); +} # /BEGIN +use constant \%events; +require PublicInbox::In3Event; # uses %events +require PublicInbox::In3Watch; # uses SYS_inotify_rm_watch + +use constant autocancel => + (IN_IGNORED|IN_UNMOUNT|IN_ONESHOT|IN_DELETE_SELF); + +sub new { + open my $fh, '+<&=', syscall(SYS_inotify_init1, IN_CLOEXEC); + bless { fh => $fh }, __PACKAGE__; +} + +sub read { + my ($self) = @_; + my (@ret, $wd, $mask, $len, $name, $size, $buf); + my $r = sysread($self->{fh}, my $rbuf, 8192); + if ($r) { + while ($r) { + ($wd, $mask, undef, $len) = unpack('lLLL', $rbuf); + $size = 16 + $len; # 16: sizeof(struct inotify_event) + substr($rbuf, 0, 16, ''); + $name = $len ? unpack('Z*', substr($rbuf, 0, $len, '')) + : undef; + $r -= $size; + next if $self->{ignore}->{$wd}; + my $ev = bless [$mask, $name], 'PublicInbox::In3Event'; + push @ret, $ev; + if (my $w = $self->{w}->{$wd}) { + $ev->[2] = $w; + $w->cancel if $ev->mask & autocancel; + } elsif ($mask & IN_Q_OVERFLOW) { + carp 'E: IN_Q_OVERFLOW, too busy? (non-fatal)' + } else { + carp "BUG? wd:$wd unknown (non-fatal)"; + } + } + } elsif (defined($r) || ($!{EAGAIN} || $!{EINTR})) { + } else { + croak "inotify read: $!"; + } + delete $self->{ignore}; + @ret; +} + +sub fileno { CORE::fileno($_[0]->{fh}) } + +sub fh { $_[0]->{fh} } + +sub blocking { shift->{fh}->blocking(@_) } + +sub watch { + my ($self, $name, $mask, $cb) = @_; + croak "E: $cb not supported" if $cb; # too much memory + my $wd = syscall(SYS_inotify_add_watch, $self->fileno, $name, $mask); + return if $wd < 0; + my $w = bless [ $wd, $mask, $name, $self ], 'PublicInbox::In3Watch'; + $self->{w}->{$wd} = $w; + Scalar::Util::weaken($w->[3]); # ugh + $w; +} + +sub rm_watch { + my ($self, $wd) = @_; + delete $self->{w}->{$wd}; + $self->{ignore}->{$wd} = 1; # is this needed? + syscall(SYS_inotify_rm_watch, $self->fileno, $wd) < 0 ? undef : 1; +} + +1; diff --git a/lib/PublicInbox/InputPipe.pm b/lib/PublicInbox/InputPipe.pm new file mode 100644 index 00000000..ee5bda59 --- /dev/null +++ b/lib/PublicInbox/InputPipe.pm @@ -0,0 +1,52 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# for reading pipes, sockets, and TTYs off the DS event loop +package PublicInbox::InputPipe; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN); + +sub consume { + my ($in, $cb, @args) = @_; + my $self = bless { cb => $cb, args => \@args }, __PACKAGE__; + eval { $self->SUPER::new($in, EPOLLIN) }; + if ($@) { # regular file (but not w/ select|IO::Poll backends) + $self->{-need_rq} = 1; + $self->requeue; + } elsif (-p _ || -S _) { # O_NONBLOCK for sockets and pipes + $in->blocking(0); + } + $self; +} + +sub close { # idempotent + my ($self) = @_; + $self->{-need_rq} ? delete($self->{sock}) : $self->SUPER::close +} + +sub event_step { + my ($self) = @_; + my $r = sysread($self->{sock} // return, my $rbuf, 65536); + eval { + if ($r) { + $self->{cb}->($self, @{$self->{args}}, $rbuf); + $self->requeue if $self->{-need_rq}; + } elsif (defined($r)) { # EOF + $self->{cb}->($self, @{$self->{args}}, ''); + $self->close + } elsif ($!{EAGAIN}) { # rely on EPOLLIN + } elsif ($!{EINTR}) { # rely on EPOLLIN for sockets/pipes + $self->requeue if $self->{-need_rq}; + } else { # another error + $self->{cb}->($self, @{$self->{args}}, undef); + $self->close; + } + }; + if ($@) { + warn "E: $@"; + $self->close; + } +} + +1; diff --git a/lib/PublicInbox/Isearch.pm b/lib/PublicInbox/Isearch.pm new file mode 100644 index 00000000..20808d6d --- /dev/null +++ b/lib/PublicInbox/Isearch.pm @@ -0,0 +1,141 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Provides everything the PublicInbox::Search object does; +# but uses global ExtSearch (->ALL) with an eidx_key query to +# emulate per-Inbox search using ->ALL. +package PublicInbox::Isearch; +use v5.12; +use PublicInbox::ExtSearch; +use PublicInbox::Search; + +sub new { + my (undef, $ibx, $es) = @_; + bless { es => $es, eidx_key => $ibx->eidx_key }, __PACKAGE__; +} + +sub _ibx_id ($) { + my ($self) = @_; + my $sth = $self->{es}->over->dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1 + + $sth->execute($self->{eidx_key}); + $sth->fetchrow_array // + die "E: `$self->{eidx_key}' not in $self->{es}->{topdir}\n"; +} + +sub query_approxidate { $_[0]->{es}->query_approxidate($_[1], $_[2]) } + +sub eidx_mset_prep ($$) { + my ($self, $opt) = @_; + my %opt = $opt ? %$opt : (); + $opt{eidx_key} = $self->{eidx_key}; + my $uid_range = $opt{uid_range} or return \%opt; + my ($beg, $end) = @$uid_range; + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $dbh = $self->{es}->over->dbh; + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MIN(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + my @r = ($sth->fetchrow_array); + + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT MAX(docid) FROM xref3 WHERE ibx_id = ? AND xnum >= ? AND xnum <= ? + + $sth->execute($ibx_id, $beg, $end); + $r[1] = $sth->fetchrow_array; + if (defined($r[1]) && defined($r[0])) { + $opt{limit} = $r[1] - $r[0] + 1; + } else { + $r[1] //= $self->{es}->xdb->get_lastdocid; + $r[0] //= 0; + } + $opt{uid_range} = \@r; # these are fed to Xapian and SQLite + \%opt; +} + +sub mset { + my ($self, $str, $opt) = @_; + $self->{es}->mset($str, eidx_mset_prep $self, $opt); +} + +sub async_mset { + my ($self, $str, $opt, $cb, @args) = @_; + $opt = eidx_mset_prep $self, $opt; + $self->{es}->async_mset($str, $opt, $cb, @args); +} + +sub mset_to_artnums { + my ($self, $mset, $opt) = @_; + my $docids = PublicInbox::Search::mset_to_artnums($self->{es}, $mset); + my $ibx_id = $self->{-ibx_id} //= _ibx_id($self); + my $qmarks = join(',', map { '?' } @$docids); + if ($opt && ($opt->{relevance} // 0) == -1) { # -1 => ENQ_ASCENDING + my $range = ''; + my @r; + if (my $r = $opt->{uid_range}) { + $range = 'AND xnum >= ? AND xnum <= ?'; + @r = @$r; + } + return $self->{es}->over->dbh-> + selectcol_arrayref(<<"", undef, $ibx_id, @$docids, @r); +SELECT xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) $range +ORDER BY xnum ASC + + } + + my $rows = $self->{es}->over->dbh-> + selectall_arrayref(<<"", undef, $ibx_id, @$docids); +SELECT docid,xnum FROM xref3 WHERE ibx_id = ? AND docid IN ($qmarks) + + my $i = -1; + my %order = map { $_ => ++$i } @$docids; + my @xnums; + for my $row (@$rows) { # @row = ($docid, $xnum) + my $idx = delete($order{$row->[0]}) // next; + $xnums[$idx] = $row->[1]; + } + if (scalar keys %order) { + warn "W: $self->{es}->{topdir} #", + join(', ', sort { $a <=> $b } keys %order), + " not mapped to `$self->{eidx_key}'\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + @xnums = grep { defined } @xnums; + } + \@xnums; +} + +sub mset_to_smsg { + my ($self, $ibx, $mset) = @_; # $ibx is a real inbox, not eidx + my $xnums = mset_to_artnums($self, $mset); + my $i = -1; + my %order = map { $_ => ++$i } @$xnums; + my $unordered = $ibx->over->get_all(@$xnums); + my @msgs; + for my $smsg (@$unordered) { + my $idx = delete($order{$smsg->{num}}) // do { + warn "W: $ibx->{inboxdir} #$smsg->{num}\n"; + next; + }; + $msgs[$idx] = $smsg; + } + if (scalar keys %order) { + warn "W: $ibx->{inboxdir} #", + join(', ', sort { $a <=> $b } keys %order), + " no longer valid\n"; + warn "W: $self->{es}->{topdir} may need to be reindexed\n"; + } + wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs; +} + +sub has_threadid { 1 } + +sub help { $_[0]->{es}->help } + +sub xh_args { # prep getopt args to feed to xap_helper.h socket + my ($self, $opt) = @_; # TODO uid_range + ($self->{es}->xh_args, '-O', $self->{eidx_key}); +} + +1; diff --git a/lib/PublicInbox/KQNotify.pm b/lib/PublicInbox/KQNotify.pm index c7740df2..2efa887d 100644 --- a/lib/PublicInbox/KQNotify.pm +++ b/lib/PublicInbox/KQNotify.pm @@ -1,48 +1,35 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # implements the small subset of Linux::Inotify2 functionality we use # using IO::KQueue on *BSD systems. package PublicInbox::KQNotify; -use strict; +use v5.12; +use parent qw(PublicInbox::FakeInotify); use IO::KQueue; use PublicInbox::DSKQXS; # wraps IO::KQueue for fork-safe DESTROY -use PublicInbox::FakeInotify; -use Time::HiRes qw(stat); +use Errno qw(ENOENT); # NOTE_EXTEND detects rename(2), NOTE_WRITE detects link(2) sub MOVED_TO_OR_CREATE () { NOTE_EXTEND|NOTE_WRITE } sub new { my ($class) = @_; - bless { dskq => PublicInbox::DSKQXS->new, watch => {} }, $class; + bless { dskq => PublicInbox::DSKQXS->new }, $class; } sub watch { my ($self, $path, $mask) = @_; - my ($fh, $watch); - if (-d $path) { - opendir($fh, $path) or return; - my @st = stat($fh); - $watch = bless [ $fh, $path, $st[10] ], - 'PublicInbox::KQNotify::Watchdir'; - } else { - open($fh, '<', $path) or return; - $watch = bless [ $fh, $path ], - 'PublicInbox::KQNotify::Watch'; - } - my $ident = fileno($fh); - $self->{dskq}->{kq}->EV_SET($ident, # ident + my $dir_delete = $mask & NOTE_DELETE ? 1 : 0; + my $w = $self->watch_open($path, \$dir_delete) or return; + $w->[2] = pop @$w; # ctime is unused by this subclass + my $ident = fileno($w->[2]) // die "BUG: bad fileno $w->[2]: $!"; + $self->{dskq}->{kq}->EV_SET($ident, # ident (fd) EVFILT_VNODE, # filter EV_ADD | EV_CLEAR, # flags $mask, # fflags - 0, 0); # data, udata - if ($mask == NOTE_WRITE || $mask == MOVED_TO_OR_CREATE) { - $self->{watch}->{$ident} = $watch; - } else { - die "TODO Not implemented: $mask"; - } - $watch; + 0, $dir_delete); # data, udata + $self->{watch}->{$ident} = $w; } # emulate Linux::Inotify::fileno @@ -59,38 +46,31 @@ sub blocking {} # behave like Linux::Inotify2->read sub read { my ($self) = @_; - my @kevents = $self->{dskq}->{kq}->kevent(0); my $events = []; - for my $kev (@kevents) { + for my $kev ($self->{dskq}->{kq}->kevent(0)) { my $ident = $kev->[KQ_IDENT]; - my $mask = $kev->[KQ_FFLAGS]; - my ($dh, $path, $old_ctime) = @{$self->{watch}->{$ident}}; - if (!defined($old_ctime)) { - push @$events, - bless(\$path, 'PublicInbox::FakeInotify::Event') - } elsif ($mask & MOVED_TO_OR_CREATE) { - my @new_st = stat($path) or next; - $self->{watch}->{$ident}->[3] = $new_st[10]; # ctime - rewinddir($dh); - PublicInbox::FakeInotify::on_new_files($events, $dh, - $path, $old_ctime); + my $w = $self->{watch}->{$ident} or next; + if (!@$w) { # cancelled + delete($self->{watch}->{$ident}); + next; + } + my $dir_delete = $kev->[KQ_UDATA]; + my ($old_dev, $old_ino, $fh, $path) = @$w; + my @new_st = stat($path); + warn "W: stat($path): $!\n" if !@new_st && $! != ENOENT; + if (!@new_st || "$old_dev $old_ino" ne "@new_st[0,1]") { + push(@$events, $self->gone($ident, $path)); + next; + } + if (-d _) { + rewinddir($fh); + $self->on_dir_change($events, $fh, $path, $dir_delete); + } else { + push @$events, bless(\$path, + 'PublicInbox::FakeInotify::Event'); } } @$events; } -package PublicInbox::KQNotify::Watch; -use strict; - -sub name { $_[0]->[1] } - -sub cancel { close $_[0]->[0] or die "close: $!" } - -package PublicInbox::KQNotify::Watchdir; -use strict; - -sub name { $_[0]->[1] } - -sub cancel { closedir $_[0]->[0] or die "closedir: $!" } - 1; diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm new file mode 100644 index 00000000..e9a0de6c --- /dev/null +++ b/lib/PublicInbox/LEI.pm @@ -0,0 +1,1605 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Backend for `lei' (local email interface). Unlike the C10K-oriented +# PublicInbox::Daemon, this is designed exclusively to handle trusted +# local clients with read/write access to the FS and use as many +# system resources as the local user has access to. +package PublicInbox::LEI; +use v5.12; +use parent qw(PublicInbox::DS PublicInbox::LeiExternal + PublicInbox::LeiQuery); +use autodie qw(bind chdir open pipe socket socketpair syswrite unlink); +use Getopt::Long (); +use Socket qw(AF_UNIX SOCK_SEQPACKET pack_sockaddr_un); +use Errno qw(EPIPE EAGAIN ECONNREFUSED ENOENT ECONNRESET); +use Cwd qw(getcwd); +use POSIX qw(strftime); +use IO::Handle (); +use Fcntl qw(SEEK_SET); +use PublicInbox::Config; +use PublicInbox::Syscall qw(EPOLLIN); +use PublicInbox::Spawn qw(run_wait popen_rd run_qx); +use PublicInbox::Lock; +use PublicInbox::Eml; +use PublicInbox::Import; +use PublicInbox::ContentHash qw(git_sha); +use PublicInbox::OnDestroy; +use PublicInbox::IPC; +use Time::HiRes qw(stat); # ctime comparisons for config cache +use File::Path (); +use File::Spec; +use Carp qw(carp); +use Sys::Syslog qw(openlog syslog closelog); +our $quit = \&CORE::exit; +our ($current_lei, $errors_log, $listener, $oldset, $dir_idle); +my $GLP = Getopt::Long::Parser->new; +$GLP->configure(qw(gnu_getopt no_ignore_case auto_abbrev)); +my $GLP_PASS = Getopt::Long::Parser->new; +$GLP_PASS->configure(qw(gnu_getopt no_ignore_case auto_abbrev pass_through)); + +our (%PATH2CFG, # persistent for socket daemon +$MDIR2CFGPATH, # location => { /path/to/config => [ ino watches ] } +$OPT, # shared between optparse and opt_dash callback (for Getopt::Long) +$daemon_pid +); + +# TBD: this is a documentation mechanism to show a subcommand +# (may) pass options through to another command: +sub pass_through { $GLP_PASS } + +sub opt_dash ($$) { # callback runs inside optparse + my ($spec, $re_str) = @_; # 'limit|n=i', '([0-9]+)' + my ($key) = ($spec =~ m/\A([a-z]+)/g); + my $cb = sub { # Getopt::Long "<>" catch-all handler + my ($arg) = @_; + if ($arg =~ /\A-($re_str)\z/) { + $OPT->{$key} = $1; + } elsif ($arg eq '--') { # "--" arg separator, ignore first + push @{$OPT->{-argv}}, $arg if $OPT->{'--'}++; + # lone (single) dash is handled elsewhere + } elsif (substr($arg, 0, 1) eq '-') { + if ($OPT->{'--'}) { + push @{$OPT->{-argv}}, $arg; + } else { + die "bad argument: $arg\n"; + } + } else { + push @{$OPT->{-argv}}, $arg; + } + }; + ($spec, '<>' => $cb, $GLP_PASS) # for Getopt::Long +} + +# rel2abs preserves symlinks in parent, unlike abs_path +sub rel2abs { + my ($self, $p) = @_; + if (index($p, '/') == 0) { # already absolute + $p =~ tr!/!/!s; # squeeze redundant slashes + chop($p) if substr($p, -1, 1) eq '/'; + return $p; + } + my $pwd = $self->{env}->{PWD}; + if (defined $pwd) { + if (my @st_pwd = stat($pwd)) { + my @st_cwd = stat($self->{3}) or die "stat({3}): $!"; + "@st_pwd[1,0]" eq "@st_cwd[1,0]" or + $self->{env}->{PWD} = $pwd = undef; + } else { # PWD was invalid + $self->{env}->{PWD} = $pwd = undef; + } + } + $pwd //= $self->{env}->{PWD} = getcwd() // die "getcwd: $!"; + File::Spec->rel2abs($p, $pwd); +} + +# abs_path resolves symlinks in parent iff all parents exist +sub abs_path { Cwd::abs_path($_[1]) // rel2abs(@_) } + +sub canonpath_harder { + my $p = $_[-1]; # $_[0] may be self + $p = File::Spec->canonpath($p); + $p =~ m!(?:/*|\A)\.\.(?:/*|\z)! && -e $p ? Cwd::abs_path($p) : $p; +} + +sub share_path ($) { # $HOME/.local/share/lei/$FOO + my ($self) = @_; + rel2abs($self, ($self->{env}->{XDG_DATA_HOME} // + ($self->{env}->{HOME} // '/nonexistent').'/.local/share') + .'/lei'); +} + +sub store_path ($) { share_path($_[0]) . '/store' } + +sub _config_path ($) { + my ($self) = @_; + rel2abs($self, ($self->{env}->{XDG_CONFIG_HOME} // + ($self->{env}->{HOME} // '/nonexistent').'/.config') + .'/lei/config'); +} + +sub cache_dir ($) { + my ($self) = @_; + rel2abs($self, ($self->{env}->{XDG_CACHE_HOME} // + ($self->{env}->{HOME} // '/nonexistent').'/.cache') + .'/lei'); +} + +sub url_folder_cache { + my ($self) = @_; + require PublicInbox::SharedKV; # URI => updated_at_sec_ + PublicInbox::SharedKV->new(cache_dir($self).'/uri_folder'); +} + +sub ale { + my ($self) = @_; + $self->{ale} // do { + require PublicInbox::LeiALE; + my $cfg = $self->_lei_cfg(1); + $self->{ale} = $cfg->{ale} //= PublicInbox::LeiALE->new($self); + }; +} + +sub index_opt { + # TODO: drop underscore variants everywhere, they're undocumented + qw(fsync|sync! jobs|j=i indexlevel|L=s compact + max_size|max-size=s sequential-shard + batch_size|batch-size=s skip-docdata) +} + +my @c_opt = qw(c=s@ C=s@ quiet|q); +my @net_opt = (qw(no-torsocks torsocks=s), PublicInbox::LeiQuery::curl_opt()); +my @lxs_opt = qw(remote! local! external! include|I=s@ exclude=s@ only|O=s@ + import-remote!); + +# we don't support -C as an alias for --find-copies since it's already +# used for chdir +our @diff_opt = qw(unified|U=i output-indicator-new=s output-indicator-old=s + output-indicator-context=s indent-heuristic! + minimal patience histogram anchored=s@ diff-algorithm=s + color-moved:s color-moved-ws=s no-color-moved no-color-moved-ws + word-diff:s word-diff-regex=s color-words:s no-renames + rename-empty! check ws-error-highlight=s full-index binary + abbrev:i break-rewrites|B:s find-renames|M:s find-copies:s + find-copies-harder irreversible-delete|D l=i diff-filter=s + S=s G=s find-object=s pickaxe-all pickaxe-regex R + relative:s text|a ignore-cr-at-eol ignore-space-at-eol + ignore-space-change|b ignore-all-space|w ignore-blank-lines + inter-hunk-context=i function-context|W exit-code ext-diff + no-ext-diff textconv! src-prefix=s dst-prefix=s no-prefix + line-prefix=s); + +# we generate shell completion + help using %CMD and %OPTDESC, +# see lei__complete() and PublicInbox::LeiHelp +# command => [ positional_args, 1-line description, Getopt::Long option spec ] +our %CMD = ( # sorted in order of importance/use: +'q' => [ '--stdin|SEARCH_TERMS...', 'search for messages matching terms', + 'stdin|', # /|\z/ must be first for lone dash + @lxs_opt, @net_opt, + qw(save! output|mfolder|o=s format|f=s dedupe|d=s threads|t+ + thread-id|T=s + sort|s=s reverse|r offset=i pretty jobs|j=s globoff|g augment|a + import-before! lock=s@ rsyncable alert=s@ mua=s verbose|v+ + shared color! mail-sync!), @c_opt, opt_dash('limit|n=i', '[0-9]+') ], + +'up' => [ 'OUTPUT...|--all', 'update saved search', + qw(jobs|j=s lock=s@ alert=s@ mua=s verbose|v+ exclude=s@ + remote-fudge-time=s all:s remote! local! external!), @net_opt, @c_opt ], + +'lcat' => [ '--stdin|MSGID_OR_URL...', 'display local copy of message(s)', + 'stdin|', # /|\z/ must be first for lone dash + # some of these options are ridiculous for lcat + @lxs_opt, @net_opt, + qw(output|mfolder|o=s format|f=s dedupe|d=s threads|t+ + sort|s=s reverse|r offset=i jobs|j=s globoff|g augment|a + import-before! lock=s@ rsyncable alert=s@ mua=s verbose|v+ + color!), @c_opt, opt_dash('limit|n=i', '[0-9]+') ], + +'blob' => [ 'OID', 'show a git blob, reconstructing from mail if necessary', + qw(git-dir=s@ cwd! verbose|v+ mail! oid-a|A=s path-a|a=s path-b|b=s), + @lxs_opt, @net_opt, @c_opt ], + +'rediff' => [ '--stdin|LOCATION...', + 'regenerate a diff with different options', + 'stdin|', # /|\z/ must be first for lone dash + qw(git-dir=s@ cwd! verbose|v+ color:s no-color drq:1 dequote-only:1 + order-file=s), @diff_opt, @lxs_opt, @net_opt, @c_opt ], + +'mail-diff' => [ '--stdin|LOCATION...', 'diff the contents of emails', + 'stdin|', # /|\z/ must be first for lone dash + qw(verbose|v+ in-format|F=s color:s no-color raw-header), + @diff_opt, @net_opt, @c_opt ], + +'add-external' => [ 'LOCATION', + 'add/set priority of a publicinbox|extindex for extra matches', + qw(boost=i mirror=s inbox-version=i epoch=s verbose|v+), + @c_opt, index_opt(), @net_opt ], +'ls-external' => [ '[FILTER]', 'list publicinbox|extindex locations', + qw(format|f=s z|0 globoff|g invert-match|v local remote), @c_opt ], +'ls-label' => [ '', 'list labels', qw(z|0 stats:s), @c_opt ], +'ls-mail-sync' => [ '[FILTER]', 'list mail sync folders', + qw(z|0 globoff|g invert-match|v local remote), @c_opt ], +'ls-mail-source' => [ 'URL', 'list IMAP or NNTP mail source folders', + qw(z|0 ascii l pretty url), @net_opt, @c_opt ], +'forget-external' => [ 'LOCATION...|--prune', + 'exclude further results from a publicinbox|extindex', + qw(prune), @c_opt ], + +'ls-search' => [ '[PREFIX]', 'list saved search queries', + qw(format|f=s pretty l ascii z|0), @c_opt ], +'forget-search' => [ 'OUTPUT...|--prune', 'forget a saved search', + qw(verbose|v+ prune:s), @c_opt ], +'edit-search' => [ 'OUTPUT', "edit saved search via `git config --edit'", + @c_opt ], +'rm' => [ '--stdin|LOCATION...', + 'remove a message from the index and prevent reindexing', + 'stdin|', # /|\z/ must be first for lone dash + qw(in-format|F=s lock=s@ commit-delay=i), @net_opt, @c_opt ], +'plonk' => [ '--threads|--from=IDENT', + 'exclude mail matching From: or threads from non-Message-ID searches', + qw(stdin| threads|t from|f=s mid=s oid=s), @c_opt ], +tag => [ 'KEYWORDS... LOCATION...|--stdin', + 'set/unset keywords and/or labels on message(s)', + qw(stdin| in-format|F=s input|i=s@ oid=s@ mid=s@ commit-delay=i), + @net_opt, @c_opt, pass_through('-kw:foo for delete') ], + +'purge-mailsource' => [ 'LOCATION|--all', + 'remove imported messages from IMAP, Maildirs, and MH', + qw(exact! all jobs:i indexed), @c_opt ], + +'add-watch' => [ 'LOCATION... [LABELS...]', + 'watch for new messages and flag changes', + qw(poll-interval=s state=s recursive|r), @c_opt ], +'rm-watch' => [ 'LOCATION...', 'remove specified watch(es)', + qw(recursive|r), @c_opt ], +'ls-watch' => [ '[FILTER...]', 'list active watches with numbers and status', + qw(l z|0), @c_opt ], +'pause-watch' => [ '[WATCH_NUMBER_OR_FILTER]', qw(all local remote), @c_opt ], +'resume-watch' => [ '[WATCH_NUMBER_OR_FILTER]', qw(all local remote), @c_opt ], +'forget-watch' => [ '{WATCH_NUMBER|--prune}', 'stop and forget a watch', + qw(prune), @c_opt ], + +'reindex' => [ '', 'reindex all locally-indexed messages', @c_opt ], + +'index' => [ 'LOCATION... [LABELS...]', 'one-time index from URL or filesystem', + qw(in-format|F=s kw! offset=i recursive|r exclude=s include|I=s + verbose|v+ incremental!), @net_opt, # mainly for --proxy= + @c_opt ], +import => [ 'LOCATION...|--stdin [LABELS...]', + 'one-time import/update from URL or filesystem', + qw(stdin| offset=i recursive|r exclude=s include|I=s new-only + lock=s@ in-format|F=s kw! verbose|v+ incremental! mail-sync! + commit-delay=i sort|s:s@), + @net_opt, @c_opt ], +'forget-mail-sync' => [ 'LOCATION...', + 'forget sync information for a mail folder', @c_opt ], +'refresh-mail-sync' => [ 'LOCATION...|--all', + 'prune dangling sync data for a mail folder', 'all:s', + @net_opt, @c_opt ], +'export-kw' => [ 'LOCATION...|--all', + 'one-time export of keywords of sync sources', + qw(all:s mode=s), @net_opt, @c_opt ], +'convert' => [ 'LOCATION...|--stdin', + 'one-time conversion from URL or filesystem to another format', + qw(stdin| in-format|F=s out-format|f=s output|mfolder|o=s lock=s@ kw! + rsyncable sort|s:s@), + @net_opt, @c_opt ], +'p2q' => [ 'LOCATION_OR_COMMIT...|--stdin', + "use a patch to generate a query for `lei q --stdin'", + qw(stdin| in-format|F=s want|w=s@ uri debug), @net_opt, @c_opt ], +'config' => [ '[...]', sub { + 'git-config(1) wrapper for '._config_path($_[0]). "\n" . + '-l/--list and other common git-config uses are supported' + }, qw(config-file|system|global|file|f=s), # for conflict detection + qw(edit|e c=s@ C=s@), pass_through('git config') ], +'inspect' => [ 'ITEMS...|--stdin', 'inspect lei/store and/or local external', + qw(stdin| pretty ascii dir|d=s), @c_opt ], + +'init' => [ '[DIRNAME]', sub { + "initialize storage, default: ".store_path($_[0]); + }, @c_opt ], +'daemon-kill' => [ '[-SIGNAL]', 'signal the lei-daemon', + # "-C DIR" conflicts with -CHLD, here, and chdir makes no sense, here + opt_dash('signal|s=s', '[0-9]+|(?:[A-Z][A-Z0-9]+)') ], +'daemon-pid' => [ '', 'show the PID of the lei-daemon' ], +'help' => [ '[SUBCOMMAND]', 'show help' ], + +# TODO +#'reorder-local-store-and-break-history' => [ '[REFNAME]', +# 'rewrite git history in an attempt to improve compression', +# qw(gc!), @c_opt ], +#'fuse-mount' => [ 'PATHNAME', 'expose lei/store as Maildir(s)', @c_opt ], +# +# internal commands are prefixed with '_' +'_complete' => [ '[...]', 'internal shell completion helper', + pass_through('everything') ], +); # @CMD + +# switch descriptions, try to keep consistent across commands +# $spec: Getopt::Long option specification +# $spec => [@ALLOWED_VALUES (default is first), $description], +# $spec => $description +# "$SUB_COMMAND TAB $spec" => as above +my $stdin_formats = [ 'MAIL_FORMAT|eml|mboxrd|mboxcl2|mboxcl|mboxo', + 'specify message input format' ]; +my $ls_format = [ 'OUT|plain|json|null', 'listing output format' ]; +my $sort_out = [ 'VAL|received|relevance|docid', + "order of results is `--output'-dependent"]; +my $sort_in = [ 'sequence|mtime|size', 'sort input (format-dependent)' ]; + +# we use \x{a0} (non-breaking SP) to avoid wrapping in PublicInbox::LeiHelp +my %OPTDESC = ( +'help|h' => 'show this built-in help', +'c=s@' => [ 'NAME=VALUE', 'set config option' ], +'C=s@' => [ 'DIR', 'chdir to specify to directory' ], +'quiet|q' => 'be quiet', +'lock=s@' => [ 'METHOD|dotlock|fcntl|flock|none', + 'mbox(5) locking method(s) to use (default: fcntl,dotlock)' ], + +'incremental! import' => 'import already seen IMAP and NNTP articles', +'globoff|g' => "do not match locations using '*?' wildcards ". + "and\xa0'[]'\x{a0}ranges", +'invert-match|v' => 'select non-matching lines', +'color!' => 'disable color (for --format=text)', +'verbose|v+' => 'be more verbose', +'external!' => 'do not use externals', +'mail!' => 'do not look in mail storage for OID', +'cwd!' => 'do not look in git repo of current working directory', +'oid-a|A=s' => 'pre-image OID', +'path-a|a=s' => 'pre-image pathname associated with OID', +'path-b|b=s' => 'post-image pathname associated with OID', +'git-dir=s@' => 'additional git repository to scan', +'dir|d=s inspect' => + 'specify a inboxdir, extindex topdir or Xapian shard', +'proxy=s' => [ 'PROTO://HOST[:PORT]', # shared with curl(1) + "proxy for (e.g. `socks5h://0:9050')" ], +'torsocks=s' => ['VAL|auto|no|yes', + 'whether or not to wrap git and curl commands with torsocks'], +'no-torsocks' => 'alias for --torsocks=no', +'save!' => "do not save a search for `lei up'", +'import-remote!' => 'do not memoize remote messages into local store', +'import-before!' => 'do not import before writing to output (DANGEROUS)', + +'type=s' => [ 'any|mid|git', 'disambiguate type' ], + +'dedupe|d=s' => ['STRATEGY|content|oid|mid|none', + 'deduplication strategy'], +'threads|t+' => + 'return all messages in the same threads as the actual match(es)', + +'want|w=s@' => [ 'PREFIX|dfpost|dfn', # common ones in help... + 'search prefixes to extract (default: dfpost7)' ], +'uri p2q' => [ 'URI escape output' ], + +'alert=s@' => ['CMD,:WINCH,:bell,<any command>', + 'run command(s) or perform ops when done writing to output ' . + '(default: ":WINCH,:bell" with --mua and Maildir/IMAP output, ' . + 'nothing otherwise)' ], + +'augment|a' => 'augment --output destination instead of clobbering', + +'output|mfolder|o=s' => [ 'MFOLDER', + "destination (e.g.\xa0`/path/to/Maildir', ". + "or\xa0`-'\x{a0}for\x{a0}stdout)" ], +'mua=s' => [ 'CMD', + "MUA to run on --output Maildir or mbox (e.g.\xa0`mutt\xa0-f\xa0%f')" ], +'new-only import' => 'only import new messages from IMAP source', + +'inbox-version=i' => [ 'NUM|1|2', + 'force a public-inbox version with --mirror'], +'mirror=s' => [ 'URL', 'mirror a public-inbox'], + +# public-inbox-index options +'fsync!' => 'speed up indexing after --mirror, risk index corruption', +'compact' => 'run compact index after mirroring', +'indexlevel|L=s' => [ 'LEVEL|full|medium|basic', + "indexlevel with --mirror (default: full)" ], +'max_size|max-size=s' => [ 'SIZE', + 'do not index messages larger than SIZE (default: infinity)' ], +'batch_size|batch-size=s' => [ 'SIZE', + 'flush changes to OS after given number of bytes (default: 1m)' ], +'sequential-shard' => + 'index Xapian shards sequentially for slow storage', +'skip-docdata' => + 'drop compatibility w/ public-inbox <1.6 to save ~1.5% space', + +'format|f=s q' => [ + 'OUT|maildir|mboxrd|mboxcl2|mboxcl|mboxo|html|json|jsonl|concatjson', + 'specify output format, default depends on --output'], +'exclude=s@ q' => [ 'LOCATION', + 'exclude specified external(s) from search' ], +'include|I=s@ q' => [ 'LOCATION', + 'include specified external(s) in search' ], +'only|O=s@ q' => [ 'LOCATION', + 'only use specified external(s) for search' ], +'jobs|j=s' => [ 'JOBSPEC', + 'control number of query and writer jobs' . + "integers delimited by `,', either of which may be omitted" + ], +'jobs|j=i add-external' => 'set parallelism when indexing after --mirror', + +'in-format|F=s' => $stdin_formats, +'format|f=s ls-search' => ['OUT|json|jsonl|concatjson', + 'listing output format' ], +'l ls-search' => 'long listing format', +'l ls-watch' => 'long listing format', +'l ls-mail-source' => 'long listing format', +'url ls-mail-source' => 'show full URL of newsgroup or IMAP folder', +'format|f=s ls-external' => $ls_format, + +'prune:s forget-search' => + ['TYPE|local|remote', 'prune all, remote or local folders' ], + +'limit|n=i@' => ['NUM', 'limit on number of matches (default: 10000)' ], +'offset=i' => ['OFF', 'search result offset (default: 0)'], + +'sort|s=s q' => $sort_out, +'sort|s=s lcat' => $sort_out, +'sort|s:s@ convert' => $sort_in, +'sort|s:s@ import' => $sort_in, +'reverse|r' => 'reverse search results', # like sort(1) + +'boost=i' => 'increase/decrease priority of results (default: 0)', + +'local' => 'limit operations to the local filesystem', +'local!' => 'exclude results from the local filesystem', +'remote' => 'limit operations to those requiring network access', +'remote!' => 'prevent operations requiring network access', + +# up, refresh-mail-sync, export-kw +'all:s' => ['TYPE|local|remote', 'all remote or local folders' ], + +'remote-fudge-time=s' => [ 'INTERVAL', + 'look for mail INTERVAL older than the last successful query' ], + +'mid=s' => 'specify the Message-ID of a message', +'oid=s' => 'specify the git object ID of a message', + +'recursive|r' => 'scan directories/mailboxes/newsgroups recursively', +'exclude=s' => 'exclude mailboxes/newsgroups based on pattern', +'include=s' => 'include mailboxes/newsgroups based on pattern', + +'exact' => 'operate on exact header matches only', +'exact!' => 'rely on content match instead of exact header matches', + +'by-mid|mid:s' => [ 'MID', 'match only by Message-ID, ignoring contents' ], + +'kw!' => 'disable/enable importing keywords (aka "flags")', + +# xargs, env, use "-0", git(1) uses "-z". We support z|0 everywhere +'z|0' => 'use NUL \\0 instead of newline (CR) to delimit lines', + +'signal|s=s' => [ 'SIG', 'signal to send lei-daemon (default: TERM)' ], +'edit|e config' => 'open an editor to modify the lei config file', +); # %OPTDESC + +my %CONFIG_KEYS = ( + 'leistore.dir' => 'top-level storage location', +); + +my @WQ_KEYS = qw(lxs l2m ikw pmd wq1 lne v2w); # internal workers + +sub _drop_wq { + my ($self) = @_; + for my $wq (grep(defined, delete(@$self{@WQ_KEYS}))) { + $wq->wq_kill(-POSIX::SIGTERM()); + $wq->DESTROY; + } +} + +# pronounced "exit": x_it(1 << 8) => exit(1); x_it(13) => SIGPIPE +sub x_it ($$) { + my ($self, $code) = @_; + # make sure client sees stdout before exit + $self->{1}->autoflush(1) if $self->{1}; + stop_pager($self); + if ($self->{pkt_op_p}) { # worker => lei-daemon + $self->{pkt_op_p}->pkt_do('x_it', $code); + exit($code >> 8) if $$ != $daemon_pid; + } elsif ($self->{sock}) { # lei->daemon => lei(1) client + send($self->{sock}, "x_it $code", 0); + } elsif ($quit == \&CORE::exit) { # an admin (one-shot) command + exit($code >> 8); + } # else ignore if client disconnected + $self->dclose if $$ == $daemon_pid; +} + +sub err ($;@) { + my $self = shift; + my $err = $self->{2} // ($self->{pgr} // [])->[2] // *STDERR{GLOB}; + my @eor = (substr($_[-1]//'', -1, 1) eq "\n" ? () : ("\n")); + print $err @_, @eor and return; + my $old_err = delete $self->{2}; + $old_err->close if $! == EPIPE && $old_err; + $err = $self->{2} = ($self->{pgr} // [])->[2] // *STDERR{GLOB}; + print $err @_, @eor or print STDERR @_, @eor; +} + +sub qerr ($;@) { $_[0]->{opt}->{quiet} or err(shift, @_) } + +sub qfin { # show message on finalization (LeiFinmsg) + my ($lei, $msg) = @_; + return if $lei->{opt}->{quiet}; + $lei->{fmsg} ? push(@{$lei->{fmsg}}, "$msg\n") : qerr($lei, $msg); +} + +sub fail_handler ($;$$) { + my ($lei, $code, $io) = @_; + $io->close if $io; # needed to avoid warnings on SIGPIPE + _drop_wq($lei); + x_it($lei, $code // (1 << 8)); +} + +sub sigpipe_handler { # handles SIGPIPE from @WQ_KEYS workers + fail_handler($_[0], 13, delete $_[0]->{1}); +} + +sub fail ($;@) { + my ($lei, @msg) = @_; + my $exit_code = ($msg[0]//'') =~ /\A-?[0-9]+\z/ ? shift(@msg) : undef; + local $current_lei = $lei; + $lei->{failed}++; + if (@msg) { + push @msg, "\n" if substr($msg[-1], -1, 1); + warn @msg; + } + $lei->{pkt_op_p}->pkt_do('fail_handler') if $lei->{pkt_op_p}; + x_it($lei, $exit_code // (1 << 8)); + undef; +} + +sub out ($;@) { + my $self = shift; + return if print { $self->{1} // return } @_; # likely + return note_sigpipe($self, 1) if $! == EPIPE; + my $err = "error writing to output: $!"; + delete $self->{1}; + fail($self, $err); +} + +sub puts ($;@) { out(shift, map { "$_\n" } @_) } + +sub child_error { # passes non-fatal curl exit codes to user + my ($self, $child_error, $msg) = @_; # child_error is $? + local $current_lei = $self; + $child_error ||= 1 << 8; + warn(substr($msg, -1, 1) eq "\n" ? $msg : "$msg\n") if defined $msg; + $self->{child_error} ||= $child_error; + if ($self->{pkt_op_p}) { # to top lei-daemon + $self->{pkt_op_p}->pkt_do('child_error', $child_error); + } elsif ($self->{sock}) { # to lei(1) client + send($self->{sock}, "child_error $child_error", 0); + } # else noop if client disconnected +} + +sub note_sigpipe { # triggers sigpipe_handler + my ($self, $fd) = @_; + delete($self->{$fd})->close; # explicit close silences Perl warning + $self->{pkt_op_p}->pkt_do('sigpipe_handler') if $self->{pkt_op_p}; + x_it($self, 13); +} + +sub _lei_atfork_child { + my ($self, $persist) = @_; + # we need to explicitly close things which are on stack + my $cfg = $self->{cfg}; + delete @$cfg{qw(-watches -lei_note_event)}; + if ($persist) { + open $self->{3}, '<', '/'; + fchdir($self); + close($_) for (grep(defined, delete @$self{qw(0 1 2 sock)})); + delete $cfg->{-lei_store}; + } else { # worker, Net::NNTP (Net::Cmd) uses STDERR directly + open STDERR, '+>&='.fileno($self->{2}); # idempotent w/ fileno + STDERR->autoflush(1); + $self->{2} = \*STDERR; + POSIX::setpgid(0, $$) // die "setpgid(0, $$): $!"; + } + close($_) for (grep(defined, delete @$self{qw(old_1 au_done)})); + close($_) for (@{delete($self->{-socks}) // []}); + if (my $op_c = delete $self->{pkt_op_c}) { + close(delete $op_c->{sock}); + } + if (my $pgr = delete $self->{pgr}) { + close($_) for (@$pgr[1,2]); + } + close $listener if $listener; + undef $listener; + $dir_idle->force_close if $dir_idle; + undef $dir_idle; + %PATH2CFG = (); + $MDIR2CFGPATH = undef; + eval 'no warnings; undef $PublicInbox::LeiNoteEvent::to_flush'; + undef $errors_log; + $quit = \&CORE::exit; + if (!$self->{-eml_noisy}) { # only "lei import" sets this atm + my $cb = $SIG{__WARN__} // \&CORE::warn; + $SIG{__WARN__} = sub { + $cb->(@_) unless PublicInbox::Eml::warn_ignore(@_) + }; + } + $SIG{TERM} = sub { exit(128 + 15) }; + $current_lei = $persist ? undef : $self; # for SIG{__WARN__} +} + +sub _delete_pkt_op { # OnDestroy callback to prevent leaks on die + my ($self) = @_; + if (my $op = delete $self->{pkt_op_c}) { # in case of die + $op->close; # PublicInbox::PktOp::close + } + my $pkt_op_p = delete($self->{pkt_op_p}) or return; + close $pkt_op_p->{op_p}; +} + +sub pkt_op_pair { + my ($self) = @_; + require PublicInbox::PktOp; + my $end = on_destroy \&_delete_pkt_op, $self; + @$self{qw(pkt_op_c pkt_op_p)} = PublicInbox::PktOp->pair; + $end; +} + +sub incr { + my $lei = shift; + $lei->{incr_pid} = $$ if @_; + while (my ($f, $n) = splice(@_, 0, 2)) { $lei->{$f} += $n } +} + +sub pkt_ops { + my ($lei, $ops) = @_; + $ops->{fail_handler} = [ $lei ]; + $ops->{sigpipe_handler} = [ $lei ]; + $ops->{x_it} = [ $lei ]; + $ops->{child_error} = [ $lei ]; + $ops->{incr} = [ $lei ]; + $ops; +} + +sub workers_start { + my ($lei, $wq, $jobs, $ops, $flds) = @_; + $ops //= {}; + ($wq->can('net_merge_all_done') && $lei->{auth}) and + $lei->{auth}->op_merge($ops, $wq, $lei); + pkt_ops($lei, $ops); + $ops->{''} //= [ $wq->can('_lei_wq_eof') || \&wq_eof, $lei ]; + my $end = $lei->pkt_op_pair; + my $ident = $wq->{-wq_ident} // "lei-$lei->{cmd} worker"; + $flds->{lei} = $lei; + $wq->wq_workers_start($ident, $jobs, $lei->oldset, $flds, + $wq->can('_wq_done_wait') // \&wq_done_wait, $lei); + delete $lei->{pkt_op_p}; + my $op_c = delete $lei->{pkt_op_c}; + @$end = (); + $lei->event_step_init; + ($op_c, $ops); +} + +# call this when we're ready to wait on events and yield to other clients +sub wait_wq_events { + my ($lei, $op_c, $ops) = @_; + my $wq1 = $lei->{wq1}; + ($wq1 && $wq1->can('net_merge_all_done') && !$lei->{auth}) and + $wq1->net_merge_all_done; + for my $wq (grep(defined, @$lei{qw(ikw pmd)})) { # auxiliary WQs + $wq->wq_close; + } + $wq1->{lei_sock} = $lei->{sock} if $wq1; + $op_c->{ops} = $ops; +} + +sub wq1_start { + my ($lei, $wq, $jobs) = @_; + my ($op_c, $ops) = workers_start($lei, $wq, $jobs // 1); + $lei->{wq1} = $wq; + wait_wq_events($lei, $op_c, $ops); # net_merge_all_done if !{auth} +} + +sub _help { + require PublicInbox::LeiHelp; + PublicInbox::LeiHelp::call($_[0], $_[1], \%CMD, \%OPTDESC); +} + +sub optparse ($$$) { + my ($self, $cmd, $argv) = @_; + # allow _complete --help to complete, not show help + return 1 if substr($cmd, 0, 1) eq '_'; + $self->{cmd} = $cmd; + local $OPT = $self->{opt} //= {}; + my $info = $CMD{$cmd} // [ '[...]' ]; + my ($proto, undef, @spec) = @$info; + my $glp = ref($spec[-1]) eq ref($GLP) ? pop(@spec) : $GLP; + push @spec, qw(help|h); + my $lone_dash; + if ($spec[0] =~ s/\|\z//s) { # "stdin|" or "clear|" allows "-" alias + $lone_dash = $spec[0]; + $OPT->{$spec[0]} = \(my $var); + push @spec, '' => \$var; + } + $glp->getoptionsfromarray($argv, $OPT, @spec) or + return _help($self, "bad arguments or options for $cmd"); + return _help($self) if $OPT->{help}; + + push @$argv, @{$OPT->{-argv}} if defined($OPT->{-argv}); + + # "-" aliases "stdin" or "clear" + $OPT->{$lone_dash} = ${$OPT->{$lone_dash}} if defined $lone_dash; + + if ($proto =~ s/\s*\[?(?:KEYWORDS|LABELS)\.\.\.\]?\s*//g) { + require PublicInbox::LeiInput; + my @err = PublicInbox::LeiInput::vmd_mod_extract($self, $argv); + return $self->fail(join("\n", @err)) if @err; + } + + my $i = 0; + my $POS_ARG = '[A-Z][A-Z0-9_]+'; + my ($err, $inf); + my @args = split(/ /, $proto); + for my $var (@args) { + if ($var =~ /\A$POS_ARG\.\.\.\z/o) { # >= 1 args; + $inf = defined($argv->[$i]) and last; + $var =~ s/\.\.\.\z//; + $err = "$var not supplied"; + } elsif ($var =~ /\A$POS_ARG\z/o) { # required arg at $i + $argv->[$i++] // ($err = "$var not supplied"); + } elsif ($var =~ /\.\.\.\]\z/) { # optional args start + $inf = 1; + last; + } elsif ($var =~ /\A\[-?$POS_ARG\]\z/) { # one optional arg + $i++; + } elsif ($var =~ /\A.+?\|/) { # required FOO|--stdin + $inf = 1 if index($var, '...') > 0; + my @or = split(/\|/, $var); + my $ok; + for my $o (@or) { + if ($o =~ /\A--([a-z0-9\-]+)/) { + my $sw = $1; + # assume pipe/regular file on stdin + # w/o args means stdin + if ($sw eq 'stdin' && !@$argv && + (-p $self->{0} || + -f _)) { + $OPT->{stdin} //= 1; + } + $ok = defined($OPT->{$sw}) and last; + } elsif (defined($argv->[$i])) { + $ok = 1; + $i++; + last; + } # else continue looping + } + last if $ok; + my $last = pop @or; + $err = join(', ', @or) . " or $last must be set"; + } else { + warn "BUG: can't parse `$var' in $proto"; + } + last if $err; + } + if (!$inf && scalar(@$argv) > scalar(@args)) { + $err //= 'too many arguments'; + } + $err ? fail($self, "usage: lei $cmd $proto\nE: $err") : 1; +} + +sub lazy_cb ($$$) { # $pfx is _complete_ or lei_ + my ($self, $cmd, $pfx) = @_; + my $ucmd = $cmd; + $ucmd =~ tr/-/_/; + my $cb; + $cb = $self->can($pfx.$ucmd) and return $cb; + my $base = $ucmd; + $base =~ s/_([a-z])/\u$1/g; + my $pkg = "PublicInbox::Lei\u$base"; + ($INC{"PublicInbox/Lei\u$base.pm"} // eval("require $pkg")) ? + $pkg->can($pfx.$ucmd) : undef; +} + +sub do_env { + my $lei = shift; + fchdir($lei); + my $cb = shift // return ($lei, %{$lei->{env}}) ; + local ($current_lei, %ENV) = ($lei, %{$lei->{env}}); + $cb = $lei->can($cb) if !ref($cb); # $cb may be a scalar sub name + eval { $cb->($lei, @_) }; + $lei->fail($@) if $@; +} + +sub dispatch { + my ($self, $cmd, @argv) = @_; + local ($current_lei, %ENV) = do_env($self); + $self->{2}->autoflush(1); # keep stdout buffered until x_it|DESTROY + return _help($self, 'no command given') unless defined($cmd); + # do not support Getopt bundling for this + while ($cmd eq '-C' || $cmd eq '-c') { + my $v = shift(@argv) // return fail($self, $cmd eq '-C' ? + '-C DIRECTORY' : '-c <name>=<value>'); + push @{$self->{opt}->{substr($cmd, 1, 1)}}, $v; + $cmd = shift(@argv) // return _help($self, 'no command given'); + } + if (my $cb = lazy_cb(__PACKAGE__, $cmd, 'lei_')) { + optparse($self, $cmd, \@argv) or return; + if (my $chdir = $self->{opt}->{C}) { + for my $d (@$chdir) { + next if $d eq ''; # same as git(1) + chdir $d; + } + open($self->{3}, '<', '.'); + } + $cb->($self, @argv); + } elsif (grep(/\A-/, $cmd, @argv)) { # --help or -h only + $GLP->getoptionsfromarray([$cmd, @argv], {}, qw(help|h C=s@)) + or return _help($self, 'bad arguments or options'); + _help($self); + } else { + fail($self, "`$cmd' is not an lei command"); + } +} + +sub _lei_cfg ($;$) { + my ($self, $creat) = @_; + return $self->{cfg} if $self->{cfg}; + my $f = _config_path($self); + my @st = stat($f); + my $cur_st = @st ? pack('dd', $st[10], $st[7]) : ''; # 10:ctime, 7:size + my ($sto, $sto_dir, $watches, $lne, $cfg); + if ($cfg = $PATH2CFG{$f}) { # reuse existing object in common case + ($cur_st eq $cfg->{-st} && !$self->{opt}->{c}) and + return ($self->{cfg} = $cfg); + # reuse some fields below if they match: + ($sto, $sto_dir, $watches, $lne) = + @$cfg{qw(-lei_store leistore.dir -watches + -lei_note_event)}; + } + if (!@st) { + unless ($creat) { # any commands which write to cfg must creat + $cfg = PublicInbox::Config->git_config_dump( + '/dev/null', $self); + return ($self->{cfg} = $cfg); + } + my ($cfg_dir) = ($f =~ m!(.*?/)[^/]+\z!); + File::Path::mkpath($cfg_dir); + open my $fh, '>>', $f; + @st = stat($fh) or die "fstat($f): $!\n"; + $cur_st = pack('dd', $st[10], $st[7]); + qerr($self, "# $f created") if $self->{cmd} ne 'config'; + } + $cfg = PublicInbox::Config->git_config_dump($f, $self); + $cfg->{-st} = $cur_st; + if ($sto && canonpath_harder($sto_dir // store_path($self)) + eq canonpath_harder($cfg->{'leistore.dir'} // + store_path($self))) { + $cfg->{-lei_store} = $sto; + $cfg->{-lei_note_event} = $lne; + $cfg->{-watches} = $watches if $watches; + } + if (scalar(keys %PATH2CFG) > 5) { + # FIXME: use inotify/EVFILT_VNODE to detect unlinked configs + delete(@PATH2CFG{grep(!-f, keys %PATH2CFG)}); + } + $self->{cfg} = $self->{opt}->{c} ? $cfg : ($PATH2CFG{$f} = $cfg); + refresh_watches($self); + $cfg; +} + +sub _lei_store ($;$) { + my ($self, $creat) = @_; + my $cfg = _lei_cfg($self, $creat) // return; + $cfg->{-lei_store} //= do { + require PublicInbox::LeiStore; + my $dir = $cfg->{'leistore.dir'} // store_path($self); + return unless $creat || -d $dir; + PublicInbox::LeiStore->new($dir, { creat => $creat }); + }; +} + +# returns true on success, undef +# argv[0] eq `+e' means errors do not ->fail # (like `sh +e') +sub _config { + my ($self, @argv) = @_; + my $err_ok = ($argv[0] // '') eq '+e' ? shift(@argv) : undef; + my %env; + my %opt = map { $_ => $self->{$_} } (0..2); + my $cfg = _lei_cfg($self, 1); + my $opt_c = delete local $cfg->{-opt_c}; + my @file_arg; + if ($opt_c) { + my ($set, $get, $nondash); + for (@argv) { # order matters for git-config + if (!$nondash) { + if (/\A--(?:add|rename-section|remove-section| + replace-all| + unset-all|unset)\z/x) { + ++$set; + } elsif ($_ eq '-l' || $_ eq '--list' || + /\A--get/) { + ++$get; + } elsif (/\A-/) { # -z and such + } else { + ++$nondash; + } + } else { + ++$nondash; + } + } + if ($set || ($nondash//0) > 1 && !$get) { + @file_arg = ('-f', $cfg->{-f}); + $env{GIT_CONFIG} = $file_arg[1]; + } else { # OK, we can use `-c n=v' for read-only + $cfg->{-opt_c} = $opt_c; + $env{GIT_CONFIG} = undef; + } + } + my $cmd = $cfg->config_cmd(\%env, \%opt); + push @$cmd, @file_arg, @argv; + run_wait($cmd, \%env, \%opt) ? ($err_ok ? undef : fail($self, $?)) : 1; +} + +sub lei_daemon_pid { puts shift, $daemon_pid } + +sub lei_daemon_kill { + my ($self) = @_; + my $sig = $self->{opt}->{signal} // 'TERM'; + kill($sig, $$) or fail($self, "kill($sig, $$): $!"); +} + +# Shell completion helper. Used by lei-completion.bash and hopefully +# other shells. Try to do as much here as possible to avoid redundancy +# and improve maintainability. +sub lei__complete { + my ($self, @argv) = @_; # argv = qw(lei and any other args...) + shift @argv; # ignore "lei", the entire command is sent + @argv or return puts $self, grep(!/^_/, keys %CMD), qw(--help -h -C); + my $cmd = shift @argv; + my $info = $CMD{$cmd} // do { # filter matching commands + @argv or puts $self, grep(/\A\Q$cmd\E/, keys %CMD); + return; + }; + my ($proto, undef, @spec) = @$info; + my $cur = pop @argv; + my $re = defined($cur) ? qr/\A\Q$cur\E/ : qr/./; + if (substr(my $_cur = $cur // '-', 0, 1) eq '-') { # --switches + # gross special case since the only git-config options + # Consider moving to a table if we need more special cases + # we use Getopt::Long for are the ones we reject, so these + # are the ones we don't reject: + if ($cmd eq 'config') { + puts $self, grep(/$re/, keys %CONFIG_KEYS); + @spec = qw(add z|null get get-all unset unset-all + replace-all get-urlmatch + remove-section rename-section + name-only list|l edit|e + get-color-name get-colorbool); + # fall-through + } + # generate short/long names from Getopt::Long specs + puts $self, grep(/$re/, qw(--help -h -C), map { + if (s/[:=].+\z//) { # req/optional args, e.g output|o=i + } elsif (s/\+\z//) { # verbose|v+ + } elsif (s/!\z//) { + # negation: mail! => no-mail|mail + s/([\w\-]+)/$1|no-$1/g + } + map { + my $x = length > 1 ? "--$_" : "-$_"; + $x eq $_cur ? () : $x; + } grep(!/_/, split(/\|/, $_, -1)) # help|h + } grep { $OPTDESC{"$_\t$cmd"} || $OPTDESC{$_} } @spec); + } elsif ($cmd eq 'config' && !@argv && !$CONFIG_KEYS{$cur}) { + puts $self, grep(/$re/, keys %CONFIG_KEYS); + } + + # switch args (e.g. lei q -f mbox<TAB>) + if (($argv[-1] // $cur // '') =~ /\A--?([\w\-]+)\z/) { + my $opt = quotemeta $1; + puts $self, map { + my $v = $OPTDESC{$_}; + my @v = ref($v) ? split(/\|/, $v->[0]) : (); + # get rid of ALL CAPS placeholder (e.g "OUT") + # (TODO: completion for external paths) + shift(@v) if scalar(@v) && uc($v[0]) eq $v[0]; + @v; + } grep(/\A(?:[\w-]+\|)*$opt\b.*?(?:\t$cmd)?\z/, keys %OPTDESC); + } + if (my $cb = lazy_cb($self, $cmd, '_complete_')) { + puts $self, $cb->($self, @argv, $cur ? ($cur) : ()); + } + # TODO: URLs, pathnames, OIDs, MIDs, etc... See optparse() for + # proto parsing. +} + +sub exec_buf ($$) { + my ($argv, $env) = @_; + my $argc = scalar @$argv; + my $buf = 'exec '.join("\0", scalar(@$argv), @$argv); + while (my ($k, $v) = each %$env) { $buf .= "\0$k=$v" }; + $buf; +} + +sub start_mua { + my ($self) = @_; + if ($self->{ovv}->{fmt} =~ /\A(?:maildir)\z/) { # TODO: IMAP + refresh_watches($self); + } + my $mua = $self->{opt}->{mua} // return; + my $mfolder = $self->{ovv}->{dst}; + my (@cmd, $replaced); + if ($mua =~ /\A(?:mutt|mailx|mail|neomutt)\z/) { + @cmd = ($mua, '-f'); + # TODO: help wanted: other common FOSS MUAs + } else { + require Text::ParseWords; + @cmd = Text::ParseWords::shellwords($mua); + # mutt uses '%f' for open-hook with compressed mbox, we follow + @cmd = map { $_ eq '%f' ? ($replaced = $mfolder) : $_ } @cmd; + } + push @cmd, $mfolder unless defined($replaced); + if ($self->{sock}) { # lei(1) client process runs it + # restore terminal: echo $query | lei q --stdin --mua=... + my $io = []; + $io->[0] = $self->{1} if $self->{opt}->{stdin} && -t $self->{1}; + send_exec_cmd($self, $io, \@cmd, {}); + } + + # kick wait_startq: + syswrite($self->{au_done}, 'q') if $self->{lxs} && $self->{au_done}; + + return unless -t $self->{2}; # XXX how to determine non-TUI MUAs? + $self->{opt}->{quiet} = 1; + delete $self->{-progress}; + delete $self->{opt}->{verbose}; +} + +sub send_exec_cmd { # tell script/lei to execute a command + my ($self, $io, $cmd, $env) = @_; + $PublicInbox::IPC::send_cmd->( + $self->{sock} // die('lei client gone'), + [ map { fileno($_) } @$io ], + exec_buf($cmd, $env), 0) // + Carp::croak("sendmsg: $!"); +} + +sub poke_mua { # forces terminal MUAs to wake up and hopefully notice new mail + my ($self) = @_; + my $alerts = $self->{opt}->{alert} // return; + my $sock = $self->{sock}; + while (my $op = shift(@$alerts)) { + if ($op eq ':WINCH') { + # hit the process group that started the MUA + send($sock, '-WINCH', 0) if $sock; + } elsif ($op eq ':bell') { + out($self, "\a"); + } elsif ($op =~ /(?<!\\),/) { # bare ',' (not ',,') + push @$alerts, split(/(?<!\\),/, $op); + } elsif ($op =~ m!\A([/a-z0-9A-Z].+)!) { + my $cmd = $1; # run an arbitrary command + require Text::ParseWords; + $cmd = [ Text::ParseWords::shellwords($cmd) ]; + send($sock, exec_buf($cmd, {}), 0) if $sock; + } else { + warn("W: unsupported --alert=$op\n"); # non-fatal + } + } +} + +my %path_to_fd = ('/dev/stdin' => 0, '/dev/stdout' => 1, '/dev/stderr' => 2); +$path_to_fd{"/dev/fd/$_"} = $_ for (0..2); + +# this also normalizes the path +sub path_to_fd { + my ($self, $path) = @_; + $path = rel2abs($self, $path); + $path =~ tr!/!/!s; + $path_to_fd{$path} // ( + ($path =~ m!\A/(?:dev|proc/self)/fd/[0-9]+\z!) ? + fail($self, "cannot open $path from daemon") : -1 + ); +} + +# caller needs to "-t $self->{1}" to check if tty +sub start_pager { + my ($self, $new_env) = @_; + chomp(my $pager = run_qx([qw(git var GIT_PAGER)])); + warn "`git var PAGER' error: \$?=$?" if $?; + return if $pager eq 'cat' || $pager eq ''; + $new_env //= {}; + $new_env->{LESS} //= 'FRX'; + $new_env->{LV} //= '-c'; + $new_env->{MORE} = $new_env->{LESS} if $^O eq 'freebsd'; + my $rdr = { 1 => $self->{1}, 2 => $self->{2} }; + CORE::pipe($rdr->{0}, my $wpager) or return warn "pipe: $!"; + my $pgr = [ undef, @$rdr{1, 2} ]; + my $env = $self->{env}; + if ($self->{sock}) { # lei(1) process runs it + delete @$new_env{keys %$env}; # only set iff unset + send_exec_cmd($self, [ @$rdr{0..2} ], [$pager], $new_env); + } else { + die 'BUG: start_pager w/o socket'; + } + $self->{1} = $wpager; + $self->{2} = $wpager if -t $self->{2}; + $env->{GIT_PAGER_IN_USE} = 'true'; # we may spawn git + $self->{pgr} = $pgr; +} + +# display a message for user before spawning full-screen $VISUAL +sub pgr_err { + my ($self, @msg) = @_; + return warn(@msg) unless $self->{sock} && -t $self->{2}; + start_pager($self, { LESS => 'RX' }); # no 'F' so we prompt + say { $self->{2} } @msg, '# -quit pager to continue-'; + $self->{2}->autoflush(1); + stop_pager($self); + send($self->{sock}, 'wait', 0); # wait for user to quit pager +} + +sub stop_pager { + my ($self) = @_; + my $pgr = delete($self->{pgr}) or return; + $self->{2} = $pgr->[2]; + delete($self->{1})->close if $self->{1}; + $self->{1} = $pgr->[1]; +} + +sub accept_dispatch { # Listener {post_accept} callback + my ($sock) = @_; # ignore other + $sock->autoflush(1); + my $self = bless { sock => $sock }, __PACKAGE__; + vec(my $rvec = '', fileno($sock), 1) = 1; + select($rvec, undef, undef, 60) or + return send($sock, 'timed out waiting to recv FDs', 0); + # (4096 * 33) >MAX_ARG_STRLEN + my @fds = $PublicInbox::IPC::recv_cmd->($sock, my $buf, 4096 * 33) or + return; # EOF + if (!defined($fds[0])) { + warn(my $msg = "recv_cmd failed: $!"); + return send($sock, $msg, 0); + } else { + my $i = 0; + open($self->{$i++}, '+<&=', $_) for @fds; + $i == 4 or return send($sock, 'not enough FDs='.($i-1), 0) + } + # $ENV_STR = join('', map { "\0$_=$ENV{$_}" } keys %ENV); + # $buf = "$argc\0".join("\0", @ARGV).$ENV_STR."\0\0"; + substr($buf, -2, 2, '') eq "\0\0" or # s/\0\0\z// + return send($sock, 'request command truncated', 0); + my ($argc, @argv) = split(/\0/, $buf, -1); + undef $buf; + my %env = map { split(/=/, $_, 2) } splice(@argv, $argc); + $self->{env} = \%env; + eval { dispatch($self, @argv) }; + $self->fail($@) if $@; +} + +sub dclose { + my ($self) = @_; + local $current_lei = $self; + delete $self->{-progress}; + _drop_wq($self) if $self->{failed}; + $self->close if $self->{-event_init_done}; # PublicInbox::DS::close +} + +# for long-running results +sub event_step { + my ($self) = @_; + local %ENV = %{$self->{env}}; + local $current_lei = $self; + eval { + my @fds = $PublicInbox::IPC::recv_cmd->( + $self->{sock} // return, my $buf, 4096); + if (scalar(@fds) == 1 && !defined($fds[0])) { + return if $! == EAGAIN; + die "recvmsg: $!" if $! != ECONNRESET; + @fds = (); # for open loop below: + } + for (@fds) { open my $rfh, '+<&=', $_ } + if ($buf eq '') { + _drop_wq($self); # EOF, client disconnected + dclose($self); + $buf = 'TERM'; + } + if ($buf =~ /\A(?:STOP|CONT|TERM)\z/) { + my $sig = "-$buf"; + for my $wq (grep(defined, @$self{@WQ_KEYS})) { + $wq->wq_kill($sig); + } + } else { + die "unrecognized client signal: $buf"; + } + my $s = $self->{-socks} // []; # lei up --all + @$s = grep { send($_, $buf, 0) } @$s; + }; + if (my $err = $@) { + eval { $self->fail($err) }; + dclose($self); + } +} + +sub event_step_init { + my ($self) = @_; + my $sock = $self->{sock} or return; + $self->{-event_init_done} // do { # persist til $ops done + $sock->blocking(0); + $self->SUPER::new($sock, EPOLLIN); + $self->{-event_init_done} = $sock; + }; +} + +sub oldset { $oldset } + +sub dump_and_clear_log { + if (defined($errors_log) && -s STDIN && seek(STDIN, 0, SEEK_SET)) { + openlog('lei-daemon', 'pid,nowait,nofatal,ndelay', 'user'); + chomp(my @lines = <STDIN>); + truncate(STDIN, 0) or + syslog('warning', "ftruncate (%s): %m", $errors_log); + for my $l (@lines) { syslog('warning', '%s', $l) } + closelog(); # don't share across fork + } +} + +sub cfg2lei ($) { + my ($cfg) = @_; + my $lei = bless { env => { %{$cfg->{-env}} } }, __PACKAGE__; + open($lei->{0}, '<&', \*STDIN); + open($lei->{1}, '>>&', \*STDOUT); + open($lei->{2}, '>>&', \*STDERR); + open($lei->{3}, '<', '/'); + socketpair(my $x, my $y, AF_UNIX, SOCK_SEQPACKET, 0); + $lei->{sock} = $x; + require PublicInbox::LeiSelfSocket; + PublicInbox::LeiSelfSocket->new($y); # adds to event loop + $lei; +} + +sub note_event ($@) { # runs lei_note_event for a given config file + my ($cfg_f, @args) = @_; + my $cfg = $PATH2CFG{$cfg_f} // return; + eval { cfg2lei($cfg)->dispatch('note-event', @args) }; + carp "E: note-event $cfg_f: $@\n" if $@; +} + +sub dir_idle_handler ($) { # PublicInbox::DirIdle callback + my ($ev) = @_; # Linux::Inotify2::Event or duck type + my $fn = $ev->fullname; + if ($fn =~ m!\A(.+)/(new|cur)/([^/]+)\z!) { # Maildir file + my ($loc, $new_cur, $bn) = ("maildir:$1", $2, $3); + $new_cur = '' if $ev->IN_DELETE || $ev->IN_MOVED_FROM; + for my $cfg_f (keys %{$MDIR2CFGPATH->{$loc} // {}}) { + note_event($cfg_f, $loc, $new_cur, $bn, $fn); + } + } elsif ($fn =~ m!\A(.+)/([0-9]+)\z!) { # MH mail message file + my ($loc, $n, $new_cur) = ("mh:$1", $2, '+'); + $new_cur = '' if $ev->IN_DELETE || $ev->IN_MOVED_FROM; + for my $cfg_f (keys %{$MDIR2CFGPATH->{$loc} // {}}) { + note_event($cfg_f, $loc, $new_cur, $n, $fn); + } + } elsif ($fn =~ m!\A(.+)/\.mh_sequences\z!) { # reread flags + my $loc = "mh:$1"; + for my $cfg_f (keys %{$MDIR2CFGPATH->{$loc} // {}}) { + note_event($cfg_f, $loc, '.mh_sequences') + } + } # else we don't care + if ($ev->can('cancel') && ($ev->IN_IGNORE || $ev->IN_UNMOUNT)) { + $ev->cancel; + } + if ($fn =~ m!\A(.+)/(?:new|cur)\z! && !-e $fn) { + delete $MDIR2CFGPATH->{"maildir:$1"}; + } + if (!-e $fn) { # config file, Maildir, or MH dir gone + delete $_->{$fn} for values %$MDIR2CFGPATH; # config file + delete @$MDIR2CFGPATH{"maildir:$fn", "mh:$fn"}; + delete $PATH2CFG{$fn}; + } +} + +sub can_stay_alive { # PublicInbox::DS::post_loop_do cb + my ($path, $dev_ino_expect) = @_; + if (my @st = defined($$path) ? stat($$path) : ()) { + if ($dev_ino_expect ne pack('dd', $st[0], $st[1])) { + warn "$$path dev/ino changed, quitting\n"; + $$path = undef; + } + } elsif (defined($$path)) { # ENOENT is common + warn "stat($$path): $!, quitting ...\n" if $! != ENOENT; + undef $$path; + $quit->(); + } + return 1 if defined($$path); + my $n = PublicInbox::DS::close_non_busy() or do { + eval 'PublicInbox::LeiNoteEvent::flush_task()'; + # drop stores only if no clients + for my $cfg (values %PATH2CFG) { + my $lne = delete($cfg->{-lei_note_event}); + $lne->wq_close if $lne; + my $sto = delete($cfg->{-lei_store}) // next; + eval { $sto->wq_do('done') if $sto->{-wq_s1} }; + warn "E: $@ (dropping store for $cfg->{-f})" if $@; + $sto->wq_close; + } + }; + # returns true: continue, false: stop + $n + scalar(keys(%PublicInbox::DS::AWAIT_PIDS)); +} + +# lei(1) calls this when it can't connect +sub lazy_start { + my ($path, $errno, $narg) = @_; + local ($errors_log, $listener); + my ($sock_dir) = ($path =~ m!\A(.+?)/[^/]+\z!); + $errors_log = "$sock_dir/errors.log"; + my $addr = pack_sockaddr_un($path); + my $lk = PublicInbox::Lock->new($errors_log); + umask(077) // die("umask(077): $!"); + $lk->lock_acquire; + socket($listener, AF_UNIX, SOCK_SEQPACKET, 0); + if ($errno == ECONNREFUSED || $errno == ENOENT) { + return if connect($listener, $addr); # another process won + unlink($path) if $errno == ECONNREFUSED && -S $path; + } else { + $! = $errno; # allow interpolation to stringify in die + die "connect($path): $!"; + } + bind($listener, $addr); + $lk->lock_release; + undef $lk; + my @st = stat($path) or die "stat($path): $!"; + my $dev_ino_expect = pack('dd', $st[0], $st[1]); # dev+ino + local $oldset = PublicInbox::DS::block_signals(POSIX::SIGALRM); + die "incompatible narg=$narg" if $narg != 5; + $PublicInbox::IPC::send_cmd or die <<""; +(Socket::MsgHdr || Inline::C) missing/unconfigured (narg=$narg); + + require PublicInbox::Listener; + require PublicInbox::PktOp; + (-p STDOUT) or die "E: stdout must be a pipe\n"; + open(STDIN, '+>>', $errors_log); + STDIN->autoflush(1); + dump_and_clear_log(); + POSIX::setsid() > 0 or die "setsid: $!"; + my $pid = PublicInbox::OnDestroy::fork_tmp; + return if $pid; + $0 = "lei-daemon $path"; + local (%PATH2CFG, $MDIR2CFGPATH); + local $daemon_pid = $$; + $listener->blocking(0); + my $exit_code; + my $pil = PublicInbox::Listener->new($listener, \&accept_dispatch); + local $quit = do { + my (undef, $eof_p) = PublicInbox::PktOp->pair; + sub { + $exit_code //= eval("POSIX::SIG$_[0] + 128") if @_; + $dir_idle->close if $dir_idle; # EPOLL_CTL_DEL + $dir_idle = undef; # let RC take care of it + eval 'PublicInbox::LeiNoteEvent::flush_task()'; + my $lis = $pil or exit($exit_code // 0); + # closing eof_p triggers \&noop wakeup + $listener = $eof_p = $pil = $path = undef; + $lis->close; # DS::close + }; + }; + my $sig = { CHLD => \&PublicInbox::DS::enqueue_reap }; + $sig->{$_} = $quit for qw(QUIT INT TERM); + $sig->{$_} = \&PublicInbox::Config::noop for qw(HUP USR1 USR2); + require PublicInbox::DirIdle; + local $dir_idle = PublicInbox::DirIdle->new(sub { + # just rely on wakeup to hit post_loop_do + dir_idle_handler($_[0]) if $_[0]->fullname ne $path; + }); + $dir_idle->add_watches([$sock_dir]); + local @PublicInbox::DS::post_loop_do = (\&can_stay_alive, + \$path, $dev_ino_expect); + # STDIN was redirected to /dev/null above, closing STDERR and + # STDOUT will cause the calling `lei' client process to finish + # reading the <$daemon> pipe. + local $SIG{__WARN__} = sub { + $current_lei ? err($current_lei, @_) : warn( + strftime('%Y-%m-%dT%H:%M:%SZ', gmtime(time))," $$ ", @_); + }; + local $SIG{PIPE} = 'IGNORE'; + local $SIG{ALRM} = 'IGNORE'; + open STDERR, '>&STDIN'; + open STDOUT, '>&STDIN'; + # $daemon pipe to `lei' closed, main loop begins: + eval { PublicInbox::DS::event_loop($sig, $oldset) }; + warn "event loop error: $@\n" if $@; + dump_and_clear_log(); + exit($exit_code // 0); +} + +sub busy { 1 } # prevent daemon-shutdown if client is connected + +# ensures stdout hits the FS before sock disconnects so a client +# can immediately reread it +sub DESTROY { + my ($self) = @_; + if (defined($self->{incr_pid}) && $self->{incr_pid} == $$) { + for my $k (sort(grep(/\A-nr_/, keys %$self))) { + my $nr = $self->{$k}; + substr($k, 0, length('-nr_'), ''); + $self->child_error(0, "$nr $k messages"); + } + } + $self->{1}->autoflush(1) if $self->{1}; + stop_pager($self); + dump_and_clear_log(); + # preserve $? for ->fail or ->x_it code +} + +sub wq_done_wait { # awaitpid cb (via wq_eof) + my ($pid, $wq, $lei) = @_; + local $current_lei = $lei; + my $err_type = $lei->{-err_type}; + $? and $lei->child_error($?, + $err_type ? "$err_type errors during $lei->{cmd} \$?=$?" : ()); + $lei->dclose; +} + +sub fchdir { + my ($lei) = @_; + chdir($lei->{3} // die 'BUG: lei->{3} (CWD) gone'); +} + +sub wq_eof { # EOF callback for main daemon + my ($lei, $wq_fld) = @_; + local $current_lei = $lei; + my $wq = delete $lei->{$wq_fld // 'wq1'}; + $lei->sto_barrier_request($wq); + $wq // $lei->fail; # already failed +} + +sub watch_state_ok ($) { + my ($state) = $_[-1]; # $_[0] may be $self + $state =~ /\Apause|(?:import|index|tag)-(?:ro|rw)\z/; +} + +sub cancel_dir_watch ($$$) { + my ($type, $d, $cfg_f) = @_; + my $loc = "$type:".canonpath_harder($d); + my $w = delete $MDIR2CFGPATH->{$loc}->{$cfg_f}; + delete $MDIR2CFGPATH->{$loc} if !(keys %{$MDIR2CFGPATH->{$loc}}); + $_->cancel for @$w; +} + +sub add_dir_watch ($$$) { + my ($type, $d, $cfg_f) = @_; + $d = canonpath_harder($d); + my $loc = "$type:$d"; + my @dirs = $type eq 'mh' ? ($d) : ("$d/cur", "$d/new"); + if (!exists($MDIR2CFGPATH->{$loc}->{$cfg_f})) { + my @w = $dir_idle->add_watches(\@dirs, 1); + push @{$MDIR2CFGPATH->{$loc}->{$cfg_f}}, @w if @w; + } +} + +sub refresh_watches { + my ($lei) = @_; + $dir_idle or return; + my $cfg = _lei_cfg($lei) or return; + my $old = $cfg->{-watches}; + my $watches = $cfg->{-watches} //= {}; + my %seen; + my $cfg_f = $cfg->{'-f'}; + for my $w (grep(/\Awatch\..+\.state\z/, keys %$cfg)) { + my $loc = substr($w, length('watch.'), -length('.state')); + require PublicInbox::LeiWatch; + $watches->{$loc} //= PublicInbox::LeiWatch->new($loc); + $seen{$loc} = undef; + my $state = $cfg->get_1("watch.$loc.state"); + if (!watch_state_ok($state)) { + warn("watch.$loc.state=$state not supported\n"); + } elsif ($loc =~ /\A(maildir|mh):(.+)\z/i) { + my ($type, $d) = ($1, $2); + $state eq 'pause' ? + cancel_dir_watch($type, $d, $cfg_f) : + add_dir_watch($type, $d, $cfg_f); + } else { # TODO: imap/nntp/jmap + $lei->child_error(0, "E: watch $loc not supported, yet") + } + } + + # add all known Maildir folders as implicit watches + my $lms = $lei->lms; + if ($lms) { + $lms->lms_write_prepare; + for my $loc ($lms->folders(qr/\A(?:maildir|mh):/)) { + my $old = $loc; + my ($type, $d) = split /:/, $loc, 2; + # fixup old bugs while we're iterating: + $d = canonpath_harder($d); + $loc = "$type:$d"; + $lms->rename_folder($old, $loc) if $old ne $loc; + next if $watches->{$loc}; # may be set to pause + require PublicInbox::LeiWatch; + $watches->{$loc} = PublicInbox::LeiWatch->new($loc); + $seen{$loc} = undef; + add_dir_watch($type, $d, $cfg_f); + } + } + if ($old) { # cull old non-existent entries + for my $loc (keys %$old) { + next if exists $seen{$loc}; + delete $old->{$loc}; + if ($loc =~ /\A(maildir|mh):(.+)\z/i) { + cancel_dir_watch($1, $2, $cfg_f); + } else { # TODO: imap/nntp/jmap + $lei->child_error(0, "E: watch $loc TODO"); + } + } + } + if (scalar keys %$watches) { + $cfg->{-env} //= { %{$lei->{env}}, PWD => '/' }; # for cfg2lei + } else { + delete $cfg->{-watches}; + } +} + +# TODO: support SHA-256 +sub git_oid { + my $eml = $_[-1]; + $eml->header_set($_) for @PublicInbox::Import::UNWANTED_HEADERS; + git_sha(1, $eml); +} + +sub lms { + my ($lei, $creat) = @_; + my $sto = $lei->{sto} // _lei_store($lei) // return; + require PublicInbox::LeiMailSync; + my $f = "$sto->{priv_eidx}->{topdir}/mail_sync.sqlite3"; + (-f $f || $creat) ? PublicInbox::LeiMailSync->new($f) : undef; +} + +sub sto_barrier_request { + my ($lei, $wq) = @_; + return unless $lei->{sto} && $lei->{sto}->{-wq_s1}; + local $current_lei = $lei; + if (my $n = $lei->{opt}->{'commit-delay'}) { + eval { $lei->{sto}->wq_do('schedule_commit', $n) }; + } else { + my $s = ($wq ? $wq->{lei_sock} : undef) // $lei->{sock}; + my $errfh = $lei->{2} // *STDERR{GLOB}; + my @io = $s ? ($errfh, $s) : ($errfh); + eval { $lei->{sto}->wq_io_do('barrier', \@io, 1) }; + } + warn($@) if $@; +} + +sub cfg_dump ($$) { + my ($lei, $f) = @_; + my $ret = eval { PublicInbox::Config->git_config_dump($f, $lei) }; + return $ret if !$@; + warn($@); + undef; +} + +sub request_umask { + my ($lei) = @_; + my $s = $lei->{sock} // return; + send($s, 'umask', 0) // die "send: $!"; + vec(my $rvec = '', fileno($s), 1) = 1; + select($rvec, undef, undef, 2) or die 'timeout waiting for umask'; + recv($s, my $v, 5, 0) // die "recv: $!"; + (my $u, $lei->{client_umask}) = unpack('AV', $v); + $u eq 'u' or warn "E: recv $v has no umask"; +} + +sub _stdin_cb { # PublicInbox::InputPipe::consume callback for --stdin + my (undef, $lei, $cb) = @_; # $_[-1] = $rbuf + $_[1] // return $lei->fail("error reading stdin: $!"); + $lei->{stdin_buf} .= $_[-1]; + do_env($lei, $cb) if $_[-1] eq ''; +} + +sub slurp_stdin { + my ($lei, $cb) = @_; + require PublicInbox::InputPipe; + my $in = $lei->{0}; + if (-t $in) { # run cat via script/lei and read from it + $in = undef; + pipe($in, my $wr); + say { $lei->{2} } '# enter query, Ctrl-D when done'; + send_exec_cmd($lei, [ $lei->{0}, $wr ], ['cat'], {}); + } + PublicInbox::InputPipe::consume($in, \&_stdin_cb, $lei, $cb); +} + +1; diff --git a/lib/PublicInbox/LI2Wrap.pm b/lib/PublicInbox/LI2Wrap.pm new file mode 100644 index 00000000..d4792b25 --- /dev/null +++ b/lib/PublicInbox/LI2Wrap.pm @@ -0,0 +1,20 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Wrapper for Linux::Inotify2 < 2.3 which lacked ->fh and auto-close +# Remove this when supported LTS/enterprise distros are all +# Linux::Inotify2 >= 2.3 +package PublicInbox::LI2Wrap; +use v5.12; +our @ISA = qw(Linux::Inotify2); + +sub wrapclose { + my ($inot) = @_; + my $fd = $inot->fileno; + open my $fh, '<&=', $fd or die "open <&= $fd $!"; + bless $inot, __PACKAGE__; +} + +sub DESTROY {} # no-op + +1 diff --git a/lib/PublicInbox/LeiALE.pm b/lib/PublicInbox/LeiALE.pm new file mode 100644 index 00000000..ce03f5b4 --- /dev/null +++ b/lib/PublicInbox/LeiALE.pm @@ -0,0 +1,106 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# All Locals Ever: track lei/store + externals ever used as +# long as they're on an accessible FS. Includes "lei q" --include +# and --only targets that haven't been through "lei add-external". +# Typically: ~/.cache/lei/all_locals_ever.git +package PublicInbox::LeiALE; +use v5.12; +use parent qw(PublicInbox::LeiSearch PublicInbox::Lock); +use PublicInbox::Git; +use autodie qw(close open rename seek truncate); +use PublicInbox::Import; +use PublicInbox::OnDestroy; +use PublicInbox::LeiXSearch; +use Fcntl qw(SEEK_SET); + +sub _new { + my ($d) = @_; + PublicInbox::Import::init_bare($d, 'ale'); + bless { + git => PublicInbox::Git->new($d), + lock_path => "$d/lei_ale.state", # dual-duty lock + state + ibxish => [], # Inbox and ExtSearch (and LeiSearch) objects + }, __PACKAGE__ +} + +sub new { + my ($self, $lei) = @_; + ref($self) or $self = _new($lei->cache_dir . '/all_locals_ever.git'); + my $lxs = PublicInbox::LeiXSearch->new; + my $sto = $lei->_lei_store; + $lxs->prepare_external($sto->search) if $sto; + for my $loc ($lei->externals_each) { # locals only + $lxs->prepare_external($loc) if -d $loc; + } + $self->refresh_externals($lxs, $lei); + $self; +} + +sub over {} # undef for xoids_for + +sub overs_all { # for xoids_for (called only in lei workers?) + my ($self) = @_; + my $fgen = $PublicInbox::OnDestroy::fork_gen ; + if (($self->{fgen} // $fgen) != $fgen) { + delete($_->{over}) for @{$self->{ibxish}}; + } + $self->{fgen} = $fgen; + grep(defined, map { $_->over } @{$self->{ibxish}}); +} + +sub refresh_externals { + my ($self, $lxs, $lei) = @_; + $self->git->cleanup; + my $lk = $self->lock_for_scope; + my $cur_lxs = ref($lxs)->new; + my $orig = PublicInbox::IO::read_all $self->{lockfh}; + my $new = ''; + my $old = ''; + my $gone = 0; + my %seen_ibxish; # $dir => any-defined value + for my $dir (split(/\n/, $orig)) { + if (-d $dir && -r _ && $cur_lxs->prepare_external($dir)) { + $seen_ibxish{$dir} //= length($old .= "$dir\n"); + } else { + ++$gone; + } + } + my @ibxish = $cur_lxs->locals; + for my $x ($lxs->locals) { + my $d = $lei->canonpath_harder($x->{inboxdir} // $x->{topdir}); + $seen_ibxish{$d} //= do { + $new .= "$d\n"; + push @ibxish, $x; + }; + } + if ($new ne '' || $gone) { + $self->{lockfh}->autoflush(1); + if ($gone) { + seek($self->{lockfh}, 0, SEEK_SET); + truncate($self->{lockfh}, 0); + } else { + $old = ''; + } + print { $self->{lockfh} } $old, $new or die "print: $!"; + } + $new = ''; + my $f = $self->git->{git_dir}.'/objects/info/alternates'; + $old = PublicInbox::IO::try_cat $f; + for my $x (@ibxish) { + $new .= $lei->canonpath_harder($x->git->{git_dir})."/objects\n"; + } + $self->{ibxish} = \@ibxish; + return if $old eq $new; + + # this needs to be atomic since child processes may start + # git-cat-file at any time + my $tmp = "$f.$$.tmp"; + open my $fh, '>', $tmp; + print $fh $new; + close $fh; + rename($tmp, $f) +} + +1; diff --git a/lib/PublicInbox/LeiAddExternal.pm b/lib/PublicInbox/LeiAddExternal.pm new file mode 100644 index 00000000..5eef206c --- /dev/null +++ b/lib/PublicInbox/LeiAddExternal.pm @@ -0,0 +1,72 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei add-external" command +package PublicInbox::LeiAddExternal; +use strict; +use v5.10.1; + +sub _finish_add_external { + my ($lei, $location) = @_; + my $new_boost = $lei->{opt}->{boost} // 0; + my $key = "external.$location.boost"; + my $cur_boost = $lei->_lei_cfg(1)->{$key}; + return if defined($cur_boost) && $cur_boost == $new_boost; # idempotent + $lei->_config($key, $new_boost); +} + +sub lei_add_external { + my ($lei, $location) = @_; + my $mirror = $lei->{opt}->{mirror} // do { + my @fail; + for my $sw ($lei->index_opt, $lei->curl_opt, + qw(no-torsocks torsocks inbox-version)) { + my ($f) = (split(/|/, $sw, 2))[0]; + next unless defined $lei->{opt}->{$f}; + $f = length($f) == 1 ? "-$f" : "--$f"; + push @fail, $f; + } + if (scalar(@fail) == 1) { + return $lei->("@fail requires --mirror"); + } elsif (@fail) { + my $last = pop @fail; + my $fail = join(', ', @fail); + return $lei->("@fail and $last require --mirror"); + } + undef; + }; + $location = $lei->ext_canonicalize($location); + if (defined($mirror) && -d $location) { + $lei->fail(<<""); # TODO: did you mean "update-external?" +--mirror destination `$location' already exists + + } elsif (-d $location) { + index($location, "\n") >= 0 and + return $lei->fail("`\\n' not allowed in `$location'"); + } + if ($location !~ m!\Ahttps?://! && !-d $location) { + $mirror // return $lei->fail("$location not a directory"); + index($location, "\n") >= 0 and + return $lei->fail("`\\n' not allowed in `$location'"); + $mirror = $lei->ext_canonicalize($mirror); + require PublicInbox::LeiMirror; + PublicInbox::LeiMirror->start($lei, $mirror => $location); + } else { + _finish_add_external($lei, $location); + } +} + +sub _complete_add_external { # for bash, this relies on "compopt -o nospace" + my ($lei, @argv) = @_; + my $cfg = $lei->_lei_cfg or return (); + my $match_cb = $lei->complete_url_prepare(\@argv); + require URI; + map { + my $u = URI->new(substr($_, length('external.'))); + my ($base) = ($u->path =~ m!((?:/?.*)?/)[^/]+/?\z!); + $u->path($base); + $match_cb->($u->as_string); + } grep(m!\Aexternal\.https?://!, @{$cfg->{-section_order}}); +} + +1; diff --git a/lib/PublicInbox/LeiAddWatch.pm b/lib/PublicInbox/LeiAddWatch.pm new file mode 100644 index 00000000..e2be5cee --- /dev/null +++ b/lib/PublicInbox/LeiAddWatch.pm @@ -0,0 +1,42 @@ +# Copyright all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei add-watch" command +package PublicInbox::LeiAddWatch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::LeiInput); + +sub lei_add_watch { + my ($lei, @argv) = @_; + my $cfg = $lei->_lei_cfg(1); + my $self = bless {}, __PACKAGE__; + $lei->{opt}->{'mail-sync'} = 1; # for prepare_inputs + my $state = $lei->{opt}->{'state'} // 'import-rw'; + $lei->watch_state_ok($state) or + return $lei->fail("invalid state: $state"); + $self->prepare_inputs($lei, \@argv) or return; + my @vmd; + while (my ($type, $vals) = each %{$lei->{vmd_mod}}) { + push @vmd, "$type:$_" for @$vals; + } + my $vmd0 = shift @vmd; + for my $w (@{$self->{inputs}}) { + # clobber existing, allow multiple + if (defined($vmd0)) { + $lei->_config("watch.$w.vmd", '--replace-all', $vmd0) + or return; + for my $v (@vmd) { + $lei->_config("watch.$w.vmd", $v) or return; + } + } + next if defined $cfg->{"watch.$w.state"}; + $lei->_config("watch.$w.state", $state) or return; + } + $lei->_lei_store(1); # create + $lei->lms(1)->lms_write_prepare->add_folders(@{$self->{inputs}}); + delete $lei->{cfg}; # force reload + $lei->refresh_watches; +} + +1; diff --git a/lib/PublicInbox/LeiAuth.pm b/lib/PublicInbox/LeiAuth.pm new file mode 100644 index 00000000..020dd125 --- /dev/null +++ b/lib/PublicInbox/LeiAuth.pm @@ -0,0 +1,72 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Authentication worker for anything that needs auth for read/write IMAP +# and read-only NNTP access +# +# timelines +# lei-daemon | LeiAuth worker #0 | other WQ workers +# ---------------------------------------------------------- +# spawns all workers ---->[ workers all start and run ipc_atfork_child ] +# | do_auth_atfork | wq_worker_loop sleep +# | # reads .netrc | +# | # queries git-credential| +# | send net_merge_continue | +# | | | +# | v | +# recv net_merge_continue <---------/ | +# | | | +# v | | +# broadcast net_merge_all [ all workers (including LeiAuth worker #0) ] +# [ LeiAuth worker #0 becomes just another WQ worker ] +# | +# call net_merge_all_done ->-> do per-WQ-class defined actions +package PublicInbox::LeiAuth; +use v5.12; + +sub do_auth_atfork { # used by IPC WQ workers + my ($self, $wq) = @_; + return if $wq->{-wq_worker_nr} != 0; # only first worker calls this + my $lei = $wq->{lei}; + my $net = $lei->{net}; + if ($net->{-auth_done}) { # from previous worker... (ugly) + $lei->{pkt_op_p}->pkt_do('net_merge_continue', $net) or + $lei->fail("pkt_do net_merge_continue: $!"); + return; + } + eval { # fill auth info (may prompt user or read netrc) + my $mics = $net->imap_common_init($lei); + my $nn = $net->nntp_common_init($lei); + # broadcast successful auth info to lei-daemon: + $net->{-auth_done} = 1; + $lei->{pkt_op_p}->pkt_do('net_merge_continue', $net) or + die "pkt_do net_merge_continue: $!"; + $net->{mics_cached} = $mics if $mics; + $net->{nn_cached} = $nn if $nn; + }; + $lei->fail($@) if $@; +} + +sub net_merge_all { # called in wq worker via wq_broadcast + my ($wq, $net_new) = @_; + my $net = $wq->{lei}->{net}; + %$net = (%$net, %$net_new); +} + +# called by top-level lei-daemon when first worker is done with auth +# passes updated net auth info to current workers +sub net_merge_continue { + my ($lei, $wq, $net_new) = @_; + $wq->{-net_new} = $net_new; # for "lei up" + $wq->wq_broadcast('PublicInbox::LeiAuth::net_merge_all', $net_new); + $wq->net_merge_all_done($lei); # defined per-WQ +} + +sub op_merge { # prepares PktOp->pair ops + my ($self, $ops, $wq, $lei) = @_; + $ops->{net_merge_continue} = [ \&net_merge_continue, $lei, $wq ]; +} + +sub new { bless \(my $x), __PACKAGE__ } + +1; diff --git a/lib/PublicInbox/LeiBlob.pm b/lib/PublicInbox/LeiBlob.pm new file mode 100644 index 00000000..7b2ea434 --- /dev/null +++ b/lib/PublicInbox/LeiBlob.pm @@ -0,0 +1,179 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei blob $OID" command +# TODO: this doesn't scan submodules, but maybe it should +package PublicInbox::LeiBlob; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use PublicInbox::Spawn qw(run_wait run_qx which); +use PublicInbox::DS; +use PublicInbox::Eml; +use PublicInbox::Git; +use PublicInbox::IO qw(read_all); + +sub get_git_dir ($$) { + my ($lei, $d) = @_; + return $d if -d "$d/objects" && -d "$d/refs" && -e "$d/HEAD"; + + my $cmd = [ qw(git rev-parse --git-dir) ]; + my $opt = { '-C' => $d }; + if (defined($lei->{opt}->{cwd})) { # --cwd used, report errors + $opt->{2} = $lei->{2}; + } else { # implicit --cwd, quiet errors + open $opt->{2}, '>', '/dev/null' or die "open /dev/null: $!"; + } + chomp(my $git_dir = run_qx($cmd, {GIT_DIR => undef}, $opt)); + $? ? undef : $git_dir; +} + +sub solver_user_cb { # called by solver when done + my ($res, $self) = @_; + my $lei = $self->{lei}; + my $log_buf = delete $lei->{'log_buf'}; + $$log_buf =~ s/^/# /sgm; + ref($res) eq 'ARRAY' or return $lei->child_error(0, $$log_buf); + $lei->qerr($$log_buf); + my ($git, $oid, $type, $size, $di) = @$res; + + # don't try to support all the git-show(1) options for non-blob, + # this is just a convenience: + $type ne 'blob' and warn <<EOM; +# $oid is a $type of $size bytes in:\n#\t$git->{git_dir} +EOM + my $cmd = $git->cmd('show', $oid); + my $rdr = { 1 => $lei->{1}, 2 => $lei->{2} }; + run_wait($cmd, $lei->{env}, $rdr) and $lei->child_error($?); +} + +sub do_solve_blob { # via wq_do + my ($self) = @_; + my $lei = $self->{lei}; + my $git_dirs = $lei->{opt}->{'git-dir'}; + my $hints = {}; + for my $x (qw(oid-a path-a path-b)) { + my $v = $lei->{opt}->{$x} // next; + $x =~ tr/-/_/; + $hints->{$x} = $v; + } + open my $log, '+>', \(my $log_buf = '') or die "PerlIO::scalar: $!"; + $lei->{log_buf} = \$log_buf; + my $git = $lei->{ale}->git; + my @rmt = map { + PublicInbox::LeiRemote->new($lei, $_) + } $self->{lxs}->remotes; + my $solver = bless { + gits => [ map { + PublicInbox::Git->new($lei->rel2abs($_)) + } @$git_dirs ], + user_cb => \&solver_user_cb, + uarg => $self, + # -cur_di, -msg => temporary fields for Qspawn callbacks + inboxes => [ $self->{lxs}->locals, @rmt ], + }, 'PublicInbox::SolverGit'; + local $PublicInbox::DS::in_loop = 0; # waitpid synchronously + $solver->solve($lei->{env}, $log, $self->{oid_b}, $hints); +} + +sub cat_attach_i { # Eml->each_part callback + my ($part, $depth, $idx) = @{$_[0]}; + my $lei = $_[1]; + my $want = $lei->{-attach_idx} // return; + return if $idx ne $want; # [0-9]+(?:\.[0-9]+)+ + delete $lei->{-attach_idx}; + $lei->out($part->body); +} + +sub extract_attach ($$$) { + my ($lei, $blob, $bref) = @_; + my $eml = PublicInbox::Eml->new($bref); + $eml->each_part(\&cat_attach_i, $lei, 1); + my $idx = delete $lei->{-attach_idx}; + defined($idx) and return $lei->fail(<<EOM); +E: attachment $idx not found in $blob +EOM +} + +sub lei_blob { + my ($lei, $blob) = @_; + $lei->start_pager if -t $lei->{1}; + my $opt = $lei->{opt}; + my $has_hints = grep(defined, @$opt{qw(oid-a path-a path-b)}); + my $lxs; + if ($blob =~ s/:([0-9\.]+)\z//) { + $lei->{-attach_idx} = $1; + $opt->{mail} = 1; + } + + # first, see if it's a blob returned by "lei q" JSON output:k + if ($opt->{mail} // ($has_hints ? 0 : 1)) { + if (grep(defined, @$opt{qw(include only)})) { + $lxs = $lei->lxs_prepare; + $lei->ale->refresh_externals($lxs, $lei); + } + my $rdr = {}; + if ($opt->{mail}) { + open $rdr->{2}, '+>', undef or die "open: $!"; + } else { + open $rdr->{2}, '>', '/dev/null' or die "open: $!"; + } + my $cmd = $lei->ale->git->cmd('cat-file', 'blob', $blob); + my $cerr; + if (defined $lei->{-attach_idx}) { + my $buf = run_qx($cmd, $lei->{env}, $rdr); + return extract_attach($lei, $blob, \$buf) unless $?; + $cerr = $?; + } else { + $rdr->{1} = $lei->{1}; # write directly to client + $cerr = run_wait($cmd, $lei->{env}, $rdr) or return; + } + # fall back to unimported ('lei index') and inflight blobs + my $lms = $lei->lms; + my $bref = ($lms ? $lms->local_blob($blob, 1) : undef) // do { + my $sto = $lei->{sto} // $lei->_lei_store; + $sto && $sto->{-wq_s1} ? $sto->wq_do('cat_blob', $blob) + : undef; + }; + $bref and return $lei->{-attach_idx} ? + extract_attach($lei, $blob, $bref) : + $lei->out($$bref); + if ($opt->{mail}) { + seek($rdr->{2}, 0, 0); + return $lei->child_error($cerr, read_all($rdr->{2})); + } # else: fall through to solver below + } + + # maybe it's a non-email (code) blob from a coderepo + my $git_dirs = $opt->{'git-dir'} //= []; + if ($opt->{'cwd'} // 1) { + my $cgd = get_git_dir($lei, '.'); + unshift(@$git_dirs, $cgd) if defined $cgd; + } + return $lei->fail('no --git-dir to try') unless @$git_dirs; + unless ($lxs) { + $lxs = $lei->lxs_prepare or return; + $lei->ale->refresh_externals($lxs, $lei); + } + if ($lxs->remotes) { + require PublicInbox::LeiRemote; + $lei->{curl} //= which('curl') or return + $lei->fail('curl needed for '.join(', ',$lxs->remotes)); + $lei->_lei_store(1)->write_prepare($lei); + } + require PublicInbox::SolverGit; + my $self = bless { lxs => $lxs, oid_b => $blob }, __PACKAGE__; + my ($op_c, $ops) = $lei->workers_start($self, 1); + $lei->{wq1} = $self; + $self->wq_io_do('do_solve_blob', []); + $self->wq_close; + $lei->wait_wq_events($op_c, $ops); +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->_lei_atfork_child; + $self->SUPER::ipc_atfork_child; +} + +1; diff --git a/lib/PublicInbox/LeiConfig.pm b/lib/PublicInbox/LeiConfig.pm new file mode 100644 index 00000000..a50ff2b6 --- /dev/null +++ b/lib/PublicInbox/LeiConfig.pm @@ -0,0 +1,55 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::LeiConfig; # subclassed by LeiEditSearch +use v5.12; +use PublicInbox::PktOp; +use Fcntl qw(SEEK_SET); +use autodie qw(open seek); +use PublicInbox::IO qw(read_all); + +sub cfg_do_edit ($;$) { + my ($self, $reason) = @_; + my $lei = $self->{lei}; + $lei->pgr_err($reason) if defined $reason; + my $cmd = [ qw(git config --edit -f), $self->{-f} ]; + my $env = { GIT_CONFIG => $self->{-f} }; + $self->cfg_edit_begin if $self->can('cfg_edit_begin'); + # run in script/lei foreground + my ($op_c, $op_p) = PublicInbox::PktOp->pair; + # $op_p will EOF when $EDITOR is done + $op_c->{ops} = { '' => [\&cfg_edit_done, $lei, $self] }; + $lei->send_exec_cmd([ @$lei{qw(0 1 2)}, $op_p->{op_p} ], $cmd, $env); +} + +sub cfg_edit_done { # PktOp lei->do_env cb + my ($lei, $self) = @_; + open my $fh, '+>', undef; + my $cfg = do { + local $lei->{2} = $fh; + $lei->cfg_dump($self->{-f}); + } or do { + seek($fh, 0, SEEK_SET); + return cfg_do_edit($self, read_all($fh)); + }; + $self->cfg_verify($cfg) if $self->can('cfg_verify'); +} + +sub lei_config { + my ($lei, @argv) = @_; + $lei->{opt}->{'config-file'} and return $lei->fail( + "config file switches not supported by `lei config'"); + if ($lei->{opt}->{edit}) { + @argv and return $lei->fail( +'--edit must be used without other arguments'); + $lei->{opt}->{c} and return $lei->fail( +"`-c $lei->{opt}->{c}->[0]' not allowed with --edit"); + my $f = $lei->_lei_cfg(1)->{-f}; + cfg_do_edit(bless { lei => $lei, -f => $f }, __PACKAGE__); + } elsif (@argv) { # let git-config do error-checking + $lei->_config(@argv); + } else { + $lei->_help('no options given'); + } +} + +1; diff --git a/lib/PublicInbox/LeiConvert.pm b/lib/PublicInbox/LeiConvert.pm new file mode 100644 index 00000000..4d4fceb2 --- /dev/null +++ b/lib/PublicInbox/LeiConvert.pm @@ -0,0 +1,87 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# front-end for the "lei convert" sub-command +package PublicInbox::LeiConvert; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use PublicInbox::LeiOverview; +use PublicInbox::DS; + +# /^input_/ subs are used by PublicInbox::LeiInput + +sub input_mbox_cb { # MboxReader callback + my ($eml, $self) = @_; + my $kw = PublicInbox::MboxReader::mbox_keywords($eml); + $eml->header_set($_) for qw(Status X-Status); + $self->{wcb}->(undef, { kw => $kw }, $eml); +} + +sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh + my ($self, $eml) = @_; + $self->{wcb}->(undef, {}, $eml); +} + +sub input_maildir_cb { + my (undef, $kw, $eml, $self) = @_; # $_[0] $filename ignored + $self->{wcb}->(undef, { kw => $kw }, $eml); +} + +sub input_mh_cb { + my ($dn, $bn, $kw, $eml, $self) = @_; + $self->{wcb}->(undef, { kw => $kw }, $eml); +} + +sub process_inputs { # via wq_do + my ($self) = @_; + local $PublicInbox::DS::in_loop = 0; # force synchronous awaitpid + $self->SUPER::process_inputs; + my $lei = $self->{lei}; + my $l2m = delete $lei->{l2m}; + delete $self->{wcb}; # may close connections + $l2m->finish_output($lei) if $l2m; + if (my $v2w = delete $lei->{v2w}) { $v2w->done } # may die + my $nr_w = delete($l2m->{-nr_write}) // 0; + my $d = (delete($l2m->{-nr_seen}) // 0) - $nr_w; + $d = $d ? " ($d duplicates)" : ''; + $lei->qerr("# converted $nr_w messages$d"); +} + +sub lei_convert { # the main "lei convert" method + my ($lei, @inputs) = @_; + $lei->{opt}->{kw} //= 1; + $lei->{opt}->{dedupe} //= 'none'; + $lei->{input_opt}->{sort} = 1; # for LeiToMail conflict check + my $self = bless {}, __PACKAGE__; + my $ovv = PublicInbox::LeiOverview->new($lei, 'out-format'); + $lei->{l2m} or return + $lei->fail('--output unspecified or is not a mail destination'); + my $devfd = $lei->path_to_fd($ovv->{dst}) // return; + $lei->{opt}->{augment} = 1 if $devfd < 0; + $self->prepare_inputs($lei, \@inputs) or return; + # n.b. {net} {auth} is handled by l2m worker + my ($op_c, $ops) = $lei->workers_start($self, 1); + $lei->{wq1} = $self; + $self->wq_io_do('process_inputs', []); + $self->wq_close; + $lei->wait_wq_events($op_c, $ops); +} + +sub ipc_atfork_child { + my ($self) = @_; + my $lei = $self->{lei}; + $lei->_lei_atfork_child; + my $l2m = $lei->{l2m}; + if (my $net = $lei->{net}) { # may prompt user once + $net->{mics_cached} = $net->imap_common_init($lei); + $net->{nn_cached} = $net->nntp_common_init($lei); + } + $l2m->pre_augment($lei); + $l2m->do_augment($lei); + $l2m->post_augment($lei); + $self->{wcb} = $l2m->write_cb($lei); + $self->SUPER::ipc_atfork_child; +} + +1; diff --git a/lib/PublicInbox/LeiCurl.pm b/lib/PublicInbox/LeiCurl.pm new file mode 100644 index 00000000..48c66ee9 --- /dev/null +++ b/lib/PublicInbox/LeiCurl.pm @@ -0,0 +1,85 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# common option and torsocks(1) wrapping for curl(1) +# Eventually, we may support using libcurl via Inline::C and/or +# WWW::Curl; but curl(1) is most prevalent and widely-installed. +# n.b. curl may support a daemon/client model like lei someday: +# https://github.com/curl/curl/wiki/curl-tool-master-client +package PublicInbox::LeiCurl; +use v5.12; +use PublicInbox::Spawn qw(which); +use PublicInbox::Config; + +# Ensures empty strings are quoted, we don't need more +# sophisticated quoting than for empty strings: curl -d '' +use overload '""' => sub { + join(' ', map { $_ eq '' ? "''" : $_ } @{$_[0]}); +}; + +my %lei2curl = ( + 'curl-config=s@' => 'config|K=s@', +); + +# prepares a common command for curl(1) based on $lei command +sub new { + my ($cls, $lei, $curl) = @_; + $curl //= which('curl') // return $lei->fail('curl not found'); + my $opt = $lei->{opt}; + my @cmd = ($curl, qw(-gSf)); + $cmd[-1] .= 's' if $opt->{quiet}; # already the default for "lei q" + $cmd[-1] .= 'v' if $opt->{verbose}; # we use ourselves, too + for my $o ($lei->curl_opt) { + if (my $lei_spec = $lei2curl{$o}) { + $o = $lei_spec; + } + $o =~ s/\|[a-z0-9]\b//i; # remove single char short option + if ($o =~ s/=[is]@\z//) { + my $ary = $opt->{$o} or next; + push @cmd, map { ("--$o", $_) } @$ary; + } elsif ($o =~ s/=[is]\z//) { + my $val = $opt->{$o} // next; + push @cmd, "--$o", $val; + } elsif ($opt->{$o}) { + push @cmd, "--$o"; + } + } + push @cmd, '-v' if $opt->{verbose}; # lei uses this itself + bless \@cmd, $cls; +} + +sub torsocks { # useful for "git clone" and "git fetch", too + my ($self, $lei, $uri)= @_; + my $opt = $lei->{opt}; + $opt->{torsocks} = 'false' if $opt->{'no-torsocks'}; + my $torsocks = $opt->{torsocks} //= 'auto'; + if ($torsocks eq 'auto' && substr($uri->host, -6) eq '.onion' && + ($PublicInbox::Config::LD_PRELOAD//'') !~ m!/libtorsocks\b!) { + # "auto" continues anyways if torsocks is missing; + # a proxy may be specified via CLI, curlrc, + # environment variable, or even firewall rule + [ ($lei->{torsocks} //= which('torsocks')) // () ] + } elsif (PublicInbox::Config::git_bool($torsocks)) { + my $x = $lei->{torsocks} //= which('torsocks'); + $x or return $lei->fail(<<EOM); +--torsocks=yes specified but torsocks not found in PATH=$ENV{PATH} +EOM + [ $x ]; + } else { # the common case for current Internet :< + []; + } +} + +# completes the result of cmd() for $uri +sub for_uri { + my ($self, $lei, $uri, @opt) = @_; + my $pfx = torsocks($self, $lei, $uri) or return; # error + if ($uri->scheme =~ /\Ahttps?\z/i) { + my $cfg = $lei->_lei_cfg; + my $p = $cfg ? $cfg->urlmatch('http.Proxy', $$uri, 1) : undef; + push(@opt, '--proxy', $p) if defined($p); + } + bless [ @$pfx, @$self, @opt, $uri->as_string ], ref($self); +} + +1; diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm new file mode 100644 index 00000000..eda54d79 --- /dev/null +++ b/lib/PublicInbox/LeiDedupe.pm @@ -0,0 +1,137 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::LeiDedupe; +use v5.12; +use PublicInbox::ContentHash qw(content_hash content_digest git_sha); +use PublicInbox::SHA qw(sha256); + +# n.b. mutt sets most of these headers not sure about Bytes +our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes); + +# best-effort regeneration of OID when augmenting existing results +sub _regen_oid ($) { + my ($eml) = @_; + my @stash; # stash away headers we shouldn't have in git + for my $k (@OID_IGNORE) { + my @v = $eml->header_raw($k) or next; + push @stash, [ $k, \@v ]; + $eml->header_set($k); # restore below + } + my $dig = git_sha(1, $eml); + for my $kv (@stash) { # restore stashed headers + my ($k, @v) = @$kv; + $eml->header_set($k, @v); + } + $dig->digest; +} + +sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } + +sub smsg_hash ($) { + my ($smsg) = @_; + my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)}); + utf8::encode($x); + sha256($x); +} + +# the paranoid option +sub dedupe_oid ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml, $oidhex) = @_; + $skv->set_maybe(_oidbin($oidhex) // _regen_oid($eml), ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(_oidbin($smsg->{blob}), ''); + }); +} + +# dangerous if there's duplicate messages with different Message-IDs +sub dedupe_mid ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml, $oidhex) = @_; + # lei supports non-public drafts w/o Message-ID + my $mid = $eml->header_raw('Message-ID') // _oidbin($oidhex) // + content_hash($eml); + $skv->set_maybe($mid, ''); + }, sub { + my ($smsg) = @_; + my $mid = $smsg->{mid}; + $mid = undef if $mid eq ''; + $mid //= smsg_hash($smsg) // _oidbin($smsg->{blob}); + $skv->set_maybe($mid, ''); + }); +} + +# our default deduplication strategy (used by v2, also) +sub dedupe_content ($) { + my ($skv) = @_; + (sub { # may be called in a child process + my ($eml) = @_; # $oidhex = $_[1], ignored + + # we must account for Message-ID via hash_mids, since + # (unlike v2 dedupe) Message-ID is not accounted for elsewhere: + $skv->set_maybe(content_digest($eml, PublicInbox::SHA->new(256), + 1 # hash_mids + )->digest, ''); + }, sub { + my ($smsg) = @_; + $skv->set_maybe(smsg_hash($smsg), ''); + }); +} + +# no deduplication at all +sub true { 1 } +sub dedupe_none ($) { (\&true, \&true) } + +sub new { + my ($cls, $lei) = @_; + my $dd = $lei->{opt}->{dedupe} // 'content'; + my $dst = $lei->{ovv}->{dst}; + + # allow "none" to bypass Eml->new if writing to directory: + return if ($dd eq 'none' && substr($dst // '', -1) eq '/'); + my $m = "dedupe_$dd"; + $cls->can($m) or die "unsupported dedupe strategy: $dd\n"; + my $skv; + if ($dd ne 'none') { + require PublicInbox::SharedKV; + $skv = PublicInbox::SharedKV->new; + } + # [ $skv, $eml_cb, $smsg_cb, "dedupe_$dd" ] + bless [ $skv, undef, undef, $m ], $cls; +} + +# returns true on seen messages according to the deduplication strategy, +# returns false if unseen +sub is_dup { + my ($self, $eml, $smsg) = @_; + !$self->[1]->($eml, $smsg ? $smsg->{blob} : undef); +} + +sub is_smsg_dup { + my ($self, $smsg) = @_; + !$self->[2]->($smsg); +} + +sub prepare_dedupe { + my ($self) = @_; + my $skv = $self->[0]; + $self->[1] or @$self[1,2] = $self->can($self->[3])->($skv); + $skv ? $skv->dbh : undef; +} + +sub pause_dedupe { + my ($self) = @_; + my $skv = $self->[0] or return; + $skv->dbh_release; + delete($skv->{dbh}) if $skv; +} + +sub has_entries { + my $skv = $_[0]->[0] or return undef; + $skv->has_entries; +} + +1; diff --git a/lib/PublicInbox/LeiEditSearch.pm b/lib/PublicInbox/LeiEditSearch.pm new file mode 100644 index 00000000..bcf7c105 --- /dev/null +++ b/lib/PublicInbox/LeiEditSearch.pm @@ -0,0 +1,76 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei edit-search" edit a saved search following "lei q --save" +package PublicInbox::LeiEditSearch; +use strict; +use v5.10.1; +use PublicInbox::LeiSavedSearch; +use PublicInbox::LeiUp; +use parent qw(PublicInbox::LeiConfig); + +sub cfg_edit_begin { + my ($self) = @_; + if (ref($self->{lss}->{-cfg}->{'lei.q.output'})) { + delete $self->{lss}->{-cfg}->{'lei.q.output'}; # invalid + $self->{lei}->pgr_err(<<EOM); +$self->{lss}->{-f} has multiple values of lei.q.output +please remove redundant ones +EOM + } +} + +sub cfg_verify { + my ($self, $cfg) = @_; + my $new_out = $cfg->{'lei.q.output'} // ''; + return $self->cfg_do_edit(<<EOM) if ref $new_out; +$self->{-f} has multiple values of lei.q.output +EOM + return $self->cfg_do_edit(<<EOM) if $new_out eq ''; +$self->{-f} needs lei.q.output +EOM + my $lss = $self->{lss}; + my $old_out = $lss->{-cfg}->{'lei.q.output'} // return; + return if $old_out eq $new_out; + my $lei = $self->{lei}; + my $old_path = $old_out; + my $new_path = $new_out; + s!$PublicInbox::LeiSavedSearch::LOCAL_PFX!! for ($old_path, $new_path); + my $dir_old = $lss->can('lss_dir_for')->($lei, \$old_path, 1); + my $dir_new = $lss->can('lss_dir_for')->($lei, \$new_path); + return if $dir_new eq $dir_old; + + ($old_out =~ m!\Av2:!i || $new_out =~ m!\Av2:!) and + return $self->cfg_do_edit(<<EOM); +conversions from/to v2 inboxes not supported at this time +EOM + return $self->cfg_do_edit(<<EOM) if -e $dir_new; +lei.q.output changed from `$old_out' to `$new_out' +However, $dir_new exists +EOM + # start the conversion asynchronously + my $old_sq = PublicInbox::Config::squote_maybe($old_out); + my $new_sq = PublicInbox::Config::squote_maybe($new_out); + $lei->puts("lei.q.output changed from $old_sq to $new_sq"); + $lei->qerr("# lei convert $old_sq -o $new_sq"); + my $v = !$lei->{opt}->{quiet}; + $lei->{opt} = { output => $new_out, verbose => $v }; + require PublicInbox::LeiConvert; + PublicInbox::LeiConvert::lei_convert($lei, $old_out); + + $lei->fail(<<EOM) if -e $dir_old && !rename($dir_old, $dir_new); +E: rename($dir_old, $dir_new) error: $! +EOM +} + +sub lei_edit_search { + my ($lei, $out) = @_; + my $lss = PublicInbox::LeiSavedSearch->up($lei, $out) or return; + my $f = $lss->{-f}; + my $self = bless { lei => $lei, lss => $lss, -f => $f }, __PACKAGE__; + $self->cfg_do_edit; +} + +*_complete_edit_search = \&PublicInbox::LeiUp::_complete_up; + +1; diff --git a/lib/PublicInbox/LeiExportKw.pm b/lib/PublicInbox/LeiExportKw.pm new file mode 100644 index 00000000..16f069da --- /dev/null +++ b/lib/PublicInbox/LeiExportKw.pm @@ -0,0 +1,147 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# front-end for the "lei export-kw" sub-command +package PublicInbox::LeiExportKw; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use Errno qw(EEXIST ENOENT); +use PublicInbox::Syscall qw(rename_noreplace); + +sub export_kw_md { # LeiMailSync->each_src callback + my ($oidbin, $id, $self, $mdir) = @_; + my $sto_kw = $self->{lse}->oidbin_keywords($oidbin) or return; + my $bn = $$id; + my ($md_kw, $unknown, @try); + if ($bn =~ s/:2,([a-zA-Z]*)\z//) { + ($md_kw, $unknown) = PublicInbox::MdirReader::flags2kw($1); + @try = qw(cur new); + } else { + $unknown = []; + @try = qw(new cur); + } + if ($self->{-merge_kw} && $md_kw) { # merging keywords is the default + @$sto_kw{keys %$md_kw} = values(%$md_kw); + } + $bn .= ':2,'. + PublicInbox::LeiToMail::kw2suffix([keys %$sto_kw], @$unknown); + return if $bn eq $$id; + my $dst = "$mdir/cur/$bn"; + my $lei = $self->{lei}; + for my $d (@try) { + my $src = "$mdir/$d/$$id"; + if (rename_noreplace($src, $dst)) { # success + $self->{lms}->mv_src("maildir:$mdir", + $oidbin, $id, $bn); + return; # success + } elsif ($! == EEXIST) { # lost race with lei/store? + return; + } elsif ($! != ENOENT) { + $lei->child_error(0, + "E: rename_noreplace($src -> $dst): $!"); + } # else loop @try + } + my $e = $!; + # both tries failed + my $oidhex = unpack('H*', $oidbin); + my $src = "$mdir/{".join(',', @try)."}/$$id"; + $lei->child_error(0, "rename_noreplace($src -> $dst) ($oidhex): $e"); + for (@try) { return if -e "$mdir/$_/$$id" } + $self->{lms}->clear_src("maildir:$mdir", $id); +} + +sub export_kw_imap { # LeiMailSync->each_src callback + my ($oidbin, $id, $self, $mic) = @_; + my $sto_kw = $self->{lse}->oidbin_keywords($oidbin) or return; + $self->{imap_mod_kw}->($self->{nwr}, $mic, $id, [ keys %$sto_kw ]); +} + +# overrides PublicInbox::LeiInput::input_path_url +sub input_path_url { + my ($self, $input, @args) = @_; + $self->{lms}->lms_write_prepare; + if ($input =~ /\Amaildir:(.+)/i) { + my $mdir = $1; + require PublicInbox::LeiToMail; # kw2suffix + $self->{lms}->each_src($input, \&export_kw_md, $self, $mdir); + } elsif ($input =~ m!\Aimaps?://!i) { + my $uri = PublicInbox::URIimap->new($input); + my $mic = $self->{nwr}->mic_for_folder($uri); + if ($mic && !$self->{nwr}->can_store_flags($mic)) { + my $m = "$input does not support PERMANENTFLAGS"; + if (defined $self->{lei}->{opt}->{all}) { + $self->{lei}->qerr("# $m"); + } else { # set error code if user explicitly requested + $self->{lei}->child_error(0, "E: $m"); + } + return; + } + if ($mic) { + $self->{lms}->each_src($$uri, \&export_kw_imap, + $self, $mic); + $mic->expunge; + } else { + $self->{lei}->child_error(0, "$input unavailable: $@"); + } + } else { die "BUG: $input not supported" } +} + +sub lei_export_kw { + my ($lei, @folders) = @_; + my $sto = $lei->_lei_store or return $lei->fail(<<EOM); +lei/store uninitialized, see lei-import(1) +EOM + my $lms = $lei->lms or return $lei->fail(<<EOM); +lei mail_sync uninitialized, see lei-import(1) +EOM + if (defined(my $all = $lei->{opt}->{all})) { # --all=<local|remote> + $lms->group2folders($lei, $all, \@folders) or return; + @folders = grep(/\A(?:maildir|imaps?):/i, @folders); + } else { + $lms->arg2folder($lei, \@folders); # may die + } + $lms->lms_pause; + my $self = bless { lse => $sto->search, lms => $lms }, __PACKAGE__; + $lei->{opt}->{'mail-sync'} = 1; # for prepare_inputs + $self->prepare_inputs($lei, \@folders) or return; + if (my @ro = grep(!/\A(?:maildir|imaps?):/i, @folders)) { + return $lei->fail("cannot export to read-only folders: @ro"); + } + my $m = $lei->{opt}->{mode} // 'merge'; + if ($m eq 'merge') { # default + $self->{-merge_kw} = 1; + } elsif ($m eq 'set') { + } else { + return $lei->fail(<<EOM); +--mode=$m not supported (`set' or `merge') +EOM + } + if (my $net = $lei->{net}) { + require PublicInbox::NetWriter; + $self->{nwr} = bless $net, 'PublicInbox::NetWriter'; + $self->{imap_mod_kw} = $net->can($self->{-merge_kw} ? + 'imap_add_kw' : 'imap_set_kw'); + $self->{nwr}->{-skip_creat} = 1; + } + $lei->{-err_type} = 'non-fatal'; + $lei->wq1_start($self); +} + +sub _complete_export_kw { + my ($lei, @argv) = @_; + my $lms = $lei->lms or return (); + my $match_cb = $lei->complete_url_prepare(\@argv); + # filter-out read-only sources: + my @k = grep(m!(?:maildir|imaps?):!, + $lms->folders($argv[-1] // undef, 1)); + my @m = map { $match_cb->($_) } @k; + @m ? @m : @k; +} + +no warnings 'once'; + +*ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiExternal.pm b/lib/PublicInbox/LeiExternal.pm new file mode 100644 index 00000000..31b9bd1e --- /dev/null +++ b/lib/PublicInbox/LeiExternal.pm @@ -0,0 +1,119 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# *-external commands of lei +package PublicInbox::LeiExternal; +use strict; +use v5.10.1; +use PublicInbox::Config qw(glob2re); + +sub externals_each { + my ($self, $cb, @arg) = @_; + my $cfg = $self->_lei_cfg; + my %boost; + for my $sec (grep(/\Aexternal\./, @{$cfg->{-section_order}})) { + my $loc = substr($sec, length('external.')); + $boost{$loc} = $cfg->{"$sec.boost"}; + } + return \%boost if !wantarray && !$cb; + + # highest boost first, but stable for alphabetic tie break + use sort 'stable'; + my @order = sort { $boost{$b} <=> $boost{$a} } sort keys %boost; + if (ref($cb) eq 'CODE') { + for my $loc (@order) { + $cb->(@arg, $loc, $boost{$loc}); + } + } elsif (ref($cb) eq 'HASH') { + %$cb = %boost; + } + @order; # scalar or array +} + +sub ext_canonicalize { + my $location = $_[-1]; # $_[0] may be $lei + if ($location !~ m!\Ahttps?://!) { + PublicInbox::Config::rel2abs_collapsed($location); + } else { + require URI; + my $uri = URI->new($location)->canonical; + my $path = $uri->path . '/'; + $path =~ tr!/!/!s; # squeeze redundant '/' + $uri->path($path); + $uri->as_string; + } +} + +# get canonicalized externals list matching $loc +# $is_exclude denotes it's for --exclude +# otherwise it's for --only/--include is assumed +sub get_externals { + my ($self, $loc, $is_exclude) = @_; + return (ext_canonicalize($loc)) if -e $loc; + my @m; + my @cur = externals_each($self); + my $do_glob = !$self->{opt}->{globoff}; # glob by default + if ($do_glob && (my $re = glob2re($loc))) { + @m = grep(m!$re/?\z!, @cur); + return @m if scalar(@m); + } elsif (index($loc, '/') < 0) { # exact basename match: + @m = grep(m!/\Q$loc\E/?\z!, @cur); + return @m if scalar(@m) == 1; + } elsif ($is_exclude) { # URL, maybe: + my $canon = ext_canonicalize($loc); + @m = grep(m!\A\Q$canon\E\z!, @cur); + return @m if scalar(@m) == 1; + } else { # URL: + return (ext_canonicalize($loc)); + } + if (scalar(@m) == 0) { + die "`$loc' is unknown\n"; + } else { + die("`$loc' is ambiguous:\n", map { "\t$_\n" } @m, "\n"); + } +} + +sub canonicalize_excludes { + my ($lei, $excludes) = @_; + my %x; + for my $loc (@$excludes) { + my @l = get_externals($lei, $loc, 1); + $x{$_} = 1 for @l; + } + \%x; +} + +# returns an anonymous sub which returns an array of potential results +sub complete_url_prepare { + my $argv = $_[-1]; # $_[0] may be $lei + # Workaround bash default COMP_WORDBREAKS splitting URLs to + # ['https', ':', '//', ...]. COMP_WORDBREAKS is global for all + # completions loaded, not just ours, so we can't change it. + # cf. contrib/completion/lei-completion.bash + my ($pfx, $cur) = ('', pop(@$argv) // ''); + if (@$argv) { + my @x = @$argv; + if ($cur =~ /\A[:;=]\z/) { # COMP_WORDBREAKS + URL union + push @x, $cur; + $cur = ''; + } + while (@x && $pfx !~ m!\A(?: (?:[\+\-]?(?:L|kw):) | + (?:(?:imap|nntp|http)s?:) | + (?:--\w?\z)|(?:-\w?\z) )!x) { + $pfx = pop(@x).$pfx; + } + } + my $re = qr!\A\Q$pfx\E(\Q$cur\E.*)!; + my $match_cb = sub { + # the "//;" here (for AUTH=ANONYMOUS) interacts badly with + # bash tab completion, strip it out for now since our commands + # work w/o it. Not sure if there's a better solution... + $_[0] =~ s!//;AUTH=ANONYMOUS\@!//!i; + # only return the part specified on the CLI + # don't duplicate if already 100% completed + $_[0] =~ $re ? ($cur eq $1 ? () : $1) : () + }; + wantarray ? ($pfx, $cur, $match_cb) : $match_cb; +} + +1; diff --git a/lib/PublicInbox/LeiFinmsg.pm b/lib/PublicInbox/LeiFinmsg.pm new file mode 100644 index 00000000..7ed58c24 --- /dev/null +++ b/lib/PublicInbox/LeiFinmsg.pm @@ -0,0 +1,22 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Finalization messages, used to queue up a bunch of messages which +# only get written out on ->DESTROY +package PublicInbox::LeiFinmsg; +use strict; +use v5.10.1; + +sub new { + my ($cls, $lei) = @_; + bless [ @$lei{qw(2 sock)}, $$ ], $cls; +} + +sub DESTROY { + my ($self) = @_; + my ($stderr, $sock, $pid) = splice(@$self, 0, 3); + print $stderr @$self if $pid == $$; + # script/lei disconnects when $sock SvREFCNT drops to zero +} + +1; diff --git a/lib/PublicInbox/LeiForgetExternal.pm b/lib/PublicInbox/LeiForgetExternal.pm new file mode 100644 index 00000000..c8d1df38 --- /dev/null +++ b/lib/PublicInbox/LeiForgetExternal.pm @@ -0,0 +1,40 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei forget-external" command +package PublicInbox::LeiForgetExternal; +use strict; +use v5.10.1; + +sub lei_forget_external { + my ($lei, @locations) = @_; + my $cfg = $lei->_lei_cfg or + return $lei->fail('no externals configured'); + my %seen; + for my $loc (@locations) { + for my $l ($loc, $lei->ext_canonicalize($loc)) { + next if $seen{$l}++; + my $key = "external.$l.boost"; + delete($cfg->{$key}); + if ($lei->_config('+e', '--unset', $key)) { + $lei->qerr("# $l forgotten "); + } elsif (($? >> 8) == 5) { + warn("# $l not found\n"); + } else { + $lei->child_error($?, "# --unset $key error"); + } + } + } +} + +# shell completion helper called by lei__complete +sub _complete_forget_external { + my ($lei, @argv) = @_; + my $cfg = $lei->_lei_cfg or return (); + my ($pfx, $cur, $match_cb) = $lei->complete_url_prepare(\@argv); + map { + $match_cb->(substr($_, length('external.'))); + } grep(/\Aexternal\.\Q$pfx$cur/, @{$cfg->{-section_order}}); +} + +1; diff --git a/lib/PublicInbox/LeiForgetMailSync.pm b/lib/PublicInbox/LeiForgetMailSync.pm new file mode 100644 index 00000000..762910ed --- /dev/null +++ b/lib/PublicInbox/LeiForgetMailSync.pm @@ -0,0 +1,26 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei forget-mail-sync" drop synchronization information +# TODO: figure out what to do about "lei index" users having +# dangling references. Perhaps just documenting "lei index" +# use being incompatible with "forget-mail-sync" use is +# sufficient. + +package PublicInbox::LeiForgetMailSync; +use strict; +use v5.10.1; +use PublicInbox::LeiRefreshMailSync; + +sub lei_forget_mail_sync { + my ($lei, @folders) = @_; + my $lms = $lei->lms or return; + $lms->lms_write_prepare; + $lms->arg2folder($lei, \@folders); # may die + $lms->forget_folders(@folders); +} + +*_complete_forget_mail_sync = + \&PublicInbox::LeiRefreshMailSync::_complete_refresh_mail_sync; + +1; diff --git a/lib/PublicInbox/LeiForgetSearch.pm b/lib/PublicInbox/LeiForgetSearch.pm new file mode 100644 index 00000000..dd358ae1 --- /dev/null +++ b/lib/PublicInbox/LeiForgetSearch.pm @@ -0,0 +1,79 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei forget-search" forget/remove a saved search "lei q --save" +package PublicInbox::LeiForgetSearch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::LeiUp); +use PublicInbox::LeiSavedSearch; +use File::Path (); +use SelectSaver; + +sub do_forget_search { + my ($lei, @outs) = @_; + my @dirs; # paths in ~/.local/share/lei/saved-search/ + my $cwd; + for my $o (@outs) { + my $d = PublicInbox::LeiSavedSearch::lss_dir_for($lei, \$o, 1); + if (-e $d) { + push @dirs, $d + } else { # keep going, like rm(1): + $cwd //= $lei->rel2abs('.'); + warn "--save was not used with $o cwd=$cwd\n"; + } + } + my $save; + my $opt = { safe => 1 }; + if ($lei->{opt}->{verbose}) { + $opt->{verbose} = 1; + $save = SelectSaver->new($lei->{2}); + } + File::Path::remove_tree(@dirs, $opt); + $lei->child_error(0) if defined $cwd; +} + +sub lei_forget_search { + my ($lei, @outs) = @_; + my $prune = $lei->{opt}->{prune}; + $prune // return do_forget_search($lei, @outs); + return $lei->fail("--prune and @outs incompatible") if @outs; + my @tmp = PublicInbox::LeiSavedSearch::list($lei); + my $self = bless { -mail_sync => 1 }, __PACKAGE__; + $self->filter_lss($lei, $prune) // return + $lei->fail("only --prune=$prune not understood"); + if ($self->{o_remote}) { # setup lei->{auth} + $self->prepare_inputs($lei, $self->{o_remote}) or return; + } + $lei->wq1_start($self); +} + +sub do_prune { + my ($self) = @_; + my $lei = $self->{lei}; + for my $o (@{$self->{o_local} // []}) { + next if -e $o; + $lei->qerr("# pruning $o"); + eval { do_forget_search($lei, $o) }; + $lei->child_error(0, "E: $@") if $@; + } + for my $o (@{$self->{o_remote} // []}) { + my $uri = PublicInbox::URIimap->new($o); + next if $lei->{net}->mic_for_folder($uri); + $lei->qerr("# pruning $uri"); + eval { do_forget_search($lei, $o) }; + $lei->child_error(0, "E: $@") if $@; + } +} + +# called in top-level lei-daemon when LeiAuth is done +sub net_merge_all_done { + my ($self) = @_; + $self->wq_do('do_prune'); + $self->wq_close; +} + +*_wq_done_wait = \&PublicInbox::LEI::wq_done_wait; +*_complete_forget_search = \&PublicInbox::LeiUp::_complete_up; + +1; diff --git a/lib/PublicInbox/LeiHelp.pm b/lib/PublicInbox/LeiHelp.pm new file mode 100644 index 00000000..fa0e7866 --- /dev/null +++ b/lib/PublicInbox/LeiHelp.pm @@ -0,0 +1,103 @@ +# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# -h/--help support for lei +package PublicInbox::LeiHelp; +use strict; +use v5.10.1; +use Text::Wrap qw(wrap); + +my %NOHELP = map { $_ => 1 } qw(mfolder); + +sub call { + my ($self, $errmsg, $CMD, $OPTDESC) = @_; + my $cmd = $self->{cmd} // 'COMMAND'; + my @info = @{$CMD->{$cmd} // [ '...', '...' ]}; + my @top = ($cmd, shift(@info) // ()); + my $cmd_desc = shift(@info); + $cmd_desc = $cmd_desc->($self) if ref($cmd_desc) eq 'CODE'; + $cmd_desc =~ s/default: /default:\xa0/; + my @opt_desc; + my $lpad = 2; + for my $sw (grep { !ref } @info) { # ("prio=s", "z", $GLP_PASS) + my $desc = $OPTDESC->{"$sw\t$cmd"} // $OPTDESC->{$sw} // next; + my $arg_vals = ''; + ($arg_vals, $desc) = @$desc if ref($desc) eq 'ARRAY'; + + # lower-case is a keyword (e.g. `content', `oid'), + # ALL_CAPS is a string description (e.g. `PATH') + if ($desc !~ /default/ && $arg_vals =~ /\b([a-z]+)[,\|]/) { + $desc .= " (default:\xa0`$1')"; + } else { + $desc =~ s/default: /default:\xa0/; + } + my (@vals, @s, @l); + my $x = $sw; + if ($x =~ s/!\z//) { # solve! => --no-solve + $x =~ s/(\A|\|)/$1no-/g + } elsif ($x =~ s/\+\z//) { # verbose|v+ + } elsif ($x =~ s/:.+//) { # optional args: $x = "mid:s" + @vals = (' [', undef, ']'); + } elsif ($x =~ s/=.+//) { # required arg: $x = "type=s" + @vals = (' ', undef); + } # else: no args $x = 'threads|t' + + # we support underscore options from public-inbox-* commands; + # but they've never been documented and will likely go away. + # $x = help|h + for (grep { !/_/ && !$NOHELP{$_} } split(/\|/, $x)) { + length($_) > 1 ? push(@l, "--$_") : push(@s, "-$_"); + } + if (!scalar(@vals)) { # no args 'threads|t' + } elsif ($arg_vals =~ s/\A([A-Z_=]+)\b//) { # "NAME" + $vals[1] = $1; + } else { + $vals[1] = uc(substr($l[0], 2)); # "--type" => "TYPE" + } + if ($arg_vals =~ /([,\|])/) { + my $sep = $1; + my @allow = split(/\Q$sep\E/, $arg_vals); + my $must = $sep eq '|' ? 'Must' : 'Can'; + @allow = map { length $_ ? "`$_'" : () } @allow; + my $last = pop @allow; + $desc .= "\n$must be one of: " . + join(', ', @allow) . " or $last"; + } + my $lhs = join(', ', @s, @l) . join('', @vals); + if ($x =~ /\|\z/) { # "stdin|" or "clear|" + $lhs =~ s/\A--/- , --/; + } else { + $lhs =~ s/\A--/ --/; # pad if no short options + } + $lpad = length($lhs) if length($lhs) > $lpad; + push @opt_desc, $lhs, $desc; + } + my $msg = $errmsg ? "E: $errmsg\n" : ''; + $msg .= <<EOF; +usage: lei @top +$cmd_desc + +EOF + $lpad += 2; + local $Text::Wrap::columns = 78 - $lpad; + # local $Text::Wrap::break = ; # don't break on nbsp (\xa0) + my $padding = ' ' x ($lpad + 2); + while (my ($lhs, $rhs) = splice(@opt_desc, 0, 2)) { + $msg .= ' '.pack("A$lpad", $lhs); + $rhs = wrap('', '', $rhs); + $rhs =~ s/\n/\n$padding/sg; # LHS pad continuation lines + $msg .= $rhs; + $msg .= "\n"; + } + my $fd = $errmsg ? 2 : 1; + $self->start_pager if -t $self->{$fd}; + $msg =~ s/\xa0/ /gs; # convert NBSP to SP + print { $self->{$fd} } $msg; + $self->x_it($errmsg ? (1 << 8) : 0); # stderr => failure + undef; +} + +# the "lei help" command +sub lei_help { $_[0]->_help } + +1; diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm new file mode 100644 index 00000000..5521188c --- /dev/null +++ b/lib/PublicInbox/LeiImport.pm @@ -0,0 +1,163 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# front-end for the "lei import" sub-command +package PublicInbox::LeiImport; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::Compat qw(uniqstr); + +# /^input_/ subs are used by (or override) PublicInbox::LeiInput superclass + +sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh + my ($self, $eml, $vmd) = @_; + my $xoids = $self->{lei}->{ale}->xoids_for($eml); + if (my $all_vmd = $self->{all_vmd}) { + @$vmd{keys %$all_vmd} = values %$all_vmd; + } + $self->{lei}->{sto}->wq_do('set_eml', $eml, $vmd, $xoids); +} + +sub input_mbox_cb { # MboxReader callback + my ($eml, $self) = @_; + my $vmd; + if ($self->{-import_kw}) { + my $kw = PublicInbox::MboxReader::mbox_keywords($eml); + $vmd = { kw => $kw } if scalar(@$kw); + } + input_eml_cb($self, $eml, $vmd); +} + +sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn + my ($self, $f, $fl) = @_; + my ($folder, $bn) = ($f =~ m!\A(.+?)/(?:new|cur)/([^/]+)\z!) or + die "BUG: $f was not from a Maildir?\n"; + my $kw = PublicInbox::MdirReader::flags2kw($fl); + substr($folder, 0, 0) = 'maildir:'; # add prefix + my $lse = $self->{lse} //= $self->{lei}->{sto}->search; + my $lms = $self->{-lms_rw} //= $self->{lei}->lms; # may be 0 or undef + my @oidbin = $lms ? $lms->name_oidbin($folder, $bn) : (); + @oidbin > 1 and warn("W: $folder/*/$$bn not unique:\n", + map { "\t".unpack('H*', $_)."\n" } @oidbin); + my @docids = sort { $a <=> $b } uniqstr + map { $lse->over->oidbin_exists($_) } @oidbin; + my $vmd = $self->{-import_kw} ? { kw => $kw } : undef; + if (scalar @docids) { + $lse->kw_changed(undef, $kw, \@docids) or return; + } + if (my $eml = eml_from_path($f)) { + $vmd->{sync_info} = [ $folder, \$bn ] if $self->{-mail_sync}; + $self->input_eml_cb($eml, $vmd); + } +} + +sub input_mh_cb { + my ($mhdir, $n, $kw, $eml, $self) = @_; + substr($mhdir, 0, 0) = 'mh:'; # add prefix + my $lse = $self->{lse} //= $self->{lei}->{sto}->search; + my $lms = $self->{-lms_rw} //= $self->{lei}->lms; # may be 0 or undef + my @oidbin = $lms ? $lms->num_oidbin($mhdir, $n) : (); + @oidbin > 1 and warn("W: $mhdir/$n not unique:\n", + map { "\t".unpack('H*', $_)."\n" } @oidbin); + my @docids = sort { $a <=> $b } uniqstr + map { $lse->over->oidbin_exists($_) } @oidbin; + if (scalar @docids) { + $lse->kw_changed(undef, $kw, \@docids) or return; + } + if (defined $eml) { + my $vmd = $self->{-import_kw} ? { kw => $kw } : undef; + $vmd->{sync_info} = [ $mhdir, $n + 0 ] if $self->{-mail_sync}; + $self->input_eml_cb($eml, $vmd); + } + # TODO: + # elsif (my $ikw = $self->{lei}->{ikw}) { # old message, kw only + # $ikw->wq_io_do('ck_update_kw', [], "mh:$dir", $uid, $kw); +} + +sub input_net_cb { # imap_each / nntp_each + my ($uri, $uid, $kw, $eml, $self) = @_; + if (defined $eml) { + my $vmd = $self->{-import_kw} ? { kw => $kw } : undef; + $vmd->{sync_info} = [ $$uri, $uid ] if $self->{-mail_sync}; + $self->input_eml_cb($eml, $vmd); + } elsif (my $ikw = $self->{lei}->{ikw}) { # old message, kw only + # we send $uri as a bare SCALAR and not a URIimap ref to + # reduce socket traffic: + $ikw->wq_io_do('ck_update_kw', [], $$uri, $uid, $kw); + } +} + +sub do_import_index ($$@) { + my ($self, $lei, @inputs) = @_; + my $sto = $lei->_lei_store(1); + $sto->write_prepare($lei); + $self->{-import_kw} = $lei->{opt}->{kw} // 1; + $self->{all_vmd} = $lei->{vmd_mod} if keys %{$lei->{vmd_mod}}; + $lei->ale; # initialize for workers to read (before LeiPmdir->new) + $self->{-mail_sync} = $lei->{opt}->{'mail-sync'} // 1; + $self->prepare_inputs($lei, \@inputs) or return; + + my $j = $lei->{opt}->{jobs} // 0; + $j =~ /\A([0-9]+),[0-9]+\z/ and $j = $1 + 0; + $j ||= scalar(@{$self->{inputs}}) || 1; + my $ikw; + my $net = $lei->{net}; + if ($net) { + # $j = $net->net_concurrency($j); TODO + if ($lei->{opt}->{incremental} // 1) { + $net->{incremental} = 1; + $net->{-lms_rw} = $lei->lms // 0; + if ($self->{-import_kw} && $net->{-lms_rw} && + !$lei->{opt}->{'new-only'} && + $net->{imap_order}) { + require PublicInbox::LeiImportKw; + $ikw = PublicInbox::LeiImportKw->new($lei); + $net->{each_old} = 1; + } + } + } else { + my $nproc = $self->detect_nproc; + $j = $nproc if $j > $nproc; + } + ($lei->{opt}->{'new-only'} && (!$net || !$net->{imap_order})) and + warn "# --new-only is only for IMAP\n"; + $lei->{-eml_noisy} = 1; + $lei->{-err_type} = 'non-fatal'; + $lei->wq1_start($self, $j); +} + +sub lei_import { # the main "lei import" method + my ($lei, @inputs) = @_; + my $self = bless {}, __PACKAGE__; + do_import_index($self, $lei, @inputs); +} + +sub _complete_import { + my ($lei, @argv) = @_; + my $has_arg = @argv; + my ($pfx, $cur, $match_cb) = $lei->complete_url_prepare(\@argv); + my @try = $has_arg ? ($pfx.$cur, $argv[-1]) : ($argv[-1]); + push(@try, undef) if defined $try[-1]; + my (@f, @k); + for (@try) { + @k = $lei->url_folder_cache->keys($_, 1) and last; + } + my @L = eval { $lei->_lei_store->search->all_terms('L') }; + push(@k, map { "+L:$_" } @L); + if (my $lms = $lei->lms) { + for (@try) { + @f = $lms->folders($_, 1) and last; + } + push @k, @f; + } + my @m = map { $match_cb->($_) } @k; + @m ? @m : @k; +} + +no warnings 'once'; +*ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiImportKw.pm b/lib/PublicInbox/LeiImportKw.pm new file mode 100644 index 00000000..765e23cd --- /dev/null +++ b/lib/PublicInbox/LeiImportKw.pm @@ -0,0 +1,53 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# WQ worker for dealing with LeiImport IMAP flags on already-imported messages +# WQ key: {ikw} +package PublicInbox::LeiImportKw; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use PublicInbox::Compat qw(uniqstr); + +sub new { + my ($cls, $lei) = @_; + my $self = bless { -wq_ident => 'lei import_kw worker' }, $cls; + my $j = $self->detect_nproc // 4; + $j = 4 if $j > 4; + my ($op_c, $ops) = $lei->workers_start($self, $j); + $op_c->{ops} = $ops; # for PktOp->event_step + $self->{lei_sock} = $lei->{sock}; + $lei->{ikw} = $self; +} + +sub ipc_atfork_child { + my ($self) = @_; + my $lei = $self->{lei}; + $lei->_lei_atfork_child; + my $net = delete $lei->{net} // die 'BUG: no lei->{net}'; + $self->{sto} = $lei->{sto} // die 'BUG: no lei->{sto}'; + $self->{verbose} = $lei->{opt}->{verbose}; + $self->{lse} = $self->{sto}->search; + $self->{over} = $self->{lse}->over; + $self->{-lms_rw} = $net->{-lms_rw} || die 'BUG: net->{-lms_rw} FALSE'; + $self->SUPER::ipc_atfork_child; +} + +sub ck_update_kw { # via wq_io_do + my ($self, $url, $uid, $kw) = @_; + my @oidbin = $self->{-lms_rw}->num_oidbin($url, $uid); + my $uid_url = index($url, 'mh:') == 0 ? $url.$uid : "$url/;UID=$uid"; + @oidbin > 1 and warn("W: $uid_url not unique:\n", + map { "\t".unpack('H*', $_)."\n" } @oidbin); + my @docids = sort { $a <=> $b } uniqstr + map { $self->{over}->oidbin_exists($_) } @oidbin; + $self->{lse}->kw_changed(undef, $kw, \@docids) or return; + $self->{verbose} and $self->{lei}->qerr("# $uid_url => @$kw\n"); + $self->{sto}->wq_do('set_eml_vmd', undef, { kw => $kw }, \@docids); +} + +sub _lei_wq_eof { # EOF callback for main lei daemon + $_[0]->wq_eof('ikw'); +} + +1; diff --git a/lib/PublicInbox/LeiIndex.pm b/lib/PublicInbox/LeiIndex.pm new file mode 100644 index 00000000..0e329e58 --- /dev/null +++ b/lib/PublicInbox/LeiIndex.pm @@ -0,0 +1,46 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# front-end for the "lei index" sub-command, this is similar to +# "lei import" but doesn't put a git blob into ~/.local/share/lei/store +package PublicInbox::LeiIndex; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use PublicInbox::LeiImport; + +# /^input_/ subs are used by (or override) PublicInbox::LeiInput superclass +sub input_eml_cb { # used by input_maildir_cb and input_net_cb + my ($self, $eml, $vmd) = @_; + my $xoids = $self->{lei}->{ale}->xoids_for($eml); + if (my $all_vmd = $self->{all_vmd}) { + @$vmd{keys %$all_vmd} = values %$all_vmd; + } + $self->{lei}->{sto}->wq_do('index_eml_only', $eml, $vmd, $xoids); +} + +sub input_fh { # overrides PublicInbox::LeiInput::input_fh + my ($self, $ifmt, $fh, $input, @args) = @_; + $self->{lei}->child_error(0, <<EOM); +$input ($ifmt) not yet supported, try `lei import' +EOM +} + +sub lei_index { + my ($lei, @argv) = @_; + $lei->{opt}->{'mail-sync'} = 1; + my $self = bless {}, __PACKAGE__; + PublicInbox::LeiImport::do_import_index($self, $lei, @argv); +} + +no warnings 'once'; +no strict 'refs'; +for my $m (qw(pmdir_cb input_net_cb input_mh_cb)) { + *$m = PublicInbox::LeiImport->can($m); +} + +*_complete_index = \&PublicInbox::LeiImport::_complete_import; +*ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiInit.pm b/lib/PublicInbox/LeiInit.pm new file mode 100644 index 00000000..94897e61 --- /dev/null +++ b/lib/PublicInbox/LeiInit.pm @@ -0,0 +1,40 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# for the "lei init" command, not sure if it's even needed... +package PublicInbox::LeiInit; +use v5.10.1; + +sub lei_init { + my ($self, $dir) = @_; + my $cfg = $self->_lei_cfg(1); + my $cur = $cfg->{'leistore.dir'}; + $dir //= $self->store_path; + $dir = $self->rel2abs($dir); + my @cur = stat($cur) if defined($cur); + $cur = $self->canonpath_harder($cur // $dir); + my @dir = stat($dir); + my $exists = "# leistore.dir=$cur already initialized" if @dir; + if (@cur) { + if ($cur eq $dir) { + $self->_lei_store(1)->done; + return $self->qerr($exists); + } + + # some folks like symlinks and bind mounts :P + if (@dir && "@cur[1,0]" eq "@dir[1,0]") { + $self->_config('leistore.dir', $dir) or return; + $self->_lei_store(1)->done; + return $self->qerr("$exists (as $cur)"); + } + return $self->fail(<<""); +E: leistore.dir=$cur already initialized and it is not $dir + + } + $self->_config('leistore.dir', $dir) or return; + $self->_lei_store(1)->done; + $exists //= "# leistore.dir=$dir newly initialized"; + $self->qerr($exists); +} + +1; diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm new file mode 100644 index 00000000..c388f7dc --- /dev/null +++ b/lib/PublicInbox/LeiInput.pm @@ -0,0 +1,544 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# parent class for LeiImport, LeiConvert, LeiIndex +package PublicInbox::LeiInput; +use v5.12; +use PublicInbox::DS; +use PublicInbox::Spawn qw(which popen_rd); +use PublicInbox::InboxWritable qw(eml_from_path); + +# JMAP RFC 8621 4.1.1 +# https://www.iana.org/assignments/imap-jmap-keywords/imap-jmap-keywords.xhtml +our @KW = (qw(seen answered flagged draft), # widely-compatible + qw(forwarded), # IMAP + Maildir + qw(phishing junk notjunk)); # rarely supported + +# note: RFC 8621 states "Users may add arbitrary keywords to an Email", +# but is it good idea? Stick to the system and reserved ones, for now. +# The widely-compatible ones map to IMAP system flags, Maildir flags +# and mbox Status/X-Status headers. +my %KW = map { $_ => 1 } @KW; +my $L_MAX = 244; # Xapian term limit - length('L') + +# RFC 8621, sec 2 (Mailboxes) a "label" for us is a JMAP Mailbox "name" +# "Servers MAY reject names that violate server policy" +my %ERR = ( + L => sub { + my ($label) = @_; + length($label) >= $L_MAX and + return "`$label' too long (must be <= $L_MAX)"; + $label =~ /[A-Z]/ and + return "`$label' must be lowercase"; + $label =~ m{\A[a-z0-9_](?:[a-z0-9_\-\./\@,]*[a-z0-9])?\z} ? + undef : "`$label' is invalid"; + }, + kw => sub { + my ($kw) = @_; + $KW{$kw} ? undef : <<EOM; +`$kw' is not one of: `seen', `flagged', `answered', `draft' +`junk', `notjunk', `phishing' or `forwarded' +EOM + } +); + +sub check_input_format ($;$) { + my ($lei, $files) = @_; + my $opt_key = 'in-format'; + my $fmt = $lei->{opt}->{$opt_key}; + if (!$fmt) { + my $err = $files ? "regular file(s):\n@$files" : '--stdin'; + return $lei->fail("--$opt_key unset for $err"); + } + return 1 if $fmt eq 'eml'; + require PublicInbox::MboxLock if $files; + require PublicInbox::MboxReader; + PublicInbox::MboxReader->reads($fmt) or + return $lei->fail("--$opt_key=$fmt unrecognized"); + 1; +} + +sub input_mbox_cb { # base MboxReader callback + my ($eml, $self) = @_; + $eml->header_set($_) for (qw(Status X-Status)); + $self->input_eml_cb($eml); +} + +sub input_maildir_cb { + my ($fn, $kw, $eml, $self) = @_; + $self->input_eml_cb($eml); +} + +sub input_mh_cb { + my ($dn, $n, $kw, $eml, $self) = @_; + $self->input_eml_cb($eml); +} + +sub input_net_cb { # imap_each, nntp_each cb + my ($url, $uid, $kw, $eml, $self) = @_; + $self->input_eml_cb($eml); +} + +# import a single file handle of $name +# Subclass must define ->input_eml_cb and ->input_mbox_cb +sub input_fh { + my ($self, $ifmt, $fh, $name, @args) = @_; + if ($ifmt eq 'eml') { + my $buf = eval { PublicInbox::IO::read_all $fh, 0 }; + my $e = $@; + return $self->{lei}->child_error($?, <<"") if !$fh->close || $e; +error reading $name: $! (\$?=$?) (\$@=$e) + + PublicInbox::Eml::strip_from($buf); + + # a user may feed just a body: git diff | lei rediff -U9 + if ($self->{-force_eml}) { + my $eml = PublicInbox::Eml->new($buf); + substr($buf, 0, 0) = "\n\n" if !$eml->{bdy}; + } + $self->input_eml_cb(PublicInbox::Eml->new(\$buf), @args); + } else { + # prepare_inputs already validated $ifmt + my $cb = PublicInbox::MboxReader->reads($ifmt) // + die "BUG: bad fmt=$ifmt"; + $cb->(undef, $fh, $self->can('input_mbox_cb'), $self, @args); + } +} + +# handles mboxrd endpoints described in Documentation/design_notes.txt +sub handle_http_input ($$@) { + my ($self, $url, @args) = @_; + my $lei = $self->{lei} or die 'BUG: {lei} missing'; + my $curl_opt = delete $self->{"-curl-$url"} or + die("BUG: $url curl options not prepared"); + my $uri = pop @$curl_opt; + my $curl = PublicInbox::LeiCurl->new($lei, $self->{curl}) or return; + push @$curl, '-s', @$curl_opt; + my $cmd = $curl->for_uri($lei, $uri); + $lei->qerr("# $cmd"); + my $fh = popen_rd($cmd, undef, { 2 => $lei->{2} }); + grep(/\A--compressed\z/, @$curl) or + $fh = IO::Uncompress::Gunzip->new($fh, + MultiStream => 1, AutoClose => 1); + eval { $self->input_fh('mboxrd', $fh, $url, @args) }; + my $err = $@ ? ": $@" : ''; + $lei->child_error($?, "@$cmd failed$err") if $err || $?; +} + +sub oid2eml { # git->cat_async cb + my ($bref, $oid, $type, $size, $self) = @_; + if ($type eq 'blob') { + $self->input_eml_cb(PublicInbox::Eml->new($bref)); + } else { + warn "W: $oid is type=$type\n"; + } +} + +sub each_ibx_eml_unindexed { + my ($self, $ibx, @args) = @_; + $ibx->isa('PublicInbox::Inbox') or return $self->{lei}->fail(<<EOM); +unindexed extindex $ibx->{topdir} not supported +EOM + require PublicInbox::SearchIdx; + my $n = $ibx->max_git_epoch; + my @g = defined($n) ? map { $ibx->git_epoch($_) } (0..$n) : ($ibx->git); + my $sync = { D => {}, ibx => $ibx }; # D => {} filters out deletes + my ($f, $at, $ct, $oid, $cmt); + for my $git (grep defined, @g) { + my $s = PublicInbox::SearchIdx::log2stack($sync, $git, 'HEAD'); + while (($f, $at, $ct, $oid, $cmt) = $s->pop_rec) { + $git->cat_async($oid, \&oid2eml, $self) if $f eq 'm'; + } + $git->cleanup; # wait all + } +} + +sub each_ibx_eml { + my ($self, $ibx, @args) = @_; # TODO: is @args used at all? + my $over = $ibx->over or return each_ibx_eml_unindexed(@_); + my $git = $ibx->git; + my $prev = 0; + my $smsg; + my $ids = $over->ids_after(\$prev); + while (@$ids) { + for (@$ids) { + $smsg = $over->get_art($_) // next; + $git->cat_async($smsg->{blob}, \&oid2eml, $self); + } + $ids = $over->ids_after(\$prev); + } + $git->cat_async_wait; +} + +sub input_path_url { + my ($self, $input, @args) = @_; + my $lei = $self->{lei}; + my $ifmt = lc($lei->{opt}->{'in-format'} // ''); + # TODO auto-detect? + if ($input =~ m!\Aimaps?://!i) { + $lei->{net}->imap_each($input, $self->can('input_net_cb'), + $self, @args); + return; + } elsif ($input =~ m!\A(?:nntps?|s?news)://!i) { + $lei->{net}->nntp_each($input, $self->can('input_net_cb'), + $self, @args); + return; + } elsif ($input =~ m!\Ahttps?://!i) { + handle_http_input($self, $input, @args); + return; + } + + # local-only below + my $ifmt_pfx = ''; + if ($input =~ s!\A([a-z0-9]+):!!i) { + $ifmt_pfx = "$1:"; + $ifmt = lc($1); + } elsif ($input =~ /\.(?:patch|eml)\z/i) { + $ifmt = 'eml'; + } elsif ($input =~ m{\A(?:.+)/(?:new|cur)/([^/]+)\z} && -f $input) { + my $bn = $1; + my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn); + return if index($fl, 'T') >= 0; + return $self->pmdir_cb($input, $fl) if $self->can('pmdir_cb'); + my $eml = eml_from_path($input) or return + $lei->qerr("# $input not readable"); + my $kw = PublicInbox::MdirReader::flags2kw($fl); + $self->can('input_maildir_cb')->($input, $kw, $eml, $self); + return; + } + my $devfd = $lei->path_to_fd($input) // return; + if ($devfd >= 0) { + $self->input_fh($ifmt, $lei->{$devfd}, $input, @args); + } elsif ($devfd < 0 && $input =~ m{\A(.+/)([0-9]+)\z} && -f $input) { + my ($dn, $n) = ($1, $2); + my $mhr = PublicInbox::MHreader->new($dn, $lei->{3}); + $mhr->mh_read_one($n, $self->can('input_mh_cb'), $self); + } elsif (-f $input && $ifmt eq 'eml') { + open my $fh, '<', $input or + return $lei->fail("open($input): $!"); + $self->input_fh($ifmt, $fh, $input, @args); + } elsif (-f _) { + my $m = $lei->{opt}->{'lock'} // + PublicInbox::MboxLock->defaults; + my $mbl = PublicInbox::MboxLock->acq($input, 0, $m); + my $zsfx = PublicInbox::MboxReader::zsfx($input); + if ($zsfx) { + my $in = delete $mbl->{fh}; + $mbl->{fh} = + PublicInbox::MboxReader::zsfxcat($in, $zsfx, $lei); + } + local $PublicInbox::DS::in_loop = 0 if $zsfx; # awaitpid + $self->input_fh($ifmt, $mbl->{fh}, $input, @args); + } elsif (-d _ && $ifmt eq 'maildir') { + my $mdr = PublicInbox::MdirReader->new; + if (my $pmd = $self->{pmd}) { + $mdr->maildir_each_file($input, + $pmd->can('each_mdir_fn'), + $pmd, @args); + } else { + $mdr->maildir_each_eml($input, + $self->can('input_maildir_cb'), + $self, @args); + } + } elsif (-d _ && $ifmt eq 'mh') { + my $mhr = PublicInbox::MHreader->new($input.'/', $lei->{3}); + $mhr->{sort} = $lei->{opt}->{sort} // [ 'sequence']; + $mhr->mh_each_eml($self->can('input_mh_cb'), $self, @args); + } elsif (-d _ && $ifmt =~ /\A(?:v1|v2)\z/) { + my $ibx = PublicInbox::Inbox->new({inboxdir => $input}); + each_ibx_eml($self, $ibx, @args); + } elsif (-d _ && $ifmt eq 'extindex') { + my $esrch = PublicInbox::ExtSearch->new($input); + each_ibx_eml($self, $esrch, @args); + } elsif ($self->{missing_ok} && !-e $input) { # don't ->fail + if ($lei->{cmd} eq 'p2q') { + my $fp = [ qw(git format-patch --stdout -1), $input ]; + my $rdr = { 2 => $lei->{2} }; + my $fh = popen_rd($fp, undef, $rdr); + eval { $self->input_fh('eml', $fh, $input, @args) }; + my $err = $@ ? ": $@" : ''; + $lei->child_error($?, "@$fp failed$err") if $err || $?; + } else { + $self->folder_missing("$ifmt:$input"); + } + } else { + $lei->fail("$ifmt_pfx$input unsupported (TODO)"); + } +} + +# subclasses should overrride this (see LeiRefreshMailSync) +sub folder_missing { die "BUG: ->folder_missing undefined for $_[0]" } + +sub bad_http ($$;$) { + my ($lei, $url, $alt) = @_; + my $x = $alt ? "did you mean <$alt>?" : 'download and import manually'; + $lei->fail("E: <$url> not recognized, $x"); +} + +sub prepare_http_input ($$$) { + my ($self, $lei, $url) = @_; + require URI; + require PublicInbox::MboxReader; + require PublicInbox::LeiCurl; + require IO::Uncompress::Gunzip; + $self->{curl} //= which('curl') or + return $lei->fail("curl missing for <$url>"); + my $uri = URI->new($url); + my $path = $uri->path; + my %qf = $uri->query_form; + my @curl_opt; + if ($path =~ m!/(?:t\.mbox\.gz|all\.mbox\.gz)\z!) { + # OK + } elsif ($path =~ m!/raw\z!) { + push @curl_opt, '--compressed'; + # convert search query to mboxrd request since they require POST + # this is only intended for PublicInbox::WWW, and will false-positive + # on many other search engines... oh well + } elsif (defined $qf{'q'}) { + $qf{x} = 'm'; + $uri->query_form(\%qf); + push @curl_opt, '-d', ''; + $$uri ne $url and $lei->qerr(<<""); +# <$url> rewritten to <$$uri> with HTTP POST + + # try to provide hints for /$INBOX/$MSGID/T/ and /$INBOX/ + } elsif ($path =~ s!/[tT]/\z!/t.mbox.gz! || + $path =~ s!/t\.atom\z!/t.mbox.gz! || + $path =~ s!/([^/]+\@[^/]+)/\z!/$1/raw!) { + $uri->path($path); + return bad_http($lei, $url, $$uri); + } else { + return bad_http($lei, $url); + } + $self->{"-curl-$url"} = [ @curl_opt, $uri ]; # for handle_http_input +} + +sub add_dir ($$$$) { + my ($lei, $istate, $ifmt, $input) = @_; + if ($istate->{-may_sync}) { + $$input = "$ifmt:".$lei->abs_path($$input); + push @{$istate->{-sync}->{ok}}, $$input if $istate->{-sync}; + } else { + substr($$input, 0, 0) = "$ifmt:"; # prefix + } + push @{$istate->{$ifmt}}, $$input; +} + +sub prepare_inputs { # returns undef on error + my ($self, $lei, $inputs) = @_; + my $in_fmt = $lei->{opt}->{'in-format'}; + my $sync = $lei->{opt}->{'mail-sync'} ? {} : undef; # using LeiMailSync + my $may_sync = $sync || $self->{-mail_sync}; + if ($lei->{opt}->{stdin}) { + @$inputs and return + $lei->fail("--stdin and @$inputs do not mix"); + check_input_format($lei) or return; + push @$inputs, '/dev/stdin'; + push @{$sync->{no}}, '/dev/stdin' if $sync; + } + my $net = $lei->{net}; # NetWriter may be created by l2m + my @f; + my $istate = { -sync => $sync, -may_sync => $may_sync }; + # e.g. Maildir:/home/user/Mail/ or imaps://example.com/INBOX + for my $input (@$inputs) { + my $input_path = $input; + if ($input =~ m!\A(?:imaps?|nntps?|s?news)://!i) { + require PublicInbox::NetReader; + $net //= PublicInbox::NetReader->new; + $net->add_url($input, $self->{-ls_ok}); + push @{$sync->{ok}}, $input if $sync; + } elsif ($input_path =~ m!\Ahttps?://!i) { # mboxrd.gz + # TODO: how would we detect r/w JMAP? + push @{$sync->{no}}, $input if $sync; + prepare_http_input($self, $lei, $input_path) or return; + } elsif ($input_path =~ s/\A([a-z0-9]+)://is) { + my $ifmt = lc $1; + if (($in_fmt // $ifmt) ne $ifmt) { + return $lei->fail(<<""); +--in-format=$in_fmt and `$ifmt:' conflict + + } + ($sync && $ifmt !~ /\A(?:maildir|mh)\z/i) and + push(@{$sync->{no}}, $input); + my $devfd = $lei->path_to_fd($input_path) // return; + if ($devfd >= 0 || (-f $input_path || -p _)) { + require PublicInbox::MboxLock; + require PublicInbox::MboxReader; + PublicInbox::MboxReader->reads($ifmt) or return + $lei->fail("$ifmt not supported"); + } elsif (-d $input_path) { # TODO extindex + $ifmt =~ /\A(?:maildir|mh|v1|v2|extindex)\z/ or + return$lei->fail("$ifmt not supported"); + $input = $input_path; + add_dir $lei, $istate, $ifmt, \$input; + } elsif ($self->{missing_ok} && + $ifmt =~ /\A(?:maildir|mh)\z/ && + !-e $input_path) { + # for "lei rm-watch" on missing Maildir + $may_sync and $input = "$ifmt:". + $lei->abs_path($input_path); + } else { + my $m = "Unable to handle $input"; + $input =~ /\A(?:L|kw):/ and + $m .= ", did you mean +$input?"; + return $lei->fail($m); + } + } elsif ($input =~ /\.(?:eml|patch)\z/i && -f $input) { + lc($in_fmt//'eml') eq 'eml' or return $lei->fail(<<""); +$input is `eml', not --in-format=$in_fmt + + push @{$sync->{no}}, $input if $sync; + } elsif ($input =~ m{\A(.+)/(new|cur)/([^/]+)\z} && -f $input) { + # single file in a Maildir + my ($mdir, $nc, $bn) = ($1, $2, $3); + my $other = $mdir . ($nc eq 'new' ? '/cur' : '/new'); + return $lei->fail(<<EOM) if !-d $other; +No `$other' directory for `$input' +EOM + lc($in_fmt//'eml') eq 'eml' or return $lei->fail(<<""); +$input is `eml', not --in-format=$in_fmt + + if ($sync) { + $input = $lei->abs_path($mdir) . "/$nc/$bn"; + push @{$sync->{ok}}, $input; + } + require PublicInbox::MdirReader; + } else { + my $devfd = $lei->path_to_fd($input) // return; + if ($devfd < 0 && $input =~ m{\A(.+)/([0-9]+)\z} && + -f $input) { # single file in MH dir + my ($mh, $n) = ($1, $2); + lc($in_fmt//'eml') eq 'eml' or + return $lei->fail(<<""); +$input is `eml', not --in-format=$in_fmt + + if ($sync) { + $input = $lei->abs_path($mh)."/$n"; + push @{$sync->{ok}}, $input; + } + require PublicInbox::MHreader; + } elsif ($devfd >= 0 || -f $input || -p _) { + push @{$sync->{no}}, $input if $sync; + push @f, $input; + } elsif (-d "$input/new" && -d "$input/cur") { + add_dir $lei, $istate, 'maildir', \$input; + } elsif (-e "$input/inbox.lock") { + add_dir $lei, $istate, 'v2', \$input; + } elsif (-e "$input/ssoma.lock") { + add_dir $lei, $istate, 'v1', \$input; + } elsif (-e "$input/ei.lock") { + add_dir $lei, $istate, 'extindex', \$input; + } elsif (-f "$input/.mh_sequences") { + add_dir $lei, $istate, 'mh', \$input; + } elsif ($self->{missing_ok} && !-e $input) { + if ($lei->{cmd} eq 'p2q') { + # will run "git format-patch" + } elsif ($may_sync) { # for lei rm-watch + # FIXME: support MH, here + $input = 'maildir:'. + $lei->abs_path($input); + } + } else { + return $lei->fail("Unable to handle $input") + } + } + } + if (@f) { check_input_format($lei, \@f) or return } + if ($sync && $sync->{no}) { + return $lei->fail(<<"") if !$sync->{ok}; +--mail-sync specified but no inputs support it + + # non-fatal if some inputs support support sync + warn("# --mail-sync will only be used for @{$sync->{ok}}\n"); + warn("# --mail-sync is not supported for: @{$sync->{no}}\n"); + } + if ($net) { + $net->{-can_die} = 1; + if (my $err = $net->errors($lei)) { + return $lei->fail($err); + } + $net->{quiet} = $lei->{opt}->{quiet}; + require PublicInbox::LeiAuth; + $lei->{auth} //= PublicInbox::LeiAuth->new; + $lei->{net} //= $net; + } + if (my $md = $istate->{maildir}) { + require PublicInbox::MdirReader; + if ($self->can('pmdir_cb')) { + require PublicInbox::LeiPmdir; + $self->{pmd} = PublicInbox::LeiPmdir->new($lei, $self); + } + grep(!m!\Amaildir:/!i, @$md) and die "BUG: @$md (no pfx)"; + + # start watching Maildirs ASAP + if ($may_sync && $lei->{sto}) { + $lei->lms(1)->lms_write_prepare->add_folders(@$md); + $lei->refresh_watches; + } + } + if (my $mh = $istate->{mh}) { + require PublicInbox::MHreader; + grep(!m!\Amh:!i, @$mh) and die "BUG: @$mh (no pfx)"; + if ($may_sync && $lei->{sto}) { + $lei->lms(1)->lms_write_prepare->add_folders(@$mh); + # $lei->refresh_watches; TODO + } + } + require PublicInbox::ExtSearch if $istate->{extindex}; + $self->{inputs} = $inputs; +} + +sub process_inputs { + my ($self) = @_; + my $err; + for my $input (@{$self->{inputs}}) { + eval { $self->input_path_url($input) }; + next unless $@; + $err = "$input: $@"; + last; + } + # always commit first, even on error partial work is acceptable for + # lei <import|tag|convert> + $self->{lei}->sto_barrier_request; + $self->{lei}->fail($err) if $err; +} + +sub input_only_atfork_child { + my ($self) = @_; + my $lei = $self->{lei}; + $lei->_lei_atfork_child; + PublicInbox::IPC::ipc_atfork_child($self); + $lei->{auth}->do_auth_atfork($self) if $lei->{auth}; + undef; +} + +# alias this as "net_merge_all_done" to use as an LeiAuth callback +sub input_only_net_merge_all_done { + my ($self) = @_; + $self->wq_io_do('process_inputs'); + $self->wq_close; +} + +# like Getopt::Long, but for +kw:FOO and -kw:FOO to prepare +# for update_xvmd -> update_vmd +# returns something like { "+L" => [ @Labels ], ... } +sub vmd_mod_extract { + my ($lei, $argv) = @_; + my (@new_argv, @err); + for my $x (@$argv) { + if ($x =~ /\A(\+|\-)(kw|L):(.+)\z/) { + my ($op, $pfx, $val) = ($1, $2, $3); + if (my $err = $ERR{$pfx}->($val)) { + push @err, $err; + } else { # set "+kw", "+L", "-L", "-kw" + push @{$lei->{vmd_mod}->{$op.$pfx}}, $val; + } + } else { + push @new_argv, $x; + } + } + @$argv = @new_argv; + @err; +} + +1; diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm new file mode 100644 index 00000000..576ab2c7 --- /dev/null +++ b/lib/PublicInbox/LeiInspect.pm @@ -0,0 +1,293 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei inspect" general purpose inspector for stuff in SQLite and +# Xapian. Will eventually be useful with plain public-inboxes, +# not just lei/store. This is totally half-baked at the moment +# but useful for testing. +package PublicInbox::LeiInspect; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use PublicInbox::Config; +use PublicInbox::MID qw(mids); +use PublicInbox::NetReader qw(imap_uri nntp_uri); +use PublicInbox::LeiOverview; +*iso8601 = \&PublicInbox::LeiOverview::iso8601; + +sub _json_prep ($) { + my ($smsg) = @_; + $smsg->{$_} += 0 for qw(bytes lines); # integerize + $smsg->{dt} = iso8601($smsg->{ds}) if defined($smsg->{ds}); + $smsg->{rt} = iso8601($smsg->{ts}) if defined($smsg->{ts}); + +{ %$smsg } # unbless and scalarize +} + +sub inspect_blob ($$) { + my ($lei, $oidhex) = @_; + my $ent = {}; + if (my $lse = $lei->{lse}) { + my $oidbin = pack('H*', $oidhex); + my @docids = $lse ? $lse->over->oidbin_exists($oidbin) : (); + $ent->{'lei/store'} = \@docids if @docids; + my $lms = $lei->lms; + if (my $loc = $lms ? $lms->locations_for($oidbin) : undef) { + $ent->{'mail-sync'} = $loc; + } + } + $ent; +} + +sub inspect_imap_uid ($$) { + my ($lei, $uid_uri) = @_; + my $ent = {}; + my $lms = $lei->lms or return $ent; + my @oidhex = $lms->imap_oidhex($lei, $uid_uri); + $ent->{$$uid_uri} = @oidhex == 1 ? $oidhex[0] : + ((@oidhex == 0) ? undef : \@oidhex); + $ent; +} + +sub inspect_nntp_range { + my ($lei, $uri) = @_; + my ($ng, $beg, $end) = $uri->group; + $uri = $uri->clone; + $uri->group($ng); + my $ent = {}; + my $ret = { "$uri" => $ent }; + my $lms = $lei->lms or return $ret; + my $folders = [ $$uri ]; + eval { $lms->arg2folder($lei, $folders) }; + $lei->qerr("# no folders match $$uri (non-fatal)") if $@; + $end //= $beg; + for my $art ($beg..$end) { + my @oidhex = map { unpack('H*', $_) } + $lms->num_oidbin($folders->[0], $art); + $ent->{$art} = @oidhex == 1 ? $oidhex[0] : + ((@oidhex == 0) ? undef : \@oidhex); + } + $ret; +} + +sub inspect_sync_folder ($$) { + my ($lei, $folder) = @_; + my $ent = {}; + my $lms = $lei->lms or return $ent; + my $folders = [ $folder ]; + eval { $lms->arg2folder($lei, $folders) }; + $lei->qerr("# no folders match $folder (non-fatal)") if $@; + for my $f (@$folders) { + $ent->{$f} = $lms->location_stats($f); # may be undef + } + $ent +} + +sub _inspect_doc ($$) { + my ($ent, $doc) = @_; + my $data = $doc->get_data; + $ent->{data_length} = length($data); + $ent->{description} = $doc->get_description; + $ent->{$_} = $doc->$_ for (qw(termlist_count values_count)); + my $cur = $doc->termlist_begin; + my $end = $doc->termlist_end; + for (; $cur != $end; $cur++) { + my $tn = $cur->get_termname; + $tn =~ s/\A([A-Z]+)// or warn "$tn no prefix! (???)"; + my $term = ($1 // ''); + push @{$ent->{terms}->{$term}}, $tn; + } + $cur = $doc->values_begin; + $end = $doc->values_end; + for (; $cur != $end; $cur++) { + my $n = $cur->get_valueno; + my $v = $cur->get_value; + my $iv = PublicInbox::Search::sortable_unserialise($v); + $v = $iv + 0 if defined $iv; + # not using ->[$n] since we may have large gaps in $n + $ent->{'values'}->{$n} = $v; + } + $ent; +} + +sub inspect_docid ($$;$) { + my ($lei, $docid, $ent) = @_; + require PublicInbox::Search; + $ent //= {}; + my $xdb; + if ($xdb = delete $ent->{xdb}) { # from inspect_num + } elsif (defined(my $dir = $lei->{opt}->{dir})) { + no warnings 'once'; + $xdb = $PublicInbox::Search::X{Database}->new($dir); + } elsif ($lei->{lse}) { + $xdb = $lei->{lse}->xdb; + } + $xdb or return $lei->fail('no Xapian DB'); + my $doc = $xdb->get_document($docid); # raises + $ent->{docid} = $docid; + _inspect_doc($ent, $doc); +} + +sub dir2ibx ($$) { + my ($lei, $dir) = @_; + if (-f "$dir/ei.lock") { + require PublicInbox::ExtSearch; + PublicInbox::ExtSearch->new($dir); + } elsif (-f "$dir/inbox.lock" || -d "$dir/public-inbox") { + require PublicInbox::Inbox; # v2, v1 + bless { inboxdir => $dir }, 'PublicInbox::Inbox'; + } else { + $lei->fail("no (indexed) inbox or extindex at $dir"); + } +} + +sub inspect_num ($$) { + my ($lei, $num) = @_; + my ($docid, $ibx); + my $ent = { num => $num }; + if (defined(my $dir = $lei->{opt}->{dir})) { + $ibx = dir2ibx($lei, $dir) or return; + if (my $srch = $ibx->search) { + $ent->{xdb} = $srch->xdb and + $docid = $srch->num2docid($num); + } + } elsif ($lei->{lse}) { + $ibx = $lei->{lse}; + $lei->{lse}->xdb; # set {nshard} for num2docid + $docid = $lei->{lse}->num2docid($num); + } + if ($ibx && $ibx->over) { + my $smsg = $ibx->over->get_art($num); + $ent->{smsg} = _json_prep($smsg) if $smsg; + } + defined($docid) ? inspect_docid($lei, $docid, $ent) : $ent; +} + +sub inspect_mid ($$) { + my ($lei, $mid) = @_; + my $ibx; + my $ent = { mid => $mid }; + if (defined(my $dir = $lei->{opt}->{dir})) { + $ibx = dir2ibx($lei, $dir) + } else { + $ibx = $lei->{lse}; + } + if ($ibx && $ibx->over) { + my ($id, $prev); + while (my $smsg = $ibx->over->next_by_mid($mid, \$id, \$prev)) { + push @{$ent->{smsg}}, _json_prep($smsg); + } + } + if ($ibx && $ibx->search) { + my $mset = $ibx->search->mset(qq{mid:"$mid"}); + for (sort { $a->get_docid <=> $b->get_docid } $mset->items) { + my $tmp = { docid => $_->get_docid }; + _inspect_doc($tmp, $_->get_document); + push @{$ent->{xdoc}}, $tmp; + } + } + $ent; +} + +sub inspect1 ($$$) { + my ($lei, $item, $more) = @_; + my $ent; + if ($item =~ /\Ablob:(.+)/) { + $ent = inspect_blob($lei, $1); + } elsif ($item =~ m!\A(?:maildir|mh):!i || -d $item) { + $ent = inspect_sync_folder($lei, $item); + } elsif ($item =~ m!\Adocid:([0-9]+)\z!) { + $ent = inspect_docid($lei, $1 + 0); + } elsif ($item =~ m!\Anum:([0-9]+)\z!) { + $ent = inspect_num($lei, $1 + 0); + } elsif ($item =~ m!\A(?:mid|m):(.+)\z!) { + $ent = inspect_mid($lei, $1); + } elsif (my $iuri = imap_uri($item)) { + if (defined($iuri->uid)) { + $ent = inspect_imap_uid($lei, $iuri); + } else { + $ent = inspect_sync_folder($lei, $item); + } + } elsif (my $nuri = nntp_uri($item)) { + if (defined(my $mid = $nuri->message)) { + $ent = inspect_mid($lei, $mid); + } else { + my ($group, $beg, $end) = $nuri->group; + if (defined($beg)) { + $ent = inspect_nntp_range($lei, $nuri); + } else { + $ent = inspect_sync_folder($lei, $item); + } + } + } else { # TODO: more things + return $lei->fail("$item not understood"); + } + $lei->out($lei->{json}->encode($ent)); + $lei->out(',') if $more; + 1; +} + +sub inspect_argv { # via wq_do + my ($self) = @_; + my ($lei, $argv) = delete @$self{qw(lei argv)}; + my $multi = scalar(@$argv) > 1; + $lei->{1}->autoflush(0); + $lei->out('[') if $multi; + while (defined(my $x = shift @$argv)) { + eval { inspect1($lei, $x, scalar(@$argv)) or return }; + warn "E: $@\n" if $@; + } + $lei->out(']') if $multi; +} + +sub inspect_start ($$) { + my ($lei, $argv) = @_; + my $self = bless { lei => $lei, argv => $argv }, __PACKAGE__; + my ($op_c, $ops) = $lei->workers_start($self, 1); + $lei->{wq1} = $self; + $lei->wait_wq_events($op_c, $ops); + $self->wq_do('inspect_argv'); + $self->wq_close; +} + +sub do_inspect { # lei->do_env cb + my ($lei) = @_; + my $str = delete $lei->{stdin_buf}; + PublicInbox::Eml::strip_from($str); + my $eml = PublicInbox::Eml->new(\$str); + inspect_start($lei, [ 'blob:'.$lei->git_oid($eml)->hexdigest, + map { "mid:$_" } @{mids($eml)} ]); +} + +sub lei_inspect { + my ($lei, @argv) = @_; + $lei->{json} = ref(PublicInbox::Config::json())->new->utf8->canonical; + $lei->{lse} = ($lei->{opt}->{external} // 1) ? do { + my $sto = $lei->_lei_store; + $sto ? $sto->search : undef; + } : undef; + my $isatty = -t $lei->{1}; + $lei->{json}->pretty(1)->indent(2) if $lei->{opt}->{pretty} || $isatty; + $lei->start_pager if $isatty; + if ($lei->{opt}->{stdin}) { + return $lei->fail(<<'') if @argv; +no args allowed on command-line with --stdin + + $lei->slurp_stdin(\&do_inspect); + } else { + inspect_start($lei, \@argv); + } +} + +sub _complete_inspect { + require PublicInbox::LeiRefreshMailSync; + PublicInbox::LeiRefreshMailSync::_complete_refresh_mail_sync(@_); + # TODO: message-ids?, blobs? could get expensive... +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->_lei_atfork_child; + $self->SUPER::ipc_atfork_child; +} + +1; diff --git a/lib/PublicInbox/LeiLcat.pm b/lib/PublicInbox/LeiLcat.pm new file mode 100644 index 00000000..274a9605 --- /dev/null +++ b/lib/PublicInbox/LeiLcat.pm @@ -0,0 +1,160 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# lcat: local cat, display a local message by Message-ID or blob, +# extracting from URL necessary +# "lei lcat <URL|SPEC>" +package PublicInbox::LeiLcat; +use strict; +use v5.10.1; +use PublicInbox::LeiViewText; +use URI::Escape qw(uri_unescape); +use PublicInbox::MID qw($MID_EXTRACT); + +sub lcat_folder ($$;$$) { + my ($lei, $folder, $beg, $end) = @_; + my $lms = $lei->{-lms_rw} //= $lei->lms // return; + my $folders = [ $folder ]; + eval { $lms->arg2folder($lei, $folders) }; + return $lei->child_error(0, "# unknown folder: $folder") if $@; + my %range; + if (defined($beg)) { # NNTP article range + $range{min} = $beg; + $range{max} = $end // $beg; + } + for my $f (@$folders) { + my $fid = $lms->fid_for($f); + push @{$lei->{lcat_todo}}, { fid => $fid, %range }; + } +} + +sub lcat_imap_uri ($$) { + my ($lei, $uri) = @_; + # cf. LeiXSearch->lcat_dump + my $lms = $lei->{-lms_rw} //= $lei->lms // return; + if (defined $uri->uid) { + push @{$lei->{lcat_todo}}, $lms->imap_oidhex($lei, $uri); + } elsif (defined(my $fid = $lms->fid_for($$uri))) { + push @{$lei->{lcat_todo}}, { fid => $fid }; + } else { + lcat_folder($lei, $$uri); + } +} + +sub lcat_nntp_uri ($$) { + my ($lei, $uri) = @_; + my $mid = $uri->message; # already unescaped by URI::news + return "mid:$mid" if defined($mid); + my $lms = $lei->{-lms_rw} //= $lei->lms // return; + my ($ng, $beg, $end) = $uri->group; + $uri->group($ng); + lcat_folder($lei, $$uri, $beg, $end); + '""'; +} + +sub extract_1 ($$) { + my ($lei, $x) = @_; + if ($x =~ m!\b(maildir:.+)!i) { + lcat_folder($lei, $1); + '""'; # blank query, using {lcat_todo} + } elsif ($x =~ m!\b(([a-z]+)://\S+)!i) { + my ($u, $scheme) = ($1, $2); + $u =~ s/[\>\]\)\,\.\;]+\z//; + if ($scheme =~ m!\A(imaps?)\z!i) { + require PublicInbox::URIimap; + lcat_imap_uri($lei, PublicInbox::URIimap->new($u)); + return '""'; # blank query, using {lcat_todo} + } elsif ($scheme =~ m!\A(?:nntps?|s?news)\z!i) { + require PublicInbox::URInntps; + $u = PublicInbox::URInntps->new($u); + return lcat_nntp_uri($lei, $u); + } # http, or something else: + require URI; + $u = URI->new($u); + my $p = $u->path; + my $term; + if ($p =~ m!([^/]+\@[^/]+)!) { # common msgid pattern + $term = 'mid:'.uri_unescape($1); + + # is it a URL which returns the full thread? + if ($u->scheme =~ /\Ahttps?/i && + $p =~ m!/(?:T/?|t/?|t\.mbox\.gz|t\.atom)\b!) { + + $lei->{mset_opt}->{threads} = 1; + } + } elsif ($u->scheme =~ /\Ahttps?/i && + # some msgids don't have '@', see if it looks like + # a public-inbox URL: + $p =~ m!/([^/]+)/(raw|t/?|T/?| + t\.mbox\.gz|t\.atom)\z!x) { + $lei->{mset_opt}->{threads} = 1 if $2 && $2 ne 'raw'; + $term = 'mid:'.uri_unescape($1); + } + $term; + } elsif ($x =~ $MID_EXTRACT) { # <$MSGID> + "mid:$1"; + } elsif ($x =~ /\b((?:m|mid):\S+)/) { # our own prefixes (and mairix) + $1; + } elsif ($x =~ /\bid:(\S+)/) { # notmuch convention + "mid:$1"; + } elsif ($x =~ /\bblob:([0-9a-f]{7,})\b/) { + push @{$lei->{lcat_todo}}, $1; # cf. LeiToMail->wq_atexit_child + '""'; # blank query + } else { + undef; + } +} + +sub extract_all { + my ($lei, @argv) = @_; + my $strict = !$lei->{opt}->{stdin}; + my @q; + for my $x (@argv) { + if (my $term = extract_1($lei, $x)) { + push @q, $term; + } elsif ($strict) { + return $lei->fail(<<""); +could not extract Message-ID from $x + + } + } + delete $lei->{-lms_rw}; + @q ? join(' OR ', @q) : $lei->fail("no Message-ID in: @argv"); +} + +sub do_lcat { # lei->do_env cb + my ($lei) = @_; + my @argv = split(/\s+/, delete($lei->{stdin_buf})); + $lei->{mset_opt}->{qstr} = extract_all($lei, @argv) or return; + $lei->_start_query; +} + +sub lei_lcat { + my ($lei, @argv) = @_; + my $lxs = $lei->lxs_prepare or return; + $lei->ale->refresh_externals($lxs, $lei); + $lei->_lei_store(1); + my $opt = $lei->{opt}; + my %mset_opt; + $mset_opt{asc} = $opt->{'reverse'} ? 1 : 0; + $opt->{sort} //= 'relevance'; + $mset_opt{relevance} = 1; + $lei->{mset_opt} = \%mset_opt; + $opt->{'format'} //= 'text' unless defined($opt->{output}); + if ($lei->{opt}->{stdin}) { + return $lei->fail(<<'') if @argv; +no args allowed on command-line with --stdin + + return $lei->slurp_stdin(\&do_lcat); + } + $lei->{mset_opt}->{qstr} = extract_all($lei, @argv) or return; + $lei->_start_query; +} + +sub _complete_lcat { + require PublicInbox::LeiRefreshMailSync; + PublicInbox::LeiRefreshMailSync::_complete_refresh_mail_sync(@_); + # TODO: message-ids?, blobs? could get expensive... +} + +1; diff --git a/lib/PublicInbox/LeiLsExternal.pm b/lib/PublicInbox/LeiLsExternal.pm new file mode 100644 index 00000000..2cdd0c4d --- /dev/null +++ b/lib/PublicInbox/LeiLsExternal.pm @@ -0,0 +1,34 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei ls-external" command +package PublicInbox::LeiLsExternal; +use strict; +use v5.10.1; +use PublicInbox::Config qw(glob2re); + +# TODO: does this need JSON output? +sub lei_ls_external { + my ($lei, $filter) = @_; + my $do_glob = !$lei->{opt}->{globoff}; # glob by default + my ($OFS, $ORS) = $lei->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n"); + $filter //= '*'; + my $re = $do_glob ? glob2re($filter) : undef; + $re .= '/?\\z' if defined $re; + $re //= index($filter, '/') < 0 ? + qr!/\Q$filter\E/?\z! : # exact basename match + qr/\Q$filter\E/; # grep -F semantics + my @ext = $lei->externals_each(my $boost = {}); + @ext = $lei->{opt}->{'invert-match'} ? grep(!/$re/, @ext) + : grep(/$re/, @ext); + if ($lei->{opt}->{'local'} && !$lei->{opt}->{remote}) { + @ext = grep(!m!\A[a-z\+]+://!, @ext); + } elsif ($lei->{opt}->{remote} && !$lei->{opt}->{'local'}) { + @ext = grep(m!\A[a-z\+]+://!, @ext); + } + for my $loc (@ext) { + $lei->out($loc, $OFS, 'boost=', $boost->{$loc}, $ORS); + } +} + +1; diff --git a/lib/PublicInbox/LeiLsLabel.pm b/lib/PublicInbox/LeiLsLabel.pm new file mode 100644 index 00000000..474224d4 --- /dev/null +++ b/lib/PublicInbox/LeiLsLabel.pm @@ -0,0 +1,17 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei ls-label" command +package PublicInbox::LeiLsLabel; +use strict; +use v5.10.1; + +sub lei_ls_label { # the "lei ls-label" method + my ($lei, @argv) = @_; + # TODO: document stats/counts (expensive) + my @L = eval { $lei->_lei_store->search->all_terms('L') }; + my $ORS = $lei->{opt}->{z} ? "\0" : "\n"; + $lei->out(map { $_.$ORS } @L); +} + +1; diff --git a/lib/PublicInbox/LeiLsMailSource.pm b/lib/PublicInbox/LeiLsMailSource.pm new file mode 100644 index 00000000..ab6c1e60 --- /dev/null +++ b/lib/PublicInbox/LeiLsMailSource.pm @@ -0,0 +1,126 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# command for listing NNTP groups and IMAP folders, +# handy for users with git-credential-helper configured +# TODO: list JMAP labels +package PublicInbox::LeiLsMailSource; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); + +sub input_path_url { # overrides LeiInput version + my ($self, $url) = @_; + # TODO: support ndjson and other JSONs we support elsewhere + my $lei = $self->{lei}; + my $json = $lei->{json}; + my $ORS = $self->{lei}->{opt}->{z} ? "\0" : "\n"; + my @f; + if ($url =~ m!\Aimaps?://!i) { + my $uri = PublicInbox::URIimap->new($url); + my $sec = $lei->{net}->can('uri_section')->($uri); + my $mic = $lei->{net}->mic_get($uri) or + return $lei->err("E: $uri"); + my $l = $mic->folders_hash($uri->path); # server-side filter + @$l = map { $_->[2] } # undo Schwartzian transform below: + sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] } + map { # prepare to sort -imapd slices numerically + $_->{name} =~ /\A(.+?)\.([0-9]+)\z/ ? + [ $1, $2 + 0, $_ ] : [ $_->{name}, -1, $_ ]; + } @$l; + @f = map { "$sec/$_->{name}" } @$l; + if ($json) { + $_->{url} = "$sec/$_->{name}" for @$l; + $lei->puts($json->encode($l)); + } else { + if ($self->{lei}->{opt}->{url}) { + $_->{name} = "$sec/$_->{name}" for @$l; + } + $lei->out(join($ORS, (map { $_->{name} } @$l), '')); + } + } elsif ($url =~ m!\A(?:nntps?|s?news)://!i) { + my $uri = PublicInbox::URInntps->new($url); + my $nn = $lei->{net}->nn_get($uri) or + return $lei->err("E: $uri"); + # $l = name => description + my $l = $nn->newsgroups($uri->group) // return $lei->err(<<EOM); +E: $uri LIST NEWSGROUPS: ${\($lei->{net}->ndump($nn->message))} +E: login may be required, try adding `-c nntp.debug' to your command +EOM + my $sec = $lei->{net}->can('uri_section')->($uri); + if ($json) { + my $all = $nn->list; + my @x; + for my $ng (sort keys %$l) { + my $desc = $l->{$ng}; + +# we need to drop CR ourselves iff using IO::Socket::SSL since +# Net::Cmd::getline doesn't get used by Net::NNTP if TLS is in play, noted in: +# <https://rt.cpan.org/Ticket/Display.html?id=129966> + $desc =~ s/\r\z//; + + my ($high, $low, $status) = @{$all->{$ng}}; + push @x, { name => $ng, url => "$sec/$ng", + low => $low + 0, + high => $high + 0, status => $status, + description => $desc }; + } + @f = map { "$sec/$_" } keys %$all; + $lei->puts($json->encode(\@x)); + } else { + @f = map { "$sec/$_" } keys %$l; + if ($self->{lei}->{opt}->{url}) { + $lei->out(join($ORS, sort(@f), '')); + } else { + $lei->out(join($ORS, sort(keys %$l), '')); + } + } + } else { die "BUG: $url not supported" } + if (@f) { + my $fc = $lei->url_folder_cache; + my $lk = $fc->lock_for_scope; + $fc->dbh->begin_work; + my $now = time; + $fc->set($_, $now) for @f; + $fc->dbh->commit; + } +} + +sub lei_ls_mail_source { + my ($lei, $url, $pfx) = @_; + $url =~ m!\A(?:imaps?|nntps?|s?news)://!i or return + $lei->fail('only NNTP and IMAP URLs supported'); + my $self = bless { pfx => $pfx, -ls_ok => 1 }, __PACKAGE__; + $self->{cfg} = $lei->_lei_cfg; # may be undef + $self->prepare_inputs($lei, [ $url ]) or return; + my $isatty = -t $lei->{1}; + if ($lei->{opt}->{l}) { + my $json = ref(PublicInbox::Config->json)->new->utf8->canonical; + $lei->{json} = $json; + $json->ascii(1) if $lei->{opt}->{ascii}; + $json->pretty(1)->indent(2) if $isatty || $lei->{opt}->{pretty}; + } + $lei->start_pager if $isatty; + $lei->{-err_type} = 'non-fatal'; + $lei->wq1_start($self); +} + +sub _complete_ls_mail_source { + my ($lei, @argv) = @_; + my $match_cb = $lei->complete_url_prepare(\@argv); + my @k = $lei->url_folder_cache->keys($argv[-1] // undef, 1); + my @m = map { $match_cb->($_) } @k; + my %f = map { $_ => 1 } (@m ? @m : @k); + if (my $lms = $lei->lms) { + @k = $lms->folders($argv[-1] // undef, 1); + @m = map { $match_cb->($_) } grep(m!\A[a-z]+://!, @k); + if (@m) { @f{@m} = @m } else { @f{@k} = @k } + } + keys %f; +} + +no warnings 'once'; +*ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiLsMailSync.pm b/lib/PublicInbox/LeiLsMailSync.pm new file mode 100644 index 00000000..1400d488 --- /dev/null +++ b/lib/PublicInbox/LeiLsMailSync.pm @@ -0,0 +1,31 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# front-end for the "lei ls-mail-sync" sub-command +package PublicInbox::LeiLsMailSync; +use strict; +use v5.10.1; +use PublicInbox::LeiMailSync; +use PublicInbox::Config qw(glob2re); + +sub lei_ls_mail_sync { + my ($lei, $filter) = @_; + my $lms = $lei->lms or return; + my $opt = $lei->{opt}; + my $re = $opt->{globoff} ? undef : glob2re($filter // '*'); + $re .= '/?\\z' if defined $re; + $re //= index($filter, '/') < 0 ? + qr!/\Q$filter\E/?\z! : # exact basename match + qr/\Q$filter\E/; # grep -F semantics + my @f = $lms->folders; + @f = $opt->{'invert-match'} ? grep(!/$re/, @f) : grep(/$re/, @f); + if ($opt->{'local'} && !$opt->{remote}) { + @f = grep(!m!\A[a-z\+]+://!i, @f); + } elsif ($opt->{remote} && !$opt->{'local'}) { + @f = grep(m!\A[a-z\+]+://!i, @f); + } + my $ORS = $opt->{z} ? "\0" : "\n"; + $lei->out(join($ORS, @f, '')); +} + +1; diff --git a/lib/PublicInbox/LeiLsSearch.pm b/lib/PublicInbox/LeiLsSearch.pm new file mode 100644 index 00000000..0193e590 --- /dev/null +++ b/lib/PublicInbox/LeiLsSearch.pm @@ -0,0 +1,109 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei ls-search" to display results saved via "lei q --save" +package PublicInbox::LeiLsSearch; +use strict; +use v5.10.1; +use PublicInbox::LeiSavedSearch; +use parent qw(PublicInbox::IPC); + +sub do_ls_search_long { + my ($self, $pfx) = @_; + # TODO: share common JSON output code with LeiOverview + my $json = $self->{json}->new->utf8->canonical; + my $lei = $self->{lei}; + $json->ascii(1) if $lei->{opt}->{ascii}; + my $fmt = $lei->{opt}->{'format'}; + $lei->{1}->autoflush(0); + my $ORS = "\n"; + my $pretty = $lei->{opt}->{pretty}; + my $EOR; # TODO: compact pretty like "lei q" + if ($fmt =~ /\A(concat)?json\z/ && $pretty) { + $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},"; + } + if ($fmt eq 'json') { + $lei->out('['); + $ORS = ",\n"; + } + my @x = sort(grep(/\A\Q$pfx/, PublicInbox::LeiSavedSearch::list($lei))); + while (my $x = shift @x) { + $ORS = '' if !scalar(@x); + my $lss = PublicInbox::LeiSavedSearch->up($lei, $x) or next; + my $cfg = $lss->{-cfg}; + my $ent = { + q => $cfg->get_all('lei.q'), + output => $cfg->{'lei.q.output'}, + }; + for my $k ($lss->ARRAY_FIELDS) { + my $ary = $cfg->get_all("lei.q.$k") // next; + $ent->{$k} = $ary; + } + for my $k ($lss->BOOL_FIELDS) { + my $val = $cfg->{"lei.q.$k"} // next; + $ent->{$k} = $val; + } + if (defined $EOR) { # pretty, but compact + $EOR = "\n}" if !scalar(@x); + my $buf = "{\n"; + $buf .= join(",\n", map {; + my $f = $_; + if (my $v = $ent->{$f}) { + $v = $json->encode([$v]); + qq{ "$f": }.substr($v, 1, -1); + } else { + (); + } + # key order by importance + } (qw(output q), $lss->ARRAY_FIELDS, + $lss->BOOL_FIELDS) ); + $lei->out($buf .= $EOR); + } else { + $lei->out($json->encode($ent), $ORS); + } + } + if ($fmt eq 'json') { + $lei->out("]\n"); + } elsif ($fmt eq 'concatjson') { + $lei->out("\n"); + } +} + +sub bg_worker ($$$) { + my ($lei, $pfx, $json) = @_; + my $self = bless { json => $json }, __PACKAGE__; + my ($op_c, $ops) = $lei->workers_start($self, 1); + $lei->{wq1} = $self; + $self->wq_io_do('do_ls_search_long', [], $pfx); + $self->wq_close; + $lei->wait_wq_events($op_c, $ops); +} + +sub lei_ls_search { + my ($lei, $pfx) = @_; + my $fmt = $lei->{opt}->{'format'} // ''; + if ($lei->{opt}->{l}) { + $lei->{opt}->{'format'} //= $fmt = 'json'; + } + my $json; + my $tty = -t $lei->{1}; + $lei->start_pager if $tty; + if ($fmt =~ /\A(ldjson|ndjson|jsonl|(?:concat)?json)\z/) { + $lei->{opt}->{pretty} //= $tty; + $json = ref(PublicInbox::Config->json); + } elsif ($fmt ne '') { + return $lei->fail("unknown format: $fmt"); + } + my $ORS = "\n"; + if ($lei->{opt}->{z}) { + return $lei->fail('-z and --format do not mix') if $json; + $ORS = "\0"; + } + $pfx //= ''; + return bg_worker($lei, $pfx, $json) if $json; + for (sort(grep(/\A\Q$pfx/, PublicInbox::LeiSavedSearch::list($lei)))) { + $lei->out($_, $ORS); + } +} + +1; diff --git a/lib/PublicInbox/LeiLsWatch.pm b/lib/PublicInbox/LeiLsWatch.pm new file mode 100644 index 00000000..f96dc4ec --- /dev/null +++ b/lib/PublicInbox/LeiLsWatch.pm @@ -0,0 +1,15 @@ +# Copyright all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::LeiLsWatch; +use strict; +use v5.10.1; + +sub lei_ls_watch { + my ($lei) = @_; + my $cfg = $lei->_lei_cfg or return; + my @w = (join("\n", keys %$cfg) =~ m/^watch\.(.+?)\.state$/sgm); + $lei->puts(join("\n", @w)) if @w; +} + +1; diff --git a/lib/PublicInbox/LeiMailDiff.pm b/lib/PublicInbox/LeiMailDiff.pm new file mode 100644 index 00000000..af6ecf82 --- /dev/null +++ b/lib/PublicInbox/LeiMailDiff.pm @@ -0,0 +1,44 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# The "lei mail-diff" sub-command, diffs input contents against +# the first message of input +package PublicInbox::LeiMailDiff; +use v5.12; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput PublicInbox::MailDiff); +use PublicInbox::Spawn qw(run_wait); +require PublicInbox::LeiRediff; + +sub diff_a ($$) { + my ($self, $eml) = @_; + my $dir = "$self->{tmp}/N".(++$self->{nr}); + $self->dump_eml($dir, $eml); + my $cmd = [ qw(git diff --no-index) ]; + my $lei = $self->{lei}; + PublicInbox::LeiRediff::_lei_diff_prepare($lei, $cmd); + push @$cmd, qw(-- a), "N$self->{nr}"; + my $rdr = { -C => "$self->{tmp}" }; + @$rdr{1, 2} = @$lei{1, 2}; + run_wait($cmd, $lei->{env}, $rdr) and $lei->child_error($?); +} + +sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh + my ($self, $eml) = @_; + $self->{tmp} ? diff_a($self, $eml) : $self->prep_a($eml); +} + +sub lei_mail_diff { + my ($lei, @argv) = @_; + my $self = bless {}, __PACKAGE__; + $self->prepare_inputs($lei, \@argv) or return; + my $isatty = -t $lei->{1}; + $lei->{opt}->{color} //= $isatty; + $lei->start_pager if $isatty; + $lei->{-err_type} = 'non-fatal'; + $self->{-raw_hdr} = $lei->{opt}->{'raw-header'}; + $lei->wq1_start($self); +} + +no warnings 'once'; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; +1; diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm new file mode 100644 index 00000000..c498421c --- /dev/null +++ b/lib/PublicInbox/LeiMailSync.pm @@ -0,0 +1,712 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# for maintaining synchronization between lei/store <=> Maildir|MH|IMAP|JMAP +package PublicInbox::LeiMailSync; +use strict; +use v5.10.1; +use parent qw(PublicInbox::Lock); +use PublicInbox::Compat qw(uniqstr); +use DBI qw(:sql_types); # SQL_BLOB +use PublicInbox::ContentHash qw(git_sha); +use Carp (); +use PublicInbox::Git qw(%HEXLEN2SHA); +use PublicInbox::IO qw(read_all); + +sub dbh_new { + my ($self) = @_; + my $f = $self->{filename}; + my $creat = !-s $f; + if ($creat) { + require PublicInbox::Syscall; + open my $fh, '+>>', $f or Carp::croak "open($f): $!"; + PublicInbox::Syscall::nodatacow_fh($fh); + } + my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', { + AutoCommit => 1, + RaiseError => 1, + PrintError => 0, + sqlite_use_immediate_transaction => 1, + }); + # no sqlite_unicode, here, all strings are binary + create_tables($self, $dbh); + $dbh->do('PRAGMA journal_mode = WAL') if $creat; + $dbh->do('PRAGMA case_sensitive_like = ON'); + $dbh; +} + +sub new { + my ($cls, $f) = @_; + bless { + filename => $f, + fmap => {}, + lock_path => "$f.flock", + }, $cls; +} + +sub lms_write_prepare { ($_[0]->{dbh} //= dbh_new($_[0])); $_[0] } + +sub lms_pause { + my ($self) = @_; + $self->{fmap} = {}; + my $dbh = delete $self->{dbh}; + eval { $dbh->do('PRAGMA optimize') } if $dbh; +} + +sub create_tables { + my ($self, $dbh) = @_; + my $lk = $self->lock_for_scope; + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS folders ( + fid INTEGER PRIMARY KEY, + loc VARBINARY NOT NULL, /* URL;UIDVALIDITY=$N or $TYPE:/pathname */ + UNIQUE (loc) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS blob2num ( + oidbin VARBINARY NOT NULL, + fid INTEGER NOT NULL, /* folder ID */ + uid INTEGER NOT NULL, /* NNTP article number, IMAP UID, MH number */ + /* not UNIQUE(fid, uid), since we may have broken servers */ + UNIQUE (oidbin, fid, uid) +) + + # speeds up LeiImport->ck_update_kw (for "lei import") by 5-6x: + $dbh->do(<<''); +CREATE INDEX IF NOT EXISTS idx_fid_uid ON blob2num(fid,uid) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS blob2name ( + oidbin VARBINARY NOT NULL, + fid INTEGER NOT NULL, /* folder ID */ + name VARBINARY NOT NULL, /* Maildir basename, JMAP blobId */ + /* not UNIQUE(fid, name), since we may have broken software */ + UNIQUE (oidbin, fid, name) +) + + # speeds up LeiImport->pmdir_cb (for "lei import") by ~6x: + $dbh->do(<<''); +CREATE INDEX IF NOT EXISTS idx_fid_name ON blob2name(fid,name) + +} + +# used to fixup pre-1.7.0 folders +sub update_fid ($$$) { + my ($dbh, $fid, $loc) = @_; + my $sth = $dbh->prepare(<<''); +UPDATE folders SET loc = ? WHERE fid = ? + + $sth->bind_param(1, $loc, SQL_BLOB); + $sth->bind_param(2, $fid); + $sth->execute; +} + +sub get_fid ($$$) { + my ($sth, $folder, $dbh) = @_; + $sth->bind_param(1, $folder, SQL_BLOB); + $sth->execute; + my ($fid) = $sth->fetchrow_array; + if (defined $fid) { # for downgrade+upgrade (1.8 -> 1.7 -> 1.8) + my $del = $dbh->prepare_cached(<<''); +DELETE FROM folders WHERE loc = ? AND fid != ? + + $del->execute($folder, $fid); + } else { + $sth->bind_param(1, $folder, SQL_VARCHAR); + $sth->execute; # fixup old stuff + ($fid) = $sth->fetchrow_array; + update_fid($dbh, $fid, $folder) if defined($fid); + } + $fid; +} + +sub fid_for { + my ($self, $folder, $creat) = @_; + my $dbh = $self->{dbh} //= dbh_new($self); + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT fid FROM folders WHERE loc = ? LIMIT 1 + + my $fid = get_fid($sth, $folder, $dbh); + return $fid if defined($fid); + + # caller had trailing slash (LeiToMail) + if ($folder =~ s!\A((?:maildir|mh):.*?)/+\z!$1!i) { + $fid = get_fid($sth, $folder, $dbh); + if (defined $fid) { + update_fid($dbh, $fid, $folder); + return $fid; + } + # sometimes we stored trailing slash.. + } elsif ($folder =~ m!\A(?:maildir|mh):!i) { + $fid = get_fid($sth, $folder, $dbh); + if (defined $fid) { + update_fid($dbh, $fid, $folder); + return $fid; + } + } elsif ($creat && $folder =~ m!\Aimaps?://!i) { + require PublicInbox::URIimap; + my $uri = PublicInbox::URIimap->new($folder); + $uri->uidvalidity // + Carp::croak("BUG: $folder has no UIDVALIDITY"); + defined($uri->uid) and Carp::confess("BUG: $folder has UID"); + } + + return unless $creat; + ($fid) = $dbh->selectrow_array('SELECT MAX(fid) FROM folders'); + + $fid += 1; + # in case we're reusing, clobber existing stale refs: + $dbh->do('DELETE FROM blob2name WHERE fid = ?', undef, $fid); + $dbh->do('DELETE FROM blob2num WHERE fid = ?', undef, $fid); + + $sth = $dbh->prepare('INSERT INTO folders (fid, loc) VALUES (?, ?)'); + $sth->bind_param(1, $fid); + $sth->bind_param(2, $folder, SQL_BLOB); + $sth->execute; + + $fid; +} + +sub add_folders { + my ($self, @folders) = @_; + my $lk = $self->lock_for_scope; + for my $f (@folders) { $self->{fmap}->{$f} //= fid_for($self, $f, 1) } +} + +sub set_src { + my ($self, $oidbin, $folder, $id) = @_; + my $lk = $self->lock_for_scope; + my $fid = $self->{fmap}->{$folder} //= fid_for($self, $folder, 1); + my $dbh = $self->{dbh}; + my ($sth, @param3, $del_old); + if (ref($id)) { # scalar name + @param3 = ($$id, SQL_BLOB); + $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO blob2name (oidbin, fid, name) VALUES (?, ?, ?) + + $del_old = $dbh->prepare_cached(<<''); +DELETE FROM blob2name WHERE oidbin = ? AND fid = ? AND name = ? + + } else { # numeric ID (IMAP UID, MH number) + @param3 = ($id); + $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO blob2num (oidbin, fid, uid) VALUES (?, ?, ?) + + $del_old = $dbh->prepare_cached(<<''); +DELETE FROM blob2num WHERE oidbin = ? AND fid = ? AND uid = ? + + } + $sth->bind_param(1, $oidbin, SQL_BLOB); + $sth->bind_param(2, $fid); + $sth->bind_param(3, @param3); + my $ret = $sth->execute; + $del_old->execute($oidbin, $fid, $param3[0]); + $ret; +} + +sub clear_src { + my ($self, $folder, $id) = @_; + my $lk = $self->lock_for_scope; + my $fid = $self->{fmap}->{$folder} //= fid_for($self, $folder, 1); + my ($sth, @param3); + if (ref($id)) { # scalar name + @param3 = ($$id, SQL_BLOB); + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM blob2name WHERE fid = ? AND name = ? + + } else { + @param3 = ($id); + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM blob2num WHERE fid = ? AND uid = ? + + } + $sth->bind_param(1, $fid); + $sth->bind_param(2, @param3); + my $ret = $sth->execute; + + # older versions may not have used SQL_BLOB: + if (defined($ret) && $ret == 0 && scalar(@param3) == 2) { + $sth->bind_param(1, $fid); + $sth->bind_param(2, $param3[0]); + $ret = $sth->execute; + } + $ret; +} + +# Maildir-only +sub mv_src { + my ($self, $folder, $oidbin, $id, $newbn) = @_; + my $lk = $self->lock_for_scope; + my $fid = $self->{fmap}->{$folder} //= fid_for($self, $folder, 1); + $self->{dbh}->begin_work; + my $sth = $self->{dbh}->prepare_cached(<<''); +UPDATE blob2name SET name = ? WHERE fid = ? AND oidbin = ? AND name = ? + + # eval since unique constraint may fail due to race + $sth->bind_param(1, $newbn, SQL_BLOB); + $sth->bind_param(2, $fid); + $sth->bind_param(3, $oidbin, SQL_BLOB); + $sth->bind_param(4, $$id, SQL_BLOB); + my $nr = eval { $sth->execute }; + if (!defined($nr) || $nr == 0) { # $nr may be `0E0' + # delete from old, pre-SQL_BLOB rows: + my $del_old = $self->{dbh}->prepare_cached(<<''); +DELETE FROM blob2name WHERE fid = ? AND oidbin = ? AND name = ? + + $del_old->execute($fid, $oidbin, $$id); # missing-OK + $del_old->execute($fid, $oidbin, $newbn); # ditto + + # may race with a clear_src, ensure new value exists + $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO blob2name (oidbin, fid, name) VALUES (?, ?, ?) + + $sth->bind_param(1, $oidbin, SQL_BLOB); + $sth->bind_param(2, $fid); + $sth->bind_param(3, $newbn, SQL_BLOB); + $sth->execute; + } + $self->{dbh}->commit; +} + +# read-only, iterates every oidbin + UID or name for a given folder +sub each_src { + my ($self, $folder, $cb, @args) = @_; + my $dbh = $self->{dbh} //= dbh_new($self); + my ($fid, @rng); + my $and_ge_le = ''; + if (ref($folder) eq 'HASH') { + $fid = $folder->{fid} // die "BUG: no `fid'"; + @rng = grep(defined, @$folder{qw(min max)}); + $and_ge_le = 'AND uid >= ? AND uid <= ?' if @rng; + } else { + $fid = $self->{fmap}->{$folder} //= + fid_for($self, $folder) // return; + } + + # minimize implicit txn time to avoid blocking writers by + # batching SELECTs. This looks wonky but is necessary since + # $cb-> may access the DB on its own. + my $ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng); +SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? $and_ge_le +ORDER BY _rowid_ ASC LIMIT 1000 + + my $min = @$ary ? $ary->[-1]->[0] : undef; + while (defined $min) { + for my $row (@$ary) { $cb->($row->[1], $row->[2], @args) } + + $ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng, $min); +SELECT _rowid_,oidbin,uid FROM blob2num +WHERE fid = ? $and_ge_le AND _rowid_ > ? +ORDER BY _rowid_ ASC LIMIT 1000 + + $min = @$ary ? $ary->[-1]->[0] : undef; + } + + $ary = $dbh->selectall_arrayref(<<'', undef, $fid); +SELECT _rowid_,oidbin,name FROM blob2name WHERE fid = ? +ORDER BY _rowid_ ASC LIMIT 1000 + + $min = @$ary ? $ary->[-1]->[0] : undef; + while (defined $min) { + for my $row (@$ary) { $cb->($row->[1], \($row->[2]), @args) } + + $ary = $dbh->selectall_arrayref(<<'', undef, $fid, $min); +SELECT _rowid_,oidbin,name FROM blob2name WHERE fid = ? AND _rowid_ > ? +ORDER BY _rowid_ ASC LIMIT 1000 + + $min = @$ary ? $ary->[-1]->[0] : undef; + } +} + +sub location_stats { + my ($self, $folder) = @_; + my $dbh = $self->{dbh} //= dbh_new($self); + my $fid; + my $ret = {}; + $fid = $self->{fmap}->{$folder} //= fid_for($self, $folder) // return; + my ($row) = $dbh->selectrow_array(<<"", undef, $fid); +SELECT COUNT(name) FROM blob2name WHERE fid = ? + + $ret->{'name.count'} = $row if $row; + my $ntype = ($folder =~ m!\A(?:nntps?|s?news)://!i) ? 'article' : + (($folder =~ m!\Aimaps?://!i) ? 'uid' : "TODO<$folder>"); + for my $op (qw(count min max)) { + ($row) = $dbh->selectrow_array(<<"", undef, $fid); +SELECT $op(uid) FROM blob2num WHERE fid = ? + + $row or last; + $ret->{"$ntype.$op"} = $row; + } + $ret; +} + +# must be called with lock +sub _forget_fids ($;@) { + my $dbh = shift; + $dbh->begin_work; + for my $t (qw(blob2name blob2num folders)) { + my $sth = $dbh->prepare_cached("DELETE FROM $t WHERE fid = ?"); + $sth->execute($_) for @_; + } + $dbh->commit; +} + +# returns a { location => [ list-of-ids-or-names ] } mapping +sub locations_for { + my ($self, $oidbin) = @_; + my ($fid, $sth, $id, %fid2id, %seen); + my $dbh = $self->{dbh} //= dbh_new($self); + $sth = $dbh->prepare('SELECT fid,uid FROM blob2num WHERE oidbin = ?'); + $sth->bind_param(1, $oidbin, SQL_BLOB); + $sth->execute; + while (my ($fid, $uid) = $sth->fetchrow_array) { + push @{$fid2id{$fid}}, $uid; + $seen{"$uid.$fid"} = 1; + } + + # deal with 1.7.0 DBs :< + $sth->bind_param(1, $oidbin, SQL_VARCHAR); + $sth->execute; + while (my ($fid, $uid) = $sth->fetchrow_array) { + next if $seen{"$uid.$fid"}; + push @{$fid2id{$fid}}, $uid; + } + + %seen = (); + $sth = $dbh->prepare('SELECT fid,name FROM blob2name WHERE oidbin = ?'); + $sth->bind_param(1, $oidbin, SQL_BLOB); + $sth->execute; + while (my ($fid, $name) = $sth->fetchrow_array) { + push @{$fid2id{$fid}}, $name; + $seen{"$fid.$name"} = 1; + } + + # deal with 1.7.0 DBs :< + $sth->bind_param(1, $oidbin, SQL_VARCHAR); + $sth->execute; + while (my ($fid, $name) = $sth->fetchrow_array) { + next if $seen{"$fid.$name"}; + push @{$fid2id{$fid}}, $name; + } + + $sth = $dbh->prepare('SELECT loc FROM folders WHERE fid = ? LIMIT 1'); + my $ret = {}; + my $drop_fids = $dbh->{ReadOnly} ? undef : {}; + while (my ($fid, $ids) = each %fid2id) { + $sth->execute($fid); + my ($loc) = $sth->fetchrow_array; + unless (defined $loc) { + my $del = ''; + if ($drop_fids) { + $del = ' (deleting)'; + $drop_fids->{$fid} = $fid; + } + my $oidhex = unpack('H*', $oidbin); + warn "E: fid=$fid for $oidhex stale/unknown:\n", map { + 'E: '.(ref() ? $$_ : "#$_")."$del\n"; + } @$ids; + next; + } + $ret->{$loc} = $ids; + } + if ($drop_fids && scalar(values %$drop_fids)) { + my $lk = $self->lock_for_scope; + _forget_fids($self->{dbh}, values %$drop_fids); + } + scalar(keys %$ret) ? $ret : undef; +} + +# returns a list of folders used for completion +sub folders { + my ($self, @pfx) = @_; + my $sql = 'SELECT loc FROM folders'; + my $re; + if (defined($pfx[0])) { + $sql .= ' WHERE loc REGEXP ?'; # DBD::SQLite uses perlre + if (ref($pfx[0])) { # assume qr// "Regexp" + $re = $pfx[0]; + } else { + $re = !!$pfx[1] ? '.*' : ''; + $re .= quotemeta($pfx[0]); + $re .= '.*'; + } + } + my $sth = ($self->{dbh} //= dbh_new($self))->prepare($sql); + $sth->bind_param(1, $re) if defined($re); + $sth->execute; + map { $_->[0] } @{$sth->fetchall_arrayref}; +} + +sub blob_mismatch ($$$) { + my ($f, $oidhex, $rawref) = @_; + my $sha = $HEXLEN2SHA{length($oidhex)}; + my $got = git_sha($sha, $rawref)->hexdigest; + $got eq $oidhex ? undef : warn("$f changed $oidhex => $got\n"); +} + +sub local_blob { + my ($self, $oidhex, $vrfy) = @_; + my $dbh = $self->{dbh} //= dbh_new($self); + my $oidbin = pack('H*', $oidhex); + + my $b2n = $dbh->prepare(<<''); +SELECT f.loc,b.name FROM blob2name b +LEFT JOIN folders f ON b.fid = f.fid +WHERE b.oidbin = ? + + $b2n->bind_param(1, $oidbin, SQL_BLOB); + $b2n->execute; + while (my ($d, $n) = $b2n->fetchrow_array) { + substr($d, 0, length('maildir:')) = ''; + # n.b. both mbsync and offlineimap use ":2," as a suffix + # in "new/", despite (from what I understand of reading + # <https://cr.yp.to/proto/maildir.html>), the ":2," only + # applies to files in "cur/". + my @try = $n =~ /:2,[a-zA-Z]+\z/ ? qw(cur new) : qw(new cur); + for my $x (@try) { + my $f = "$d/$x/$n"; + open my $fh, '<', $f or next; + # some (buggy) Maildir writers are non-atomic: + my $raw = read_all($fh, -s $fh // next); + next if $vrfy && blob_mismatch $f, $oidhex, \$raw; + return \$raw; + } + } + + # MH, except `uid' is not always unique (can be packed) + $b2n = $dbh->prepare(<<''); +SELECT f.loc,b.uid FROM blob2num b +LEFT JOIN folders f ON b.fid = f.fid +WHERE b.oidbin = ? AND f.loc REGEXP '^mh:/' + + $b2n->bind_param(1, $oidbin, SQL_BLOB); + $b2n->execute; + while (my ($f, $n) = $b2n->fetchrow_array) { + $f =~ s/\Amh://s or die "BUG: not MH: $f"; + $f .= "/$n"; + open my $fh, '<', $f or next; + my $raw = read_all($fh, -s $fh // next); + next if blob_mismatch $f, $oidhex, \$raw; + return \$raw; + } + undef; +} + +sub match_imap_url { + my ($self, $url, $all) = @_; # $all = [ $lms->folders ]; + $all //= [ $self->folders ]; + require PublicInbox::URIimap; + my $want = PublicInbox::URIimap->new($url)->canonical; + my ($s, $h, $mb) = ($want->scheme, $want->host, $want->mailbox); + my @uri = map { PublicInbox::URIimap->new($_)->canonical } + grep(m!\A\Q$s\E://.*?\Q$h\E\b.*?/\Q$mb\E\b!, @$all); + my @match; + for my $x (@uri) { + next if $x->mailbox ne $want->mailbox; + next if $x->host ne $want->host; + next if $x->port != $want->port; + my $x_uidval = $x->uidvalidity; + next if ($want->uidvalidity // $x_uidval) != $x_uidval; + + # allow nothing in want to possibly match ";AUTH=ANONYMOUS" + if (defined($x->auth) && !defined($want->auth) && + !defined($want->user)) { + push @match, $x; + # or maybe user was forgotten on CLI: + } elsif (defined($x->user) && !defined($want->user)) { + push @match, $x; + } elsif (($x->user//"\0") eq ($want->user//"\0")) { + push @match, $x; + } + } + return @match if wantarray; + scalar(@match) <= 1 ? $match[0] : + "E: `$url' is ambiguous:\n\t".join("\n\t", @match)."\n"; +} + +sub match_nntp_url ($$$) { + my ($self, $url, $all) = @_; # $all = [ $lms->folders ]; + $all //= [ $self->folders ]; + require PublicInbox::URInntps; + my $want = PublicInbox::URInntps->new($url)->canonical; + my ($s, $h, $p) = ($want->scheme, $want->host, $want->port); + my $ng = $want->group; # force scalar (no article ranges) + my @uri = map { PublicInbox::URInntps->new($_)->canonical } + grep(m!\A\Q$s\E://.*?\Q$h\E\b.*?/\Q$ng\E\b!, @$all); + my @match; + for my $x (@uri) { + next if $x->group ne $ng || $x->host ne $h || $x->port != $p; + # maybe user was forgotten on CLI: + if (defined($x->userinfo) && !defined($want->userinfo)) { + push @match, $x; + } elsif (($x->userinfo//"\0") eq ($want->userinfo//"\0")) { + push @match, $x; + } + } + return @match if wantarray; + scalar(@match) <= 1 ? $match[0] : + "E: `$url' is ambiguous:\n\t".join("\n\t", @match)."\n"; +} + +# returns undef on failure, number on success +sub group2folders { + my ($self, $lei, $all, $folders) = @_; + return $lei->fail(<<EOM) if @$folders; +--all= not compatible with @$folders on command-line +EOM + my %x = map { $_ => $_ } split(/,/, $all); + my @ok = grep(defined, delete(@x{qw(local remote), ''})); + push(@ok, '') if $all eq ''; + my @no = keys %x; + if (@no) { + @no = (join(',', @no)); + return $lei->fail(<<EOM); +--all=@no not accepted (must be `local' and/or `remote') +EOM + } + my @all = $self->folders; + for my $ok (@ok) { + if ($ok eq 'local') { + push @$folders, grep(!m!\A[a-z0-9\+]+://!i, @all); + } elsif ($ok eq 'remote') { + push @$folders, grep(m!\A[a-z0-9\+]+://!i, @all); + } elsif ($ok ne '') { + return $lei->fail("--all=$all not understood"); + } else { + push @$folders, @all; + } + } + @$folders = uniqstr @$folders; + scalar(@$folders) || $lei->fail(<<EOM); +no --mail-sync folders known to lei +EOM +} + +# map CLI args to folder table entries, returns undef on failure +sub arg2folder { + my ($self, $lei, $folders) = @_; + my @all = $self->folders; + my %all = map { $_ => 1 } @all; + my @no; + for (@$folders) { + next if $all{$_}; # ok + if (m!\A(maildir|mh):(.+)!i) { + my $type = lc $1; + my $d = "$type:".$lei->abs_path($2); + push(@no, $_) unless $all{$d}; + $_ = $d; + } elsif (-d "$_/new" && -d "$_/cur") { + my $d = 'maildir:'.$lei->abs_path($_); + push(@no, $_) unless $all{$d}; + $_ = $d; + } elsif (m!\Aimaps?://!i) { + my $orig = $_; + my $res = match_imap_url($self, $orig, \@all); + if (ref $res) { + $_ = $$res; + $lei->qerr(<<EOM); +# using `$res' instead of `$orig' +EOM + } else { + warn($res, "\n") if defined $res; + push @no, $orig; + } + } elsif (m!\A(?:nntps?|s?news)://!i) { + my $orig = $_; + my $res = match_nntp_url($self, $orig, \@all); + if (ref $res) { + $_ = $$res; + $lei->qerr(<<EOM); +# using `$res' instead of `$orig' +EOM + } else { + warn($res, "\n") if defined $res; + push @no, $orig; + } + } else { + push @no, $_; + } + } + if (@no) { + my $no = join("\n\t", @no); + die <<EOF; +No sync information for: $no +Run `lei ls-mail-sync' to display valid choices +EOF + } +} + +sub forget_folders { + my ($self, @folders) = @_; + my $lk = $self->lock_for_scope; + _forget_fids($self->{dbh}, map { + delete($self->{fmap}->{$_}) // + fid_for($self, $_) // (); + } @folders); +} + +# only used for changing canonicalization errors +sub rename_folder { + my ($self, $old, $new) = @_; + my $lk = $self->lock_for_scope; + my $ofid = delete($self->{fmap}->{$old}) // + fid_for($self, $old) // return; + eval { + $self->{dbh}->do(<<EOM, undef, $new, $ofid); +UPDATE folders SET loc = ? WHERE fid = ? +EOM + }; + if ($@ =~ /\bunique\b/i) { + my $nfid = $self->{fmap}->{$new} // fid_for($self, $new); + for my $t (qw(blob2name blob2num)) { + $self->{dbh}->do(<<EOM, undef, $nfid, $ofid); +UPDATE OR REPLACE $t SET fid = ? WHERE fid = ? +EOM + } + $self->{dbh}->do(<<EOM, undef, $ofid); +DELETE FROM folders WHERE fid = ? +EOM + } +} + +sub num_oidbin ($$$) { + my ($self, $url, $uid) = @_; # $url MUST have UIDVALIDITY if IMAP + my $fid = $self->{fmap}->{$url} //= fid_for($self, $url) // return (); + my $sth = $self->{dbh}->prepare_cached(<<EOM, undef, 1); +SELECT oidbin FROM blob2num WHERE fid = ? AND uid = ? ORDER BY _rowid_ +EOM + $sth->execute($fid, $uid); + # for public-inbox <= 1.7.0: + uniqstr(map { $_->[0] } @{$sth->fetchall_arrayref}); +} + +sub name_oidbin ($$$) { + my ($self, $mdir, $nm) = @_; + my $fid = $self->{fmap}->{$mdir} //= fid_for($self, $mdir) // return; + my $sth = $self->{dbh}->prepare_cached(<<EOM, undef, 1); +SELECT oidbin FROM blob2name WHERE fid = ? AND name = ? +EOM + $sth->bind_param(1, $fid); + $sth->bind_param(2, $nm, SQL_BLOB); + $sth->execute; + my @bin = map { $_->[0] } @{$sth->fetchall_arrayref}; + $sth->bind_param(1, $fid); + $sth->bind_param(2, $nm, SQL_VARCHAR); + $sth->execute; + my @old = map { $_->[0] } @{$sth->fetchall_arrayref}; + uniqstr @bin, @old # for public-inbox <= 1.7.0 +} + +sub imap_oidhex { + my ($self, $lei, $uid_uri) = @_; + my $mailbox_uri = $uid_uri->clone; + $mailbox_uri->uid(undef); + my $folders = [ $$mailbox_uri ]; + eval { $self->arg2folder($lei, $folders) }; + $lei->qerr("# no sync information for $mailbox_uri") if $@; + map { unpack('H*',$_) } num_oidbin($self, $folders->[0], $uid_uri->uid) +} + +1; diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm new file mode 100644 index 00000000..e7c265bd --- /dev/null +++ b/lib/PublicInbox/LeiMirror.pm @@ -0,0 +1,1386 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei add-external --mirror" support (also "public-inbox-clone"); +package PublicInbox::LeiMirror; +use v5.12; +use parent qw(PublicInbox::IPC); +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use IO::Compress::Gzip qw(gzip $GzipError); +use PublicInbox::Spawn qw(spawn run_wait run_die run_qx); +use PublicInbox::IO qw(write_file); +use File::Path (); +use File::Temp (); +use File::Spec (); +use Fcntl qw(SEEK_SET O_CREAT O_EXCL O_WRONLY); +use Carp qw(croak); +use URI; +use PublicInbox::Config qw(glob2re); +use PublicInbox::Inbox; +use PublicInbox::LeiCurl; +use PublicInbox::OnDestroy; +use PublicInbox::SHA qw(sha256_hex sha_all); +use POSIX qw(strftime); +use PublicInbox::Admin qw(fmt_localtime); +use autodie qw(chdir chmod close open pipe readlink + seek symlink sysopen sysseek truncate unlink); +use PublicInbox::Git qw(git_exe); + +our $LIVE; # pid => callback +our $FGRP_TODO; # objstore -> [[ to resume ], [ to clone ]] +our $TODO; # reference => [ non-fgrp mirror objects ] +our @PUH; # post-update hooks + +sub keep_going ($) { + $LIVE && (!$_[0]->{lei}->{child_error} || + $_[0]->{lei}->{opt}->{'keep-going'}); +} + +sub _wq_done_wait { # awaitpid cb (via wq_eof) + my ($pid, $mrr, $lei) = @_; + if ($?) { + $lei->child_error($?); + } elsif (!$lei->{child_error}) { + if (!$mrr->{dry_run} && $lei->{cmd} ne 'public-inbox-clone') { + require PublicInbox::LeiAddExternal; + PublicInbox::LeiAddExternal::_finish_add_external( + $lei, $mrr->{dst}); + } + $lei->qerr("# mirrored $mrr->{src} => $mrr->{dst}"); + } + $lei->dclose; +} + +# for old installations without manifest.js.gz +sub try_scrape { + my ($self, $fallback_manifest) = @_; + my $uri = URI->new($self->{src}); + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $cmd = $curl->for_uri($lei, $uri, '--compressed'); + my $opt = { 0 => $lei->{0}, 2 => $lei->{2} }; + my $html = run_qx($cmd, undef, $opt); + return $lei->child_error($?, "@$cmd failed") if $?; + + # we grep with URL below, we don't want Subject/From headers + # making us clone random URLs. This assumes remote instances + # prior to public-inbox 1.7.0 + # 5b96edcb1e0d8252 (www: move mirror instructions to /text/, 2021-08-28) + my @html = split(/<hr>/, $html); + my @urls = ($html[-1] =~ m!\bgit clone --mirror ([a-z\+]+://\S+)!g); + if (!@urls && $fallback_manifest) { + warn <<EOM; +W: failed to extract URLs from $uri, trying manifest.js.gz... +EOM + return start_clone_url($self); + } + my $url = $uri->as_string; + chop($url) eq '/' or die "BUG: $uri not canonicalized"; + + # since this is for old instances w/o manifest.js.gz, try v1 first + return clone_v1($self) if grep(m!\A\Q$url\E/*\z!, @urls); + if (my @v2_urls = grep(m!\A\Q$url\E/[0-9]+\z!, @urls)) { + my %v2_epochs = map { + my ($n) = (m!/([0-9]+)\z!); + $n => [ URI->new($_), '' ] + } @v2_urls; # uniq + clone_v2_prep($self, \%v2_epochs); + delete local $lei->{opt}->{epoch}; + clone_all($self); + return; + } + + # filter out common URLs served by WWW (e.g /$MSGID/T/) + if (@urls && $url =~ s!/+[^/]+\@[^/]+/.*\z!! && + grep(m!\A\Q$url\E/*\z!, @urls)) { + die <<""; +E: confused by scraping <$uri>, did you mean <$url>? + + } + @urls and die <<""; +E: confused by scraping <$uri>, got ambiguous results: +@urls + + die "E: scraping <$uri> revealed nothing\n"; +} + +sub clone_cmd { + my ($lei, $opt) = @_; + my @cmd = (git_exe); + $opt->{$_} = $lei->{$_} for (0..2); + # we support "-c $key=$val" for arbitrary git config options + # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 + push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []}; + push @cmd, qw(clone --mirror); + push @cmd, '-q' if $lei->{opt}->{quiet} || + ($lei->{opt}->{jobs} // 1) > 1; + push @cmd, '-v' if $lei->{opt}->{verbose}; + # XXX any other options to support? + # --reference is tricky with multiple epochs, but handled + # automatically if using manifest.js.gz + @cmd; +} + +sub ft_rename ($$$;$) { + my ($ft, $dst, $open_mode, $fh) = @_; + my @st = stat($fh // $dst); + my $mode = @st ? ($st[2] & 07777) : ($open_mode & ~umask); + chmod($mode, $ft); + require File::Copy; + File::Copy::mv($ft->filename, $dst) or croak "E: mv($ft => $dst): $!"; + $ft->unlink_on_destroy(0); +} + +sub do_reap ($;$) { + my ($self, $jobs) = @_; + $jobs //= $self->{-jobs} //= $self->{lei}->{opt}->{jobs} // 1; + $jobs = 1 if $jobs < 1; + while (keys(%$LIVE) >= $jobs) { + my $pid = waitpid(-1, 0) // die "waitpid(-1): $!"; + if (my $x = delete $LIVE->{$pid}) { + my $cb = shift @$x; + $cb->(@$x) if $cb; + } else { + warn "reaped unknown PID=$pid ($?)\n"; + } + } +} + +sub _get_txt_start { # non-fatal + my ($self, $endpoint, $fini) = @_; + my $uri = URI->new($self->{cur_src} // $self->{src}); + my $lei = $self->{lei}; + my $path = $uri->path; + chop($path) eq '/' or die "BUG: $uri not canonicalized"; + $uri->path("$path/$endpoint"); + my $f = (split(m!/!, $endpoint))[-1]; + my $ft = File::Temp->new(TEMPLATE => "$f-XXXX", TMPDIR => 1); + my $opt = { 0 => $lei->{0}, 1 => $lei->{1}, 2 => $lei->{2} }; + my $cmd = $self->{curl}->for_uri($lei, $uri, qw(--compressed -R -o), + $ft->filename); + do_reap($self); + $lei->qerr("# @$cmd"); + return if $self->{dry_run}; + $self->{"-get_txt.$endpoint"} = [ $ft, $cmd, $uri ]; + $LIVE->{spawn($cmd, undef, $opt)} = + [ \&_get_txt_done, $self, $endpoint, $fini ]; +} + +sub _get_txt_done { # returns true on error (non-fatal), undef on success + my ($self, $endpoint) = @_; + my ($fh, $cmd, $uri) = @{delete $self->{"-get_txt.$endpoint"}}; + my $cerr = $?; + $? = 0; # don't influence normal lei exit + return warn("$uri missing\n") if ($cerr >> 8) == 22; + return warn("# @$cmd failed (non-fatal)\n") if $cerr; + seek($fh, 0, SEEK_SET); + $self->{"mtime.$endpoint"} = (stat($fh))[9]; + $self->{"txt.$endpoint"} = PublicInbox::IO::read_all $fh, -s _; + undef; # success +} + +sub _write_inbox_config { + my ($self) = @_; + my $buf = delete($self->{'txt._/text/config/raw'}) // return; + my $dst = $self->{cur_dst} // $self->{dst}; + my $f = "$dst/inbox.config.example"; + my $mtime = delete $self->{'mtime._/text/config/raw'}; + if (CORE::sysopen(my $fh, $f, O_CREAT|O_EXCL|O_WRONLY)) { + print $fh $buf; + chmod(0444 & ~umask, $fh); + $fh->flush or die "flush($f): $!"; + if (defined $mtime) { + utime($mtime, $mtime, $fh) or die "utime($f): $!"; + } + } elsif (!$!{EEXIST}) { + die "open($f): $!"; + } + my $cfg = PublicInbox::Config->git_config_dump($f, + { 2 => $self->{lei}->{2} }); + my $ibx = $self->{ibx} = {}; # for indexing + for my $sec (grep(/\Apublicinbox\./, @{$cfg->{-section_order}})) { + for (qw(address newsgroup nntpmirror)) { + $ibx->{$_} = $cfg->{"$sec.$_"}; + } + } +} + +sub set_description ($) { + my ($self) = @_; + my $dst = $self->{cur_dst} // $self->{dst}; + chomp(my $orig = PublicInbox::IO::try_cat("$dst/description")); + my $d = $orig; + while (defined($d) && ($d =~ m!^\(\$INBOX_DIR/description missing\)! || + $d =~ /^Unnamed repository/ || $d !~ /\S/)) { + $d = delete($self->{'txt.description'}); + } + $d //= 'mirror of '.($self->{cur_src} // $self->{src}); + atomic_write($dst, 'description', $d."\n") if $d ne $orig; +} + +sub index_cloned_inbox { + my ($self, $iv) = @_; + my $lei = $self->{lei}; + + # n.b. public-inbox-clone works w/o (SQLite || Xapian) + # lei is useless without Xapian + SQLite + if ($lei->{cmd} ne 'public-inbox-clone') { + require PublicInbox::InboxWritable; + require PublicInbox::Admin; + my $ibx = delete($self->{ibx}) // { + address => [ 'lei@example.com' ], + version => $iv, + }; + $ibx->{inboxdir} = $self->{cur_dst} // $self->{dst}; + PublicInbox::Inbox->new($ibx); + PublicInbox::InboxWritable->new($ibx); + my $opt = {}; + for my $sw ($lei->index_opt) { + my ($k) = ($sw =~ /\A([\w-]+)/); + $opt->{$k} = $lei->{opt}->{$k}; + } + # force synchronous awaitpid for v2: + local $PublicInbox::DS::in_loop = 0; + my $cfg = PublicInbox::Config->new(undef, { 2 => $lei->{2} }); + my $env = PublicInbox::Admin::index_prepare($opt, $cfg); + local %ENV = (%ENV, %$env) if $env; + PublicInbox::Admin::progress_prepare($opt, $lei->{2}); + PublicInbox::Admin::index_inbox($ibx, undef, $opt); + } + return if defined $self->{cur_dst}; # one of many repos to clone +} + +sub run_reap { + my ($lei, $cmd, $opt) = @_; + $lei->qerr("# @$cmd"); + my $ret = run_wait($cmd, undef, $opt); + $? = 0; # don't let it influence normal exit + $ret; +} + +sub start_cmd { + my ($self, $cmd, $opt, $fini) = @_; + do_reap($self); + $self->{lei}->qerr("# @$cmd"); + return if $self->{dry_run}; + $LIVE->{spawn($cmd, undef, $opt)} = [ \&reap_cmd, $self, $cmd, $fini ] +} + +sub fetch_args ($$) { + my ($lei, $opt) = @_; + my @cmd; # (git --git-dir=...) to be added by caller + $opt->{$_} = $lei->{$_} for (0..2); + # we support "-c $key=$val" for arbitrary git config options + # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 + push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []}; + push @cmd, 'fetch'; + push @cmd, '-q' if $lei->{opt}->{quiet} || + ($lei->{opt}->{jobs} // 1) > 1; + push @cmd, '-v' if $lei->{opt}->{verbose}; + push(@cmd, '-p') if $lei->{opt}->{prune}; + PublicInbox::Git::git_version() ge v2.29.0 and + push(@cmd, '--no-write-fetch-head'); + @cmd; +} + +sub upr { # feed `git update-ref --stdin -z' verbosely + my ($lei, $w, $op, @rest) = @_; # ($ref, $oid) = @rest + $lei->qerr("# $op @rest") if $lei->{opt}->{verbose}; + print $w "$op ", join("\0", @rest, '') or die "print(w): $!"; +} + +sub start_update_ref { + my ($fgrp) = @_; + pipe(my $r, my $w); + my $cmd = [ git_exe, "--git-dir=$fgrp->{cur_dst}", + qw(update-ref --stdin -z) ]; + my $pack = on_destroy \&satellite_done, $fgrp; + start_cmd($fgrp, $cmd, { 0 => $r, 2 => $fgrp->{lei}->{2} }, $pack); + close $r; + $fgrp->{dry_run} ? undef : $w; +} + +sub upref_warn { warn "E: close(update-ref --stdin): $! (need git 1.8.5+)\n" } + +sub fgrp_update { + my ($fgrp) = @_; + return if !keep_going($fgrp); + my $srcfh = delete $fgrp->{srcfh} or return; + my $dstfh = delete $fgrp->{dstfh} or return; + seek($srcfh, 0, SEEK_SET); + seek($dstfh, 0, SEEK_SET); + my %src = map { chomp; split /\0/ } PublicInbox::IO::read_all $srcfh; + my %dst = map { chomp; split /\0/ } PublicInbox::IO::read_all $dstfh; + $srcfh = $dstfh = undef; + my $w = start_update_ref($fgrp) or return; + my $lei = $fgrp->{lei}; + my $ndel; + for my $ref (keys %dst) { + my $new = delete $src{$ref}; + my $old = $dst{$ref}; + if (defined $new) { + $new eq $old or + upr($lei, $w, 'update', $ref, $new, $old); + } else { + upr($lei, $w, 'delete', $ref, $old); + ++$ndel; + } + } + # git's ref files backend doesn't allow directory/file conflicts + # between `delete' and `create' ops: + if ($ndel && scalar(keys %src)) { + $fgrp->{-create_refs} = \%src; + } else { + while (my ($ref, $oid) = each %src) { + upr($lei, $w, 'create', $ref, $oid); + } + } + $w->close or upref_warn(); +} + +sub satellite_done { + my ($fgrp) = @_; + if (my $create = delete $fgrp->{-create_refs}) { + my $w = start_update_ref($fgrp) or return; + while (my ($ref, $oid) = each %$create) { + upr($fgrp->{lei}, $w, 'create', $ref, $oid); + } + $w->close or upref_warn(); + } else { + pack_refs($fgrp, $fgrp->{cur_dst}); + run_puh($fgrp); + } +} + +sub pack_refs { + my ($self, $git_dir) = @_; + my $cmd = [git_exe, "--git-dir=$git_dir", qw(pack-refs --all --prune)]; + start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }); +} + +sub unlink_fetch_head ($) { + my ($git_dir) = @_; + return if CORE::unlink("$git_dir/FETCH_HEAD") || $!{ENOENT}; + warn "W: unlink($git_dir/FETCH_HEAD): $!"; +} + +sub fgrpv_done { + my ($fgrpv) = @_; + return if !$LIVE; + my $first = $fgrpv->[0] // die 'BUG: no fgrpv->[0]'; + return if !keep_going($first); + unlink_fetch_head($first->{-osdir}) if !$first->{dry_run}; + pack_refs($first, $first->{-osdir}); # objstore refs always packed + for my $fgrp (@$fgrpv) { + my $rn = $fgrp->{-remote}; + my %opt = ( 2 => $fgrp->{lei}->{2} ); + my $update_ref = on_destroy \&fgrp_update, $fgrp; + my $src = [ git_exe, "--git-dir=$fgrp->{-osdir}", + 'for-each-ref', + "--format=refs/%(refname:lstrip=3)%00%(objectname)", + "refs/remotes/$rn/" ]; + open(my $sfh, '+>', undef); + $fgrp->{srcfh} = $sfh; + start_cmd($fgrp, $src, { %opt, 1 => $sfh }, $update_ref); + my $dst = [ git_exe, "--git-dir=$fgrp->{cur_dst}", + 'for-each-ref', '--format=%(refname)%00%(objectname)' ]; + open(my $dfh, '+>', undef); + $fgrp->{dstfh} = $dfh; + start_cmd($fgrp, $dst, { %opt, 1 => $dfh }, $update_ref); + } +} + +sub fgrp_fetch_all { + my ($self) = @_; + my $todo = $FGRP_TODO; + $FGRP_TODO = \'BUG on further use'; + keys(%$todo) or return; + + # Rely on the fgrptmp remote groups in the config file rather + # than listing all remotes since the remote name list may exceed + # system argv limits: + my $grp = 'fgrptmp'; + + my @git = (@{$self->{-torsocks}}, git_exe); + my $j = $self->{lei}->{opt}->{jobs}; + my $opt = {}; + my @fetch = do { + local $self->{lei}->{opt}->{jobs} = 1; + (fetch_args($self->{lei}, $opt), qw(--no-tags --multiple)); + }; + push(@fetch, "-j$j") if $j; + while (my ($osdir, $fgrp_old_new) = each %$todo) { + my $f = "$osdir/config"; + return if !keep_going($self); + my ($old, $new) = @$fgrp_old_new; + @$old = sort { $b->{-sort} <=> $a->{-sort} } @$old; + # $new is ordered by {references} + my $cmd = [ git_exe, "--git-dir=$osdir", qw(config -f), $f ]; + + # clobber settings from previous run atomically + for ("remotes.$grp", 'fetch.hideRefs') { + my $c = [ @$cmd, '--unset-all', $_ ]; + $self->{lei}->qerr("# @$c"); + next if $self->{dry_run}; + run_wait($c, undef, $opt); + die "E: @$c \$?=$?" if ($? && ($? >> 8) != 5); + } + + # permanent configs: + my $cfg = PublicInbox::Config->git_config_dump($f); + for my $fgrp (@$old, @$new) { + my $u = $fgrp->{-uri} // die 'BUG: no {-uri}'; + my $rn = $fgrp->{-remote} // die 'BUG: no {-remote}'; + for ("url=$u", "fetch=+refs/*:refs/remotes/$rn/*", + 'tagopt=--no-tags') { + my ($k, $v) = split(/=/, $_, 2); + $k = "remote.$rn.$k"; + next if ($cfg->{$k} // '') eq $v; + my $c = [@$cmd, $k, $v]; + $fgrp->{lei}->qerr("# @$c"); + next if $fgrp->{dry_run}; + run_die($c, undef, $opt); + } + } + + if (!$self->{dry_run}) { + # update the config atomically via O_APPEND while + # respecting git-config locking + sysopen(my $lk, "$f.lock", O_CREAT|O_EXCL|O_WRONLY); + open my $fh, '>>', $f; + $fh->autoflush(1); + my $buf = ''; + if (@$old) { + $buf = "[fetch]\n\thideRefs = refs\n"; + $buf .= join('', map { + "\thideRefs = !refs/remotes/" . + "$_->{-remote}/\n"; + } @$old); + } + $buf .= join('', "[remotes]\n", + (map { "\t$grp = $_->{-remote}\n" } @$old), + (map { "\t$grp = $_->{-remote}\n" } @$new)); + print $fh $buf or die "print($f): $!"; + close $fh; + unlink("$f.lock"); + } + $cmd = [ @git, "--git-dir=$osdir", @fetch, $grp ]; + push @$old, @$new; + my $end = on_destroy \&fgrpv_done, $old; + start_cmd($self, $cmd, $opt, $end); + } +} + +# keep this idempotent for future use by public-inbox-fetch +sub forkgroup_prep { + my ($self, $uri) = @_; + $self->{-ent} // return; + my $os = $self->{-objstore} // return; + my $fg = $self->{-ent}->{forkgroup} // return; + my $dir = "$os/$fg.git"; + if (!-d $dir && !$self->{dry_run}) { + PublicInbox::Import::init_bare($dir); + write_file '+>>', "$dir/config", <<EOM; +[repack] + useDeltaIslands = true +[pack] + island = refs/remotes/([^/]+)/ +EOM + } + my $key = $self->{-key} // die 'BUG: no -key'; + my $rn = substr(sha256_hex($key), 0, 16); + if (!-d $self->{cur_dst} && !$self->{dry_run}) { + PublicInbox::Import::init_bare($self->{cur_dst}); + write_file '+>>', "$self->{cur_dst}/config", <<EOM; +; rely on the "$rn" remote in the +; $fg fork group for fetches +; only uncomment the following iff you detach from fork groups +; [remote "origin"] +; url = $uri +; fetch = +refs/*:refs/* +; mirror = true +EOM + } + if (!$self->{dry_run}) { + my $alt = File::Spec->rel2abs("$dir/objects"); + my $o = "$self->{cur_dst}/objects"; + my $l = File::Spec->abs2rel($alt, File::Spec->rel2abs($o)); + open my $fh, '+>>', my $f = "$o/info/alternates"; + seek($fh, 0, SEEK_SET); # Perl did SEEK_END when it saw '>>' + my $seen = grep /\A\Q$l\E\n/, PublicInbox::IO::read_all $fh; + print $fh "$l\n" if !$seen; + close $fh; + } + bless { + %$self, -osdir => $dir, -remote => $rn, -uri => $uri + }, __PACKAGE__; +} + +sub fp_done { + my ($self, $cmd, $cb, @arg) = @_; + if ($?) { + $self->{lei}->err("@$cmd failed (\$?=$?) (non-fatal)"); + $? = 0; # don't let it influence normal exit + } + return if !keep_going($self); + my $fh = delete $self->{-show_ref} // die 'BUG: no show-ref output'; + sysseek($fh, 0, SEEK_SET); + $self->{-ent} // die 'BUG: no -ent'; + my $A = $self->{-ent}->{fingerprint} // die 'BUG: no fingerprint'; + my $B = sha_all(1, $fh)->hexdigest; + return $cb->($self, @arg) if $A ne $B; + $self->{lei}->qerr("# $self->{-key} up-to-date"); +} + +sub cmp_fp_do { + my ($self, $cb, @arg) = @_; + # $cb is either resume_fetch or fgrp_enqueue + $self->{-ent} // return $cb->($self, @arg); + my $new = $self->{-ent}->{fingerprint} // return $cb->($self, @arg); + my $key = $self->{-key} // die 'BUG: no -key'; + if (my $cur_ent = $self->{-local_manifest}->{$key}) { + # runs go_fetch->DESTROY run if eq + return if $cur_ent->{fingerprint} eq $new; + } + my $dst = $self->{cur_dst} // $self->{dst}; + my $cmd = [git_exe, "--git-dir=$dst", 'show-ref']; + my $opt = { 2 => $self->{lei}->{2} }; + open($opt->{1}, '+>', undef); + $self->{-show_ref} = $opt->{1}; + do_reap($self); + $self->{lei}->qerr("# @$cmd"); + $LIVE->{spawn($cmd, undef, $opt)} = [ \&fp_done, $self, $cmd, + $cb, @arg ]; +} + +sub resume_fetch { + my ($self, $uri, $fini) = @_; + return if !keep_going($self); + my $dst = $self->{cur_dst} // $self->{dst}; + my @git = (git_exe, "--git-dir=$dst"); + my $opt = { 2 => $self->{lei}->{2} }; + my $rn = 'random'.int(rand(1 << 30)); + for ("url=$uri", "fetch=+refs/*:refs/*", 'mirror=true') { + push @git, '-c', "remote.$rn.$_"; + } + my $cmd = [ @{$self->{-torsocks}}, @git, + fetch_args($self->{lei}, $opt), $rn ]; + push @$cmd, '-P' if $self->{lei}->{prune}; # --prune-tags implied + my $run_puh = on_destroy \&run_puh, $self, $fini; + ++$self->{chg}->{nr_chg}; + start_cmd($self, $cmd, $opt, $run_puh); +} + +sub fgrp_enqueue { + my ($fgrp, $end) = @_; # $end calls fgrp_fetch_all + return if !keep_going($fgrp); + ++$fgrp->{chg}->{nr_chg}; + my $dst = $FGRP_TODO->{$fgrp->{-osdir}} //= [ [], [] ]; # [ old, new ] + push @{$dst->[defined($fgrp->{-sort} ? 0 : 1)]}, $fgrp; +} + +sub clone_v1 { + my ($self, $end) = @_; + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $uri = URI->new($self->{cur_src} // $self->{src}); + my $path = $uri->path; + $path =~ s!/*\z!! and $uri->path($path); + defined($lei->{opt}->{epoch}) and + die "$uri is a v1 inbox, --epoch is not supported\n"; + $self->{-torsocks} //= $curl->torsocks($lei, $uri) or return; + my $dst = $self->{cur_dst} // $self->{dst}; + my $resume = -d $dst; + if ($resume) { # respect read-only cloned w/ --epoch= + my @st = stat(_); # for root + if (!-w _ || !($st[2] & 0222)) { + warn "# skipping $dst, not writable\n"; + return; + } + } + my $fini = on_destroy \&v1_done, $self; + if (my $fgrp = forkgroup_prep($self, $uri)) { + $fgrp->{-fini} = $fini; + if ($resume) { + $fgrp->{-sort} = $fgrp->{-ent}->{modified}; + cmp_fp_do($fgrp, \&fgrp_enqueue, $end); + } else { # new repo, save for last + fgrp_enqueue($fgrp, $end); + } + } elsif ($resume) { + cmp_fp_do($self, \&resume_fetch, $uri, $fini); + } else { # normal clone + my $cmd = [ @{$self->{-torsocks}}, + clone_cmd($lei, my $opt = {}), "$uri", $dst ]; + if (defined($self->{-ent})) { + if (defined(my $ref = $self->{-ent}->{reference})) { + -e "$self->{dst}$ref" and + push @$cmd, '--reference', + "$self->{dst}$ref"; + } + } + ++$self->{chg}->{nr_chg}; + start_cmd($self, $cmd, $opt, + on_destroy(\&run_puh, $self, $fini)); + } + if (!$self->{-is_epoch} && $lei->{opt}->{'inbox-config'} =~ + /\A(?:always|v1)\z/s && + !-f "$dst/inbox.config.example") { + _get_txt_start($self, '_/text/config/raw', $fini); + } + + my $d = $self->{-ent} ? $self->{-ent}->{description} : undef; + utf8::encode($self->{'txt.description'} = $d) if defined $d; + (!defined($d) && !$end) and + _get_txt_start($self, 'description', $fini); + + $end or do_reap($self, 1); # for non-manifest clone +} + +sub parse_epochs ($$) { + my ($opt_epochs, $v2_epochs) = @_; # $epochs "LOW..HIGH" + $opt_epochs // return; # undef => all epochs + my ($lo, $dotdot, $hi, @extra) = split(/(\.\.)/, $opt_epochs); + undef($lo) if ($lo // '') eq ''; + my $re = qr/\A~?[0-9]+\z/; + if (@extra || (($lo // '0') !~ $re) || + (($hi // '0') !~ $re) || + !(grep(defined, $lo, $hi))) { + die <<EOM; +--epoch=$opt_epochs not in the form of `LOW..HIGH', `LOW..', nor `..HIGH' +EOM + } + my @n = sort { $a <=> $b } keys %$v2_epochs; + for (grep(defined, $lo, $hi)) { + if (/\A[0-9]+\z/) { + $_ > $n[-1] and die +"`$_' exceeds maximum available epoch ($n[-1])\n"; + $_ < $n[0] and die +"`$_' is lower than minimum available epoch ($n[0])\n"; + } elsif (/\A~([0-9]+)/) { + my $off = -$1 - 1; + $n[$off] // die "`$_' is out of range\n"; + $_ = $n[$off]; + } else { die "`$_' not understood\n" } + } + defined($lo) && defined($hi) && $lo > $hi and die +"low value (`$lo') exceeds high (`$hi')\n"; + $lo //= $n[0] if $dotdot; + $hi //= $n[-1] if $dotdot; + $hi //= $lo; + my $want = {}; + for ($lo..$hi) { + if (defined $v2_epochs->{$_}) { + $want->{$_} = 1; + } else { + warn +"# epoch $_ is not available (non-fatal, $lo..$hi)\n"; + } + } + $want +} + +sub init_placeholder ($$$) { + my ($src, $edst, $ent) = @_; + PublicInbox::Import::init_bare($edst); + my @owner = defined($ent->{owner}) ? (<<EOM) : (); +[gitweb] + owner = $ent->{owner} +EOM + write_file '>>', "$edst/config", <<EOM, @owner; +[remote "origin"] + url = $src + fetch = +refs/*:refs/* + mirror = true + +; This git epoch was created read-only and "public-inbox-fetch" +; will not fetch updates for it unless write permission is added. +; Hint: chmod +w $edst +EOM + my %map = (head => 'HEAD', description => undef); + while (my ($key, $fn) = each %map) { + my $val = $ent->{$key} // next; + $fn //= $key; + write_file '>', "$edst/$fn", $val; + } +} + +sub reap_cmd { # async, called via SIGCHLD + my ($self, $cmd) = @_; + my $cerr = $?; + $? = 0; # don't let it influence normal exit + $self->{lei}->child_error($cerr, "@$cmd failed (\$?=$cerr)") if $cerr; +} + +sub up_fp_done { + my ($self) = @_; + return if !keep_going($self); + my $fh = delete $self->{-show_ref_up} // die 'BUG: no show-ref output'; + sysseek($fh, 0, SEEK_SET); + $self->{-ent} // die 'BUG: no -ent'; + my $A = $self->{-ent}->{fingerprint} // die 'BUG: no fingerprint'; + my $B = sha_all(1, $fh)->hexdigest; + return if $A eq $B; + $self->{-ent}->{fingerprint} = $B; + push @{$self->{chg}->{fp_mismatch}}, $self->{-key}; +} + +sub atomic_write ($$$) { + my ($dn, $bn, $raw) = @_; + my $ft = File::Temp->new(DIR => $dn, TEMPLATE => "$bn-XXXX"); + print $ft $raw; + $ft->flush or die "flush($ft): $!"; + ft_rename($ft, "$dn/$bn", 0666); +} + +sub run_next_puh { + my ($self) = @_; + my $puh = shift @{$self->{-puh_todo}} // return delete($self->{-fini}); + my $fini = on_destroy \&run_next_puh, $self; + my $cmd = [ @$puh, ($self->{cur_dst} // $self->{dst}) ]; + my $opt = +{ map { $_ => $self->{lei}->{$_} } (0..2) }; + start_cmd($self, $cmd, undef, $opt, $fini); +} + +sub run_puh { + my ($self, $fini) = @_; + $self->{-fini} = $fini; + @{$self->{-puh_todo}} = @PUH; + run_next_puh($self); +} + +# modifies the to-be-written manifest entry, and sets values from it, too +sub update_ent { + my ($self) = @_; + my $key = $self->{-key} // die 'BUG: no -key'; + my $new = $self->{-ent}->{fingerprint}; + my $cur = $self->{-local_manifest}->{$key}->{fingerprint} // "\0"; + my $dst = $self->{cur_dst} // $self->{dst}; + if (defined($new) && $new ne $cur) { + my $cmd = [git_exe, "--git-dir=$dst", 'show-ref']; + my $opt = { 2 => $self->{lei}->{2} }; + open($opt->{1}, '+>', undef); + $self->{-show_ref_up} = $opt->{1}; + my $done = on_destroy \&up_fp_done, $self; + start_cmd($self, $cmd, $opt, $done); + } + $new = $self->{-ent}->{head}; + $cur = $self->{-local_manifest}->{$key}->{head} // "\0"; + if (defined($new) && $new ne $cur) { + # n.b. grokmirror writes raw contents to $dst/HEAD w/o locking + my $cmd = [ git_exe, "--git-dir=$dst" ]; + if ($new =~ s/\Aref: //) { + push @$cmd, qw(symbolic-ref HEAD), $new; + } elsif ($new =~ /\A[a-f0-9]{40,}\z/) { + push @$cmd, qw(update-ref --no-deref HEAD), $new; + } else { + undef $cmd; + warn "W: $key: {head} => `$new' not understood\n"; + } + start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }) if $cmd; + } + if (my $symlinks = $self->{-ent}->{symlinks}) { + my $top = File::Spec->rel2abs($self->{dst}); + push @{$self->{-new_symlinks}}, @$symlinks; + for my $p (@$symlinks) { + my $ln = "$top/$p"; + $ln =~ tr!/!/!s; + my (undef, $dn, $bn) = File::Spec->splitpath($ln); + File::Path::mkpath($dn); + my $tgt = "$top/$key"; + $tgt = File::Spec->abs2rel($tgt, $dn); + if (lstat($ln)) { + if (-l _) { + next if readlink($ln) eq $tgt; + unlink($ln); + } else { + push @{$self->{chg}->{badlink}}, $p; + } + } + symlink($tgt, $ln); + ++$self->{chg}->{nr_chg}; + } + } + if (defined(my $t = $self->{-ent}->{modified})) { + my ($dn, $bn) = ("$dst/info/web", 'last-modified'); + my $orig = PublicInbox::IO::try_cat("$dn/$bn"); + $t = strftime('%F %T', gmtime($t))." +0000\n"; + File::Path::mkpath($dn); + atomic_write($dn, $bn, $t) if $orig ne $t; + } + + $new = $self->{-ent}->{owner} // return; + $cur = $self->{-local_manifest}->{$key}->{owner} // "\0"; + return if $cur eq $new; + utf8::encode($new); # to octets + my $cmd = [ git_exe, qw(config -f), "$dst/config", + 'gitweb.owner', $new ]; + start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }); +} + +sub v1_done { # called via OnDestroy + my ($self) = @_; + return if $self->{dry_run} || !keep_going($self); + _write_inbox_config($self); + my $dst = $self->{cur_dst} // $self->{dst}; + unlink_fetch_head($dst); + update_ent($self) if $self->{-ent}; + my $o = "$dst/objects"; + if (CORE::open(my $fh, '<', my $fn = "$o/info/alternates")) {; + my $base = File::Spec->rel2abs($o); + my @l = <$fh>; + my $ft; + for (@l) { + next unless m!\A/!; + $_ = File::Spec->abs2rel($_, $base); + $ft //= File::Temp->new(TEMPLATE => '.XXXX', + DIR => "$o/info"); + } + if ($ft) { + print $ft @l; + $ft->flush or die "flush($ft): $!"; + ft_rename($ft, $fn, 0666, $fh); + } + } + eval { set_description($self) }; + warn $@ if $@; + return if ($self->{-is_epoch} || + $self->{lei}->{opt}->{'inbox-config'} ne 'always'); + write_makefile($dst, 1); + index_cloned_inbox($self, 1); +} + +sub v2_done { # called via OnDestroy + my ($self) = @_; + return if $self->{dry_run} || !keep_going($self); + my $dst = $self->{cur_dst} // $self->{dst}; + require PublicInbox::Lock; + my $lk = PublicInbox::Lock->new("$dst/inbox.lock"); + my $lck = $lk->lock_for_scope; + _write_inbox_config($self); + require PublicInbox::MultiGit; + my $mg = PublicInbox::MultiGit->new($dst, 'all.git', 'git'); + $mg->fill_alternates; + for my $i ($mg->git_epochs) { $mg->epoch_cfg_set($i) } + for my $edst (@{delete($self->{-read_only}) // []}) { + my @st = stat($edst) or die "stat($edst): $!"; + chmod($st[2] & 0555, $edst); + } + write_makefile($dst, 2); + undef $lck; # unlock + eval { set_description($self) }; + warn $@ if $@; + index_cloned_inbox($self, 2); +} + +sub clone_v2_prep ($$;$) { + my ($self, $v2_epochs, $m) = @_; # $m => manifest.js.gz hashref + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $first_uri = (map { $_->[0] } values %$v2_epochs)[0]; + $self->{-torsocks} //= $curl->torsocks($lei, $first_uri) or return; + my $dst = $self->{cur_dst} // $self->{dst}; + my $want = parse_epochs($lei->{opt}->{epoch}, $v2_epochs); + my $task = $m ? bless { %$self }, __PACKAGE__ : $self; + my (@skip, $desc); + my $fini = on_destroy \&v2_done, $task; + for my $nr (sort { $a <=> $b } keys %$v2_epochs) { + my ($uri, $key) = @{$v2_epochs->{$nr}}; + my $src = $uri->as_string; + my $edst = $dst; + $src =~ m!/([0-9]+)(?:\.git)?\z! or die <<""; +failed to extract epoch number from $src + + $1 + 0 == $nr or die "BUG: <$uri> miskeyed $1 != $nr"; + $edst .= "/git/$nr.git"; + my $ent; + if ($m) { + $ent = $m->{$key} // + die("BUG: `$key' not in manifest.js.gz"); + if (defined(my $d = $ent->{description})) { + $d =~ s/ \[epoch [0-9]+\]\z//s; + $desc = $d; + } + } + if (!$want || $want->{$nr}) { + my $etask = bless { %$task, -key => $key }, __PACKAGE__; + $etask->{-ent} = $ent; # may have {reference} + $etask->{cur_src} = $src; + $etask->{cur_dst} = $edst; + $etask->{-is_epoch} = $fini; + my $ref = $ent->{reference} // ''; + push @{$TODO->{$ref}}, $etask; + $self->{any_want}->{$key} = 1; + } else { # create a placeholder so users only need to chmod +w + init_placeholder($src, $edst, $ent); + push @{$task->{-read_only}}, $edst; + push @skip, $key; + } + } + # filter out the epochs we skipped + $self->{chg}->{manifest} = 1 if $m && delete(@$m{@skip}); + + $self->{dry_run} or File::Path::mkpath($dst); + + if ($lei->{opt}->{'inbox-config'} =~ /\A(?:always|v2)\z/s && + !-f "$dst/inbox.config.example") { + _get_txt_start($task, '_/text/config/raw', $fini); + } + + defined($desc) ? ($task->{'txt.description'} = $desc) : + _get_txt_start($task, 'description', $fini); +} + +sub decode_manifest ($$$) { + my ($fh, $fn, $uri) = @_; + my $js; + my $gz = PublicInbox::IO::read_all $fh; + gunzip(\$gz => \$js, MultiStream => 1) or + die "gunzip($uri): $GunzipError\n"; + my $m = eval { PublicInbox::Config->json->decode($js) }; + die "$uri: error decoding `$js': $@\n" if $@; + ref($m) eq 'HASH' or die "$uri unknown type: ".ref($m); + $m; +} + +sub load_current_manifest ($) { + my ($self) = @_; + my $fn = $self->{-manifest} // return; + if (CORE::open(my $fh, '<', $fn)) { + decode_manifest($fh, $fn, $fn); + } elsif ($!{ENOENT}) { # non-fatal, we can just do it slowly + $self->{-initial_clone} or + warn "W: open($fn): $! (non-fatal)\n"; + undef; + } else { + die "E: open($fn): $!\n"; + } +} + +sub multi_inbox ($$$) { + my ($self, $path, $m) = @_; + my $incl = $self->{lei}->{opt}->{include}; + my $excl = $self->{lei}->{opt}->{exclude}; + + # assuming everything not v2 is v1, for now + my @v1 = sort grep(!m!.+/git/[0-9]+\.git\z!, keys %$m); + my @v2_epochs = sort grep(m!.+/git/[0-9]+\.git\z!, keys %$m); + my $v2 = {}; + + for (@v2_epochs) { + m!\A(/.+)/git/[0-9]+\.git\z! or die "BUG: $_"; + push @{$v2->{$1}}, $_; + } + my $n = scalar(keys %$v2) + scalar(@v1); + my @orig = defined($incl // $excl) ? (keys %$v2, @v1) : (); + if (defined $incl) { + my $re = '(?:'.join('\\z|', map { + glob2re($_) // qr/\A\Q$_\E/ + } @$incl).'\\z)'; + my @gone = delete @$v2{grep(!/$re/, keys %$v2)}; + delete @$m{map { @$_ } @gone} and $self->{chg}->{manifest} = 1; + delete @$m{grep(!/$re/, @v1)} and $self->{chg}->{manifest} = 1; + @v1 = grep(/$re/, @v1); + } + if (defined $excl) { + my $re = '(?:'.join('\\z|', map { + glob2re($_) // qr/\A\Q$_\E/ + } @$excl).'\\z)'; + my @gone = delete @$v2{grep(/$re/, keys %$v2)}; + delete @$m{map { @$_ } @gone} and $self->{chg}->{manifest} = 1; + delete @$m{grep(/$re/, @v1)} and $self->{chg}->{manifest} = 1; + @v1 = grep(!/$re/, @v1); + } + my $ret; # { v1 => [ ... ], v2 => { "/$inbox_name" => [ epochs ] }} + $ret->{v1} = \@v1 if @v1; + $ret->{v2} = $v2 if keys %$v2; + $ret //= @orig ? "Nothing to clone, available repositories:\n\t". + join("\n\t", sort @orig) + : "Nothing available to clone\n"; + my $path_pfx = ''; + + # PSGI mount prefixes and manifest.js.gz prefixes don't always align... + if (@v2_epochs) { + until (grep(m!\A\Q$$path\E/git/[0-9]+\.git\z!, + @v2_epochs) == @v2_epochs) { + $$path =~ s!\A(/[^/]+)/!/! or last; + $path_pfx .= $1; + } + } elsif (@v1) { + while (!defined($m->{$$path}) && $$path =~ s!\A(/[^/]+)/!/!) { + $path_pfx .= $1; + } + } + ($path_pfx, $n, $ret); +} + +sub clone_all { + my ($self, $m) = @_; + my $todo = $TODO; + $TODO = \'BUG on further use'; + my $end = on_destroy \&fgrp_fetch_all, $self; + { + my $nodep = delete $todo->{''}; + + # do not download unwanted deps + my $any_want = delete $self->{any_want}; + my @unwanted = grep { !$any_want->{$_} } keys %$todo; + my @nodep = delete(@$todo{@unwanted}); + push(@$nodep, @$_) for @nodep; + + # handle no-dependency repos, first + for (@$nodep) { + clone_v1($_, $end); + return if !keep_going($self); + } + } + # resolve references, deepest, first: + while (scalar keys %$todo) { + for my $x (keys %$todo) { + my ($nr, $nxt); + # resolve multi-level references + while ($m && defined($nxt = $m->{$x}->{reference})) { + exists($todo->{$nxt}) or last; + if (++$nr > 1000) { + $m->{$x}->{reference} = undef; + $m->{$nxt}->{reference} = undef; + warn <<EOM +E: dependency loop detected (`$x' => `$nxt'), breaking +EOM + } + $x = $nxt; + } + my $y = delete $todo->{$x} // next; # already done + for (@$y) { + clone_v1($_, $end); + return if !keep_going($self); + } + last; # restart %$todo iteration + } + } + + # $end->DESTROY will call fgrp_fetch_all once all references + # in $LIVE are gone, and do_reap will eventually drain $LIVE + $end = undef; + do_reap($self, 1); +} + +sub dump_manifest ($$) { + my ($m, $ft) = @_; + # write the smaller manifest if epochs were skipped so + # users won't have to delete manifest if they +w an + # epoch they no longer want to skip + my $json = PublicInbox::Config->json->encode($m); + my $mtime = (stat($ft))[9]; + seek($ft, 0, SEEK_SET); + truncate($ft, 0); + gzip(\$json => $ft) or die "gzip($ft): $GzipError"; + $ft->flush or die "flush($ft): $!"; + utime($mtime, $mtime, "$ft") or die "utime(..., $ft): $!"; +} + +sub dump_project_list ($$) { + my ($self, $m) = @_; + my $f = $self->{'-project-list'}; + my $old = defined($f) ? PublicInbox::IO::try_cat($f) : ''; + my %new; + + open my $dh, '<', '.'; + if (!$self->{dry_run} || -d $self->{dst}) { + chdir($self->{dst}); + } + my @local = grep { -e $_ ? ($new{$_} = undef) : 1 } split(/\n/s, $old); + chdir($dh); + + $new{substr($_, 1)} = 1 for keys %$m; # drop leading '/' + my @list = sort keys %new; + my @remote = grep { !defined($new{$_}) } @list; + my %lnk = map { substr($_, 1) => undef } @{$self->{-new_symlinks}}; + @remote = grep { !exists($lnk{$_}) } @remote; + + if (@remote) { + warn <<EOM; +The following local repositories are ignored/gone from $self->{src}: +EOM + warn "\t", $_, "\n" for @remote; + + if ($self->{lei}->{opt}->{purge} && !$self->{dry_run}) { + my $o = {}; + $o->{verbose} = 1 if $self->{lei}->{opt}->{verbose}; + my $dst = $self->{dst}; + File::Path::remove_tree(map { "$dst/$_" } @remote, $o); + my %rm = map { $_ => undef } @remote; + @list = grep { !exists($rm{$_}) } @list; + $self->{lei}->qerr('# purged '); + } + } + if (defined($f) && @local) { + warn <<EOM; +The following repos in $f no longer exist on the filesystem: +EOM + warn "\t", $_, "\n" for @local; + } + $self->{chg}->{nr_chg} += scalar(@remote) + scalar(@local); + return if !defined($f) || $self->{dry_run}; + my (undef, $dn, $bn) = File::Spec->splitpath($f); + my $new = join("\n", @list, ''); + atomic_write($dn, $bn, $new) if $new ne $old; +} + +# FIXME: this gets confused by single inbox instance w/ global manifest.js.gz +sub try_manifest { + my ($self) = @_; + my $uri = URI->new($self->{src}); + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + $self->{-torsocks} //= $curl->torsocks($lei, $uri) or return; + my $path = $uri->path; + chop($path) eq '/' or die "BUG: $uri not canonicalized"; + my $rmf = $lei->{opt}->{'remote-manifest'} // '/manifest.js.gz'; + if ($rmf =~ m!\A[^/:]+://!) { + $uri = URI->new($rmf); + } else { + $rmf = "/$rmf" if index($rmf, '/') != 0; + $uri->path($path.$rmf); + } + my $manifest = $self->{-manifest} // "$self->{dst}/manifest.js.gz"; + my %opt = (UNLINK => 1, SUFFIX => '.tmp', TMPDIR => 1); + if (!$self->{dry_run} && $manifest =~ m!\A(.+?)/[^/]+\z! and -d $1) { + $opt{DIR} = $1; # allows fast rename(2) w/o EXDEV + delete $opt{TMPDIR}; + } + my $ft = File::Temp->new(TEMPLATE => '.manifest-XXXX', %opt); + my $cmd = $curl->for_uri($lei, $uri, qw(-R -o), $ft->filename); + push(@$cmd, '-z', $manifest) if -f $manifest; + my $mf_url = "$uri"; + %opt = map { $_ => $lei->{$_} } (0..2); + my $cerr = run_reap($lei, $cmd, \%opt); + if ($cerr) { + return try_scrape($self) if ($cerr >> 8) == 22; # 404 missing + return $lei->child_error($cerr, "@$cmd failed"); + } + + # bail out if curl -z/--timecond hit 304 Not Modified, $ft will be empty + if (-f $manifest && !-s $ft) { + $lei->child_error(127 << 8) if $lei->{opt}->{'exit-code'}; + return $lei->qerr("# $manifest unchanged"); + } + + my $m = eval { decode_manifest($ft, $ft, $uri) }; + if ($@) { + warn $@; + return try_scrape($self); + } + local $self->{chg} = {}; + local $self->{-local_manifest} = load_current_manifest($self); + local $self->{-new_symlinks} = []; + my ($path_pfx, $n, $multi) = multi_inbox($self, \$path, $m); + return $lei->child_error(0, $multi) if !ref($multi); + my $v2 = delete $multi->{v2}; + if ($v2) { + for my $name (sort keys %$v2) { + my $epochs = delete $v2->{$name}; + my %v2_epochs = map { + $uri->path($n > 1 ? $path_pfx.$path.$_ + : $path_pfx.$_); + my ($e) = ("$uri" =~ m!/([0-9]+)\.git\z!); + $e // die "no [0-9]+\.git in `$uri'"; + $e => [ $uri->clone, $_ ]; + } @$epochs; + ("$uri" =~ m!\A(.+/)git/[0-9]+\.git\z!) or + die "BUG: `$uri' !~ m!/git/[0-9]+.git!"; + local $self->{cur_src} = $1; + local $self->{cur_dst} = $self->{dst}; + if ($n > 1 && $uri->path =~ m!\A\Q$path_pfx$path\E/(.+)/ + git/[0-9]+\.git\z!x) { + $self->{cur_dst} .= "/$1"; + } + index($self->{cur_dst}, "\n") >= 0 and die <<EOM; +E: `$self->{cur_dst}' must not contain newline +EOM + clone_v2_prep($self, \%v2_epochs, $m); + return if !keep_going($self); + } + } + if (my $v1 = delete $multi->{v1}) { + my $p = $path_pfx.$path; + chop($p) if substr($p, -1, 1) eq '/'; + $uri->path($p); + for my $name (@$v1) { + my $task = bless { %$self }, __PACKAGE__; + $task->{-ent} = $m->{$name} // + die("BUG: no `$name' in manifest"); + $task->{cur_src} = "$uri"; + $task->{cur_dst} = $task->{dst}; + $task->{-key} = $name; + if ($n > 1) { + $task->{cur_dst} .= $name; + $task->{cur_src} .= $name; + } + index($task->{cur_dst}, "\n") >= 0 and die <<EOM; +E: `$task->{cur_dst}' must not contain newline +EOM + $task->{cur_src} .= '/'; + my $dep = $task->{-ent}->{reference} // ''; + push @{$TODO->{$dep}}, $task; # for clone_all + $self->{any_want}->{$name} = 1; + } + } + delete local $lei->{opt}->{epoch} if defined($v2); + clone_all($self, $m); + return if !keep_going($self); + + # set by clone_v2_prep/-I/--exclude + my $mis = delete $self->{chg}->{fp_mismatch}; + if ($mis) { + my $t = fmt_localtime((stat($ft))[9]); + warn <<EOM; +W: Fingerprints for the following repositories do not match +W: $mf_url @ $t: +W: These repositories may have updated since $t: +EOM + warn "\t", $_, "\n" for @$mis; + warn <<EOM if !$self->{lei}->{opt}->{prune}; +W: The above fingerprints may never match without --prune +EOM + } + if ((delete($self->{chg}->{manifest}) || $mis) && !$self->{dry_run}) { + dump_manifest($m => $ft); + } + my $bad = delete $self->{chg}->{badlink}; + warn(<<EOM, map { ("\t", $_, "\n") } @$bad) if $bad; +W: The following exist and have not been converted to symlinks +EOM + dump_project_list($self, $m); + ft_rename($ft, $manifest, 0666) if !$self->{dry_run}; + !$self->{chg}->{nr_chg} && $lei->{opt}->{'exit-code'} and + $lei->child_error(127 << 8); +} + +sub start_clone_url { + my ($self) = @_; + return try_manifest($self) if $self->{src} =~ m!\Ahttps?://!; + die "TODO: non-HTTP/HTTPS clone of $self->{src} not supported, yet"; +} + +sub do_mirror { # via wq_io_do or public-inbox-clone + my ($self) = @_; + my $lei = $self->{lei}; + $self->{dry_run} = 1 if $lei->{opt}->{'dry-run'}; + umask($lei->{client_umask}) if defined $lei->{client_umask}; + $self->{-initial_clone} = 1 if !-d $self->{dst}; + local @PUH; + if (defined(my $puh = $lei->{opt}->{'post-update-hook'})) { + require Text::ParseWords; + @PUH = map { [ Text::ParseWords::shellwords($_) ] } @$puh; + } + eval { + my $ic = $lei->{opt}->{'inbox-config'} //= 'always'; + $ic =~ /\A(?:v1|v2|always|never)\z/s or die <<""; +--inbox-config must be one of `always', `v2', `v1', or `never' + + # we support these switches with '' (empty string). + # defaults match example conf distributed with grokmirror + my @pairs = qw(objstore objstore manifest manifest.js.gz + project-list projects.list); + while (@pairs) { + my ($k, $default) = splice(@pairs, 0, 2); + my $v = $lei->{opt}->{$k} // next; + $v = $default if $v eq ''; + $v = "$self->{dst}/$v" if $v !~ m!\A\.{0,2}/!; + $self->{"-$k"} = $v; + } + + local $LIVE = {}; + local $TODO = {}; + local $FGRP_TODO = {}; + my $iv = $lei->{opt}->{'inbox-version'} // + return start_clone_url($self); + return clone_v1($self) if $iv == 1; + die "bad --inbox-version=$iv\n" if $iv != 2; + die <<EOM if $self->{src} !~ m!://!; +cloning local v2 inboxes not supported +EOM + try_scrape($self, 1); + }; + $lei->fail($@) if $@; +} + +sub start { + my ($cls, $lei, $src, $dst) = @_; + my $self = bless { src => $src, dst => $dst }, $cls; + $lei->request_umask; + my ($op_c, $ops) = $lei->workers_start($self, 1); + $lei->{wq1} = $self; + $self->wq_io_do('do_mirror', []); + $self->wq_close; + $lei->wait_wq_events($op_c, $ops); +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->_lei_atfork_child; + $self->SUPER::ipc_atfork_child; +} + +sub write_makefile { + my ($dir, $ibx_ver) = @_; + my $f = "$dir/Makefile"; + if (CORE::sysopen my $fh, $f, O_CREAT|O_EXCL|O_WRONLY) { + print $fh <<EOM; +# This is a v$ibx_ver public-inbox, see the public-inbox-v$ibx_ver-format(5) +# manpage for more information on the format. This Makefile is +# intended as a familiar wrapper for users unfamiliar with +# public-inbox-* commands. +# +# See the respective manpages for public-inbox-fetch(1), +# public-inbox-index(1), etc for more information on +# some of the commands used by this Makefile. +# +# This Makefile will not be modified nor read by public-inbox, +# so you may edit it freely with your own convenience targets +# and notes. public-inbox-fetch will recreate it if removed. +EOM + print $fh <<'EOM'; +# the default target: +help : + @echo Common targets: + @echo ' make fetch - fetch from remote git repostorie(s)' + @echo ' make update - fetch and update index ' + @echo + @echo Rarely needed targets: + @echo ' make reindex - may be needed for new features/bugfixes' + @echo ' make compact - rewrite Xapian storage to save space' + @echo ' make index - initial index after clone' + +fetch : + public-inbox-fetch +update : + @if ! public-inbox-fetch --exit-code; \ + then \ + c=$$?; \ + test $$c -eq 127 && exit 0; \ + exit $$c; \ + elif test -f msgmap.sqlite3 || test -f public-inbox/msgmap.sqlite3; \ + then \ + public-inbox-index; \ + else \ + echo 'public-inbox index not initialized'; \ + echo 'see public-inbox-index(1) man page'; \ + fi +index : + public-inbox-index +reindex : + public-inbox-index --reindex +compact : + public-inbox-compact + +.PHONY : help fetch update index reindex compact +EOM + close $fh; + } else { + die "open($f): $!" unless $!{EEXIST}; + } +} + +1; diff --git a/lib/PublicInbox/LeiNoteEvent.pm b/lib/PublicInbox/LeiNoteEvent.pm new file mode 100644 index 00000000..8d900d0c --- /dev/null +++ b/lib/PublicInbox/LeiNoteEvent.pm @@ -0,0 +1,133 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# internal command for dealing with inotify, kqueue vnodes, etc +# it is a semi-persistent worker +package PublicInbox::LeiNoteEvent; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use PublicInbox::DS; +use Errno qw(ENOENT); + +our $to_flush; # { cfgpath => $lei } + +sub flush_lei ($;$) { + my ($lei, $manual) = @_; + my $lne = delete $lei->{cfg}->{-lei_note_event} // return; + $lne->{lei_sock} = $lei->{sock} if $manual; + $lne->wq_close; # runs _lei_wq_eof; +} + +# we batch up writes and flush every 5s (matching Linux default +# writeback behavior) since MUAs can trigger a storm of inotify events +sub flush_task { # PublicInbox::DS timer callback + my $todo = $to_flush // return; + $to_flush = undef; + for my $lei (values %$todo) { flush_lei($lei) } +} + +sub eml_event ($$$$) { + my ($self, $eml, $vmd, $state) = @_; + my $sto = $self->{lei}->{sto}; + if ($state =~ /\Aimport-(?:rw|ro)\z/) { + $sto->wq_do('set_eml', $eml, $vmd); + } elsif ($state =~ /\Aindex-(?:rw|ro)\z/) { + my $xoids = $self->{lei}->ale->xoids_for($eml); + $sto->wq_do('index_eml_only', $eml, $vmd, $xoids); + } elsif ($state =~ /\Atag-(?:rw|ro)\z/) { + my $docids = []; + my $c = eval { + $self->{lse}->kw_changed($eml, $vmd->{kw}, $docids); + } // 1; # too new, assume changed since still to-be-committed. + if (scalar @$docids) { # already in lei/store + $sto->wq_do('set_eml_vmd', undef, $vmd, $docids) if $c; + } elsif (my $xoids = $self->{lei}->ale->xoids_for($eml)) { + # it's in an external, only set kw, here + $sto->wq_do('set_xvmd', $xoids, $eml, $vmd); + } # else { totally unknown: ignore + } else { + warn "unknown state: $state (in $self->{lei}->{cfg}->{'-f'})\n"; + } +} + +sub maildir_event { # via wq_nonblock_do + my ($self, $fn, $vmd, $state) = @_; + if (my $eml = PublicInbox::InboxWritable::eml_from_path($fn)) { + eml_event($self, $eml, $vmd, $state); + } elsif ($! == ENOENT) { + $self->{lms}->clear_src(@{$vmd->{sync_info}}); + } # else: eml_from_path already warns +} + +sub _mh_cb { # mh_read_one cb + my ($dir, $bn, $kw, $eml, $self, $state) = @_; +} + +sub mh_event { # via wq_nonblock_do + my ($self, $folder, $bn, $state) = @_; + my $dir = substr($folder, 3); + require PublicInbox::MHreader; # if we forked early + my $mhr = PublicInbox::MHreader->new($dir, $self->{lei}->{3}); + $mhr->mh_read_one($bn, \&_mh_cb, $self, $state); +} + +sub lei_note_event { + my ($lei, $folder, $new_cur, $bn, $fn, @rest) = @_; + die "BUG: unexpected: @rest" if @rest; + my $cfg = $lei->_lei_cfg or return; # gone (race) + my $sto = $lei->_lei_store or return; # gone + return flush_lei($lei, 1) if $folder eq 'done'; # special case + my $lms = $lei->lms or return; + $lms->lms_write_prepare if $new_cur eq ''; # for ->clear_src below + $lei->{opt}->{quiet} = 1; + $lms->arg2folder($lei, [ $folder ]); + my $state = $cfg->get_1("watch.$folder.state") // 'tag-rw'; + return if $state eq 'pause'; + if ($new_cur eq '') { + my $id = $folder =~ /\Amaildir:/ ? \$bn : $bn + 0; + return $lms->clear_src($folder, $id); + } + $lms->lms_pause; + $lei->ale; # prepare + $sto->write_prepare($lei); + require PublicInbox::MHreader if $folder =~ /\Amh:/; # optimistic + my $self = $cfg->{-lei_note_event} //= do { + my $wq = bless { lms => $lms }, __PACKAGE__; + # MUAs such as mutt can trigger massive rename() storms so + # use some CPU, but don't overwhelm slower storage, either + my $jobs = $wq->detect_nproc // 1; + $jobs = 4 if $jobs > 4; # same default as V2Writable + my ($op_c, $ops) = $lei->workers_start($wq, $jobs); + $lei->wait_wq_events($op_c, $ops); + PublicInbox::DS::add_uniq_timer('flush_timer', 5, \&flush_task); + $to_flush->{$lei->{cfg}->{'-f'}} //= $lei; + $wq->prepare_nonblock; + $lei->{lne} = $wq; + }; + if ($folder =~ /\Amaildir:/i) { + require PublicInbox::MdirReader; + my $fl = PublicInbox::MdirReader::maildir_basename_flags($bn) + // return; + return if index($fl, 'T') >= 0; + my $kw = PublicInbox::MdirReader::flags2kw($fl); + my $vmd = { kw => $kw, sync_info => [ $folder, \$bn ] }; + $self->wq_nonblock_do('maildir_event', $fn, $vmd, $state); + } elsif ($folder =~ /\Amh:/) { + $self->wq_nonblock_do('mh_event', $folder, $bn, $state); + } # else: TODO: imap +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->_lei_atfork_child(1); # persistent, for a while + $self->{lms}->lms_write_prepare; + $self->{lse} = $self->{lei}->{sto}->search; + $self->SUPER::ipc_atfork_child; +} + +sub _lei_wq_eof { # EOF callback for main lei daemon + $_[0]->wq_eof('lne'); +} + +1; diff --git a/lib/PublicInbox/LeiOverview.pm b/lib/PublicInbox/LeiOverview.pm new file mode 100644 index 00000000..0529bbe4 --- /dev/null +++ b/lib/PublicInbox/LeiOverview.pm @@ -0,0 +1,263 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# per-mitem/smsg iterators for search results +# "ovv" => "Overview viewer" +package PublicInbox::LeiOverview; +use strict; +use v5.10.1; +use parent qw(PublicInbox::Lock); +use POSIX qw(strftime); +use Fcntl qw(F_GETFL O_APPEND); +use File::Spec; +use File::Temp (); +use PublicInbox::MID qw($MID_EXTRACT); +use PublicInbox::Address qw(pairs); +use PublicInbox::Config; +use PublicInbox::Search qw(get_pct); +use PublicInbox::LeiDedupe; +use PublicInbox::LeiToMail; + +# cf. https://en.wikipedia.org/wiki/JSON_streaming +my $JSONL = 'ldjson|ndjson|jsonl'; # 3 names for the same thing + +sub iso8601 ($) { strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($_[0])) } + +# we open this in the parent process before ->wq_io_do handoff +sub ovv_out_lk_init ($) { + my ($self) = @_; + my $tmp = File::Temp->new("lei-ovv.dst.$$.lock-XXXX", + TMPDIR => 1, UNLINK => 0); + $self->{"lk_id.$self.$$"} = $self->{lock_path} = $tmp->filename; +} + +sub ovv_out_lk_cancel ($) { + my ($self) = @_; + my $lock_path = delete $self->{"lk_id.$self.$$"} or return; + unlink($lock_path); +} + +sub detect_fmt ($) { + my ($dst) = @_; + if ($dst =~ m!\A([:/]+://)!) { + die "$1 support not implemented, yet\n"; + } elsif (!-e $dst || -d _) { # maildir is the default TODO: MH + -e "$dst/inbox.lock" ? 'v2' : 'maildir'; + } elsif (-f _ || -p _) { + die "unable to determine mbox family of $dst\n"; + } else { + die "unable to determine format of $dst\n"; + } +} + +sub new { + my ($class, $lei, $ofmt_key) = @_; + my $opt = $lei->{opt}; + my $dst = $opt->{output} // '-'; + $dst = '/dev/stdout' if $dst eq '-'; + $ofmt_key //= 'format'; + + my $fmt = $opt->{$ofmt_key}; + $fmt = lc($fmt) if defined $fmt; + if ($dst =~ m!\A([a-z0-9\+]+)://!is) { + defined($fmt) and die <<""; +--$ofmt_key=$fmt invalid with URL $dst + + $fmt = lc $1; + } elsif ($dst =~ s/\A([a-z0-9]+)://is) { # e.g. Maildir:/home/user/Mail/ + my $ofmt = lc $1; + $fmt //= $ofmt; + die <<"" if $fmt ne $ofmt; +--$ofmt_key=$fmt and --output=$ofmt conflict + + } + my $devfd = $lei->path_to_fd($dst) // return; + $fmt //= $devfd >= 0 ? 'json' : detect_fmt($dst); + + if (index($dst, '://') < 0) { # not a URL, so assume path + $dst = $lei->canonpath_harder($dst); + } # else URL + + my $self = bless { fmt => $fmt, dst => $dst }, $class; + $lei->{ovv} = $self; + my $json; + if ($fmt =~ /\A($JSONL|(?:concat)?json)\z/) { + $json = $self->{json} = ref(PublicInbox::Config->json); + } + if ($devfd >= 0) { + my $isatty = $lei->{need_pager} = -t $lei->{$devfd}; + $opt->{pretty} //= $isatty; + if (!$isatty && -f _) { + my $fl = fcntl($lei->{$devfd}, F_GETFL, 0) // + die("fcntl(/dev/fd/$devfd): $!\n"); + ovv_out_lk_init($self) unless ($fl & O_APPEND); + } else { + ovv_out_lk_init($self); + } + } elsif (!$opt->{quiet}) { + $lei->{-progress} = 1; + } + if ($json) { + $lei->{dedupe} //= PublicInbox::LeiDedupe->new($lei); + } else { + $lei->{l2m} = PublicInbox::LeiToMail->new($lei); + if ($opt->{mua} && $lei->{l2m}->lock_free) { + $lei->{early_mua} = 1; + $opt->{alert} //= [ ':WINCH,:bell' ] if -t $lei->{1}; + } + } + die("--shared is only for v2 inbox output\n") if + $self->{fmt} ne 'v2' && $lei->{opt}->{shared}; + $self; +} + +# called once by parent +sub ovv_begin { + my ($self, $lei) = @_; + if ($self->{fmt} eq 'json') { + $lei->out('['); + } # TODO HTML/Atom/... +} + +# called once by parent (via PublicInbox::PktOp '' => query_done) +sub ovv_end { + my ($self, $lei) = @_; + if ($self->{fmt} eq 'json') { + # JSON doesn't allow trailing commas, and preventing + # trailing commas is a PITA when parallelizing outputs + $lei->out("null]\n"); + } elsif ($self->{fmt} eq 'concatjson') { + $lei->out("\n"); + } +} + +# prepares an smsg for JSON +sub _unbless_smsg { + my ($smsg, $mitem) = @_; + + # TODO: make configurable + # num/tid are nonsensical with multi-inbox search, + # lines/bytes are not generally useful + delete @$smsg{qw(num tid lines bytes)}; + $smsg->{rt} = iso8601(delete $smsg->{ts}); # JMAP receivedAt + $smsg->{dt} = iso8601(delete $smsg->{ds}); # JMAP UTCDate + $smsg->{pct} = get_pct($mitem) if $mitem; + if (my $r = delete $smsg->{references}) { + @{$smsg->{refs}} = ($r =~ m/$MID_EXTRACT/go); + } + if (my $m = delete($smsg->{mid})) { + $smsg->{'m'} = $m; + } + for my $f (qw(from to cc)) { + my $v = delete $smsg->{$f} or next; + $smsg->{substr($f, 0, 1)} = pairs($v); + } + $smsg->{'s'} = delete $smsg->{subject}; + my $kw = delete($smsg->{kw}); + scalar { %$smsg, ($kw && scalar(@$kw) ? (kw => $kw) : ()) }; # unbless +} + +sub ovv_atexit_child { + my ($self, $lei) = @_; + if (my $bref = delete $lei->{ovv_buf}) { + my $lk = $self->lock_for_scope; + $lei->out($$bref); + } +} + +# JSON module ->pretty output wastes too much vertical white space, +# this (IMHO) provides better use of screen real-estate while not +# being excessively compact: +sub _json_pretty { + my ($json, $k, $v) = @_; + if (ref $v eq 'ARRAY') { + if (@$v) { + my $sep = ",\n" . (' ' x (length($k) + 7)); + if (ref($v->[0])) { # f/t/c + $v = '[' . join($sep, map { + my $pair = $json->encode($_); + $pair =~ s/(null|"),"/$1, "/g; + $pair; + } @$v) . ']'; + } elsif ($k eq 'kw') { # keywords are short, one-line + $v = $json->encode($v); + $v =~ s/","/", "/g; + } else { # refs, labels, ... + $v = '[' . join($sep, map { + substr($json->encode([$_]), 1, -1); + } @$v) . ']'; + } + } else { + $v = '[]'; + } + } + qq{ "$k": }.$v; +} + +sub ovv_each_smsg_cb { # runs in wq worker usually + my ($self, $lei) = @_; + my ($json, $dedupe); + if (my $pkg = $self->{json}) { + $json = $pkg->new; + $json->utf8->canonical; + $json->ascii(1) if $lei->{opt}->{ascii}; + } + my $l2m = $lei->{l2m}; + if (!$l2m) { + $dedupe = $lei->{dedupe} // die 'BUG: {dedupe} missing'; + $dedupe->prepare_dedupe; + } + $lei->{ovv_buf} = \(my $buf = '') if !$l2m; + if ($l2m) { + sub { + my ($smsg, $mitem, $eml) = @_; + $smsg->{pct} = get_pct($mitem) if $mitem; + eval { $l2m->wq_io_do('write_mail', [], $smsg, $eml) }; + $lei->fail($@) if $@ && !$!{ECONNRESET} && !$!{EPIPE}; + } + } elsif ($self->{fmt} =~ /\A(concat)?json\z/ && $lei->{opt}->{pretty}) { + my $EOR = ($1//'') eq 'concat' ? "\n}" : "\n},"; + my $lse = $lei->{lse}; + sub { # DIY prettiness :P + my ($smsg, $mitem) = @_; + return if $dedupe->is_smsg_dup($smsg); + $lse->xsmsg_vmd($smsg, $smsg->{L} ? undef : 1); + $smsg = _unbless_smsg($smsg, $mitem); + $buf .= "{\n"; + $buf .= join(",\n", map { + my $v = $smsg->{$_}; + if (ref($v)) { + _json_pretty($json, $_, $v); + } else { + $v = $json->encode([$v]); + qq{ "$_": }.substr($v, 1, -1); + } + } sort keys %$smsg); + $buf .= $EOR; + return if length($buf) < 65536; + my $lk = $self->lock_for_scope; + $lei->out($buf); + $buf = ''; + } + } elsif ($json) { + my $ORS = $self->{fmt} eq 'json' ? ",\n" : "\n"; # JSONL + my $lse = $lei->{lse}; + sub { + my ($smsg, $mitem) = @_; + return if $dedupe->is_smsg_dup($smsg); + $lse->xsmsg_vmd($smsg, $smsg->{L} ? undef : 1); + $buf .= $json->encode(_unbless_smsg(@_)) . $ORS; + return if length($buf) < 65536; + my $lk = $self->lock_for_scope; + $lei->out($buf); + $buf = ''; + } + } else { + die "TODO: unhandled case $self->{fmt}" + } +} + +no warnings 'once'; +*DESTROY = \&ovv_out_lk_cancel; + +1; diff --git a/lib/PublicInbox/LeiP2q.pm b/lib/PublicInbox/LeiP2q.pm new file mode 100644 index 00000000..68faa016 --- /dev/null +++ b/lib/PublicInbox/LeiP2q.pm @@ -0,0 +1,198 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# front-end for the "lei patch-to-query" sub-command +package PublicInbox::LeiP2q; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use PublicInbox::Eml; +use PublicInbox::Smsg; +use PublicInbox::MsgIter qw(msg_part_text); +use PublicInbox::Git qw(git_unquote); +use PublicInbox::OnDestroy; +use URI::Escape qw(uri_escape_utf8); +my $FN = qr!((?:"?[^/\n]+/[^\r\n]+)|/dev/null)!; + +sub xphrase ($) { + my ($s) = @_; + return () unless $s =~ /\S/; + # cf. xapian-core/queryparser/queryparser.lemony + # [\./:\\\@] - is_phrase_generator (implicit phrase search) + # FIXME not really sure about these..., we basically want to + # extract the longest phrase possible that Xapian can handle + map { + s/\A\s*//; + s/\s+\z//; + m![^\./:\\\@\-\w]! ? qq("$_") : $_ ; + } ($s =~ m!(\w[\|=><,\./:\\\@\-\w\s]+)!g); +} + +sub add_qterm ($$@) { + my ($self, $p, @v) = @_; + for (@v) { + $self->{qseen}->{"$p\0$_"} //= + push(@{$self->{qterms}->{$p}}, $_); + } +} + +sub extract_terms { # eml->each_part callback + my ($p, $self) = @_; + my $part = $p->[0]; # ignore $depth and @idx; + my $ct = $part->content_type || 'text/plain'; + my ($s, undef) = msg_part_text($part, $ct); + defined $s or return; + my $in_diff; + # TODO: b: nq: q: + for (split(/\n/, $s)) { + if ($in_diff && s/^ //) { # diff context + add_qterm($self, 'dfctx', xphrase($_)); + } elsif (/^-- $/) { # email signature begins + $in_diff = undef; + } elsif (m!^diff --git $FN $FN!) { + # wait until "---" and "+++" to capture filenames + $in_diff = 1; + } elsif (/^index ([a-f0-9]+)\.\.([a-f0-9]+)\b/) { + my ($oa, $ob) = ($1, $2); + add_qterm($self, 'dfpre', $oa); + add_qterm($self, 'dfpost', $ob); + # who uses dfblob? + } elsif (m!^(?:---|\+{3}) ($FN)!) { + next if $1 eq '/dev/null'; + my $fn = (split(m!/!, git_unquote($1.''), 2))[1]; + add_qterm($self, 'dfn', xphrase($fn)); + } elsif ($in_diff && s/^\+//) { # diff added + add_qterm($self, 'dfb', xphrase($_)); + } elsif ($in_diff && s/^-//) { # diff removed + add_qterm($self, 'dfa', xphrase($_)); + } elsif (/^@@ (?:\S+) (?:\S+) @@\s*$/) { + # traditional diff w/o -p + } elsif (/^@@ (?:\S+) (?:\S+) @@\s*(\S+.*)/) { + add_qterm($self, 'dfhh', xphrase($1)); + } elsif (/^(?:dis)similarity index/ || + /^(?:old|new) mode/ || + /^(?:deleted|new) file mode/ || + /^(?:copy|rename) (?:from|to) / || + /^(?:dis)?similarity index / || + /^\\ No newline at end of file/ || + /^Binary files .* differ/) { + } elsif ($_ eq '') { + # possible to be in diff context, some mail may be + # stripped by MUA or even GNU diff(1). "git apply" + # treats a bare "\n" as diff context, too + } else { + $in_diff = undef; + } + } +} + +my %pfx2smsg = ( + t => [ qw(to) ], + c => [ qw(cc) ], + f => [ qw(from) ], + tc => [ qw(to cc) ], + tcf => [ qw(to cc from) ], + a => [ qw(to cc from) ], + s => [ qw(subject) ], + bs => [ qw(subject) ], # body handled elsewhere + d => [ qw(ds) ], # nonsense? + dt => [ qw(ds) ], # ditto... + rt => [ qw(ts) ], # ditto... +); + +sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh + my ($self, $eml) = @_; + my $diff_want = $self->{diff_want} // do { + my $want = $self->{lei}->{opt}->{want} // [ qw(dfpost7) ]; + my @want = split(/[, ]+/, "@$want"); + for (@want) { + /\A(?:(d|dt|rt):)?([0-9]+)(\.(?:day|weeks)s?)?\z/ + or next; + my ($pfx, $n, $unit) = ($1, $2, $3); + $n *= 86400 * ($unit =~ /week/i ? 7 : 1); + $_ = [ $pfx, $n ]; + } + $self->{want_order} = \@want; + $self->{diff_want} = +{ map { $_ => 1 } @want }; + }; + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->populate($eml); + while (my ($pfx, $fields) = each %pfx2smsg) { + next unless $diff_want->{$pfx}; + for my $f (@$fields) { + my $v = $smsg->{$f} // next; + add_qterm($self, $pfx, xphrase($v)); + } + } + $eml->each_part(\&extract_terms, $self, 1); +} + +sub emit_query { + my ($self) = @_; + my $lei = $self->{lei}; + if ($lei->{opt}->{debug}) { + my $json = ref(PublicInbox::Config->json)->new; + $json->utf8->canonical->pretty; + print { $lei->{2} } $json->encode($self->{qterms}); + } + my (@q, %seen); + for my $pfx (@{$self->{want_order}}) { + if (ref($pfx) eq 'ARRAY') { + my ($p, $t_range) = @$pfx; # TODO + + } elsif ($pfx =~ m!\A(?:OR|XOR|AND|NOT)\z! || + $pfx =~ m!\A(?:ADJ|NEAR)(?:/[0-9]+)?\z!) { + push @q, $pfx; + } else { + my $plusminus = ($pfx =~ s/\A([\+\-])//) ? $1 : ''; + my $end = ($pfx =~ s/([0-9\*]+)\z//) ? $1 : ''; + my $x = delete($self->{qterms}->{$pfx}) or next; + my $star = $end =~ tr/*//d ? '*' : ''; + my $min_len = ($end || 0) + 0; + + # no wildcards for bool_pfx_external + $star = '' if $pfx =~ /\A(dfpre|dfpost|mid)\z/; + $pfx = "$plusminus$pfx:"; + if ($min_len) { + push @q, map { + my @t = ($pfx.$_.$star); + while (length > $min_len) { + chop $_; + push @t, 'OR', $pfx.$_.$star; + } + @t; + } @$x; + } else { + push @q, map { + my $k = $pfx.$_.$star; + $seen{$k}++ ? () : $k + } @$x; + } + } + } + if ($lei->{opt}->{uri}) { + @q = (join('+', map { uri_escape_utf8($_) } @q)); + } else { + @q = (join(' ', @q)); + } + $lei->out(@q, "\n"); +} + +sub lei_p2q { # the "lei patch-to-query" entry point + my ($lei, @inputs) = @_; + $lei->{opt}->{'in-format'} //= 'eml' if $lei->{opt}->{stdin}; + my $self = bless { missing_ok => 1 }, __PACKAGE__; + $self->prepare_inputs($lei, \@inputs) or return; + $lei->wq1_start($self); +} + +sub ipc_atfork_child { + my ($self) = @_; + PublicInbox::LeiInput::input_only_atfork_child($self); + on_destroy \&emit_query, $self; +} + +no warnings 'once'; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiPmdir.pm b/lib/PublicInbox/LeiPmdir.pm new file mode 100644 index 00000000..d4aa0212 --- /dev/null +++ b/lib/PublicInbox/LeiPmdir.pm @@ -0,0 +1,54 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# WQ worker for dealing with parallel Maildir reads; +# this does NOT use the {shard_info} field of LeiToMail +# (and we may remove {shard_info}) +# WQ key: {pmd} +package PublicInbox::LeiPmdir; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); + +sub new { + my ($cls, $lei, $ipt) = @_; + my $self = bless { -wq_ident => 'lei Maildir worker' }, $cls; + my $jobs = $lei->{opt}->{jobs} // ''; + $jobs =~ /\A[0-9]+,([0-9]+)\z/ and $jobs = $1; + my $nproc = $jobs || do { + # barely tested with >=4 CPUs, though I suspect I/O latency + # of SATA SSD storage will make >=4 processes unnecessary, + # here. NVMe users may wish to use '-j' + my $n = $self->detect_nproc; + $n = $n > 4 ? 4 : $n; + }; + my ($op_c, $ops) = $lei->workers_start($self, $nproc, + undef, { ipt => $ipt }); # LeiInput subclass + $op_c->{ops} = $ops; # for PktOp->event_step + $self->{lei_sock} = $lei->{sock}; # keep client for pmd_done_wait + $lei->{pmd} = $self; +} + +sub ipc_atfork_child { + my ($self) = @_; + my $ipt = $self->{ipt} // die 'BUG: no self->{ipt}'; + my $lei = $ipt->{lei} = $self->{lei}; + delete @$lei{qw(auth net)}; # no network access in this worker + $ipt->ipc_atfork_child; # calls _lei_atfork_child; +} + +sub each_mdir_fn { # maildir_each_file callback + my ($f, $fl, $self, @args) = @_; + $self->wq_io_do('mdir_iter', [], $f, $fl, @args); +} + +sub mdir_iter { # via wq_io_do + my ($self, $f, $fl, @args) = @_; + $self->{ipt}->pmdir_cb($f, $fl, @args); +} + +sub _lei_wq_eof { # EOF callback for main lei daemon + $_[0]->wq_eof('pmd'); +} + +1; diff --git a/lib/PublicInbox/LeiQuery.pm b/lib/PublicInbox/LeiQuery.pm new file mode 100644 index 00000000..eadf811f --- /dev/null +++ b/lib/PublicInbox/LeiQuery.pm @@ -0,0 +1,241 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# handles "lei q" command and provides internals for +# several other sub-commands (up, lcat, ...) +package PublicInbox::LeiQuery; +use v5.12; + +sub prep_ext { # externals_each callback + my ($lxs, $exclude, $loc) = @_; + $lxs->prepare_external($loc) unless $exclude->{$loc}; +} + +sub _start_query { # used by "lei q" and "lei up" + my ($self) = @_; + require PublicInbox::LeiOverview; + PublicInbox::LeiOverview->new($self) or return; + my $opt = $self->{opt}; + require PublicInbox::OverIdx; # loads DBI + PublicInbox::OverIdx::fork_ok($opt); + my ($xj, $mj) = split(/,/, $opt->{jobs} // ''); + (defined($xj) && $xj ne '' && $xj !~ /\A[1-9][0-9]*\z/) and + die "`$xj' search jobs must be >= 1\n"; + my $lxs = $self->{lxs}; + $xj ||= $lxs->concurrency($opt); # allow: "--jobs ,$WRITER_ONLY" + my $nproc = $lxs->detect_nproc || 1; # don't memoize, schedtool(1) exists + $xj = $nproc if $xj > $nproc; + $lxs->{-wq_nr_workers} = $xj; + (defined($mj) && $mj !~ /\A[1-9][0-9]*\z/) and + die "`$mj' writer jobs must be >= 1\n"; + my $l2m = $self->{l2m}; + # we use \1 (a ref) to distinguish between default vs. user-supplied + if ($l2m && grep { $opt->{$_} //= \1 } (qw(mail-sync import-remote + import-before))) { + $self->_lei_store(1)->write_prepare($self); + if ($opt->{'mail-sync'}) { + my $lms = $l2m->{-lms_rw} = $self->lms(1); + $lms->lms_write_prepare->lms_pause; # just create + } + } + $l2m and $l2m->{-wq_nr_workers} //= $mj // do { + # keep some CPU for git, and don't overload IMAP destinations + my $n = int($nproc * 0.75 + 0.5); + $self->{net} && $n > 4 ? 4 : $n; + }; + + # descending docid order is cheapest, MUA controls sorting order + $self->{mset_opt}->{relevance} //= -2 if $l2m || $opt->{threads}; + + my $tot = $self->{mset_opt}->{total} //= $self->{opt}->{limit} // 10000; + $self->{mset_opt}->{limit} = $tot > 10000 ? 10000 : $tot; + $self->{mset_opt}->{offset} //= 0; + $self->{mset_opt}->{threads} //= $opt->{threads}; + + if ($self->{net}) { + require PublicInbox::LeiAuth; + $self->{auth} = PublicInbox::LeiAuth->new + } + $lxs->do_query($self); +} + +sub do_qry { # do_env cb + my ($lei) = @_; + $lei->{mset_opt}->{q_raw} = $lei->{mset_opt}->{qstr} + = delete $lei->{stdin_buf}; + $lei->{lse}->query_approxidate($lei->{lse}->git, + $lei->{mset_opt}->{qstr}); + _start_query($lei); +} + +# make the URI||PublicInbox::{Inbox,ExtSearch} a config-file friendly string +sub cfg_ext ($) { + my ($x) = @_; + $x->isa('URI') ? "$x" : ($x->{inboxdir} // $x->{topdir}); +} + +sub lxs_prepare { + my ($self) = @_; + require PublicInbox::LeiXSearch; + # prepare any number of LeiXSearch || LeiSearch || Inbox || URL + my $lxs = $self->{lxs} = PublicInbox::LeiXSearch->new; + my $opt = $self->{opt}; + my @only = @{$opt->{only} // []}; + # --local is enabled by default unless --only is used + # we'll allow "--only $LOCATION --local" + my $sto = $self->_lei_store(1); + $self->{lse} = $sto->search; + if ($opt->{'local'} //= scalar(@only) ? 0 : 1) { + $lxs->prepare_external($self->{lse}); + } + if (@only) { + my $only; + for my $loc (@only) { + my @loc = $self->get_externals($loc) or return; + for (@loc) { + my $x = $lxs->prepare_external($_); + push(@$only, cfg_ext($x)) if $x; + } + } + $opt->{only} = $only if $only; + } else { + my (@ilocals, @iremotes, $incl); + for my $loc (@{$opt->{include} // []}) { + my @loc = $self->get_externals($loc) or return; + for (@loc) { + my $x = $lxs->prepare_external($_); + push(@$incl, cfg_ext($x)) if $x; + } + @ilocals = @{$lxs->{locals} // []}; + @iremotes = @{$lxs->{remotes} // []}; + } + $opt->{include} = $incl if $incl; + # --external is enabled by default, but allow --no-external + if ($opt->{external} //= 1) { + my $ex = $self->canonicalize_excludes($opt->{exclude}); + my @excl = keys %$ex; + $opt->{exclude} = \@excl if scalar(@excl); + $self->externals_each(\&prep_ext, $lxs, $ex); + $opt->{remote} //= !($lxs->locals - $opt->{'local'}); + $lxs->{locals} = \@ilocals if !$opt->{'local'}; + $lxs->{remotes} = \@iremotes if !$opt->{remote}; + } + } + ($lxs->locals || $lxs->remotes) ? ($self->{lxs} = $lxs) : + die("no local or remote inboxes to search\n"); +} + +# the main "lei q SEARCH_TERMS" method +sub lei_q { + my ($self, @argv) = @_; + PublicInbox::Config->json; # preload before forking + my $lxs = lxs_prepare($self) or return; + $self->ale->refresh_externals($lxs, $self); + my $opt = $self->{opt}; + my %mset_opt; + $mset_opt{asc} = $opt->{'reverse'} ? 1 : 0; + if (defined(my $sort = $opt->{'sort'})) { + if ($sort eq 'relevance') { + $mset_opt{relevance} = 1; + } elsif ($sort eq 'docid') { + $mset_opt{relevance} = $mset_opt{asc} ? -1 : -2; + } elsif ($sort =~ /\Areceived(?:-?[aA]t)?\z/) { + # the default + } else { + die "unrecognized --sort=$sort\n"; + } + $opt->{save} and return + $self->fail('--save and --sort are incompatible'); + } + $self->{mset_opt} = \%mset_opt; + + if ($opt->{stdin}) { + return $self->fail(<<'') if @argv; +no query allowed on command-line with --stdin + + return $self->slurp_stdin(\&do_qry); + } + chomp(@argv) and $self->qerr("# trailing `\\n' removed"); + $mset_opt{q_raw} = [ @argv ]; # copy + $mset_opt{qstr} = + $self->{lse}->query_argv_to_string($self->{lse}->git, \@argv); + _start_query($self); +} + +# shell completion helper called by lei__complete +sub _complete_q { + my ($self, @argv) = @_; + join('', @argv) =~ /\bL:\S*\z/ and + return eval { $self->_lei_store->search->all_terms('L') }; + my @cur; + my $cb = $self->lazy_cb(qw(forget-external _complete_)); + while (@argv) { + if ($argv[-1] =~ /\A(?:-I|(?:--(?:include|exclude|only)))\z/) { + my @c = $cb->($self, @cur); + # try basename match: + if (scalar(@cur) == 1 && index($cur[0], '/') < 0) { + my $all = $self->externals_each; + my %bn; + for my $loc (keys %$all) { + my $bn = (split(m!/!, $loc))[-1]; + ++$bn{$bn}; + } + push @c, grep { + $bn{$_} == 1 && /\A\Q$cur[0]/ + } keys %bn; + } + return @c if @c; + } + unshift(@cur, pop @argv); + } + (); +} + +# Stuff we may pass through to curl (as of 7.64.0), see curl manpage for +# details, so most options which make sense for HTTP/HTTPS (including proxy +# support for Tor and other methods of getting past weird networks). +# Most of these are untested by us, some may not make sense for our use case +# and typos below are likely. +# n.b. some short options (-$NUMBER) are not supported since they conflict +# with other "lei q" switches. +# FIXME: Getopt::Long doesn't easily let us support support options with +# '.' in them (e.g. --http1.1) +# TODO: should we depend on "-c http.*" options for things which have +# analogues in git(1)? that would reduce likelihood of conflicts with +# our other CLI options +# Note: some names are renamed to avoid potential conflicts, +# see %lei2curl in lib/PublicInbox/LeiCurl.pm +sub curl_opt { qw( + curl-config=s@ + abstract-unix-socket=s anyauth basic cacert=s capath=s + cert-status cert-type cert=s ciphers=s + connect-timeout=s connect-to=s cookie-jar=s cookie=s crlfile=s + digest disable dns-interface=s dns-ipv4-addr=s dns-ipv6-addr=s + dns-servers=s doh-url=s egd-file=s engine=s false-start + happy-eyeballs-timeout-ms=s haproxy-protocol header=s@ + http2-prior-knowledge http2 insecure + interface=s ipv4 ipv6 junk-session-cookies + key-type=s key=s limit-rate=s local-port=s location-trusted location + max-redirs=i max-time=s negotiate netrc-file=s netrc-optional netrc + no-alpn no-buffer no-npn no-sessionid noproxy=s ntlm-wb ntlm + pass=s pinnedpubkey=s post301 post302 post303 preproxy=s + proxy-anyauth proxy-basic proxy-cacert=s proxy-capath=s + proxy-cert-type=s proxy-cert=s proxy-ciphers=s proxy-crlfile=s + proxy-digest proxy-header=s@ proxy-insecure + proxy-key-type=s proxy-key proxy-negotiate proxy-ntlm proxy-pass=s + proxy-pinnedpubkey=s proxy-service-name=s proxy-ssl-allow-beast + proxy-tls13-ciphers=s proxy-tlsauthtype=s proxy-tlspassword=s + proxy-tlsuser=s proxy-tlsv1 proxy-user=s proxy=s + proxytunnel=s pubkey=s random-file=s referer=s resolve=s + retry-connrefused retry-delay=s retry-max-time=s retry=i + sasl-ir service-name=s socks4=s socks4a=s socks5-basic + socks5-gssapi-service-name=s socks5-gssapi socks5-hostname=s socks5=s + speed-limit speed-type ssl-allow-beast sslv2 sslv3 + suppress-connect-headers tcp-fastopen tls-max=s + tls13-ciphers=s tlsauthtype=s tlspassword=s tlsuser=s + tlsv1 trace-ascii=s trace-time trace=s + unix-socket=s user-agent=s user=s +) +} + +1; diff --git a/lib/PublicInbox/LeiRediff.pm b/lib/PublicInbox/LeiRediff.pm new file mode 100644 index 00000000..66359dd4 --- /dev/null +++ b/lib/PublicInbox/LeiRediff.pm @@ -0,0 +1,299 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# The "lei rediff" sub-command, regenerates diffs with new options +package PublicInbox::LeiRediff; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use File::Temp 0.19 (); # 0.19 for ->newdir +use PublicInbox::Spawn qw(run_wait popen_wr which); +use PublicInbox::MsgIter qw(msg_part_text); +use PublicInbox::ViewDiff; +use PublicInbox::LeiBlob; +use PublicInbox::Git qw(git_quote git_unquote); +use PublicInbox::Import; +use PublicInbox::LEI; +use PublicInbox::SolverGit; + +my $MODE = '(100644|120000|100755|160000)'; + +sub rediff_user_cb { # called by solver when done + my ($res, $self) = @_; + my $lei = $self->{lei}; + my $log_buf = delete $lei->{log_buf}; + $$log_buf =~ s/^/# /sgm; + ref($res) eq 'ARRAY' or return $lei->child_error(0, $$log_buf); + $lei->qerr($$log_buf); + my ($git, $oid, $type, $size, $di) = @$res; + my $oid_want = delete $self->{cur_oid_want}; + + # don't try to support all the git-show(1) options for non-blob, + # this is just a convenience: + $type ne 'blob' and return warn(<<EOF); +# $oid is a $type of $size bytes in: +# $git->{git_dir} (wanted: $oid_want) +EOF + $self->{blob}->{$oid_want} = $oid; + push @{$self->{gits}}, $git if $git->{-tmp}; +} + +# returns a full blob for oid_want +sub solve_1 ($$$) { + my ($self, $oid_want, $hints) = @_; + return if $oid_want =~ /\A0+\z/; + $self->{cur_oid_want} = $oid_want; + my $solver = bless { + gits => $self->{gits}, + user_cb => \&rediff_user_cb, + uarg => $self, + inboxes => [ $self->{lxs}->locals, @{$self->{rmt}} ], + }, 'PublicInbox::SolverGit'; + open my $log, '+>', \(my $log_buf = '') or die "PerlIO::scalar: $!"; + $self->{lei}->{log_buf} = \$log_buf; + local $PublicInbox::DS::in_loop = 0; # waitpid synchronously + $solver->solve($self->{lei}->{env}, $log, $oid_want, $hints); + $self->{blob}->{$oid_want}; # full OID +} + +sub _lei_diff_prepare ($$) { + my ($lei, $cmd) = @_; + my $opt = $lei->{opt}; + push @$cmd, '--'.($opt->{color} && !$opt->{'no-color'} ? '' : 'no-'). + 'color'; + for my $o (@PublicInbox::LEI::diff_opt) { + my $c = ''; + # remove single char short option + $o =~ s/\|([a-z0-9])\b//i and $c = $1; + if ($o =~ s/=[is]@\z//) { + my $v = $opt->{$o} or next; + push @$cmd, map { $c ? "-$c$_" : "--$o=$_" } @$v; + } elsif ($o =~ s/=[is]\z//) { + my $v = $opt->{$o} // next; + push @$cmd, $c ? "-$c$v" : "--$o=$v"; + } elsif ($o =~ s/:[is]\z//) { + my $v = $opt->{$o} // next; + push @$cmd, $c ? "-$c$v" : + ($v eq '' ? "--$o" : "--$o=$v"); + } elsif ($o =~ s/!\z//) { + my $v = $opt->{$o} // next; + push @$cmd, $v ? "--$o" : "--no-$o"; + } elsif ($opt->{$o}) { + push @$cmd, $c ? "-$c" : "--$o"; + } + } + push(@$cmd, "-O$opt->{'order-file'}") if $opt->{'order-file'}; +} + +sub diff_ctxq ($$) { + my ($self, $ctxq) = @_; + return unless $ctxq; + my $blob = $self->{blob}; + my $ta = <<'EOM'; +reset refs/heads/A +commit refs/heads/A +author <a@s> 0 +0000 +committer <c@s> 0 +0000 +data 0 +EOM + my $tb = $ta; + $tb =~ tr!A!B!; + my $lei = $self->{lei}; + while (my ($oid_a, $oid_b, $pa, $pb, $ma, $mb) = splice(@$ctxq, 0, 6)) { + my $xa = $blob->{$oid_a} //= solve_1($self, $oid_a, + { path_b => $pa }); + my $xb = $blob->{$oid_b} //= solve_1($self, $oid_b, { + oid_a => $oid_a, + path_a => $pa, + path_b => $pb + }); + $ta .= "M $ma $xa ".git_quote($pa)."\n" if $xa; + $tb .= "M $mb $xb ".git_quote($pb)."\n" if $xb; + } + my $rw = $self->{gits}->[-1]; # has all known alternates + if (!$rw->{-tmp}) { + my $d = "$self->{rdtmp}/for_tree.git"; + -d $d or PublicInbox::Import::init_bare($d); + # always overwrite + PublicInbox::IO::write_file '>', "$d/objects/info/alternates", + map { $_->git_path('objects')."\n" } @{$self->{gits}}; + $rw = PublicInbox::Git->new($d); + } + my $w = popen_wr($rw->cmd(qw(fast-import + --quiet --done --date-format=raw)), + $lei->{env}, { 2 => $lei->{2} }); + print $w $ta, "\n", $tb, "\ndone\n" or die "print fast-import: $!"; + $w->close or die "close w fast-import: \$?=$? \$!=$!"; + + my $cmd = $rw->cmd('diff'); + _lei_diff_prepare($lei, $cmd); + $lei->qerr("# git @$cmd[2..$#$cmd]"); + push @$cmd, qw(A B); + run_wait($cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} }) and + $lei->child_error($?); # for git diff --exit-code + undef; +} + +# awaitpid callback +sub wait_requote { $_[1]->child_error($?) if $? } + +sub requote ($$) { # '> ' prefix(es) lei->{1} + my ($lei, $pfx) = @_; + my $opt = { 1 => $lei->{1}, 2 => $lei->{2} }; + # $^X (perl) is overkill, but maybe there's a weird system w/o sed + my $w = popen_wr([$^X, '-pe', "s/^/$pfx/"], $lei->{env}, $opt, + \&wait_requote, $lei); + binmode $w, ':utf8'; + $w; +} + +sub extract_oids { # Eml each_part callback + my ($ary, $self) = @_; + my $lei = $self->{lei}; + my ($p, undef, $idx) = @$ary; + $lei->out($p->header_obj->as_string, "\n"); + my ($s, undef) = msg_part_text($p, $p->content_type || 'text/plain'); + defined $s or return; + + $self->{dqre} && $s =~ s/$self->{dqre}//g && $lei->{opt}->{drq} and + local $lei->{1} = requote($lei, $1); + + my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $s); + undef $s; + my $blobs = $self->{blobs}; # blobs to resolve + my $ctxq; + while (defined(my $x = shift @top)) { + if (scalar(@top) >= 4 && + $top[1] =~ $PublicInbox::ViewDiff::IS_OID && + $top[0] =~ $PublicInbox::ViewDiff::IS_OID) { + my ($ma, $mb); + $x =~ /^old mode $MODE/sm and $ma = $1; + $x =~ /^new mode $MODE/sm and $mb = $1; + if (!defined($ma) && $x =~ + /^index [a-z0-9]+\.\.[a-z0-9]+ $MODE/sm) { + $ma = $mb = $1; + } + $ma //= '100644'; + $mb //= $ma; + my ($oid_a, $oid_b, $pa, $pb) = splice(@top, 0, 4); + $pa eq '/dev/null' or + $pa = (split(m'/', git_unquote($pa), 2))[1]; + $pb eq '/dev/null' or + $pb = (split(m'/', git_unquote($pb), 2))[1]; + $blobs->{$oid_a} //= undef; + $blobs->{$oid_b} //= undef; + push @$ctxq, $oid_a, $oid_b, $pa, $pb, $ma, $mb; + } elsif ($ctxq) { + my @out; + for (split(/^/sm, $x)) { + if (/\A-- \r?\n/s) { # email sig starts + push @out, $_; + $ctxq = diff_ctxq($self, $ctxq); + } elsif ($ctxq && (/\A[\+\- ]/ || /\A@@ / || + # allow totally blank lines w/o leading + # SP, git-apply does: + /\A\r?\n/s)) { + next; + } else { + push @out, $_; + } + } + $self->{lei}->out(@out) if @out; + } else { + $ctxq = diff_ctxq($self, $ctxq); + $self->{lei}->out($x); + } + } + $ctxq = diff_ctxq($self, $ctxq); +} + +# ensure dequoted parts are available for rebuilding patches: +sub dequote_add { # Eml each_part callback + my ($ary, $self) = @_; + my ($p, undef, $idx) = @$ary; + my ($s, undef) = msg_part_text($p, $p->content_type || 'text/plain'); + defined $s or return; + if ($s =~ s/$self->{dqre}//g) { # remove '> ' prefix(es) + substr($s, 0, 0, "part-dequoted: $idx\n\n"); + utf8::encode($s); + $self->{tmp_sto}->add_eml(PublicInbox::Eml->new(\$s)); + } +} + +sub input_eml_cb { # callback for all emails + my ($self, $eml) = @_; + { + local $SIG{__WARN__} = sub { + return if "@_" =~ /^no email in From: .*? or Sender:/; + return if PublicInbox::Eml::warn_ignore(@_); + warn @_; + }; + $self->{tmp_sto}->add_eml($eml); + $eml->each_part(\&dequote_add, $self) if $self->{dqre}; + $self->{tmp_sto}->done; + } + $eml->each_part(\&extract_oids, $self, 1); +} + +sub lei_rediff { + my ($lei, @inputs) = @_; + ($lei->{opt}->{drq} && $lei->{opt}->{'dequote-only'}) and return + $lei->fail('--drq and --dequote-only are mutually exclusive'); + ($lei->{opt}->{drq} && !$lei->{opt}->{verbose}) and + $lei->{opt}->{quiet} //= 1; + $lei->_lei_store(1)->write_prepare($lei); + $lei->{opt}->{'in-format'} //= 'eml' if $lei->{opt}->{stdin}; + # maybe it's a non-email (code) blob from a coderepo + my $git_dirs = $lei->{opt}->{'git-dir'} //= []; + if ($lei->{opt}->{cwd} // 1) { + my $cgd = PublicInbox::LeiBlob::get_git_dir($lei, '.'); + unshift(@$git_dirs, $cgd) if defined $cgd; + } + return $lei->fail('no --git-dir to try') unless @$git_dirs; + my $lxs = $lei->lxs_prepare; + if ($lxs->remotes) { + require PublicInbox::LeiRemote; + $lei->{curl} //= which('curl') or return + $lei->fail('curl needed for '.join(', ',$lxs->remotes)); + } + $lei->ale->refresh_externals($lxs, $lei); + my $self = bless { + -force_eml => 1, # for LeiInput->input_fh + lxs => $lxs, + }, __PACKAGE__; + $self->prepare_inputs($lei, \@inputs) or return; + my $isatty = -t $lei->{1}; + $lei->{opt}->{color} //= $isatty; + $lei->start_pager if $isatty; + $lei->wq1_start($self); +} + +sub ipc_atfork_child { + my ($self) = @_; + PublicInbox::LeiInput::input_only_atfork_child(@_); + my $lei = $self->{lei}; + $lei->{1}->autoflush(1); + binmode $lei->{1}, ':utf8'; + $self->{blobs} = {}; # oidhex => filename + $self->{rdtmp} = File::Temp->newdir('lei-rediff-XXXX', TMPDIR => 1); + $self->{tmp_sto} = PublicInbox::LeiStore->new( + "$self->{rdtmp}/tmp.store", + { creat => { nproc => 1 }, indexlevel => 'medium' }); + $self->{tmp_sto}->{priv_eidx}->{parallel} = 0; + $self->{rmt} = [ $self->{tmp_sto}->search, map { + PublicInbox::LeiRemote->new($lei, $_) + } $self->{lxs}->remotes ]; + $self->{gits} = [ map { + PublicInbox::Git->new($lei->rel2abs($_)) + } @{$self->{lei}->{opt}->{'git-dir'}} ]; + $lei->{env}->{TMPDIR} = $self->{rdtmp}->dirname; + if (my $nr = ($lei->{opt}->{drq} || $lei->{opt}->{'dequote-only'})) { + my $re = '\s*> ' x $nr; + $self->{dqre} = qr/^($re)/ms; + } + undef; +} + +no warnings 'once'; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; +1; diff --git a/lib/PublicInbox/LeiRefreshMailSync.pm b/lib/PublicInbox/LeiRefreshMailSync.pm new file mode 100644 index 00000000..dde23274 --- /dev/null +++ b/lib/PublicInbox/LeiRefreshMailSync.pm @@ -0,0 +1,109 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei refresh-mail-sync" drops dangling sync information +# and attempts to detect moved files +package PublicInbox::LeiRefreshMailSync; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use PublicInbox::LeiImport; +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::Import; + +sub folder_missing { # may be called by LeiInput + my ($self, $folder) = @_; + $self->{lms}->forget_folders($folder); +} + +sub prune_mdir { # lms->each_src callback + my ($oidbin, $id, $self, $mdir) = @_; + my @try = $$id =~ /:2,[a-zA-Z]*\z/ ? qw(cur new) : qw(new cur); + for (@try) { return if -f "$mdir/$_/$$id" } + # both tries failed + $self->{lms}->clear_src("maildir:$mdir", $id); +} + +sub prune_imap { # lms->each_src callback + my ($oidbin, $uid, $self, $uids, $url) = @_; + return if exists $uids->{$uid}; + $self->{lms}->clear_src($url, $uid); +} + +# detects missed file moves +sub pmdir_cb { # called via LeiPmdir->each_mdir_fn + my ($self, $f, $fl) = @_; + my ($folder, $bn) = ($f =~ m!\A(.+?)/(?:new|cur)/([^/]+)\z!) or + die "BUG: $f was not from a Maildir?"; + substr($folder, 0, 0) = 'maildir:'; # add prefix + return if scalar($self->{lms}->name_oidbin($folder, $bn)); + my $eml = eml_from_path($f) // return; + my $oidbin = $self->{lei}->git_oid($eml)->digest; + $self->{lms}->set_src($oidbin, $folder, \$bn); +} + +sub input_path_url { # overrides PublicInbox::LeiInput::input_path_url + my ($self, $input, @args) = @_; + if ($input =~ /\Amaildir:(.+)/i) { + $self->{lms}->each_src($input, \&prune_mdir, $self, $1); + $self->{lse} //= $self->{lei}->{sto}->search; + # call pmdir_cb (via maildir_each_file -> each_mdir_fn) + PublicInbox::LeiInput::input_path_url($self, $input); + } elsif ($input =~ m!\Aimaps?://!i) { + my $uri = PublicInbox::URIimap->new($input); + if (my $mic = $self->{lei}->{net}->mic_for_folder($uri)) { + my $uids = $mic->search('UID 1:*'); + $uids = +{ map { $_ => undef } @$uids }; + $self->{lms}->each_src($$uri, \&prune_imap, $self, + $uids, $$uri) + } else { + $self->folder_missing($$uri); + } + } else { die "BUG: $input not supported" } + $self->{lei}->sto_barrier_request; +} + +sub lei_refresh_mail_sync { + my ($lei, @folders) = @_; + my $sto = $lei->_lei_store or return $lei->fail(<<EOM); +lei/store uninitialized, see lei-import(1) +EOM + my $lms = $lei->lms or return $lei->fail(<<EOM); +lei mail_sync.sqlite3 uninitialized, see lei-import(1) +EOM + if (defined(my $all = $lei->{opt}->{all})) { + $lms->group2folders($lei, $all, \@folders) or return; + # TODO: handle NNTP servers which delete messages + @folders = grep(!m!\Anntps?://!, @folders); + } else { + $lms->arg2folder($lei, \@folders); # may die + } + $lms->lms_pause; # must be done before fork + $sto->write_prepare($lei); + my $self = bless { missing_ok => 1, lms => $lms }, __PACKAGE__; + $lei->{opt}->{'mail-sync'} = 1; # for prepare_inputs + $self->prepare_inputs($lei, \@folders) or return; + $lei->{-err_type} = 'non-fatal'; + $lei->wq1_start($self); +} + +sub ipc_atfork_child { # needed for PublicInbox::LeiPmdir + my ($self) = @_; + PublicInbox::LeiInput::input_only_atfork_child($self); + $self->{lms}->lms_write_prepare; + undef; +} + +sub _complete_refresh_mail_sync { + my ($lei, @argv) = @_; + my $lms = $lei->lms or return (); + my $match_cb = $lei->complete_url_prepare(\@argv); + my @k = $lms->folders($argv[-1] // undef, 1); + my @m = map { $match_cb->($_) } @k; + @m ? @m : @k +} + +no warnings 'once'; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiReindex.pm b/lib/PublicInbox/LeiReindex.pm new file mode 100644 index 00000000..3f109f33 --- /dev/null +++ b/lib/PublicInbox/LeiReindex.pm @@ -0,0 +1,49 @@ +# Copyright all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei reindex" command to reindex everything in lei/store +package PublicInbox::LeiReindex; +use v5.12; +use parent qw(PublicInbox::IPC); + +sub reindex_full { + my ($lei) = @_; + my $sto = $lei->{sto}; + my $max = $sto->search->over(1)->max; + $lei->qerr("# reindexing 1..$max"); + $sto->wq_do('reindex_art', $_) for (1..$max); +} + +sub reindex_store { # via wq_do + my ($self) = @_; + my ($lei, $argv) = delete @$self{qw(lei argv)}; + if (!@$argv) { + reindex_full($lei); + } +} + +sub lei_reindex { + my ($lei, @argv) = @_; + my $sto = $lei->_lei_store or return $lei->fail('nothing indexed'); + $sto->write_prepare($lei); + my $self = bless { lei => $lei, argv => \@argv }, __PACKAGE__; + my ($op_c, $ops) = $lei->workers_start($self, 1); + $lei->{wq1} = $self; + $lei->wait_wq_events($op_c, $ops); + $self->wq_do('reindex_store'); + $self->wq_close; +} + +sub _lei_wq_eof { # EOF callback for main lei daemon + my ($lei) = @_; + $lei->{sto}->wq_do('reindex_done'); + $lei->wq_eof; +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->_lei_atfork_child; + $self->SUPER::ipc_atfork_child; +} + +1; diff --git a/lib/PublicInbox/LeiRemote.pm b/lib/PublicInbox/LeiRemote.pm new file mode 100644 index 00000000..d6fc40a4 --- /dev/null +++ b/lib/PublicInbox/LeiRemote.pm @@ -0,0 +1,84 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Make remote externals HTTP(S) inboxes behave like +# PublicInbox::Inbox and PublicInbox::Search/ExtSearch. +# This exists solely for SolverGit. It is a high-latency a +# synchronous API that is not at all fast. +package PublicInbox::LeiRemote; +use v5.10.1; +use strict; +use IO::Uncompress::Gunzip; +use PublicInbox::MboxReader; +use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::LeiCurl; +use PublicInbox::ContentHash qw(git_sha); + +sub new { + my ($cls, $lei, $uri) = @_; + bless { uri => $uri, lei => $lei }, $cls; +} + +sub isrch { $_[0] } # SolverGit expcets this + +sub each_mboxrd_eml { # callback for MboxReader->mboxrd + my ($eml, $self) = @_; + my $lei = $self->{lei}; + my $xoids = $lei->{ale}->xoids_for($eml, 1); + my $smsg = bless {}, 'PublicInbox::Smsg'; + if ($lei->{sto} && !$xoids) { # memoize locally + my $res = $lei->{sto}->wq_do('add_eml', $eml); + $smsg = $res if ref($res) eq ref($smsg); + } + $smsg->{blob} //= $xoids ? (keys(%$xoids))[0] + : $lei->git_oid($eml)->hexdigest; + $smsg->populate($eml); + $smsg->{mid} //= '(none)'; + push @{$self->{smsg}}, $smsg; +} + +sub mset { + my ($self, $qstr, undef) = @_; # $opt ($_[2]) ignored + my $lei = $self->{lei}; + my $curl = PublicInbox::LeiCurl->new($lei, $lei->{curl}); + push @$curl, '-s', '-d', ''; + my $uri = $self->{uri}->clone; + $uri->query_form(q => $qstr, x => 'm', r => 1); # r=1: relevance + my $cmd = $curl->for_uri($self->{lei}, $uri); + $self->{lei}->qerr("# $cmd"); + $self->{smsg} = []; + my $fh = popen_rd($cmd, undef, { 2 => $lei->{2} }); + $fh = IO::Uncompress::Gunzip->new($fh, MultiStream=>1, AutoClose=>1); + eval { PublicInbox::MboxReader->mboxrd($fh, \&each_mboxrd_eml, $self) }; + my $err = $@ ? ": $@" : ''; + my $wait = $self->{lei}->{sto}->wq_do('barrier'); + $lei->child_error($?, "@$cmd failed$err") if $err || $?; + $self; # we are the mset (and $ibx, and $self) +} + +sub size { scalar @{$_[0]->{smsg}} } # size of previous results + +sub mset_to_smsg { + my ($self, $ibx, $mset) = @_; # all 3 are $self + wantarray ? ($self->size, @{$self->{smsg}}) : $self->{smsg}; +} + +sub base_url { "$_[0]->{uri}" } + +sub smsg_eml { + my ($self, $smsg) = @_; + my $bref = $self->{lei}->ale->git->cat_file($smsg->{blob}) // do { + my $lms = $self->{lei}->lms; + ($lms ? $lms->local_blob($smsg->{blob}, 1) : undef) // do { + my $sto = $self->{lei}->{sto} // + $self->{lei}->_lei_store; + $sto && $sto->{-wq_s1} ? + $sto->wq_do('cat_blob', $smsg->{blob}) : undef; + } + }; + return PublicInbox::Eml->new($bref) if $bref; + warn("E: $self->{uri} $smsg->{blob} gone <$smsg->{mid}>\n"); + undef; +} + +1; diff --git a/lib/PublicInbox/LeiRm.pm b/lib/PublicInbox/LeiRm.pm new file mode 100644 index 00000000..00b12485 --- /dev/null +++ b/lib/PublicInbox/LeiRm.pm @@ -0,0 +1,30 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# implements the "lei rm" command, you can point this at +# an entire spam mailbox or read a message from stdin +package PublicInbox::LeiRm; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); + +sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh + my ($self, $eml) = @_; + $self->{lei}->{sto}->wq_do('remove_eml', $eml); +} + +sub lei_rm { + my ($lei, @inputs) = @_; + $lei->_lei_store(1)->write_prepare($lei); + $lei->{opt}->{'in-format'} //= 'eml' if $lei->{opt}->{stdin}; + my $self = bless {}, __PACKAGE__; + $self->prepare_inputs($lei, \@inputs) or return; + $lei->{-err_type} = 'non-fatal'; + $lei->wq1_start($self); +} + +no warnings 'once'; +*ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child; +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiRmWatch.pm b/lib/PublicInbox/LeiRmWatch.pm new file mode 100644 index 00000000..19bee3ab --- /dev/null +++ b/lib/PublicInbox/LeiRmWatch.pm @@ -0,0 +1,31 @@ +# Copyright all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei rm-watch" command +package PublicInbox::LeiRmWatch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::LeiInput); + +sub lei_rm_watch { + my ($lei, @argv) = @_; + my $cfg = $lei->_lei_cfg(1); + $lei->{opt}->{'mail-sync'} = 1; # for prepare_inputs + my $self = bless { missing_ok => 1 }, __PACKAGE__; + $self->prepare_inputs($lei, \@argv) or return; + for my $w (@{$self->{inputs}}) { + $lei->_config('--remove-section', "watch.$w") or return; + } + delete $lei->{cfg}; # force reload + $lei->refresh_watches; +} + +sub _complete_rm_watch { + my ($lei, @argv) = @_; + my $cfg = $lei->_lei_cfg or return; + my $match_cb = $lei->complete_url_prepare(\@argv); + my @w = (join("\n", keys %$cfg) =~ m/^watch\.(.+?)\.state$/sgm); + map { $match_cb->($_) } @w; +} + +1; diff --git a/lib/PublicInbox/LeiSavedSearch.pm b/lib/PublicInbox/LeiSavedSearch.pm new file mode 100644 index 00000000..9ae9dcdb --- /dev/null +++ b/lib/PublicInbox/LeiSavedSearch.pm @@ -0,0 +1,304 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# pretends to be like LeiDedupe and also PublicInbox::Inbox +package PublicInbox::LeiSavedSearch; +use v5.12; +use parent qw(PublicInbox::Lock); +use PublicInbox::Git; +use PublicInbox::OverIdx; +use PublicInbox::LeiSearch; +use PublicInbox::Config; +use PublicInbox::Spawn qw(run_die); +use PublicInbox::ContentHash qw(git_sha); +use PublicInbox::MID qw(mids_for_index); +use PublicInbox::SHA qw(sha256_hex); +use File::Temp (); +use IO::Handle (); +our $LOCAL_PFX = qr!\A(?:maildir|mh|mbox.+|mmdf|v2):!i; # TODO: put in LeiToMail? + +# move this to PublicInbox::Config if other things use it: +my %cquote = ("\n" => '\\n', "\t" => '\\t', "\b" => '\\b'); +sub cquote_val ($) { # cf. git-config(1) + my ($val) = @_; + $val =~ s/([\n\t\b])/$cquote{$1}/g; + $val =~ s/\"/\\\"/g; + $val; +} + +sub ARRAY_FIELDS () { qw(only include exclude) } +sub BOOL_FIELDS () { + qw(external local remote import-remote import-before threads) +} + +sub SINGLE_FIELDS () { qw(limit dedupe output) } + +sub lss_dir_for ($$;$) { + my ($lei, $dstref, $on_fs) = @_; + my $pfx; + if ($$dstref =~ m,\Aimaps?://,i) { # already canonicalized + require PublicInbox::URIimap; + my $uri = PublicInbox::URIimap->new($$dstref)->canonical; + $$dstref = $$uri; + $pfx = $uri->mailbox; + } else { + # can't use Cwd::abs_path since dirname($$dstref) may not exist + $$dstref = $lei->rel2abs($$dstref); + $$dstref =~ tr!/!/!s; + $pfx = $$dstref; + } + ($pfx) = ($pfx =~ m{([^/]+)/*\z}); # basename + my $lss_dir = $lei->share_path . '/saved-searches/'; + my $d = "$lss_dir$pfx-".sha256_hex($$dstref); + + # fall-back to looking up by st_ino + st_dev in case we're in + # a symlinked or bind-mounted path + if ($on_fs && !-d $d && -e $$dstref) { + my @cur = stat(_); + my $want = pack('dd', @cur[1,0]); # st_ino + st_dev + my ($c, $o, @st); + for my $g ("$pfx-*", '*') { + my @maybe = glob("$lss_dir$g/lei.saved-search"); + for my $f (@maybe) { + $c = $lei->cfg_dump($f) // next; + $o = $c->{'lei.q.output'} // next; + $o =~ s!$LOCAL_PFX!! or next; + @st = stat($o) or next; + next if pack('dd', @st[1,0]) ne $want; + $f =~ m!\A(.+?)/[^/]+\z! and return $1; + } + } + } + $d; +} + +sub list { + my ($lei, $pfx) = @_; + my $lss_dir = $lei->share_path.'/saved-searches'; + return () unless -d $lss_dir; + # TODO: persist the cache? Use another format? + my $fh = File::Temp->new(TEMPLATE => 'lss_list-XXXX', TMPDIR => 1) or + die "File::Temp->new: $!"; + print $fh "[include]\n"; + for my $p (glob("$lss_dir/*/lei.saved-search")) { + print $fh "\tpath = ", cquote_val($p), "\n"; + } + $fh->flush or die "flush: $fh"; + my $cfg = $lei->cfg_dump($fh->filename); + my $out = $cfg ? $cfg->get_all('lei.q.output') : []; + s!$LOCAL_PFX!! for @$out;; + @$out; +} + +sub translate_dedupe ($$) { + my ($self, $lei) = @_; + my $dd = $lei->{opt}->{dedupe} // 'content'; + return 1 if $dd eq 'content'; # the default + return $self->{"-dedupe_$dd"} = 1 if ($dd eq 'oid' || $dd eq 'mid'); + die("--dedupe=$dd requires --no-save\n"); +} + +sub up { # updating existing saved search via "lei up" + my ($cls, $lei, $dst) = @_; + my $f; + my $self = bless { ale => $lei->ale }, $cls; + my $dir = $dst; + output2lssdir($self, $lei, \$dir, \$f) or + return die("--no-save was used with $dst cwd=". + $lei->rel2abs('.')."\n"); + $self->{-cfg} = $lei->cfg_dump($f) // return $lei->child_error; + $self->{-ovf} = "$dir/over.sqlite3"; + $self->{'-f'} = $f; + $self->{lock_path} = "$self->{-f}.flock"; + $self; +} + +sub new { # new saved search "lei q --save" + my ($cls, $lei) = @_; + my $self = bless { ale => $lei->ale }, $cls; + require File::Path; + my $dst = $lei->{ovv}->{dst}; + + # canonicalize away relative paths into the config + if ($lei->{ovv}->{fmt} eq 'maildir' && + $dst =~ m!(?:/*|\A)\.\.(?:/*|\z)! && !-d $dst) { + File::Path::make_path($dst); + $lei->{ovv}->{dst} = $dst = $lei->abs_path($dst); + } + my $dir = lss_dir_for($lei, \$dst); + File::Path::make_path($dir); # raises on error + $self->{-cfg} = {}; + my $f = $self->{'-f'} = "$dir/lei.saved-search"; + translate_dedupe($self, $lei) or return; + open my $fh, '>', $f or return $lei->fail("open $f: $!"); + my $sq_dst = PublicInbox::Config::squote_maybe($dst); + my $q = $lei->{mset_opt}->{q_raw} // die 'BUG: {q_raw} missing'; + if (ref $q) { + $q = join("\n", map { "\tq = ".cquote_val($_) } @$q); + } else { + $q = "\tq = ".cquote_val($q); + } + $dst = "$lei->{ovv}->{fmt}:$dst" if $dst !~ m!\Aimaps?://!i; + $lei->{opt}->{output} = $dst; + print $fh <<EOM; +; to refresh with new results, run: lei up $sq_dst +; `maxuid' and `lastresult' lines are maintained by "lei up" for optimization +[lei] +$q +[lei "q"] +EOM + for my $k (ARRAY_FIELDS) { + my $ary = $lei->{opt}->{$k} // next; + for my $x (@$ary) { + print $fh "\t$k = ".cquote_val($x)."\n"; + } + } + for my $k (BOOL_FIELDS) { + my $val = $lei->{opt}->{$k} // next; + print $fh "\t$k = ".($val ? 1 : 0)."\n"; + } + for my $k (SINGLE_FIELDS) { + my $val = $lei->{opt}->{$k} // next; + print $fh "\t$k = $val\n"; + } + $lei->{opt}->{stdin} and print $fh <<EOM; +[lei "internal"] + rawstr = 1 # stdin was used initially +EOM + close($fh) or return $lei->fail("close $f: $!"); + $self->{lock_path} = "$self->{-f}.flock"; + $self->{-ovf} = "$dir/over.sqlite3"; + $self; +} + +sub description { $_[0]->{qstr} } # for WWW + +sub cfg_set { # called by LeiXSearch + my ($self, @args) = @_; + my $lk = $self->lock_for_scope; # git-config doesn't wait + run_die([qw(git config -f), $self->{'-f'}, @args]); +} + +# drop-in for LeiDedupe API +sub is_dup { + my ($self, $eml, $smsg) = @_; + my $oidx = $self->{oidx} // die 'BUG: no {oidx}'; + my $lk; + if ($self->{-dedupe_mid}) { + $lk //= $self->lock_for_scope_fast; + for my $mid (@{mids_for_index($eml)}) { + my ($id, $prv); + return 1 if $oidx->next_by_mid($mid, \$id, \$prv); + } + } + my $blob = $smsg ? $smsg->{blob} : git_sha(1, $eml)->hexdigest; + $lk //= $self->lock_for_scope_fast; + return 1 if $oidx->blob_exists($blob); + if (my $xoids = PublicInbox::LeiSearch::xoids_for($self, $eml, 1)) { + for my $docid (values %$xoids) { + $oidx->add_xref3($docid, -1, $blob, '.'); + } + $oidx->commit_lazy; + if ($self->{-dedupe_oid}) { + exists $xoids->{$blob} ? 1 : undef; + } else { + 1; + } + } else { + # n.b. above xoids_for fills out eml->{-lei_fake_mid} if needed + unless ($smsg) { + $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->{bytes} = 0; + $smsg->populate($eml); + } + $smsg->{blob} //= $blob; + $oidx->begin_lazy; + $smsg->{num} = $oidx->adj_counter('eidx_docid', '+'); + $oidx->add_overview($eml, $smsg); + $oidx->add_xref3($smsg->{num}, -1, $blob, '.'); + $oidx->commit_lazy; + undef; + } +} + +sub prepare_dedupe { + my ($self) = @_; + $self->{oidx} //= do { + my $creat = !-f $self->{-ovf}; + my $lk = $self->lock_for_scope; # git-config doesn't wait + my $oidx = PublicInbox::OverIdx->new($self->{-ovf}); + $oidx->{-no_fsync} = 1; + $oidx->dbh; + if ($creat) { + $oidx->{dbh}->do('PRAGMA journal_mode = WAL'); + $oidx->eidx_prep; # for xref3 + } + $oidx + }; +} + +sub over { $_[0]->{oidx} } # for xoids_for + +# don't use ale->git directly since is_dup is called inside +# ale->git->cat_async callbacks +sub git { $_[0]->{git} //= PublicInbox::Git->new($_[0]->{ale}->git->{git_dir}) } + +sub pause_dedupe { + my ($self) = @_; + my ($git, $oidx) = delete @$self{qw(git oidx)}; + $git->cleanup if $git; + $oidx->commit_lazy if $oidx; + delete $self->{lockfh}; # from lock_for_scope_fast; +} + +sub reset_dedupe { + my ($self) = @_; + prepare_dedupe($self); + my $lk = $self->lock_for_scope_fast; + for my $t (qw(xref3 over id2num)) { + $self->{oidx}->{dbh}->do("DELETE FROM $t"); + } + pause_dedupe($self); +} + +sub mm { undef } + +sub altid_map { {} } + +sub cloneurl { [] } + +# find existing directory containing a `lei.saved-search' file based on +# $dir_ref which is an output +sub output2lssdir { + my ($self, $lei, $dir_ref, $fn_ref) = @_; + my $dst = $$dir_ref; # imap://$MAILBOX, /path/to/maildir, /path/to/mbox + my $dir = lss_dir_for($lei, \$dst, 1); + my $f = "$dir/lei.saved-search"; + if (-f $f && -r _) { + $self->{-cfg} = $lei->cfg_dump($f) // return; + $$dir_ref = $dir; + $$fn_ref = $f; + return 1; + } + undef; +} + +# cf. LeiDedupe->has_entries +sub has_entries { + my $oidx = $_[0]->{oidx} // die 'BUG: no {oidx}'; + my @n = $oidx->{dbh}->selectrow_array('SELECT num FROM over LIMIT 1'); + scalar(@n) ? 1 : undef; +} + +no warnings 'once'; +*nntp_url = \&cloneurl; +*base_url = \&PublicInbox::Inbox::base_url; +*smsg_eml = \&PublicInbox::Inbox::smsg_eml; +*smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid; +*msg_by_mid = \&PublicInbox::Inbox::msg_by_mid; +*modified = \&PublicInbox::Inbox::modified; +*max_git_epoch = *nntp_usable = *msg_by_path = \&mm; # undef +*isrch = *search = \&mm; # TODO +*DESTROY = \&pause_dedupe; + +1; diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm new file mode 100644 index 00000000..684668c5 --- /dev/null +++ b/lib/PublicInbox/LeiSearch.pm @@ -0,0 +1,187 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# read-only counterpart for PublicInbox::LeiStore +package PublicInbox::LeiSearch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::ExtSearch); # PublicInbox::Search->reopen +use PublicInbox::Search qw(xap_terms); +use PublicInbox::ContentHash qw(content_digest content_hash git_sha); +use PublicInbox::MID qw(mids mids_for_index); +use PublicInbox::Compat qw(uniqstr); +use Carp qw(croak); + +sub _msg_kw { # retry_reopen callback + my ($self, $num) = @_; + my $xdb = $self->xdb; # set {nshard} for num2docid; + xap_terms('K', $xdb, $self->num2docid($num)); +} + +sub msg_keywords { # array or hashref + my ($self, $num) = @_; + $self->retry_reopen(\&_msg_kw, $num); +} + +sub _oid_kw { # retry_reopen callback + my ($self, $nums) = @_; + my $xdb = $self->xdb; # set {nshard}; + my %kw; + for my $num (@$nums) { # there should only be one... + my $doc = $xdb->get_document($self->num2docid($num)); + my $x = xap_terms('K', $doc); + %kw = (%kw, %$x); + } + \%kw; +} + +# returns undef if blob is unknown +sub oidbin_keywords { + my ($self, $oidbin) = @_; + my @num = $self->over->oidbin_exists($oidbin) or return; + $self->retry_reopen(\&_oid_kw, \@num); +} + +sub _xsmsg_vmd { # retry_reopen + my ($self, $smsg, $want_label) = @_; + my $xdb = $self->xdb; # set {nshard}; + my (@kw, @L, $doc, $x); + @kw = qw(flagged) if delete($smsg->{lei_q_tt_flagged}); + my @num = $self->over->blob_exists($smsg->{blob}); + for my $num (@num) { # there should only be one... + $doc = $xdb->get_document($self->num2docid($num)); + push @kw, xap_terms('K', $doc); + push @L, xap_terms('L', $doc) if $want_label # JSON/JMAP only + } + @{$smsg->{kw}} = sort(uniqstr(@kw)) if @kw; + @{$smsg->{L}} = uniqstr(@L) if @L; +} + +# lookup keywords+labels for external messages +sub xsmsg_vmd { + my ($self, $smsg, $want_label) = @_; + return if $smsg->{kw}; # already set by LeiXSearch->mitem_kw + eval { $self->retry_reopen(\&_xsmsg_vmd, $smsg, $want_label) }; + warn "$$ $0 (nshard=$self->{nshard}) $smsg->{blob}: $@" if $@; +} + +# when a message has no Message-IDs at all, this is needed for +# unsent Draft messages, at least +sub content_key ($) { + my ($eml) = @_; + my $dig = content_digest($eml); + my $chash = $dig->clone->digest; + my $mids = mids_for_index($eml); + unless (@$mids) { + $eml->{-lei_fake_mid} = $mids->[0] = + PublicInbox::Import::digest2mid($dig, $eml, 0); + } + ($chash, $mids); +} + +sub _cmp_1st { # git->cat_async callback + my ($bref, $oid, $type, $size, $cmp) = @_; + # cmp: [chash, xoids, smsg, lms] + $bref //= $cmp->[3] ? $cmp->[3]->local_blob($oid, 1) : undef; + if ($bref && content_hash(PublicInbox::Eml->new($bref)) eq $cmp->[0]) { + $cmp->[1]->{$oid} = $cmp->[2]->{num}; + } +} + +# returns { OID => num } mapping for $eml matches +# The `num' hash value only makes sense from LeiSearch itself +# and is nonsense from the PublicInbox::LeiALE subclass +sub xoids_for { + my ($self, $eml, $min) = @_; + my ($chash, $mids) = content_key($eml); + my @overs = ($self->over // $self->overs_all); + my $git = $self->git; + my $xoids = {}; + # no lms when used via {ale}: + my $lms = $self->{-lms_rw} //= lms($self) if defined($self->{topdir}); + for my $mid (@$mids) { + for my $o (@overs) { + my ($id, $prev); + while (my $cur = $o->next_by_mid($mid, \$id, \$prev)) { + # {bytes} may be '' from old bug + $cur->{bytes} = 1 if $cur->{bytes} eq ''; + next if $cur->{bytes} == 0 || + $xoids->{$cur->{blob}}; + $git->cat_async($cur->{blob}, \&_cmp_1st, + [$chash, $xoids, $cur, $lms]); + if ($min && scalar(keys %$xoids) >= $min) { + $git->async_wait_all; + return $xoids; + } + } + } + } + $git->async_wait_all; + + # it could be an 'lei index'-ed file that just got renamed + if (scalar(keys %$xoids) < ($min // 1) && defined($self->{topdir})) { + my $hex = git_sha(1, $eml)->hexdigest; + my @n = $overs[0]->blob_exists($hex); + for (@n) { $xoids->{$hex} //= $_ } + } + scalar(keys %$xoids) ? $xoids : undef; +} + +# returns true if $eml is indexed by lei/store and keywords don't match +sub kw_changed { + my ($self, $eml, $new_kw_sorted, $docids) = @_; + my $cur_kw; + if ($eml) { + my $xoids = xoids_for($self, $eml) // return; + $docids //= []; + @$docids = sort { $a <=> $b } values %$xoids; + if (!@$docids && $self->over) { + my $bin = git_sha(1, $eml)->digest; + @$docids = $self->over->oidbin_exists($bin); + } + } + for my $id (@$docids) { + $cur_kw = eval { msg_keywords($self, $id) } and last; + } + if (!defined($cur_kw) && $@) { + $docids = join(', num:', @$docids); + croak "E: num:$docids keyword lookup failure: $@"; + } + # RFC 5550 sec 5.9 on the $Forwarded keyword states: + # "Once set, the flag SHOULD NOT be cleared" + if (exists($cur_kw->{forwarded}) && + !grep(/\Aforwarded\z/, @$new_kw_sorted)) { + delete $cur_kw->{forwarded}; + } + $cur_kw = join("\0", sort keys %$cur_kw); + join("\0", @$new_kw_sorted) eq $cur_kw ? 0 : 1; +} + +sub qparse_new { + my ($self) = @_; + my $qp = $self->SUPER::qparse_new; # PublicInbox::Search + $self->{qp_flags} |= PublicInbox::Search::FLAG_PHRASE() | + PublicInbox::Search::FLAG_PURE_NOT(); + $qp->add_boolean_prefix('kw', 'K'); + $qp->add_boolean_prefix('L', 'L'); + $qp +} + +sub lms { + my ($self) = @_; + require PublicInbox::LeiMailSync; + my $f = "$self->{topdir}/mail_sync.sqlite3"; + -f $f ? PublicInbox::LeiMailSync->new($f) : undef; +} + +# allow SolverGit->resolve_patch to work with "lei index" +sub smsg_eml { + my ($self, $smsg) = @_; + PublicInbox::Inbox::smsg_eml($self, $smsg) // do { + my $lms = lms($self); + my $bref = $lms ? $lms->local_blob($smsg->{blob}, 1) : undef; + $bref ? PublicInbox::Eml->new($bref) : undef; + }; +} + +1; diff --git a/lib/PublicInbox/LeiSelfSocket.pm b/lib/PublicInbox/LeiSelfSocket.pm new file mode 100644 index 00000000..0e15bc7c --- /dev/null +++ b/lib/PublicInbox/LeiSelfSocket.pm @@ -0,0 +1,37 @@ +# Copyright all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# dummy placeholder socket for internal lei commands. +# This receives what script/lei receives, but isn't connected +# to an interactive terminal so I'm not sure what to do with it... +package PublicInbox::LeiSelfSocket; +use v5.12; +use parent qw(PublicInbox::DS); +use Data::Dumper; +$Data::Dumper::Useqq = 1; # should've been the Perl default :P +use PublicInbox::Syscall qw(EPOLLIN); +use PublicInbox::IPC; + +sub new { + my ($cls, $r) = @_; + my $self = bless {}, $cls; + $r->blocking(0); + $self->SUPER::new($r, EPOLLIN); +} + +sub event_step { + my ($self) = @_; + my ($buf, @fds); + @fds = $PublicInbox::IPC::recv_cmd->($self->{sock}, $buf, 4096 * 33); + if (scalar(@fds) == 1 && !defined($fds[0])) { + return if $!{EAGAIN}; + die "recvmsg: $!" unless $!{ECONNRESET}; + } else { # just in case open so perl can auto-close them: + for (@fds) { open my $fh, '+<&=', $_ }; + } + return $self->close if $buf eq ''; + warn Dumper({ 'unexpected self msg' => $buf, fds => \@fds }); + # TODO: figure out what to do with these messages... +} + +1; diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm new file mode 100644 index 00000000..b2da2bc3 --- /dev/null +++ b/lib/PublicInbox/LeiStore.pm @@ -0,0 +1,667 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Local storage (cache/memo) for lei(1), suitable for personal/private +# mail iff on encrypted device/FS. Based on v2, but only deduplicates +# git storage based on git OID (index deduplication is done in ContentHash) +# +# for xref3, the following are constant: $eidx_key = '.', $xnum = -1 +# +# We rely on the synchronous IPC API for this in lei-daemon and +# multiple lei clients to write to it at once. This allows the +# lei/store IPC process to be decoupled from network latency in +# lei WQ workers. +package PublicInbox::LeiStore; +use strict; +use v5.10.1; +use parent qw(PublicInbox::Lock PublicInbox::IPC); +use PublicInbox::ExtSearchIdx; +use PublicInbox::Eml; +use PublicInbox::Import; +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::V2Writable; +use PublicInbox::ContentHash qw(content_hash); +use PublicInbox::MID qw(mids); +use PublicInbox::LeiSearch; +use PublicInbox::MDA; +use PublicInbox::Spawn qw(spawn); +use PublicInbox::MdirReader; +use PublicInbox::LeiToMail; +use PublicInbox::Compat qw(uniqstr); +use PublicInbox::OnDestroy; +use File::Temp qw(tmpnam); +use POSIX (); +use IO::Handle (); # ->autoflush +use Sys::Syslog qw(syslog openlog); +use Errno qw(EEXIST ENOENT); +use PublicInbox::Syscall qw(rename_noreplace); +use PublicInbox::LeiStoreErr; +use PublicInbox::DS qw(add_uniq_timer); + +sub new { + my (undef, $dir, $opt) = @_; + my $eidx = PublicInbox::ExtSearchIdx->new($dir, $opt); + my $self = bless { priv_eidx => $eidx }, __PACKAGE__; + eidx_init($self)->done if $opt->{creat}; + $self; +} + +sub git { $_[0]->{priv_eidx}->git } # read-only + +sub packing_factor { $PublicInbox::V2Writable::PACKING_FACTOR } + +sub rotate_bytes { + $_[0]->{rotate_bytes} // ((1024 * 1024 * 1024) / $_[0]->packing_factor) +} + +sub git_ident ($) { + my ($git) = @_; + my $rdr = {}; + open $rdr->{2}, '>', '/dev/null' or die "open /dev/null: $!"; + chomp(my $i = $git->qx([qw(var GIT_COMMITTER_IDENT)], undef, $rdr)); + $i =~ /\A(.+) <([^>]+)> [0-9]+ [-\+]?[0-9]+$/ and return ($1, $2); + my ($user, undef, undef, undef, undef, undef, $gecos) = getpwuid($<); + ($user) = (($user // $ENV{USER} // '') =~ /([\w\-\.\+]+)/); + $user //= 'lei-user'; + ($gecos) = (($gecos // '') =~ /([\w\-\.\+ \t]+)/); + $gecos //= 'lei user'; + require Sys::Hostname; + my ($host) = (Sys::Hostname::hostname() =~ /([\w\-\.]+)/); + $host //= 'localhost'; + ($gecos, "$user\@$host") +} + +sub importer { + my ($self) = @_; + my $max; + my $im = $self->{im}; + if ($im) { + return $im if $im->{bytes_added} < $self->rotate_bytes; + + delete $self->{im}; + $im->done; + undef $im; + $self->barrier; + $max = $self->{priv_eidx}->{mg}->git_epochs + 1; + } + my (undef, $tl) = eidx_init($self); # acquire lock + $max //= $self->{priv_eidx}->{mg}->git_epochs; + while (1) { + my $latest = $self->{priv_eidx}->{mg}->add_epoch($max); + my $git = PublicInbox::Git->new($latest); + $self->done; # unlock + # re-acquire lock, update alternates for new epoch + (undef, $tl) = eidx_init($self); + my $packed_bytes = $git->packed_bytes; + my $unpacked_bytes = $packed_bytes / $self->packing_factor; + if ($unpacked_bytes >= $self->rotate_bytes) { + $max++; + next; + } + my ($n, $e) = git_ident($git); + $self->{im} = $im = PublicInbox::Import->new($git, $n, $e); + $im->{bytes_added} = int($packed_bytes / $self->packing_factor); + $im->{lock_path} = undef; + $im->{path_type} = 'v2'; + return $im; + } +} + +sub search { + PublicInbox::LeiSearch->new($_[0]->{priv_eidx}->{topdir}); +} + +sub cat_blob { + my ($self, $oid) = @_; + $self->{im} ? $self->{im}->cat_blob($oid) : undef; +} + +sub schedule_commit { + my ($self, $sec) = @_; + add_uniq_timer($self->{priv_eidx}->{topdir}, $sec, \&barrier, $self); +} + +# follows the stderr file +sub _tail_err { + my ($self) = @_; + my $err = $self->{-tmp_err} // return; + $err->clearerr; # clear EOF marker + my @msg = readline($err); + PublicInbox::LeiStoreErr::emit($self->{-err_wr}, @msg) and return; + # syslog is the last resort if lei-daemon broke + syslog('warning', '%s', $_) for @msg; +} + +sub eidx_init { + my ($self) = @_; + my $eidx = $self->{priv_eidx}; + my $tl = wantarray && $self->{-err_wr} ? + on_destroy(\&_tail_err, $self) : + undef; + $eidx->idx_init({-private => 1}); # acquires lock + wantarray ? ($eidx, $tl) : $eidx; +} + +sub _docids_for ($$) { + my ($self, $eml) = @_; + my %docids; + my $eidx = $self->{priv_eidx}; + my ($chash, $mids) = PublicInbox::LeiSearch::content_key($eml); + my $oidx = $eidx->{oidx}; + my $im = $self->{im}; + for my $mid (@$mids) { + my ($id, $prev); + while (my $cur = $oidx->next_by_mid($mid, \$id, \$prev)) { + next if $cur->{bytes} == 0; # external-only message + my $oid = $cur->{blob}; + my $docid = $cur->{num}; + my $bref = $im ? $im->cat_blob($oid) : undef; + $bref //= $eidx->git->cat_file($oid) // + _lms_rw($self)->local_blob($oid, 1) // do { + warn "W: $oid (#$docid) <$mid> not found\n"; + next; + }; + local $self->{current_info} = $oid; + my $x = PublicInbox::Eml->new($bref); + $docids{$docid} = $docid if content_hash($x) eq $chash; + } + } + sort { $a <=> $b } values %docids; +} + +# n.b. similar to LeiExportKw->export_kw_md, but this is for a single eml +sub export1_kw_md ($$$$$) { + my ($self, $mdir, $bn, $oidbin, $vmdish) = @_; # vmd/vmd_mod + my $orig = $bn; + my (@try, $unkn, $kw); + if ($bn =~ s/:2,([a-zA-Z]*)\z//) { + ($kw, $unkn) = PublicInbox::MdirReader::flags2kw($1); + if (my $set = $vmdish->{kw}) { + $kw = $set; + } elsif (my $add = $vmdish->{'+kw'}) { + @$kw{@$add} = (); + } elsif (my $del = $vmdish->{-kw}) { + delete @$kw{@$del}; + } # else no changes... + @try = qw(cur new); + } else { # no keywords, yet, could be in new/ + @try = qw(new cur); + $unkn = []; + if (my $set = $vmdish->{kw}) { + $kw = $set; + } elsif (my $add = $vmdish->{'+kw'}) { + @$kw{@$add} = (); # auto-vivify + } else { # ignore $vmdish->{-kw} + $kw = []; + } + } + $kw = [ keys %$kw ] if ref($kw) eq 'HASH'; + $bn .= ':2,'. PublicInbox::LeiToMail::kw2suffix($kw, @$unkn); + return if $orig eq $bn; # no change + + # we use link(2) + unlink(2) since rename(2) may + # inadvertently clobber if the "uniquefilename" part wasn't + # actually unique. + my $dst = "$mdir/cur/$bn"; + for my $d (@try) { + my $src = "$mdir/$d/$orig"; + if (rename_noreplace($src, $dst)) { + # TODO: verify oidbin? + $self->{lms}->mv_src("maildir:$mdir", + $oidbin, \$orig, $bn); + return; + } elsif ($! == EEXIST) { # lost race with "lei export-kw"? + return; + } elsif ($! != ENOENT) { + syslog('warning', "rename_noreplace($src -> $dst): $!"); + } + } + for (@try) { return if -e "$mdir/$_/$orig" }; + $self->{lms}->clear_src("maildir:$mdir", \$orig); +} + +sub sto_export_kw ($$$) { + my ($self, $docid, $vmdish) = @_; # vmdish (vmd or vmd_mod) + my ($eidx, $tl) = eidx_init($self); + my $lms = _lms_rw($self) // return; + my $xr3 = $eidx->{oidx}->get_xref3($docid, 1); + for my $row (@$xr3) { + my (undef, undef, $oidbin) = @$row; + my $locs = $lms->locations_for($oidbin) // next; + while (my ($loc, $ids) = each %$locs) { + if ($loc =~ s!\Amaildir:!!i) { + for my $id (@$ids) { + export1_kw_md($self, $loc, $id, + $oidbin, $vmdish); + } + } + # TODO: IMAP + } + } +} + +# vmd = { kw => [ qw(seen ...) ], L => [ qw(inbox ...) ] } +sub set_eml_vmd { + my ($self, $eml, $vmd, $docids) = @_; + my ($eidx, $tl) = eidx_init($self); + $docids //= [ _docids_for($self, $eml) ]; + for my $docid (@$docids) { + $eidx->idx_shard($docid)->ipc_do('set_vmd', $docid, $vmd); + sto_export_kw($self, $docid, $vmd); + } + $docids; +} + +sub add_eml_vmd { + my ($self, $eml, $vmd) = @_; + my ($eidx, $tl) = eidx_init($self); + my @docids = _docids_for($self, $eml); + for my $docid (@docids) { + $eidx->idx_shard($docid)->ipc_do('add_vmd', $docid, $vmd); + } + \@docids; +} + +sub remove_eml_vmd { # remove just the VMD + my ($self, $eml, $vmd) = @_; + my ($eidx, $tl) = eidx_init($self); + my @docids = _docids_for($self, $eml); + for my $docid (@docids) { + $eidx->idx_shard($docid)->ipc_do('remove_vmd', $docid, $vmd); + } + \@docids; +} + +sub _lms_rw ($) { # it is important to have eidx processes open before lms + my ($self) = @_; + $self->{lms} // do { + require PublicInbox::LeiMailSync; + my ($eidx, $tl) = eidx_init($self); + my $f = "$self->{priv_eidx}->{topdir}/mail_sync.sqlite3"; + my $lms = PublicInbox::LeiMailSync->new($f); + $lms->lms_write_prepare; + $self->{lms} = $lms; + }; +} + +sub _remove_if_local { # git->cat_async arg + my ($bref, $oidhex, $type, $size, $self) = @_; + $self->{im}->remove($bref) if $bref; +} + +sub remove_docids ($;@) { + my ($self, @docids) = @_; + my $eidx = eidx_init($self); + for my $docid (@docids) { + $eidx->remove_doc($docid); + $eidx->{oidx}->{dbh}->do(<<EOF, undef, $docid); +DELETE FROM xref3 WHERE docid = ? +EOF + } +} + +# remove the entire message from the index, does not touch mail_sync.sqlite3 +sub remove_eml { + my ($self, $eml) = @_; + my $im = $self->importer; # may create new epoch + my ($eidx, $tl) = eidx_init($self); + my $oidx = $eidx->{oidx}; + my @docids = _docids_for($self, $eml); + my $git = $eidx->git; + for my $docid (@docids) { + my $xr3 = $oidx->get_xref3($docid, 1); + for my $row (@$xr3) { + my (undef, undef, $oidbin) = @$row; + my $oidhex = unpack('H*', $oidbin); + $git->cat_async($oidhex, \&_remove_if_local, $self); + } + } + $git->async_wait_all; + remove_docids($self, @docids); + \@docids; +} + +sub oid2docid ($$) { + my ($self, $oid) = @_; + my $eidx = eidx_init($self); + my ($docid, @cull) = $eidx->{oidx}->blob_exists($oid); + if (@cull) { # fixup old bugs... + warn <<EOF; +W: $oid indexed as multiple docids: $docid @cull, culling to fixup old bugs +EOF + remove_docids($self, @cull); + } + $docid; +} + +sub _add_vmd ($$$$) { + my ($self, $idx, $docid, $vmd) = @_; + $idx->ipc_do('add_vmd', $docid, $vmd); + sto_export_kw($self, $docid, $vmd); +} + +sub _docids_and_maybe_kw ($$) { + my ($self, $docids) = @_; + return $docids unless wantarray; + my (@kw, $idx, @tmp); + for my $num (@$docids) { # likely only 1, unless ContentHash changes + # can't use ->search->msg_keywords on uncommitted docs + $idx = $self->{priv_eidx}->idx_shard($num); + @tmp = eval { $idx->ipc_do('get_terms', 'K', $num) }; + $@ ? warn("#$num get_terms: $@") : push(@kw, @tmp); + } + @kw = sort(uniqstr(@kw)) if @$docids > 1; + ($docids, \@kw); +} + +sub _reindex_1 { # git->cat_async callback + my ($bref, $hex, $type, $size, $smsg) = @_; + my $self = delete $smsg->{-sto}; + my ($eidx, $tl) = eidx_init($self); + $bref //= _lms_rw($self)->local_blob($hex, 1); + if ($bref) { + my $eml = PublicInbox::Eml->new($bref); + $smsg->{-merge_vmd} = 1; # preserve existing keywords + $eidx->idx_shard($smsg->{num})->index_eml($eml, $smsg); + } elsif ($type eq 'missing') { + # pre-release/buggy lei may've indexed external-only msgs, + # try to correct that, here + warn("E: missing $hex, culling (ancient lei artifact?)\n"); + $smsg->{to} = $smsg->{cc} = $smsg->{from} = ''; + $smsg->{bytes} = 0; + $eidx->{oidx}->update_blob($smsg, ''); + my $eml = PublicInbox::Eml->new("\r\n\r\n"); + $eidx->idx_shard($smsg->{num})->index_eml($eml, $smsg); + } else { + warn("E: $type $hex\n"); + } +} + +sub reindex_art { + my ($self, $art) = @_; + my ($eidx, $tl) = eidx_init($self); + my $smsg = $eidx->{oidx}->get_art($art) // return; + return if $smsg->{bytes} == 0; # external-only message + $smsg->{-sto} = $self; + $eidx->git->cat_async($smsg->{blob} // die("no blob (#$art)"), + \&_reindex_1, $smsg); +} + +sub reindex_done { + my ($self) = @_; + my ($eidx, $tl) = eidx_init($self); + $eidx->git->async_wait_all; + # ->done to be called via sto_barrier_request +} + +sub add_eml { + my ($self, $eml, $vmd, $xoids) = @_; + my $im = $self->{-fake_im} // $self->importer; # may create new epoch + my ($eidx, $tl) = eidx_init($self); + my $oidx = $eidx->{oidx}; # PublicInbox::Import::add checks this + my $smsg = bless { -oidx => $oidx }, 'PublicInbox::Smsg'; + $smsg->{-eidx_git} = $eidx->git if !$self->{-fake_im}; + my $im_mark = $im->add($eml, undef, $smsg); + if ($vmd && $vmd->{sync_info}) { + _lms_rw($self)->set_src($smsg->oidbin, @{$vmd->{sync_info}}); + } + unless ($im_mark) { # duplicate blob returns undef + return unless wantarray || $vmd; + my @docids = $oidx->blob_exists($smsg->{blob}); + if ($vmd) { + for my $docid (@docids) { + my $idx = $eidx->idx_shard($docid); + _add_vmd($self, $idx, $docid, $vmd); + } + } + return _docids_and_maybe_kw $self, \@docids; + } + + local $self->{current_info} = $smsg->{blob}; + my $vivify_xvmd = delete($smsg->{-vivify_xvmd}) // []; # exact matches + if ($xoids) { # fuzzy matches from externals in ale->xoids_for + delete $xoids->{$smsg->{blob}}; # added later + if (scalar keys %$xoids) { + my %docids = map { $_ => 1 } @$vivify_xvmd; + for my $oid (keys %$xoids) { + my $docid = oid2docid($self, $oid); + $docids{$docid} = $docid if defined($docid); + } + @$vivify_xvmd = sort { $a <=> $b } keys(%docids); + } + } + if (@$vivify_xvmd) { # docids list + $xoids //= {}; + $xoids->{$smsg->{blob}} = 1; + for my $docid (@$vivify_xvmd) { + my $cur = $oidx->get_art($docid); + my $idx = $eidx->idx_shard($docid); + if (!$cur || $cur->{bytes} == 0) { # really vivifying + $smsg->{num} = $docid; + $oidx->add_overview($eml, $smsg); + $smsg->{-merge_vmd} = 1; + $idx->index_eml($eml, $smsg); + } else { # lse fuzzy hit off ale + $idx->ipc_do('add_eidx_info', $docid, '.', $eml); + } + for my $oid (keys %$xoids) { + $oidx->add_xref3($docid, -1, $oid, '.'); + } + _add_vmd($self, $idx, $docid, $vmd) if $vmd; + } + _docids_and_maybe_kw $self, $vivify_xvmd; + } elsif (my @docids = _docids_for($self, $eml)) { + # fuzzy match from within lei/store + for my $docid (@docids) { + my $idx = $eidx->idx_shard($docid); + $oidx->add_xref3($docid, -1, $smsg->{blob}, '.'); + # add_eidx_info for List-Id + $idx->ipc_do('add_eidx_info', $docid, '.', $eml); + _add_vmd($self, $idx, $docid, $vmd) if $vmd; + } + _docids_and_maybe_kw $self, \@docids; + } else { # totally new message, no keywords + delete $smsg->{-oidx}; # for IPC-friendliness + $smsg->{num} = $oidx->adj_counter('eidx_docid', '+'); + $oidx->add_overview($eml, $smsg); + $oidx->add_xref3($smsg->{num}, -1, $smsg->{blob}, '.'); + my $idx = $eidx->idx_shard($smsg->{num}); + $idx->index_eml($eml, $smsg); + _add_vmd($self, $idx, $smsg->{num}, $vmd) if $vmd; + wantarray ? ($smsg, []) : $smsg; + } +} + +sub set_eml { + my ($self, $eml, $vmd, $xoids) = @_; + add_eml($self, $eml, $vmd, $xoids) // + set_eml_vmd($self, $eml, $vmd); +} + +sub index_eml_only { + my ($self, $eml, $vmd, $xoids) = @_; + require PublicInbox::FakeImport; + local $self->{-fake_im} = PublicInbox::FakeImport->new; + set_eml($self, $eml, $vmd, $xoids); +} + +# store {kw} / {L} info for a message which is only in an external +sub _external_only ($$$) { + my ($self, $xoids, $eml) = @_; + my $eidx = $self->{priv_eidx}; + my $oidx = $eidx->{oidx} // die 'BUG: {oidx} missing'; + my $smsg = bless { blob => '' }, 'PublicInbox::Smsg'; + $smsg->{num} = $oidx->adj_counter('eidx_docid', '+'); + # save space for an externals-only message + my $hdr = $eml->header_obj; + $smsg->populate($hdr); # sets lines == 0 + $smsg->{bytes} = 0; + delete @$smsg{qw(From Subject)}; + $smsg->{to} = $smsg->{cc} = $smsg->{from} = ''; + $oidx->add_overview($hdr, $smsg); # subject+references for threading + $smsg->{subject} = ''; + for my $oid (keys %$xoids) { + $oidx->add_xref3($smsg->{num}, -1, $oid, '.'); + } + my $idx = $eidx->idx_shard($smsg->{num}); + $idx->index_eml(PublicInbox::Eml->new("\n\n"), $smsg); + ($smsg, $idx); +} + +sub update_xvmd { + my ($self, $xoids, $eml, $vmd_mod) = @_; + my ($eidx, $tl) = eidx_init($self); + my $oidx = $eidx->{oidx}; + my %seen; + for my $oid (keys %$xoids) { + my $docid = oid2docid($self, $oid) // next; + delete $xoids->{$oid}; + next if $seen{$docid}++; + my $idx = $eidx->idx_shard($docid); + $idx->ipc_do('update_vmd', $docid, $vmd_mod); + sto_export_kw($self, $docid, $vmd_mod); + } + return unless scalar(keys(%$xoids)); + + # see if it was indexed, but with different OID(s) + if (my @docids = _docids_for($self, $eml)) { + for my $docid (@docids) { + next if $seen{$docid}; + for my $oid (keys %$xoids) { + $oidx->add_xref3($docid, -1, $oid, '.'); + } + my $idx = $eidx->idx_shard($docid); + $idx->ipc_do('update_vmd', $docid, $vmd_mod); + sto_export_kw($self, $docid, $vmd_mod); + } + return; + } + # totally unseen + my ($smsg, $idx) = _external_only($self, $xoids, $eml); + $idx->ipc_do('update_vmd', $smsg->{num}, $vmd_mod); + sto_export_kw($self, $smsg->{num}, $vmd_mod); +} + +# set or update keywords for external message, called via ipc_do +sub set_xvmd { + my ($self, $xoids, $eml, $vmd) = @_; + + my ($eidx, $tl) = eidx_init($self); + my $oidx = $eidx->{oidx}; + my %seen; + + # see if we can just update existing docs + for my $oid (keys %$xoids) { + my $docid = oid2docid($self, $oid) // next; + delete $xoids->{$oid}; # all done with this oid + next if $seen{$docid}++; + my $idx = $eidx->idx_shard($docid); + $idx->ipc_do('set_vmd', $docid, $vmd); + sto_export_kw($self, $docid, $vmd); + } + return unless scalar(keys(%$xoids)); + + # n.b. we don't do _docids_for here, we expect the caller + # already checked $lse->kw_changed before calling this sub + + return unless (@{$vmd->{kw} // []}) || (@{$vmd->{L} // []}); + # totally unseen: + my ($smsg, $idx) = _external_only($self, $xoids, $eml); + $idx->ipc_do('add_vmd', $smsg->{num}, $vmd); + sto_export_kw($self, $smsg->{num}, $vmd); +} + +sub check_done { + my ($self) = @_; + $self->git->_active ? + add_uniq_timer("$self-check_done", 5, \&check_done, $self) : + done($self); +} + +sub xchg_stderr { + my ($self) = @_; + _tail_err($self) if $self->{-err_wr}; + my $dir = $self->{priv_eidx}->{topdir}; + return unless -e $dir; + delete $self->{-tmp_err}; + my ($err, $name) = tmpnam(); + open STDERR, '>>', $name or die "dup2: $!"; + unlink($name); + STDERR->autoflush(1); # shared with shard subprocesses + $self->{-tmp_err} = $err; # separate file description for RO access + undef; +} + +sub _commit ($$) { + my ($self, $cmd) = @_; # cmd is 'done' or 'barrier' + my ($errfh, $lei_sock) = @$self{0, 1}; # via sto_barrier_request + my @err; + if ($self->{im}) { + eval { $self->{im}->$cmd }; + push(@err, "E: import $cmd: $@\n") if $@; + } + delete $self->{lms}; + eval { $self->{priv_eidx}->$cmd }; + push(@err, "E: priv_eidx $cmd: $@\n") if $@; + print { $errfh // \*STDERR } @err; + send($lei_sock, 'child_error 256', 0) if @err && $lei_sock; + xchg_stderr($self); + die @err if @err; + # $lei_sock goes out-of-scope and script/lei can terminate +} + +sub barrier { + my ($self) = @_; + _commit $self, 'barrier'; + add_uniq_timer("$self-check_done", 5, \&check_done, $self); + undef; +} + +sub done { _commit $_[0], 'done' } + +sub ipc_atfork_child { + my ($self) = @_; + my $lei = $self->{lei}; + $lei->_lei_atfork_child(1) if $lei; + xchg_stderr($self); + if (my $to_close = delete($self->{to_close})) { + close($_) for @$to_close; + } + openlog('lei/store', 'pid,nowait,nofatal,ndelay', 'user'); + $self->SUPER::ipc_atfork_child; +} + +sub recv_and_run { + my ($self, @args) = @_; + local $PublicInbox::DS::in_loop = 0; # waitpid synchronously + $self->SUPER::recv_and_run(@args); +} + +sub _sto_atexit { # awaitpid cb + my ($pid) = @_; + warn "lei/store PID:$pid died \$?=$?\n" if $?; +} + +sub write_prepare { + my ($self, $lei) = @_; + $lei // die 'BUG: $lei not passed'; + unless ($self->{-wq_s1}) { + my $dir = $lei->store_path; + substr($dir, -length('/lei/store'), 10, ''); + pipe(my ($r, $w)) or die "pipe: $!"; + $w->autoflush(1); + # Mail we import into lei are private, so headers filtered out + # by -mda for public mail are not appropriate + local @PublicInbox::MDA::BAD_HEADERS = (); + local $SIG{ALRM} = 'IGNORE'; + $self->wq_workers_start("lei/store $dir", 1, $lei->oldset, { + lei => $lei, + -err_wr => $w, + to_close => [ $r ], + }, \&_sto_atexit); + PublicInbox::LeiStoreErr->new($r, $lei); + } + $lei->{sto} = $self; +} + +1; diff --git a/lib/PublicInbox/LeiStoreErr.pm b/lib/PublicInbox/LeiStoreErr.pm new file mode 100644 index 00000000..c8bc72b6 --- /dev/null +++ b/lib/PublicInbox/LeiStoreErr.pm @@ -0,0 +1,62 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# forwards stderr from lei/store process to any lei clients using +# the same store, falls back to syslog if no matching clients exist. +package PublicInbox::LeiStoreErr; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN); +use Sys::Syslog qw(openlog syslog closelog); +use IO::Handle (); # ->blocking +use Time::HiRes (); +use autodie qw(open); +our $err_wr; + +# We don't want blocked stderr on clients to block lei/store or lei-daemon. +# We can't make stderr non-blocking since it can break MUAs or anything +# lei might spawn. So we setup a timer to wake us up after a second if +# printing to a user's stderr hasn't completed, yet. Unfortunately, +# EINTR alone isn't enough since Perl auto-restarts writes on signals, +# so to interrupt writes to clients with blocked stderr, we dup the +# error output to $err_wr ahead-of-time and close $err_wr in the +# SIGALRM handler to ensure `print' gets aborted: + +sub abort_err_wr { close($err_wr) if $err_wr; undef $err_wr } + +sub emit ($@) { + my ($efh, @msg) = @_; + open(local $err_wr, '>&', $efh); # fdopen(dup(fileno($efh)), "w") + local $SIG{ALRM} = \&abort_err_wr; + Time::HiRes::alarm(1.0, 0.1); + my $ret = print $err_wr @msg; + Time::HiRes::alarm(0); + $ret; +} + +sub new { + my ($cls, $rd, $lei) = @_; + my $self = bless { sock => $rd, store_path => $lei->store_path }, $cls; + $rd->blocking(0); + $self->SUPER::new($rd, EPOLLIN); # level-trigger +} + +sub event_step { + my ($self) = @_; + my $n = sysread($self->{sock}, my $buf, 8192); + return ($!{EAGAIN} ? 0 : $self->close) if !defined($n); + return $self->close if !$n; + my $printed; + for my $lei (grep defined, @PublicInbox::DS::FD_MAP) { + my $cb = $lei->can('store_path') // next; + next if $cb->($lei) ne $self->{store_path}; + emit($lei->{2} // next, $buf) and $printed = 1; + } + if (!$printed) { + openlog('lei/store', 'pid,nowait,nofatal,ndelay', 'user'); + for my $l (split(/\n/, $buf)) { syslog('warning', '%s', $l) } + closelog(); # don't share across fork + } +} + +1; diff --git a/lib/PublicInbox/LeiSucks.pm b/lib/PublicInbox/LeiSucks.pm new file mode 100644 index 00000000..ddb3faf7 --- /dev/null +++ b/lib/PublicInbox/LeiSucks.pm @@ -0,0 +1,74 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Undocumented hidden command somebody might discover if they're +# frustrated and need to report a bug. There's no manpage and +# it won't show up in tab completions or help. +package PublicInbox::LeiSucks; +use strict; +use v5.10.1; +use PublicInbox::SHA qw(sha1_hex); +use Config; +use POSIX (); +use PublicInbox::Config; +use PublicInbox::IPC; +use PublicInbox::IO qw(read_all); + +sub lei_sucks { + my ($lei, @argv) = @_; + $lei->start_pager if -t $lei->{1}; + my ($os, undef, $rel, undef, $mac)= POSIX::uname(); + if ($mac eq 'x86_64' && $Config{ptrsize} == 4) { + $mac .= $Config{cppsymbols} =~ /\b__ILP32__=1\b/ ? + ',u=x32' : ',u=x86'; + } + eval { require PublicInbox }; + my $pi_ver = eval('$PublicInbox::VERSION') // '(???)'; + my $nproc = PublicInbox::IPC::detect_nproc() // '?'; + my @out = ("lei $pi_ver\n", + "perl $Config{version} / $os $rel / $mac ". + "ptrsize=$Config{ptrsize} nproc=$nproc\n"); + chomp(my $gv = `git --version` || "git missing"); + $gv =~ s/ version / /; + my $json = ref(PublicInbox::Config->json); + $json .= ' ' . eval('$'.$json.'::VERSION') if $json; + $json ||= '(no JSON)'; + push @out, "$gv / $json\n"; + if (eval { require PublicInbox::Over }) { + push @out, 'SQLite '. + (eval('$DBD::SQLite::sqlite_version') // '(undef)') . + ', DBI '.(eval('$DBI::VERSION') // '(undef)') . + ', DBD::SQLite '. + (eval('$DBD::SQLite::VERSION') // '(undef)')."\n"; + } else { + push @out, "Unable to load DBI / DBD::SQLite: $@\n"; + } + if (eval { require PublicInbox::Search } && + PublicInbox::Search::load_xapian()) { + push @out, 'Xapian '. + join('.', map { + $PublicInbox::Search::Xap->can($_)->(); + } qw(major_version minor_version revision)) . + ", bindings: $PublicInbox::Search::Xap"; + my $xs_ver = eval '$'."$PublicInbox::Search::Xap".'::VERSION'; + push @out, $xs_ver ? " $xs_ver\n" : " SWIG\n"; + } else { + push @out, "Xapian not available: $@\n"; + } + push @out, "public-inbox blob OIDs of loaded features:\n"; + for my $m (grep(m{^PublicInbox/}, sort keys %INC)) { + my $f = $INC{$m} // next; # lazy require failed (missing dep) + open my $fh, '<', $f or do { warn "open($f): $!"; next }; + my $size = -s $fh; + my $hex = sha1_hex("blob $size\0".read_all($fh, $size)); + push @out, ' '.$hex.' '.$m."\n"; + } + push @out, <<'EOM'; +Let us know how it sucks! Please include the above and any other +relevant information when sending plain-text mail to us at: +meta@public-inbox.org -- archives: https://public-inbox.org/meta/ +EOM + $lei->out(@out); +} + +1; diff --git a/lib/PublicInbox/LeiTag.pm b/lib/PublicInbox/LeiTag.pm new file mode 100644 index 00000000..da8caeb7 --- /dev/null +++ b/lib/PublicInbox/LeiTag.pm @@ -0,0 +1,105 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# handles "lei tag" command +package PublicInbox::LeiTag; +use v5.12; +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::OnDestroy; + +sub input_eml_cb { # used by PublicInbox::LeiInput::input_fh + my ($self, $eml) = @_; + if (my $xoids = $self->{lse}->xoids_for($eml) // # tries LeiMailSync + $self->{lei}->{ale}->xoids_for($eml)) { + $self->{lei}->{sto}->wq_do('update_xvmd', $xoids, $eml, + $self->{lei}->{vmd_mod}); + } else { + ++$self->{-nr_unimported}; + } +} + +sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn + my ($self, $f) = @_; + my $eml = eml_from_path($f) or return; + input_eml_cb($self, $eml); +} + +sub lei_tag { # the "lei tag" method + my ($lei, @argv) = @_; + $lei->{opt}->{'in-format'} //= 'eml' if $lei->{opt}->{stdin}; + my $sto = $lei->_lei_store(1)->write_prepare($lei); + my $self = bless {}, __PACKAGE__; + $lei->ale; # refresh and prepare + $self->prepare_inputs($lei, \@argv) or return; + grep(defined, @{$lei->{vmd_mod}}{qw(+kw +L -L -kw)}) or + return $lei->fail('no keywords or labels specified'); + $lei->{-err_type} = 'non-fatal'; + $lei->wq1_start($self); +} + +sub note_unimported { + my ($self) = @_; + my $n = $self->{-nr_unimported} or return; + $self->{lei}->{pkt_op_p}->pkt_do('incr', -nr_unimported => $n); +} + +sub ipc_atfork_child { + my ($self) = @_; + PublicInbox::LeiInput::input_only_atfork_child($self); + $self->{lse} = $self->{lei}->{sto}->search; + # this goes out-of-scope at worker process exit: + on_destroy \¬e_unimported, $self; +} + +# Workaround bash word-splitting s to ['kw', ':', 'keyword' ...] +# Maybe there's a better way to go about this in +# contrib/completion/lei-completion.bash +sub _complete_tag_common ($) { + my ($argv) = @_; + # Workaround bash word-splitting URLs to ['https', ':', '//' ...] + # Maybe there's a better way to go about this in + # contrib/completion/lei-completion.bash + my $re = ''; + my $cur = pop(@$argv) // ''; + if (@$argv) { + my @x = @$argv; + if ($cur eq ':' && @x) { + push @x, $cur; + $cur = ''; + } + while (@x > 2 && $x[0] !~ /\A[+\-](?:kw|L)\z/ && + $x[1] ne ':') { + shift @x; + } + if (@x >= 2) { # qw(kw : $KEYWORD) or qw(kw :) + $re = join('', @x); + } else { # just return everything and hope for the best + $re = join('', @$argv); + } + $re = quotemeta($re); + } + ($cur, $re); +} + +# FIXME: same problems as _complete_forget_external and similar +sub _complete_tag { + my ($self, @argv) = @_; + require PublicInbox::LeiImport; + my @in = PublicInbox::LeiImport::_complete_import(@_); + my @L = eval { $self->_lei_store->search->all_terms('L') }; + my @kwL = ((map { ("+kw:$_", "-kw:$_") } @PublicInbox::LeiInput::KW), + (map { ("+L:$_", "-L:$_") } @L)); + my ($cur, $re) = _complete_tag_common(\@argv); + my @m = map { + # only return the part specified on the CLI + # don't duplicate if already 100% completed + /\A$re(\Q$cur\E.*)/ ? ($cur eq $1 ? () : $1) : (); + } grep(/$re\Q$cur/, @kwL); + (@in, (@m ? @m : @kwL)); +} + +no warnings 'once'; # the following works even when LeiAuth is lazy-loaded +*net_merge_all_done = \&PublicInbox::LeiInput::input_only_net_merge_all_done; + +1; diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm new file mode 100644 index 00000000..5481b5e4 --- /dev/null +++ b/lib/PublicInbox/LeiToMail.pm @@ -0,0 +1,878 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Writes PublicInbox::Eml objects atomically to a mbox variant or Maildir +package PublicInbox::LeiToMail; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use PublicInbox::Eml; +use PublicInbox::IO; +use PublicInbox::Git; +use PublicInbox::Spawn qw(spawn); +use PublicInbox::Import; +use IO::Handle; # ->autoflush +use Fcntl qw(SEEK_SET SEEK_END O_CREAT O_EXCL O_WRONLY); +use PublicInbox::Syscall qw(rename_noreplace); +use autodie qw(pipe open seek close); +use Carp qw(croak); + +my %kw2char = ( # Maildir characters + draft => 'D', + flagged => 'F', + forwarded => 'P', # passed + answered => 'R', + seen => 'S', +); + +my %kw2status = ( + flagged => [ 'X-Status' => 'F' ], + answered => [ 'X-Status' => 'A' ], + seen => [ 'Status' => 'R' ], + draft => [ 'X-Status' => 'T' ], +); + +sub _mbox_hdr_buf ($$$) { + my ($eml, $type, $smsg) = @_; + $eml->header_set($_) for (qw(Lines Bytes Content-Length)); + + my %hdr = (Status => []); # set Status, X-Status + for my $k (@{$smsg->{kw} // []}) { + if (my $ent = $kw2status{$k}) { + push @{$hdr{$ent->[0]}}, $ent->[1]; + } else { # X-Label? + warn "# keyword `$k' not supported for mbox\n"; + } + } + # When writing to empty mboxes, messages are always 'O' + # (not-\Recent in IMAP), it saves MUAs the trouble of + # rewriting the mbox if no other changes are made. + # We put 'O' at the end (e.g. "Status: RO") to match mutt(1) output. + # We only set smsg->{-recent} if augmenting existing stores. + my $status = join('', sort(@{$hdr{Status}})); + $status .= 'O' unless $smsg->{-recent}; + $eml->header_set('Status', $status) if $status; + if (my $chars = delete $hdr{'X-Status'}) { + $eml->header_set('X-Status', join('', sort(@$chars))); + } + my $buf = delete $eml->{hdr}; + + PublicInbox::Eml::strip_from($$buf); + my $ident = $smsg->{blob} // 'lei'; + if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" } + + substr($$buf, 0, 0, # prepend From line + "From $ident\@$type Thu Jan 1 00:00:00 1970$eml->{crlf}"); + $buf; +} + +sub atomic_append { # for on-disk destinations (O_APPEND, or O_EXCL) + my ($lei, $buf) = @_; + if (defined(my $w = syswrite($lei->{1} // return, $$buf))) { + return if $w == length($$buf); + $buf = "short atomic write: $w != ".length($$buf); + } elsif ($!{EPIPE}) { + return $lei->note_sigpipe(1); + } else { + $buf = "atomic write: $!"; + } + $lei->fail($buf); +} + +sub eml2mboxrd ($;$) { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxrd', $smsg); + if (my $bdy = delete $eml->{bdy}) { + $$bdy =~ s/^(>*From )/>$1/gm; + $$buf .= $eml->{crlf}; + substr($$bdy, 0, 0, $$buf); # prepend header + $buf = $bdy; + } + $$buf .= $eml->{crlf}; + $buf; +} + +sub eml2mboxo { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxo', $smsg); + if (my $bdy = delete $eml->{bdy}) { + $$bdy =~ s/^From />From /gm; + $$buf .= $eml->{crlf}; + substr($$bdy, 0, 0, $$buf); # prepend header + $buf = $bdy; + } + $$buf .= $eml->{crlf}; + $buf; +} + +sub _mboxcl_common ($$$) { + my ($buf, $bdy, $crlf) = @_; + # add Lines: so mutt won't have to add it on MUA close + my $lines = $$bdy =~ tr!\n!\n!; + $$buf .= 'Content-Length: '.length($$bdy).$crlf. + 'Lines: '.$lines.$crlf.$crlf; + substr($$bdy, 0, 0, $$buf); # prepend header + $$bdy .= $crlf; + $bdy; +} + +# mboxcl still escapes "From " lines +sub eml2mboxcl { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl', $smsg); + my $bdy = delete($eml->{bdy}) // \(my $empty = ''); + $$bdy =~ s/^From />From /gm; + _mboxcl_common($buf, $bdy, $eml->{crlf}); +} + +# mboxcl2 has no "From " escaping +sub eml2mboxcl2 { + my ($eml, $smsg) = @_; + my $buf = _mbox_hdr_buf($eml, 'mboxcl2', $smsg); + my $bdy = delete($eml->{bdy}) // \(my $empty = ''); + _mboxcl_common($buf, $bdy, $eml->{crlf}); +} + +sub git_to_mail { # git->cat_async callback + my ($bref, $oid, $type, $size, $smsg) = @_; + $type // return; # called by PublicInbox::Git::close + return if $PublicInbox::Git::in_cleanup; + my $self = delete $smsg->{l2m} // croak "BUG: no l2m (type=$type)"; + $self->{lei} // croak "BUG: no {lei} (type=$type)"; + eval { + if ($type eq 'missing' && + ($bref = $self->{-lms_rw}->local_blob($oid, 1))) { + $type = 'blob'; + $size = length($$bref); + } + $type eq 'blob' or return $self->{lei}->child_error(0, + "W: $oid is $type (!= blob)"); + $size or return $self->{lei}->child_error(0,"E: $oid is empty"); + $smsg->{blob} eq $oid or die "BUG: expected=$smsg->{blob}"; + $smsg->{bytes} ||= $size; + $self->{wcb}->($bref, $smsg); + }; + $self->{lei}->fail("$@ (oid=$oid)") if $@; +} + +sub reap_compress { # awaitpid callback + my ($pid, $lei, $cmd, $old_out) = @_; + $lei->{1} = $old_out; + $lei->fail($?, "@$cmd failed") if $?; +} + +sub _post_augment_mbox { # open a compressor process from top-level lei-daemon + my ($self, $lei) = @_; + my $zsfx = $self->{zsfx} or return; + my $cmd = PublicInbox::MboxReader::zsfx2cmd($zsfx, undef, $lei); + my ($r, $w) = @{delete $lei->{zpipe}}; + my $rdr = { 0 => $r, 1 => $lei->{1}, 2 => $lei->{2}, pgid => 0 }; + $lei->{1} = PublicInbox::IO::attach_pid($w, spawn($cmd, undef, $rdr), + \&reap_compress, $lei, $cmd, $lei->{1}); +} + +# --augment existing output destination, with deduplication +sub _augment { # MboxReader eml_cb + my ($eml, $lei) = @_; + # ignore return value, just populate the skv + $lei->{dedupe}->is_dup($eml); +} + +sub _mbox_augment_kw_maybe { + my ($eml, $lei, $lse, $augment) = @_; + my $kw = PublicInbox::MboxReader::mbox_keywords($eml); + update_kw_maybe($lei, $lse, $eml, $kw); + _augment($eml, $lei) if $augment; +} + +sub _mbox_write_cb ($$) { + my ($self, $lei) = @_; + my $ovv = $lei->{ovv}; + my $m = 'eml2'.$ovv->{fmt}; + my $eml2mbox = $self->can($m) or die "$self->$m missing"; + $lei->{1} // die "no stdout ($m, $ovv->{dst})"; # redirected earlier + $lei->{1}->autoflush(1); + my $atomic_append = !defined($ovv->{lock_path}); + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe; + my $lse = $lei->{lse}; # may be undef + my $set_recent = $dedupe->has_entries; + sub { # for git_to_mail + my ($buf, $smsg, $eml) = @_; + $eml //= PublicInbox::Eml->new($buf); + ++$self->{-nr_seen}; + return if $dedupe->is_dup($eml, $smsg); + $lse->xsmsg_vmd($smsg) if $lse; + $smsg->{-recent} = 1 if $set_recent; + $buf = $eml2mbox->($eml, $smsg); + if ($atomic_append) { + atomic_append($lei, $buf); + } else { + my $lk = $ovv->lock_for_scope; + $lei->out($$buf); + } + ++$self->{-nr_write}; + } +} + +sub update_kw_maybe ($$$$) { + my ($lei, $lse, $eml, $kw) = @_; + return unless $lse; + my $c = $lse->kw_changed($eml, $kw, my $docids = []); + my $vmd = { kw => $kw }; + if (scalar @$docids) { # already in lei/store + $lei->{sto}->wq_do('set_eml_vmd', undef, $vmd, $docids) if $c; + } elsif (my $xoids = $lei->{ale}->xoids_for($eml)) { + # it's in an external, only set kw, here + $lei->{sto}->wq_do('set_xvmd', $xoids, $eml, $vmd); + } else { # never-before-seen, import the whole thing + # XXX this is critical in protecting against accidental + # data loss without --augment + $lei->{sto}->wq_do('set_eml', $eml, $vmd); + } +} + +sub _md_update { # maildir_each_eml cb + my ($f, $kw, $eml, $lei, $lse, $unlink) = @_; + update_kw_maybe($lei, $lse, $eml, $kw); + $unlink ? unlink($f) : _augment($eml, $lei); +} + +# maildir_each_file callback, \&CORE::unlink doesn't work with it +sub _unlink { unlink($_[0]) } + +sub _rand () { + state $seq = 0; + sprintf('%x,%x,%x,%x', rand(0xffffffff), time, $$, ++$seq); +} + +sub kw2suffix ($;@) { + my $kw = shift; + join('', sort(map { $kw2char{$_} // () } @$kw, @_)); +} + +sub _buf2maildir ($$$$) { + my ($dst, $buf, $smsg, $dir) = @_; + my $kw = $smsg->{kw} // []; + my $rand = ''; # chosen by die roll :P + my ($tmp, $fh, $base, $ok); + my $common = $smsg->{blob} // _rand; + if (defined(my $pct = $smsg->{pct})) { $common .= "=$pct" } + do { + $tmp = $dst.'tmp/'.$rand.$common; + } while (!($ok = sysopen($fh, $tmp, O_CREAT|O_EXCL|O_WRONLY)) && + $!{EEXIST} && ($rand = _rand.',')); + if ($ok && print $fh $$buf and $fh->close) { + $dst .= $dir; # 'new/' or 'cur/' + $rand = ''; + do { + $base = $rand.$common.':2,'.kw2suffix($kw); + } while (!($ok = rename_noreplace($tmp, $dst.$base)) && + $!{EEXIST} && ($rand = _rand.',')); + \$base; + } else { + my $err = "Error writing $smsg->{blob} to $dst: $!\n"; + $_[0] = undef; # clobber dst + unlink($tmp); + die $err; + } +} + +sub _maildir_write_cb ($$) { + my ($self, $lei) = @_; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe if $dedupe; + my $dst = $lei->{ovv}->{dst}; + my $lse = $lei->{lse}; # may be undef + my $lms = $self->{-lms_rw}; + my $out = $lms ? 'maildir:'.$lei->abs_path($dst) : undef; + $lms->lms_write_prepare if $lms; + + # Favor cur/ and only write to new/ when augmenting. This + # saves MUAs from having to do a mass rename when the initial + # search result set is huge. + my $dir = $dedupe && $dedupe->has_entries ? 'new/' : 'cur/'; + sub { # for git_to_mail + my ($bref, $smsg, $eml) = @_; + $dst // return $lei->fail; # dst may be undef-ed in last run + + ++$self->{-nr_seen}; + return if $dedupe && $dedupe->is_dup($eml // + PublicInbox::Eml->new($$bref), + $smsg); + $lse->xsmsg_vmd($smsg) if $lse; + my $n = _buf2maildir($dst, $bref // \($eml->as_string), + $smsg, $dir); + $lms->set_src($smsg->oidbin, $out, $n) if $lms; + ++$self->{-nr_write}; + } +} + +sub _imap_write_cb ($$) { + my ($self, $lei) = @_; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe if $dedupe; + my $append = $lei->{net}->can('imap_append'); + my $uri = $self->{uri} // die 'BUG: no {uri}'; + my $mic = $lei->{net}->mic_get($uri) // die <<EOM; +E: $uri connection failed. +E: Consider using `--jobs ,1' to limit IMAP connections +EOM + my $folder = $uri->mailbox; + $uri->uidvalidity($mic->uidvalidity($folder)); + my $lse = $lei->{lse}; # may be undef + my $lms = $self->{-lms_rw}; + $lms->lms_write_prepare if $lms; + sub { # for git_to_mail + my ($bref, $smsg, $eml) = @_; + $mic // return $lei->fail; # mic may be undef-ed in last run + + ++$self->{-nr_seen}; + return if $dedupe && $dedupe->is_dup($eml // + PublicInbox::Eml->new($$bref), + $smsg); + $lse->xsmsg_vmd($smsg) if $lse; + my $uid = eval { $append->($mic, $folder, $bref, $smsg, $eml) }; + if (my $err = $@) { + undef $mic; + die $err; + } + # imap_append returns UID if IMAP server has UIDPLUS extension + ($lms && $uid =~ /\A[0-9]+\z/) and + $lms->set_src($smsg->oidbin, $$uri, $uid + 0); + ++$self->{-nr_write}; + } +} + +sub _text_write_cb ($$) { + my ($self, $lei) = @_; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe if $dedupe; + my $lvt = $lei->{lvt}; + my $ovv = $lei->{ovv}; + $lei->{1} // die "no stdout ($ovv->{dst})"; # redirected earlier + $lei->{1}->autoflush(1); + binmode $lei->{1}, ':utf8'; + my $lse = $lei->{lse}; # may be undef + sub { # for git_to_mail + my ($bref, $smsg, $eml) = @_; + $lse->xsmsg_vmd($smsg) if $lse; + $eml //= PublicInbox::Eml->new($bref); + return if $dedupe && $dedupe->is_dup($eml, $smsg); + my $lk = $ovv->lock_for_scope; + $lei->out(${$lvt->eml_to_text($smsg, $eml)}, "\n"); + } +} + +sub _v2_write_cb ($$) { + my ($self, $lei) = @_; + my $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe if $dedupe; + # only call in worker + $PublicInbox::Import::DROP_UNIQUE_UNSUB = $lei->{-drop_unique_unsub}; + sub { # for git_to_mail + my ($bref, $smsg, $eml) = @_; + $eml //= PublicInbox::Eml->new($bref); + ++$self->{-nr_seen}; + return if $dedupe && $dedupe->is_dup($eml, $smsg); + $lei->{v2w}->add($eml) and ++$self->{-nr_write}; + } +} + +sub write_cb { # returns a callback for git_to_mail + my ($self, $lei) = @_; + # _mbox_write_cb, _maildir_write_cb, _imap_write_cb, _v2_write_cb + my $m = "_$self->{base_type}_write_cb"; + $self->$m($lei); +} + +sub new { + my ($cls, $lei) = @_; + my $fmt = $lei->{ovv}->{fmt}; + my $dst = $lei->{ovv}->{dst}; + my $self = bless {}, $cls; + my @conflict; + if ($fmt eq 'maildir') { + require PublicInbox::MdirReader; + $self->{base_type} = 'maildir'; + -e $dst && !-d _ and die + "$dst exists and is not a directory\n"; + $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; + $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; + } elsif ($fmt eq 'mh') { + -e $dst && !-d _ and die + "$dst exists and is not a directory\n"; + $lei->{ovv}->{dst} = $dst .= '/' if substr($dst, -1) ne '/'; + $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; + } elsif (substr($fmt, 0, 4) eq 'mbox') { + require PublicInbox::MboxReader; + $self->can("eml2$fmt") or die <<EOM; +E: bad mbox format: $fmt (did you mean: mboxrd, mboxo, mboxcl, or mboxcl2?) +EOM + $self->{base_type} = 'mbox'; + if ($lei->{cmd} eq 'q' && + (($lei->path_to_fd($dst) // -1) < 0) && + (-f $dst || !-e _)) { + $lei->{opt}->{save} //= \1; + } + } elsif ($fmt =~ /\Aimaps?\z/) { + require PublicInbox::NetWriter; + require PublicInbox::URIimap; + # {net} may exist from "lei up" for auth + my $net = $lei->{net} // PublicInbox::NetWriter->new; + $net->{quiet} = $lei->{opt}->{quiet}; + my $uri = PublicInbox::URIimap->new($dst)->canonical; + $net->add_url($$uri); + my $err = $net->errors($lei); + return $lei->fail($err) if $err; + $uri->mailbox or return $lei->fail("No mailbox: $dst"); + $self->{uri} = $uri; + $dst = $lei->{ovv}->{dst} = $$uri; # canonicalized + $lei->{net} = $net; + $self->{base_type} = 'imap'; + $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; + } elsif ($fmt eq 'text' || $fmt eq 'reply') { + require PublicInbox::LeiViewText; + $lei->{lvt} = PublicInbox::LeiViewText->new($lei, $fmt); + $self->{base_type} = 'text'; + $self->{-wq_nr_workers} = 1; # for pager + @conflict = qw(mua save); + } elsif ($fmt eq 'v2') { + die "--dedupe=oid and v2 are incompatible\n" if + ($lei->{opt}->{dedupe}//'') eq 'oid'; + $self->{base_type} = 'v2'; + $self->{-wq_nr_workers} = 1; # v2 has shards + $lei->{opt}->{save} //= \1 if $lei->{cmd} eq 'q'; + $dst = $lei->{ovv}->{dst} = $lei->abs_path($dst); + @conflict = qw(mua sort); + } else { + die "bad mail --format=$fmt\n"; + } + if ($self->{base_type} =~ /\A(?:text|mbox)\z/) { + (-d $dst || (-e _ && !-w _)) and die + "$dst exists and is not a writable file\n"; + } + $lei->{input_opt} and # lei_convert sets this + @conflict = grep { !$lei->{input_opt}->{$_} } @conflict; + my @err = map { defined($lei->{opt}->{$_}) ? "--$_" : () } @conflict; + die "@err incompatible with $fmt\n" if @err; + $self->{dst} = $dst; + $lei->{dedupe} = $lei->{lss} // do { + my $dd_cls = 'PublicInbox::'. + ($lei->{opt}->{save} ? 'LeiSavedSearch' : 'LeiDedupe'); + eval "require $dd_cls"; + die "$dd_cls: $@" if $@; + my $dd = $dd_cls->new($lei); + $lei->{lss} //= $dd if $dd && $dd->can('cfg_set'); + $dd; + }; + $self; +} + +sub _pre_augment_maildir { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + require File::Path; + File::Path::make_path(map { $dst.$_ } qw(tmp new cur)); + # for utime, so no opendir + open $self->{poke_dh}, '<', "${dst}cur"; +} + +sub clobber_dst_prepare ($;$) { + my ($lei, $f) = @_; + if (my $lms = defined($f) ? $lei->lms : undef) { + $lms->lms_write_prepare; + $lms->forget_folders($f); + } + my $dedupe = $lei->{dedupe} or return; + $dedupe->reset_dedupe if $dedupe->can('reset_dedupe'); +} + +sub _do_augment_maildir { + my ($self, $lei) = @_; + return if $lei->{cmd} eq 'up'; + my $dst = $lei->{ovv}->{dst}; + my $lse = $lei->{opt}->{'import-before'} ? $lei->{lse} : undef; + my $mdr = PublicInbox::MdirReader->new; + if ($lei->{opt}->{augment}) { + my $dedupe = $lei->{dedupe}; + if ($dedupe && $dedupe->prepare_dedupe) { + $mdr->{shard_info} = $self->{shard_info}; + $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse); + $dedupe->pause_dedupe; + } + } elsif ($lse) { + clobber_dst_prepare($lei, "maildir:$dst"); + $mdr->{shard_info} = $self->{shard_info}; + $mdr->maildir_each_eml($dst, \&_md_update, $lei, $lse, 1); + } else {# clobber existing Maildir + clobber_dst_prepare($lei, "maildir:$dst"); + $mdr->maildir_each_file($dst, \&_unlink); + } +} + +sub _imap_augment_or_delete { # PublicInbox::NetReader::imap_each cb + my ($uri, $uid, $kw, $eml, $lei, $lse, $delete_mic) = @_; + update_kw_maybe($lei, $lse, $eml, $kw); + if ($delete_mic) { + $lei->{net}->imap_delete_1($uri, $uid, $delete_mic); + } else { + _augment($eml, $lei); + } +} + +sub _do_augment_imap { + my ($self, $lei) = @_; + return if $lei->{cmd} eq 'up'; + my $net = $lei->{net}; + my $lse = $lei->{opt}->{'import-before'} ? $lei->{lse} : undef; + if ($lei->{opt}->{augment}) { + my $dedupe = $lei->{dedupe}; + if ($dedupe && $dedupe->prepare_dedupe) { + $net->imap_each($self->{uri}, \&_imap_augment_or_delete, + $lei, $lse); + $dedupe->pause_dedupe; + } + } elsif ($lse) { + my $delete_mic; + clobber_dst_prepare($lei, "$self->{uri}"); + $net->imap_each($self->{uri}, \&_imap_augment_or_delete, + $lei, $lse, \$delete_mic); + $delete_mic->expunge if $delete_mic; + } elsif (!$self->{-wq_worker_nr}) { # undef or 0 + # clobber existing IMAP folder + clobber_dst_prepare($lei, "$self->{uri}"); + $net->imap_delete_all($self->{uri}); + } +} + +sub _pre_augment_text { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + my $out; + my $devfd = $lei->path_to_fd($dst) // die "bad $dst"; + if ($devfd >= 0) { + $out = $lei->{$devfd}; + } else { # normal-looking path + if (-p $dst) { + open $out, '>', $dst; + } elsif (-f _ || !-e _) { + # text allows augment, HTML/Atom won't + my $mode = $lei->{opt}->{augment} ? '>>' : '>'; + open $out, $mode, $dst; + } else { + die "$dst is not a file or FIFO\n"; + } + } + $lei->{ovv}->ovv_out_lk_init if !$lei->{ovv}->{lock_path}; + $lei->{1} = $out; + undef; +} + +sub _pre_augment_mbox { + my ($self, $lei) = @_; + my $dst = $lei->{ovv}->{dst}; + my $out; + my $devfd = $lei->path_to_fd($dst) // die "bad $dst"; + if ($devfd >= 0) { + $out = $lei->{$devfd}; + } else { # normal-looking path + if (-p $dst) { + open $out, '>', $dst; + } elsif (-f _ || !-e _) { + require PublicInbox::MboxLock; + my $m = $lei->{opt}->{'lock'} // + PublicInbox::MboxLock->defaults; + $self->{mbl} = PublicInbox::MboxLock->acq($dst, 1, $m); + $out = $self->{mbl}->{fh}; + } else { + die "$dst is not a file or FIFO\n"; + } + $lei->{old_1} = $lei->{1}; # keep for spawning MUA + } + # Perl does SEEK_END even with O_APPEND :< + $self->{seekable} = $out->seek(0, SEEK_SET); + if (!$self->{seekable} && !$!{ESPIPE} && !defined($devfd)) { + die "seek($dst): $!\n"; + } + if (!$self->{seekable}) { + my $imp_before = $lei->{opt}->{'import-before'}; + die "--import-before specified but $dst is not seekable\n" + if $imp_before && !ref($imp_before); + die "--augment specified but $dst is not seekable\n" if + $lei->{opt}->{augment}; + die "cannot --save with unseekable $dst\n" if + $lei->{dedupe} && $lei->{dedupe}->can('reset_dedupe'); + } + if ($self->{zsfx} = PublicInbox::MboxReader::zsfx($dst)) { + pipe(my $r, my $w); + $lei->{zpipe} = [ $r, $w ]; + $lei->{ovv}->{lock_path} and + die 'BUG: unexpected {ovv}->{lock_path}'; + $lei->{ovv}->ovv_out_lk_init; + } elsif (!$self->{seekable} && !$lei->{ovv}->{lock_path}) { + $lei->{ovv}->ovv_out_lk_init; + } + $lei->{1} = $out; + undef; +} + +sub finish_output { + my ($self, $lei) = @_; + my $out = delete $lei->{1} // die 'BUG: no lei->{1}'; + my $old = delete $lei->{old_1} or return; # path only + $lei->{1} = $old; + return if $out->close; # reaps gzip|pigz|xz|bzip2 + my $msg = "E: Error closing $lei->{ovv}->{dst}"; + $? ? $lei->child_error($?) : ($msg .= " ($!)"); + die $msg; +} + +sub _do_augment_mbox { + my ($self, $lei) = @_; + return unless $self->{seekable}; + my $opt = $lei->{opt}; + return if $lei->{cmd} eq 'up'; + my $out = $lei->{1}; + my ($fmt, $dst) = @{$lei->{ovv}}{qw(fmt dst)}; + return clobber_dst_prepare($lei) unless -s $out; + unless ($opt->{augment} || $opt->{'import-before'}) { + truncate($out, 0) or die "truncate($dst): $!"; + return; + } + my $rd; + if (my $zsfx = $self->{zsfx}) { + $rd = PublicInbox::MboxReader::zsfxcat($out, $zsfx, $lei); + } else { + open($rd, '+>>&', $out); + } + my $dedupe; + if ($opt->{augment}) { + $dedupe = $lei->{dedupe}; + $dedupe->prepare_dedupe if $dedupe; + } else { + clobber_dst_prepare($lei); + } + if ($opt->{'import-before'}) { # the default + my $lse = $lei->{lse}; + PublicInbox::MboxReader->$fmt($rd, \&_mbox_augment_kw_maybe, + $lei, $lse, $opt->{augment}); + if (!$opt->{augment} and !truncate($out, 0)) { + die "truncate($dst): $!"; + } + } else { # --augment --no-import-before + PublicInbox::MboxReader->$fmt($rd, \&_augment, $lei); + } + # maybe some systems don't honor O_APPEND, Perl does this: + seek($out, 0, SEEK_END); + $dedupe->pause_dedupe if $dedupe; +} + +sub _pre_augment_v2 { + my ($self, $lei) = @_; + my $dir = $self->{dst}; + require PublicInbox::InboxWritable; + my ($ibx, @creat); + if (-d $dir) { + my $opt = { -min_inbox_version => 2 }; + require PublicInbox::Admin; + my @ibx = PublicInbox::Admin::resolve_inboxes([ $dir ], $opt); + $ibx = $ibx[0] or die "$dir is not a v2 inbox\n"; + } else { + $creat[0] = {}; + $ibx = PublicInbox::Inbox->new({ + name => 'lei-result', # XXX configurable + inboxdir => $dir, + version => 2, + address => [ 'lei@example.com' ], + }); + } + PublicInbox::InboxWritable->new($ibx, @creat); + local $PublicInbox::Import::DROP_UNIQUE_UNSUB; # only for workers + PublicInbox::Import::load_config(PublicInbox::Config->new, sub { + $lei->x_it(shift); + die "E: can't write v2 inbox with broken config\n"; + }); + $lei->{-drop_unique_unsub} = $PublicInbox::Import::DROP_UNIQUE_UNSUB; + $ibx->init_inbox if @creat; + $lei->{v2w} = $ibx->importer; + return if !$lei->{opt}->{shared}; + my $d = "$lei->{ale}->{git}->{git_dir}/objects"; + open my $fh, '+>>', my $f = "$dir/git/0.git/objects/info/alternates"; + seek($fh, 0, SEEK_SET); # Perl did SEEK_END when it saw '>>' + my $seen = grep /\A\Q$d\E\n/, PublicInbox::IO::read_all $fh; + print $fh "$d\n" if !$seen; + close $fh; +} + +sub pre_augment { # fast (1 disk seek), runs in same process as post_augment + my ($self, $lei) = @_; + # _pre_augment_maildir, _pre_augment_mbox, _pre_augment_v2 + my $m = $self->can("_pre_augment_$self->{base_type}") or return; + $m->($self, $lei); +} + +sub do_augment { # slow, runs in wq worker + my ($self, $lei) = @_; + # _do_augment_maildir, _do_augment_mbox, or _do_augment_imap + my $m = $self->can("_do_augment_$self->{base_type}") or return; + $m->($self, $lei); +} + +sub post_augment_call ($$$$) { + my ($self, $lei, $m, $post_augment_done) = @_; + eval { $m->($self, $lei) }; + $lei->{post_augment_err} = $@ if $@; # for post_augment_done +} + +# fast (spawn compressor or mkdir), runs in same process as pre_augment +sub post_augment { + my ($self, $lei, $post_augment_done) = @_; + $self->{-au_noted}++ and $lei->qerr("# writing to $self->{dst} ..."); + + # _post_augment_mbox + my $m = $self->can("_post_augment_$self->{base_type}") or return; + + # --import-before is only for lei-(q|lcat), not lei-convert + $lei->{opt}->{'import-before'} or + return post_augment_call $self, $lei, $m, $post_augment_done; + + # we can't deal with post_augment until import-before commits: + require PublicInbox::EOFpipe; + my @io = @$lei{qw(2 sock)}; + pipe(my $r, $io[2]); + PublicInbox::EOFpipe->new($r, \&post_augment_call, + $self, $lei, $m, $post_augment_done); + $lei->{sto}->wq_io_do('barrier', \@io); + # _post_augment_* && post_augment_done run when barrier is complete +} + +# called by every single l2m worker process +sub do_post_auth { + my ($self) = @_; + my $lei = $self->{lei}; + # lei_xsearch can start as soon as all l2m workers get here + $lei->{pkt_op_p}->pkt_do('incr_start_query') or + die "incr_start_query: $!"; + my $aug; + if (lock_free($self)) { # all workers do_augment + my $mod = $self->{-wq_nr_workers}; + my $shard = $self->{-wq_worker_nr}; + if (my $net = $lei->{net}) { + $net->{shard_info} = [ $mod, $shard ]; + } else { # Maildir + $self->{shard_info} = [ $mod, $shard ]; + } + $aug = 'incr_post_augment'; + } elsif ($self->{-wq_worker_nr} == 0) { # 1st worker do_augment + $aug = 'do_post_augment'; + } + if ($aug) { + local $0 = 'do_augment'; + eval { do_augment($self, $lei) }; + $lei->fail($@) if $@; + $lei->{pkt_op_p}->pkt_do($aug) or die "pkt_do($aug): $!"; + } + # done augmenting, connect the compressor pipe for each worker + if (my $zpipe = delete $lei->{zpipe}) { + $lei->{1} = $zpipe->[1]; + close $zpipe->[0]; + } + my $au_peers = delete $self->{au_peers}; + if ($au_peers) { # wait for peer l2m to finish augmenting: + $au_peers->[1] = undef; + sysread($au_peers->[0], my $barrier1, 1); + } + eval { $self->{wcb} = $self->write_cb($lei) }; + $lei->fail($@) if $@; + if ($au_peers) { # wait for peer l2m to set write_cb + $au_peers->[3] = undef; + sysread($au_peers->[2], my $barrier2, 1); + } +} + +sub ipc_atfork_child { + my ($self) = @_; + my $lei = $self->{lei}; + $lei->_lei_atfork_child; + $lei->{auth}->do_auth_atfork($self) if $lei->{auth}; + $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb(); + $self->{git} = $self->{lei}->{ale}->git; + $SIG{TERM} = sub { # avoid ->DESTROY ordering problems + my $git = delete $self->{git}; + $git->async_wait_all if $git; + exit(15 + 128); + }; + $self->SUPER::ipc_atfork_child; +} + +sub lock_free { + $_[0]->{base_type} =~ /\A(?:maildir|imap|jmap)\z/ ? 1 : 0; +} + +# wakes up the MUA when complete so it can refresh messages list +sub poke_dst { + my ($self) = @_; + if ($self->{base_type} eq 'maildir') { + my $t = time + 1; + utime($t, $t, $self->{poke_dh}) or warn "futimes: $!"; + } +} + +sub write_mail { # via ->wq_io_do + my ($self, $smsg, $eml) = @_; + if ($eml) { + eval { $self->{wcb}->(undef, $smsg, $eml) }; + $self->{lei}->fail("blob=$smsg->{blob} $@") if $@; + } else { + $smsg->{l2m} = $self; + $self->{git}->cat_async($smsg->{blob}, \&git_to_mail, $smsg); + } +} + +sub wq_atexit_child { + my ($self) = @_; + local $PublicInbox::DS::in_loop = 0; # waitpid synchronously + my $lei = $self->{lei}; + $lei->{ale}->git->async_wait_all; + my ($nr_w, $nr_s) = delete(@$self{qw(-nr_write -nr_seen)}); + if (my $v2w = delete $lei->{v2w}) { + eval { $v2w->done }; + $lei->child_error($?, "E: $@ ($v2w->{ibx}->{inboxdir})") if $@; + } + delete $self->{wcb}; + (($nr_w //= 0) + ($nr_s //= 0)) or return; + return if $lei->{early_mua} || !$lei->{-progress} || !$lei->{pkt_op_p}; + $lei->{pkt_op_p}->pkt_do('incr', -nr_write => $nr_w, -nr_seen => $nr_s) +} + +# runs on a 1s timer in lei-daemon +sub augment_inprogress { + my ($err, $opt, $dst, $au_noted) = @_; + eval { + return if $$au_noted++ || !$err || !defined(fileno($err)); + print $err '# '.($opt->{'import-before'} ? + "importing non-external contents of $dst" : ( + ($opt->{dedupe} // 'content') ne 'none') ? + "scanning old contents of $dst for dedupe" : + "removing old contents of $dst")." ...\n"; + }; + warn "E: $@ ($dst)" if $@; +} + +# called in top-level lei-daemon when LeiAuth is done +sub net_merge_all_done { + my ($self, $lei) = @_; + if ($PublicInbox::DS::in_loop && + $self->can("_do_augment_$self->{base_type}") && + !$lei->{opt}->{quiet}) { + $self->{-au_noted} = 0; + PublicInbox::DS::add_timer(1, \&augment_inprogress, + $lei->{2}, $lei->{opt}, + $self->{dst}, \$self->{-au_noted}); + } + $self->wq_broadcast('do_post_auth'); + $self->wq_close; +} + +1; diff --git a/lib/PublicInbox/LeiUp.pm b/lib/PublicInbox/LeiUp.pm new file mode 100644 index 00000000..9931f017 --- /dev/null +++ b/lib/PublicInbox/LeiUp.pm @@ -0,0 +1,222 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "lei up" - updates the result of "lei q --save" +package PublicInbox::LeiUp; +use v5.12; +# n.b. we use LeiInput to setup IMAP auth +use parent qw(PublicInbox::IPC PublicInbox::LeiInput); +use PublicInbox::LeiSavedSearch; # OverIdx +use PublicInbox::DS; +use PublicInbox::PktOp; +use PublicInbox::LeiFinmsg; +use PublicInbox::LEI; +my $REMOTE_RE = qr!\A(?:imap|http)s?://!i; # http(s) will be for JMAP + +sub up1 ($$) { + my ($lei, $out) = @_; + # precedence note for CLI switches between lei q and up: + # `lei q --only' > `lei q --no-(remote|local|external)' + # `lei up --no-(remote|local|external)' > `lei.q.only' in saved search + my %no = map { + my $v = $lei->{opt}->{$_}; # set by CLI + (defined($v) && !$v) ? ($_ => 1) : (); + } qw(remote local external); + my $cli_exclude = delete $lei->{opt}->{exclude}; + my $lss = PublicInbox::LeiSavedSearch->up($lei, $out) or return; + my $f = $lss->{'-f'}; + my $mset_opt = $lei->{mset_opt} = { relevance => -2 }; + my $q = $lss->{-cfg}->get_all('lei.q') // + die("lei.q unset in $f (out=$out)\n"); + my $lse = $lei->{lse} // die 'BUG: {lse} missing'; + my $rawstr = $lss->{-cfg}->{'lei.internal.rawstr'} // + (scalar(@$q) == 1 && substr($q->[0], -1) eq "\n"); + if ($rawstr) { + die <<EOM if scalar(@$q) > 1; +$f: lei.q has multiple values (@$q) (out=$out) +$f: while lei.internal.rawstr is set +EOM + $lse->query_approxidate($lse->git, $mset_opt->{qstr} = $q->[0]); + } else { + $mset_opt->{qstr} = $lse->query_argv_to_string($lse->git, $q); + } + # n.b. only a few CLI args are accepted for "up", so //= usually sets + for my $k ($lss->ARRAY_FIELDS) { + my $v = $lss->{-cfg}->get_all("lei.q.$k") // next; + $lei->{opt}->{$k} //= $v; + } + + # --no-(local|remote) CLI flags overrided saved `lei.q.only' + my $only = $lei->{opt}->{only}; + @$only = map { $lei->get_externals($_) } @$only if $only; + if (scalar keys %no && $only) { + @$only = grep(!m!\Ahttps?://!i, @$only) if $no{remote}; + @$only = grep(m!\Ahttps?://!i, @$only) if $no{'local'}; + } + if ($cli_exclude) { + my $ex = $lei->canonicalize_excludes($cli_exclude); + @$only = grep { !$ex->{$_} } @$only if $only; + push @{$lei->{opt}->{exclude}}, @$cli_exclude; + } + delete $lei->{opt}->{only} if $no{external} || ($only && !@$only); + for my $k ($lss->BOOL_FIELDS, $lss->SINGLE_FIELDS) { + my $v = $lss->{-cfg}->get_1("lei.q.$k") // next; + $lei->{opt}->{$k} //= $v; + } + my $o = $lei->{opt}->{output} // ''; + return die("lei.q.output unset in $f (out=$out)\n") if $o eq ''; + $lss->translate_dedupe($lei) or return; + $lei->{lss} = $lss; # for LeiOverview->new and query_remote_mboxrd + my $lxs = $lei->lxs_prepare or return; + $lei->ale->refresh_externals($lxs, $lei); + $lei->_start_query; +} + +sub redispatch_all ($$) { + my ($self, $lei) = @_; + my $upq = [ (@{$self->{o_local} // []}, @{$self->{o_remote} // []}) ]; + return up1($lei, $upq->[0]) if @$upq == 1; # just one, may start MUA + + PublicInbox::OverIdx::fork_ok($lei->{opt}); + # FIXME: this is also used per-query, see lei->_start_query + my $j = $lei->{opt}->{jobs} || do { + my $n = $self->detect_nproc // 1; + $n > 4 ? 4 : $n; + }; + $j = ($j =~ /\A([0-9]+)/) ? $1 + 0 : 1; # may be --jobs=$x,$m on CLI + # re-dispatch into our event loop w/o creating an extra fork-level + # $upq will be drained via DESTROY as each query finishes + $lei->{fmsg} = PublicInbox::LeiFinmsg->new($lei); + my ($op_c, $op_p) = PublicInbox::PktOp->pair; + # call lei->dclose when upq is done processing: + $op_c->{ops} = { '' => [ $lei->can('dclose'), $lei ] }; + my @first_batch = splice(@$upq, 0, $j); # initial parallelism + $lei->{-upq} = $upq; + $lei->event_step_init; # wait for client disconnects + for my $out (@first_batch) { + PublicInbox::DS::requeue( + PublicInbox::LeiUp1::nxt($lei, $out, $op_p)); + } +} + +sub filter_lss { + my ($self, $lei, $all) = @_; + my @outs = PublicInbox::LeiSavedSearch::list($lei); + if ($all eq 'local') { + $self->{o_local} = [ grep(!/$REMOTE_RE/, @outs) ]; + } elsif ($all eq 'remote') { + $self->{o_remote} = [ grep(/$REMOTE_RE/, @outs) ]; + } elsif ($all eq '') { + $self->{o_remote} = [ grep(/$REMOTE_RE/, @outs) ]; + $self->{o_local} = [ grep(!/$REMOTE_RE/, @outs) ]; + } else { + undef; + } +} + +sub lei_up { + my ($lei, @outs) = @_; + my $opt = $lei->{opt}; + my $self = bless { -mail_sync => 1 }, __PACKAGE__; + if (defined(my $all = $opt->{all})) { + return $lei->fail("--all and @outs incompatible") if @outs; + defined($opt->{mua}) and return + $lei->fail('--all and --mua= are incompatible'); + filter_lss($self, $lei, $all) // return + $lei->fail("only --all=$all not understood"); + } elsif ($lei->{lse}) { # redispatched + scalar(@outs) == 1 or die "BUG: lse set w/ >1 out[@outs]"; + return up1($lei, $outs[0]); + } else { + $self->{o_remote} = [ grep(/$REMOTE_RE/, @outs) ]; + $self->{o_local} = [ grep(!/$REMOTE_RE/, @outs) ]; + } + $lei->{lse} = $lei->_lei_store(1)->write_prepare($lei)->search; + ((@{$self->{o_local} // []} + @{$self->{o_remote} // []}) > 1 && + defined($opt->{mua})) and return $lei->fail(<<EOM); +multiple outputs and --mua= are incompatible +EOM + if ($self->{o_remote}) { # setup lei->{auth} + $self->prepare_inputs($lei, $self->{o_remote}) or return; + } + if ($lei->{auth}) { # start auth worker + require PublicInbox::NetWriter; + bless $lei->{net}, 'PublicInbox::NetWriter'; + $lei->wq1_start($self); + # net_merge_all_done will fire when auth is done + } else { + redispatch_all($self, $lei); # see below + } +} + +# called in top-level lei-daemon when LeiAuth is done +sub net_merge_all_done { + my ($self, $lei) = @_; + $lei->{net} = delete($self->{-net_new}) if $self->{-net_new}; + $self->wq_close; + eval { redispatch_all($self, $lei) }; + $lei->child_error(0, "E: $@") if $@; +} + +sub _complete_up { # lei__complete hook + my ($lei, @argv) = @_; + my $match_cb = $lei->complete_url_prepare(\@argv); + map { $match_cb->($_) } PublicInbox::LeiSavedSearch::list($lei); +} + +sub _wq_done_wait { # awaitpid cb + my ($pid, $wq, $lei) = @_; + $lei->child_error($?, 'auth failure') if $? +} + +no warnings 'once'; +*ipc_atfork_child = \&PublicInbox::LeiInput::input_only_atfork_child; + +package PublicInbox::LeiUp1; # for redispatch_all +use v5.12; + +sub nxt ($$$) { + my ($lei, $out, $op_p) = @_; + bless { lei => $lei, out => $out, op_p => $op_p }, __PACKAGE__; +} + +sub event_step { # runs via PublicInbox::DS::requeue + my ($self) = @_; + my $lei = $self->{lei}; # the original, from lei_up + my $l = bless { %$lei }, ref($lei); # per-output copy + delete($l->{sock}) or return; # client disconnected if {sock} is gone + $l->{opt} = { %{$l->{opt}} }; # deep copy + delete $l->{opt}->{all}; + $l->qerr("# updating $self->{out}"); + my $o = " (output: $self->{out})"; # add to all warnings + my $cb = $SIG{__WARN__} // \&CORE::warn; + local $SIG{__WARN__} = sub { + my @m = @_; + push(@m, $o) if !@m || $m[-1] !~ s/\n\z/$o\n/; + $cb->(@m); + }; + $l->{-up1} = $self; # for LeiUp1->DESTROY + delete @$l{qw(-socks -event_init_done)}; + my ($op_c, $op_p) = PublicInbox::PktOp->pair; + $self->{unref_on_destroy} = $op_c->{sock}; # to cleanup $lei->{-socks} + $lei->pkt_ops($op_c->{ops} //= {}); # errors from $l -> script/lei + push @{$lei->{-socks}}, $op_c->{sock}; # script/lei signals to $l + $l->{sock} = $op_p->{op_p}; # receive signals from op_c->{sock} + $op_c = $op_p = undef; + + eval { $l->dispatch('up', $self->{out}) }; + $lei->child_error(0, $@) if $@ || $l->{failed}; # lei->fail() +} + +sub DESTROY { + my ($self) = @_; + return if ($PublicInbox::LEI::daemon_pid // -1) != $$; + my $lei = $self->{lei}; # the original, from lei_up + my $sock = delete $self->{unref_on_destroy}; + my $s = $lei->{-socks} // []; + @$s = grep { $_ != $sock } @$s; + my $out = shift(@{$lei->{-upq}}) or return; + PublicInbox::DS::requeue(nxt($lei, $out, $self->{op_p})); +} + +1; diff --git a/lib/PublicInbox/LeiViewText.pm b/lib/PublicInbox/LeiViewText.pm new file mode 100644 index 00000000..c7d72c71 --- /dev/null +++ b/lib/PublicInbox/LeiViewText.pm @@ -0,0 +1,308 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# PublicInbox::Eml to (optionally colorized) text coverter for terminals +# the non-HTML counterpart to PublicInbox::View +package PublicInbox::LeiViewText; +use strict; +use v5.10.1; +use PublicInbox::MsgIter qw(msg_part_text); +use PublicInbox::MID qw(references); +use PublicInbox::View; +use PublicInbox::Hval; +use PublicInbox::ViewDiff; +use PublicInbox::Spawn qw(popen_rd); +use Term::ANSIColor; +use POSIX (); +use PublicInbox::Address; + +sub _xs { + # xhtml_map works since we don't search for HTML ([&<>'"]) + $_[0] =~ s/([\x7f\x00-\x1f])/$PublicInbox::Hval::xhtml_map{$1}/sge; +} + +my %DEFAULT_COLOR = ( + # mutt names, loaded from ~/.config/lei/config + quoted => 'blue', + hdrdefault => 'cyan', + status => 'bright_cyan', # smsg stuff + attachment => 'bright_red', + + # git names and defaults, falls back to ~/.gitconfig + new => 'green', + old => 'red', + meta => 'bold', + frag => 'cyan', + func => undef, + context => undef, +); + +my $COLOR = qr/(?:bright)? + (?:normal|black|red|green|yellow|blue|magenta|cyan|white)/x; + +sub my_colored { + my ($self, $slot, $buf) = @_; + my $val = $self->{"color.$slot"} //= + $self->{-leicfg}->{"color.$slot"} // + $self->{-gitcfg}->{"color.diff.$slot"} // + $self->{-gitcfg}->{"diff.color.$slot"} // + $DEFAULT_COLOR{$slot}; + $val = $val->[-1] if ref($val) eq 'ARRAY'; + if (defined $val) { + $val = lc $val; + # git doesn't use "_", Term::ANSIColor does + $val =~ s/\Abright([^_])/bright_$1/ig; + + # git: "green black" => T::A: "green on_black" + $val =~ s/($COLOR)(.+?)($COLOR)/$1$2on_$3/; + + # FIXME: convert git #XXXXXX to T::A-compatible colors + # for 256-color terminals + + ${$self->{obuf}} .= colored($buf, $val); + } else { + ${$self->{obuf}} .= $buf; + } +} + +sub uncolored { ${$_[0]->{obuf}} .= $_[2] } + +sub new { + my ($cls, $lei, $fmt) = @_; + my $self = bless { %{$lei->{opt}}, -colored => \&uncolored }, $cls; + $self->{-quote_reply} = 1 if $fmt eq 'reply'; + return $self unless $self->{color} //= -t $lei->{1}; + my @cmd = qw(git config -z --includes -l); # reuse normal git config + my $r = popen_rd(\@cmd, undef, { 2 => $lei->{2} }); + my $cfg = PublicInbox::Config::config_fh_parse($r, "\0", "\n"); + if (!$r->close) { + warn "# @cmd failed, no color (non-fatal \$?=$?)\n"; + return $self; + } + $self->{-colored} = \&my_colored; + $self->{-gitcfg} = $cfg; + $self->{-leicfg} = $lei->{cfg}; + $self; +} + +sub quote_hdr_buf ($$) { + my ($self, $eml) = @_; + my $hbuf = ''; + my $to = $eml->header_raw('Reply-To') // + $eml->header_raw('From') // + $eml->header_raw('Sender'); + my $cc = ''; + for my $f (qw(To Cc)) { + for my $v ($eml->header_raw($f)) { + next if $v !~ /\S/; + $cc .= ", $v"; + $to //= $v; + } + } + substr($cc, 0, 2, ''); # s/^, //; + PublicInbox::View::fold_addresses($to); + PublicInbox::View::fold_addresses($cc); + _xs($to); + _xs($cc); + $hbuf .= "To: $to\n" if defined $to && $to =~ /\S/; + $hbuf .= "Cc: $cc\n" if $cc =~ /\S/; + my $s = $eml->header_str('Subject') // 'your mail'; + _xs($s); + substr($s, 0, 0, 'Re: ') if $s !~ /\bRe:/i; + $hbuf .= "Subject: $s\n"; + if (defined(my $irt = $eml->header_raw('Message-ID'))) { + _xs($irt); + $hbuf .= "In-Reply-To: $irt\n"; + } + $self->{-colored}->($self, 'hdrdefault', $hbuf); + my ($n) = PublicInbox::Address::names($eml->header_str('From') // + $eml->header_str('Sender') // + $eml->header_str('Reply-To') // + 'unknown sender'); + my $d = $eml->header_raw('Date') // 'some unknown date'; + _xs($d); + _xs($n); + ${delete $self->{obuf}} . "\nOn $d, $n wrote:\n"; +} + +sub hdr_buf ($$) { + my ($self, $eml) = @_; + my $hbuf = ''; + for my $f (qw(From To Cc)) { + for my $v ($eml->header($f)) { + next if $v !~ /\S/; + PublicInbox::View::fold_addresses($v); + _xs($v); + $hbuf .= "$f: $v\n"; + } + } + for my $f (qw(Subject Date Newsgroups Message-ID X-Message-ID)) { + for my $v ($eml->header($f)) { + _xs($v); + $hbuf .= "$f: $v\n"; + } + } + if (my @irt = $eml->header_raw('In-Reply-To')) { + for my $v (@irt) { + _xs($v); + $hbuf .= "In-Reply-To: $v\n"; + } + } else { + my $refs = references($eml); + if (defined(my $irt = pop @$refs)) { + _xs($irt); + $hbuf .= "In-Reply-To: <$irt>\n"; + } + if (@$refs) { + my $max = $self->{-max_cols}; + $hbuf .= 'References: ' . + join("\n\t", map { '<'._xs($_).'>' } @$refs) . + ">\n"; + } + } + $self->{-colored}->($self, 'hdrdefault', $hbuf .= "\n"); +} + +sub attach_note ($$$$;$) { + my ($self, $ct, $p, $fn, $err) = @_; + my ($part, $depth, $idx) = @$p; + my $nl = $idx eq '1' ? '' : "\n"; # like join("\n", ...) + my $abuf = $err ? <<EOF : ''; +[-- Warning: decoded text below may be mangled, UTF-8 assumed --] +EOF + $abuf .= "[-- Attachment #$idx: "; + _xs($ct); + my $size = length($part->body); + my $ts = "Type: $ct, Size: $size bytes"; + my $d = $part->header('Content-Description') // $fn // ''; + _xs($d); + $abuf .= $d eq '' ? "$ts --]\n" : "$d --]\n[-- $ts --]\n"; + if (my $blob = $self->{-smsg}->{blob}) { + $abuf .= "[-- lei blob $blob:$idx --]\n"; + } + $self->{-colored}->($self, 'attachment', $abuf); + hdr_buf($self, $part) if $part->{is_submsg}; +} + +sub flush_text_diff ($$) { + my ($self, $cur) = @_; + my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $$cur); + undef $$cur; # free memory + my $dctx; + my $obuf = $self->{obuf}; + my $colored = $self->{-colored}; + while (defined(my $x = shift @top)) { + if (scalar(@top) >= 4 && + $top[1] =~ $PublicInbox::ViewDiff::IS_OID && + $top[0] =~ $PublicInbox::ViewDiff::IS_OID) { + splice(@top, 0, 4); + $dctx = 1; + $colored->($self, 'meta', $x); + } elsif ($dctx) { + # Quiet "Complex regular subexpression recursion limit" + # warning. Perl will truncate matches upon hitting + # that limit, giving us more (and shorter) scalars than + # would be ideal, but otherwise it's harmless. + # + # We could replace the `+' metacharacter with `{1,100}' + # to limit the matches ourselves to 100, but we can + # let Perl do it for us, quietly. + no warnings 'regexp'; + + for my $s (split(/((?:(?:^\+[^\n]*\n)+)| + (?:(?:^-[^\n]*\n)+)| + (?:^@@ [^\n]+\n))/xsm, $x)) { + if (!defined($dctx)) { + ${$self->{obuf}} .= $s; + } elsif ($s =~ s/\A(@@ \S+ \S+ @@\s*)//) { + $colored->($self, 'frag', $1); + $colored->($self, 'func', $s); + } elsif ($s =~ /\A\+/) { + $colored->($self, 'new', $s); + } elsif ($s =~ /\A-- $/sm) { # email sig starts + $dctx = undef; + ${$self->{obuf}} .= $s; + } elsif ($s =~ /\A-/) { + $colored->($self, 'old', $s); + } else { + $colored->($self, 'context', $s); + } + } + } else { + ${$self->{obuf}} .= $x; + } + } +} + +sub add_text_buf { # callback for Eml->each_part + my ($p, $self) = @_; + my ($part, $depth, $idx) = @$p; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + my ($s, $err) = msg_part_text($part, $ct); + return attach_note($self, $ct, $p, $fn) unless defined $s; + hdr_buf($self, $part) if $part->{is_submsg}; + $s =~ s/\r+\n/\n/sg; + _xs($s); + my $diff = ($s =~ /^--- [^\n]+\n\+{3} [^\n]+\n@@ /ms); + my @sections = PublicInbox::MsgIter::split_quotes($s); + undef $s; # free memory + if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) { + # badly-encoded message with $err? tell the world about it! + attach_note($self, $ct, $p, $fn, $err); + ${$self->{obuf}} .= "\n"; + } + my $colored = $self->{-colored}; + for my $cur (@sections) { + if ($cur =~ /\A>/) { + $colored->($self, 'quoted', $cur); + } elsif ($diff) { + flush_text_diff($self, \$cur); + } else { + ${$self->{obuf}} .= $cur; + } + undef $cur; # free memory + } +} + +# returns a stringref suitable for $lei->out or print +sub eml_to_text { + my ($self, $smsg, $eml) = @_; + local $Term::ANSIColor::EACHLINE = "\n"; + $self->{obuf} = \(my $obuf = ''); + $self->{-smsg} = $smsg; + $self->{-max_cols} = ($self->{columns} //= 80) - 8; # for header wrap + my $h = []; + if ($self->{-quote_reply}) { + my $blob = $smsg->{blob} // 'unknown-blob'; + my $pct = $smsg->{pct} // 'unknown'; + my $t = POSIX::asctime(gmtime($smsg->{ts} // $smsg->{ds} // 0)); + $h->[0] = "From $blob\@$pct $t"; + } else { + for my $f (qw(blob pct)) { + push @$h, "$f:$smsg->{$f}" if defined $smsg->{$f}; + } + @$h = ("# @$h\n") if @$h; + for my $f (qw(kw L)) { + my $v = $smsg->{$f} or next; + push @$h, "# $f:".join(',', @$v)."\n" if @$v; + } + } + $h = join('', @$h); + $self->{-colored}->($self, 'status', $h); + my $quote_hdr; + if ($self->{-quote_reply}) { + $quote_hdr = ${delete $self->{obuf}}; + $quote_hdr .= quote_hdr_buf($self, $eml); + } else { + hdr_buf($self, $eml); + } + $eml->each_part(\&add_text_buf, $self, 1); + if (defined $quote_hdr) { + ${$self->{obuf}} =~ s/^/> /sgm; + substr(${$self->{obuf}}, 0, 0, $quote_hdr); + } + delete $self->{obuf}; +} + +1; diff --git a/lib/PublicInbox/LeiWatch.pm b/lib/PublicInbox/LeiWatch.pm new file mode 100644 index 00000000..b30e5152 --- /dev/null +++ b/lib/PublicInbox/LeiWatch.pm @@ -0,0 +1,12 @@ +# Copyright all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# represents a Maildir, MH or IMAP "watch" item +package PublicInbox::LeiWatch; +use v5.12; +use parent qw(PublicInbox::IPC); + +# "url" may be something like "maildir:/path/to/dir" or "mh:/path/to/dir" +sub new { bless { url => $_[1] }, $_[0] } + +1; diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm new file mode 100644 index 00000000..43dedd10 --- /dev/null +++ b/lib/PublicInbox/LeiXSearch.pm @@ -0,0 +1,660 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Combine any combination of PublicInbox::Search, +# PublicInbox::ExtSearch, and PublicInbox::LeiSearch objects +# into one Xapian DB +package PublicInbox::LeiXSearch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::LeiSearch PublicInbox::IPC); +use PublicInbox::DS qw(now); +use File::Temp 0.19 (); # 0.19 for ->newdir +use File::Spec (); +use PublicInbox::Search qw(xap_terms); +use PublicInbox::Spawn qw(popen_rd popen_wr which); +use PublicInbox::MID qw(mids mid_escape); +use PublicInbox::Smsg; +use PublicInbox::Eml; +use PublicInbox::LEI; +use Fcntl qw(SEEK_SET F_SETFL O_APPEND O_RDWR); +use PublicInbox::ContentHash qw(git_sha); +use POSIX qw(strftime); +use autodie qw(close open read seek truncate); +use PublicInbox::Syscall qw($F_SETPIPE_SZ); +use PublicInbox::OnDestroy; + +sub new { + my ($class) = @_; + PublicInbox::Search::load_xapian(); + bless { + qp_flags => $PublicInbox::Search::QP_FLAGS | + PublicInbox::Search::FLAG_PURE_NOT(), + }, $class +} + +sub attach_external { + my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox + my $desc = $ibxish->{inboxdir} // $ibxish->{topdir}; + my $srch = $ibxish->search // + return warn("$desc not indexed for Xapian ($@ $!)\n"); + my @shards = $srch->xdb_shards_flat or + return warn("$desc has no Xapian shards\n"); + + if (delete $self->{xdb}) { # XXX: do we need this? + # clobber existing {xdb} if amending + my $expect = delete $self->{nshard}; + my $shards = delete $self->{shards_flat}; + scalar(@$shards) == $expect or die + "BUG: {nshard}$expect != shards=".scalar(@$shards); + + my $prev = {}; + for my $old_ibxish (@{$self->{shard2ibx}}) { + next if $prev == $old_ibxish; + $prev = $old_ibxish; + my @shards = $old_ibxish->search->xdb_shards_flat; + push @{$self->{shards_flat}}, @shards; + } + my $nr = scalar(@{$self->{shards_flat}}); + $nr == $expect or die + "BUG: reloaded $nr shards, expected $expect" + } + push @{$self->{shards_flat}}, @shards; + push(@{$self->{shard2ibx}}, $ibxish) for (@shards); +} + +# returns a list of local inboxes (or count in scalar context) +sub locals { @{$_[0]->{locals} // []} } + +sub remotes { @{$_[0]->{remotes} // []} } + +# called by PublicInbox::Search::xdb (usually via ->mset) +sub xdb_shards_flat { @{$_[0]->{shards_flat} // []} } + +sub _mitem_kw { # retry_reopen callback + my ($srch, $smsg, $mitem, $flagged) = @_; + my $doc = $mitem->get_document; + my $kw = xap_terms('K', $doc); + $kw->{flagged} = 1 if $flagged; + my @L = xap_terms('L', $doc); + # we keep the empty {kw} array here to prevent expensive work in + # ->xsmsg_vmd, _unbless_smsg will clobber it iff it's empty + $smsg->{kw} = [ sort keys %$kw ]; + $smsg->{L} = \@L if scalar(@L); +} + +sub mitem_kw ($$$;$) { + my ($srch, $smsg, $mitem, $flagged) = @_; + $srch->retry_reopen(\&_mitem_kw, $smsg, $mitem, $flagged); +} + +# like over->get_art +sub smsg_for { + my ($self, $mitem) = @_; + # cf. https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID + my $nshard = $self->{nshard}; + my $docid = $mitem->get_docid; + my $shard = ($docid - 1) % $nshard; + my $num = int(($docid - 1) / $nshard) + 1; + my $ibx = $self->{shard2ibx}->[$shard]; + my $smsg = $ibx->over->get_art($num); + return if $smsg->{bytes} == 0; # external message + if ($ibx->can('msg_keywords')) { + mitem_kw($self, $smsg, $mitem); + } + $smsg; +} + +sub over {} + +sub _check_mset_limit ($$$) { + my ($lei, $desc, $mset) = @_; + return if defined($lei->{opt}->{limit}); # user requested limit + my $est = $mset->get_matches_estimated; + my $tot = $lei->{mset_opt}->{total}; + $est > $tot and $lei->qerr(<<""); +# $desc estimated matches ($est) exceeds default --limit=$tot + +} + +sub _mset_more ($$) { + my ($mset, $mo) = @_; + my $size = $mset->size; + $size >= $mo->{limit} && (($mo->{offset} += $size) < $mo->{total}); +} + +# $startq will see `q' in do_post_augment -> start_mua if spawning MUA. +# Otherwise $startq will EOF when do_augment is done augmenting and allow +# query_combined_mset and query_thread_mset to proceed. +sub wait_startq ($) { + my ($lei) = @_; + read(delete($lei->{startq}) // return, my $buf, 1) or return; # EOF + die "BUG: wrote `$buf' to au_done" if $buf ne 'q'; + $lei->{opt}->{quiet} = 1; + delete $lei->{opt}->{verbose}; + delete $lei->{-progress}; +} + +sub mset_progress { + my $lei = shift; + return if $lei->{early_mua} || !$lei->{-progress}; + if ($lei->{pkt_op_p}) { + $lei->{pkt_op_p}->pkt_do('mset_progress', @_); + } else { # single lei-daemon consumer + my ($desc, $mset_size, $mset_total_est) = @_; + $lei->{-mset_total} += $mset_size if $mset_total_est ne '?'; + $lei->qerr("# $desc $mset_size/$mset_total_est"); + } +} + +sub query_one_mset { # for --threads and l2m w/o sort + my ($self, $ibxish) = @_; + my $lei = $self->{lei}; + my ($srch, $over) = ($ibxish->search, $ibxish->over); + my $dir = $ibxish->{inboxdir} // $ibxish->{topdir}; + return warn("$dir not indexed by Xapian\n") unless ($srch && $over); + bless $srch, 'PublicInbox::LeiSearch'; # for ->qparse_new + my $mo = { %{$lei->{mset_opt}} }; # copy + local $0 = "$0 1 $mo->{qstr}"; + my $mset; + my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); + my $can_kw = !!$ibxish->can('msg_keywords'); + my $threads = $lei->{opt}->{threads} // 0; + my $fl = $threads > 1 ? 1 : undef; + my $mid = $lei->{opt}->{'thread-id'}; + $mo->{threadid} = $over->mid2tid($mid) if defined $mid; + my $lss = $lei->{lss}; + my $maxk = "external.$dir.maxuid"; # max of previous, so our min + my $min = $lss ? ($lss->{-cfg}->{$maxk} // 0) : 0; + ref($min) and return warn("$maxk=$min has multiple values\n"); + ($min =~ /[^0-9]/) and return warn("$maxk=$min not numeric\n"); + my $first_ids; + do { + $mset = eval { $srch->mset($mo->{qstr}, $mo) }; + return $lei->child_error(22 << 8, "E: $@") if $@; # 22 from curl + mset_progress($lei, $dir, $mo->{offset} + $mset->size, + $mset->get_matches_estimated); + wait_startq($lei); # wait for keyword updates + my $ids = $srch->mset_to_artnums($mset, $mo); + my $i = 0; + if ($threads) { + # copy $ids if $lss since over->expand_thread + # shifts @{$ctx->{ids}} + $first_ids = [ @$ids ] if $lss; + my $ctx = { ids => $ids, min => $min }; + my %n2item = map { $ids->[$i++] => $_ } $mset->items; + while ($over->expand_thread($ctx)) { # fills {xids} + for my $n (@{delete $ctx->{xids}}) { + my $smsg = $over->get_art($n) or next; + my $mi = delete $n2item{$n}; + next if $smsg->{bytes} == 0; + if ($mi && $can_kw) { + mitem_kw($srch, $smsg, $mi, $fl) + } elsif ($mi && $fl) { + # call ->xsmsg_vmd, later + $smsg->{lei_q_tt_flagged} = 1; + } + $each_smsg->($smsg, $mi); + } + } + } else { + $first_ids = $ids; + my @items = $mset->items; # parallel with @$ids + for my $n (@$ids) { + my $mitem = $items[$i++]; + next if $n <= $min; + my $smsg = $over->get_art($n) or next; + next if $smsg->{bytes} == 0; + mitem_kw($srch, $smsg, $mitem, $fl) if $can_kw; + $each_smsg->($smsg, $mitem); + } + } + } while (_mset_more($mset, $mo)); + _check_mset_limit($lei, $dir, $mset); + if ($lss && scalar(@$first_ids)) { + my $max = $first_ids->[0]; + $lss->cfg_set($maxk, $max); + undef $lss; + } + undef $each_smsg; # may commit + $lei->{ovv}->ovv_atexit_child($lei); +} + +sub query_combined_mset { # non-parallel for non-"--threads" users + my ($self) = @_; + my $lei = $self->{lei}; + my $mo = { %{$lei->{mset_opt}} }; + local $0 = "$0 C $mo->{qstr}"; + my $mset; + for my $loc (locals($self)) { + attach_external($self, $loc); + } + my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); + do { + $mset = eval { $self->mset($mo->{qstr}, $mo) }; + return $lei->child_error(22 << 8, "E: $@") if $@; # 22 from curl + mset_progress($lei, 'xsearch', $mo->{offset} + $mset->size, + $mset->get_matches_estimated); + wait_startq($lei); # wait for keyword updates + for my $mitem ($mset->items) { + my $smsg = smsg_for($self, $mitem) or next; + $each_smsg->($smsg, $mitem); + } + } while (_mset_more($mset, $mo)); + _check_mset_limit($lei, 'xsearch', $mset); + undef $each_smsg; # may commit + $lei->{ovv}->ovv_atexit_child($lei); +} + +sub _smsg_fill ($$) { + my ($smsg, $eml) = @_; + $smsg->populate($eml); + $smsg->parse_references($eml, mids($eml)); + $smsg->{$_} //= '' for qw(from to cc ds subject references mid); + delete @$smsg{qw(From Subject -ds -ts)}; +} + +sub each_remote_eml { # callback for MboxReader->mboxrd + my ($eml, $self, $lei, $each_smsg) = @_; + my $xoids = $lei->{ale}->xoids_for($eml, 1); + my $smsg = bless {}, 'PublicInbox::Smsg'; + if ($self->{import_sto} && !$xoids) { + my ($res, $kw) = $self->{import_sto}->wq_do('add_eml', $eml); + if (ref($res) eq ref($smsg)) { # totally new message + $smsg = $res; + $self->{-sto_imported} = 1; + } + $smsg->{kw} = $kw; # short-circuit xsmsg_vmd + } + $smsg->{blob} //= $xoids ? (keys(%$xoids))[0] + : $lei->git_oid($eml)->hexdigest; + _smsg_fill($smsg, $eml); + wait_startq($lei); + my $nr = ++$lei->{-nr_remote_eml}; # needed for lss->cfg_set + if ($lei->{-progress}) { + my $now = now(); + my $next = $lei->{-next_progress} //= ($now + 1); + if ($now > $next) { + $lei->{-next_progress} = $now + 1; + mset_progress($lei, $lei->{-current_url}, $nr, '?'); + } + } + $each_smsg->($smsg, undef, $eml); +} + +sub fudge_qstr_time ($$$) { + my ($lei, $uri, $qstr) = @_; + return ($qstr, undef) unless $lei->{lss}; + my $cfg = $lei->{lss}->{-cfg} // die 'BUG: no lss->{-cfg}'; + my $cfg_key = "external.$uri.lastresult"; + my $lr = $cfg->{$cfg_key} or return ($qstr, $cfg_key); + if ($lr !~ /\A\-?[0-9]+\z/) { + $lei->child_error(0, + "$cfg->{-f}: $cfg_key=$lr not an integer, ignoring"); + return ($qstr, $cfg_key); + } + my $rft = $lei->{opt}->{'remote-fudge-time'}; + if ($rft && $rft !~ /\A-?[0-9]+\z/) { + my @t = $lei->{lss}->git->date_parse($rft); + my $diff = time - $t[0]; + $lei->qerr("# $rft => $diff seconds"); + $rft = $diff; + } + $lr -= ($rft || (48 * 60 * 60)); + require PublicInbox::Admin; + $lei->qerr("# $uri limiting to ". + PublicInbox::Admin::fmt_localtime($lr).' and newer'); + # this should really be rt: (received-time), but no stable + # public-inbox releases support it, yet. + my $dt = 'dt:'.strftime('%Y%m%d%H%M%S', gmtime($lr)).'..'; + if ($qstr =~ /\S/) { + substr($qstr, 0, 0, '('); + $qstr .= ') AND '; + } + ($qstr .= $dt, $cfg_key); +} + +sub query_remote_mboxrd { + my ($self, $uris) = @_; + local $SIG{TERM} = sub { exit(0) }; # for DESTROY (File::Temp, $reap) + my $lei = $self->{lei}; + my $opt = $lei->{opt}; + my $qstr = $lei->{mset_opt}->{qstr}; + local $0 = "$0 R $qstr"; + my @qform = (x => 'm'); + push(@qform, t => 1) if $opt->{threads}; + open my $cerr, '+>', undef; + my $rdr = { 2 => $cerr }; + my @lbf_tee; + if ($opt->{verbose}) { + # spawn a line-buffered tee(1) script, otherwise curl + # will write 1 character at-a-time and parallel outputs + # mmmaaayyy llloookkk llliiikkkeee ttthhhiiisss + # (n.b. POSIX tee(1) cannot do any buffering) + my $o = { 1 => $cerr, 2 => $lei->{2} }; + delete $rdr->{2}; + @lbf_tee = ([ $^X, qw(-w -p -e), <<'' ], undef, $o); +BEGIN { $| = 1; use IO::Handle; STDERR->autoflush(1); } +print STDERR $_; + + } + my $curl = PublicInbox::LeiCurl->new($lei, $self->{curl}) or return; + push @$curl, '-s', '-d', ''; + my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); + $self->{import_sto} = $lei->{sto} if $lei->{opt}->{'import-remote'}; + if (defined(my $mid = $opt->{'thread-id'})) { + $mid = mid_escape($mid); + for my $uri (@$uris) { + $uri->path($uri->path.$mid.'/'); + } + } + for my $uri (@$uris) { + $lei->{-current_url} = $uri->as_string; + my $start = time; + my ($q, $key) = fudge_qstr_time($lei, $uri, $qstr); + $uri->query_form(@qform, q => $q); + my $cmd = $curl->for_uri($lei, $uri); + $lei->qerr("# $cmd"); + $rdr->{2} //= popen_wr(@lbf_tee) if @lbf_tee; + my $fh = popen_rd($cmd, undef, $rdr); + $fh = IO::Uncompress::Gunzip->new($fh, + MultiStream => 1, AutoClose => 1); + eval { + PublicInbox::MboxReader->mboxrd($fh, \&each_remote_eml, + $self, $lei, $each_smsg); + }; + my ($exc, $code) = ($@, $?); + $lei->sto_barrier_request if delete($self->{-sto_imported}); + die "E: $exc" if $exc && !$code; + my $nr = delete $lei->{-nr_remote_eml} // 0; + if (!$code) { # don't update if no results, maybe MTA is down + $lei->{lss}->cfg_set($key, $start) if $key && $nr; + mset_progress($lei, $lei->{-current_url}, $nr, $nr); + next; + } + delete($rdr->{2})->close if @lbf_tee; + seek($cerr, 0, SEEK_SET); + read($cerr, my $err, -s $cerr); + truncate($cerr, 0); + next if (($code >> 8) == 22 && $err =~ /\b404\b/); + $uri->query_form(q => $qstr); + $lei->child_error($code, "E: <$uri> `$cmd` failed"); + } + undef $each_smsg; + $lei->{ovv}->ovv_atexit_child($lei); +} + +sub git { $_[0]->{git} // die 'BUG: git uninitialized' } + +sub xsearch_done_wait { # awaitpid cb + my ($pid, $wq, $lei) = @_; + return if !$?; + my $s = $? & 127; + return $lei->child_error($?) if $s == 13 || $s == 15; + $lei->child_error($?, 'non-fatal error from '.ref($wq)." \$?=$?"); +} + +sub query_done { # EOF callback for main daemon + my ($lei) = @_; + my $l2m = delete $lei->{l2m}; + delete $lei->{lxs}; + ($lei->{opt}->{'mail-sync'} && !$lei->{sto}) and + warn "BUG: {sto} missing with --mail-sync"; + $lei->sto_barrier_request; + $lei->{ovv}->ovv_end($lei); + if ($l2m) { # close() calls LeiToMail reap_compress + $l2m->finish_output($lei); + if ($l2m->lock_free) { + $l2m->poke_dst; + $lei->poke_mua; + } else { # mbox users + delete $l2m->{mbl}; # drop dotlock + } + } + my $nr_w = delete($lei->{-nr_write}) // 0; + my $nr_dup = (delete($lei->{-nr_seen}) // 0) - $nr_w; + if ($lei->{-progress}) { + my $tot = $lei->{-mset_total} // 0; + my $x = "$tot matches"; + $x .= ", $nr_dup duplicates" if $nr_dup; + if ($l2m) { + my $m = "# $nr_w written to " . + "$lei->{ovv}->{dst} ($x)"; + $nr_w ? $lei->qfin($m) : $lei->qerr($m); + } else { + $lei->qerr("# $x"); + } + } + $lei->start_mua if $l2m && !$l2m->lock_free; + $lei->dclose; +} + +sub post_augment_done { # via on_destroy in top-level lei-daemon + my ($lei) = @_; + my $err = delete $lei->{post_augment_err}; + if ($err) { + if (my $lxs = delete $lei->{lxs}) { + $lxs->wq_kill(-POSIX::SIGTERM()); + $lxs->wq_close; + } + $lei->fail("$err"); + } + if (!$err && delete $lei->{early_mua}) { # non-augment case + eval { $lei->start_mua }; # may trigger wait_startq + $lei->fail($@) if $@; + } + close(delete $lei->{au_done}); # trigger wait_startq if start_mua didn't +} + +sub do_post_augment { + my ($lei) = @_; + my $l2m = $lei->{l2m} or return; # client disconnected + $l2m->post_augment($lei, on_destroy(\&post_augment_done, $lei)); +} + +sub incr_post_augment { # called whenever an l2m shard finishes augment + my ($lei) = @_; + my $l2m = $lei->{l2m} or return; # client disconnected + return if ++$lei->{nr_post_augment} != $l2m->{-wq_nr_workers}; + do_post_augment($lei); +} + +my $MAX_PER_HOST = 4; + +sub concurrency { + my ($self, $opt) = @_; + my $nl = $opt->{threads} ? locals($self) : 1; + my $nr = remotes($self); + $nr = $MAX_PER_HOST if $nr > $MAX_PER_HOST; + $nl + $nr; +} + +sub start_query ($$) { # always runs in main (lei-daemon) process + my ($self, $lei) = @_; + local $PublicInbox::LEI::current_lei = $lei; + if ($lei->{opt}->{threads} || + defined($lei->{opt}->{'thread-id'}) || + ($lei->{l2m} && !$lei->{opt}->{'sort'})) { + for my $ibxish (locals($self)) { + $self->wq_io_do('query_one_mset', [], $ibxish); + } + } elsif (locals($self)) { + $self->wq_io_do('query_combined_mset', []); + } + my $i = 0; + my $q = []; + for my $uri (remotes($self)) { + push @{$q->[$i++ % $MAX_PER_HOST]}, $uri; + } + for my $uris (@$q) { + $self->wq_io_do('query_remote_mboxrd', [], $uris); + } + if ($self->{-do_lcat}) { + $self->wq_io_do('lcat_dump', []); + } + $self->wq_close; # lei_xsearch workers stop when done +} + +sub incr_start_query { # called whenever an l2m shard starts do_post_auth + my ($lei, $self) = @_; + my $l2m = $lei->{l2m}; + return if ++$self->{nr_start_query} != $l2m->{-wq_nr_workers}; + start_query($self, $lei); +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->_lei_atfork_child; + $self->SUPER::ipc_atfork_child; +} + +sub do_query { + my ($self, $lei) = @_; + my $l2m = $lei->{l2m}; + my $qstr = \($lei->{mset_opt}->{qstr}); + chomp $$qstr; + $$qstr =~ s/[ \n\t]+/ /sg; # make URLs and $0 less ugly + my $ops = { + sigpipe_handler => [ $lei ], + fail_handler => [ $lei ], + do_post_augment => [ \&do_post_augment, $lei ], + incr_post_augment => [ \&incr_post_augment, $lei ], + '' => [ \&query_done, $lei ], + mset_progress => [ \&mset_progress, $lei ], + incr => [ $lei ], + x_it => [ $lei ], + child_error => [ $lei ], + incr_start_query => [ \&incr_start_query, $lei, $self ], + }; + $lei->{auth}->op_merge($ops, $l2m, $lei) if $l2m && $lei->{auth}; + my $end = $lei->pkt_op_pair; + $lei->{1}->autoflush(1); + $lei->start_pager if delete $lei->{need_pager}; + $lei->{ovv}->ovv_begin($lei); + die 'BUG: xdb|over open' if $lei->{lse}->{xdb} || $lei->{lse}->{over}; + if ($l2m) { + $l2m->pre_augment($lei); + if ($lei->{opt}->{augment} && delete $lei->{early_mua}) { + $lei->start_mua; + } + if ($l2m->{-wq_nr_workers} > 1 && + $l2m->{base_type} =~ /\A(?:maildir|mbox)\z/) { + # setup two barriers to coordinate ->has_entries + # between l2m workers + pipe(my ($a_r, $a_w)) or die "pipe: $!"; + fcntl($a_r, $F_SETPIPE_SZ, 4096) if $F_SETPIPE_SZ; + pipe(my ($b_r, $b_w)) or die "pipe: $!"; + fcntl($b_r, $F_SETPIPE_SZ, 4096) if $F_SETPIPE_SZ; + $l2m->{au_peers} = [ $a_r, $a_w, $b_r, $b_w ]; + } + $l2m->wq_workers_start('lei2mail', undef, + $lei->oldset, { lei => $lei }, + \&xsearch_done_wait, $lei); + pipe($lei->{startq}, $lei->{au_done}) or die "pipe: $!"; + fcntl($lei->{startq}, $F_SETPIPE_SZ, 4096) if $F_SETPIPE_SZ; + delete $l2m->{au_peers}; + close(delete $l2m->{-wq_s2}); # share wq_s1 with lei_xsearch + } + $self->wq_workers_start('lei_xsearch', undef, + $lei->oldset, { lei => $lei }, + \&xsearch_done_wait, $lei); + my $op_c = delete $lei->{pkt_op_c}; + delete $lei->{pkt_op_p}; + @$end = (); + $self->{-do_lcat} = !!(delete $lei->{lcat_todo}); + if ($l2m) { + $l2m->net_merge_all_done($lei) unless $lei->{auth}; + } else { + start_query($self, $lei); + } + $lei->event_step_init; # wait for shutdowns + $lei->wait_wq_events($op_c, $ops); +} + +sub add_uri { + my ($self, $uri) = @_; + if (my $curl = $self->{curl} //= which('curl') // 0) { + require PublicInbox::MboxReader; + require IO::Uncompress::Gunzip; + require PublicInbox::LeiCurl; + push @{$self->{remotes}}, $uri; + $uri; + } else { + warn "curl missing, ignoring $uri\n"; + undef; + } +} + +# returns URI or PublicInbox::Inbox-like object +sub prepare_external { + my ($self, $loc, $boost) = @_; # n.b. already ordered by boost + if (ref $loc) { # already a URI, or PublicInbox::Inbox-like object + return add_uri($self, $loc) if $loc->can('scheme'); + # fall-through on Inbox-like objects + } elsif ($loc =~ m!\Ahttps?://!) { + require URI; + return add_uri($self, URI->new($loc)); + } elsif (-f "$loc/ei.lock" && -d "$loc/ALL.git/objects") { + require PublicInbox::ExtSearch; + die "`\\n' not allowed in `$loc'\n" if index($loc, "\n") >= 0; + $loc = PublicInbox::ExtSearch->new($loc); + } elsif ((-f "$loc/inbox.lock" && -d "$loc/all.git/objects") || + (-d "$loc/public-inbox" && -d "$loc/objects")) { + die "`\\n' not allowed in `$loc'\n" if index($loc, "\n") >= 0; + require PublicInbox::Inbox; # v2, v1 + $loc = bless { inboxdir => $loc }, 'PublicInbox::Inbox'; + } elsif (!-e $loc) { + warn "W: $loc gone, perhaps run: lei forget-external $loc\n"; + return undef; + } else { + warn "W: $loc ignored, unable to determine external type\n"; + return undef; + } + push @{$self->{locals}}, $loc; + $loc; +} + +sub _lcat_i { # LeiMailSync->each_src iterator callback + my ($oidbin, $id, $each_smsg) = @_; + $each_smsg->({blob => unpack('H*', $oidbin), pct => 100}); +} + +sub _lcat2smsg { # git->cat_async callback + my ($bref, $oid, $type, $size, $smsg) = @_; + if ($bref) { + my $eml = PublicInbox::Eml->new($bref); + my $json_dump = delete $smsg->{-json_dump}; + bless $smsg, 'PublicInbox::Smsg'; + _smsg_fill($smsg, $eml); + $json_dump->($smsg, undef, $eml); + } +} + +sub lcat_dump { # via wq_io_do + my ($self) = @_; + my $lei = $self->{lei}; + my $each_smsg = $lei->{ovv}->ovv_each_smsg_cb($lei); + my $git = $lei->{ale}->git; + if (!$lei->{l2m}) { + my $json_dump = $each_smsg; + $each_smsg = sub { + my ($smsg) = @_; + $smsg->{-json_dump} = $json_dump; + $git->cat_async($smsg->{blob}, \&_lcat2smsg, $smsg); + }; + } + my $lms; + for my $ent (@{$lei->{lcat_todo}}) { + if (ref $ent eq 'HASH') { # { fid => $fid ,.. } + $lms //= $lei->{lse}->lms; + $lms->each_src($ent, \&_lcat_i, $each_smsg); + } else { # oidhex + $each_smsg->({ blob => $ent, pct => 100 }); + } + } + $git->async_wait_all; + undef $each_smsg; # may commit + $lei->{ovv}->ovv_atexit_child($lei); +} + +1; diff --git a/lib/PublicInbox/Limiter.pm b/lib/PublicInbox/Limiter.pm new file mode 100644 index 00000000..a8d08fc3 --- /dev/null +++ b/lib/PublicInbox/Limiter.pm @@ -0,0 +1,50 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::Limiter; +use v5.12; +use PublicInbox::Spawn; + +sub new { + my ($class, $max) = @_; + bless { + # 32 is same as the git-daemon connection limit + max => $max || 32, + running => 0, + run_queue => [], + # RLIMIT_CPU => undef, + # RLIMIT_DATA => undef, + # RLIMIT_CORE => undef, + }, $class; +} + +sub setup_rlimit { + my ($self, $name, $cfg) = @_; + for my $rlim (@PublicInbox::Spawn::RLIMITS) { + my $k = lc($rlim); + $k =~ tr/_//d; + $k = "publicinboxlimiter.$name.$k"; + my $v = $cfg->{$k} // next; + my @rlimit = split(/\s*,\s*/, $v); + if (scalar(@rlimit) == 1) { + push @rlimit, $rlimit[0]; + } elsif (scalar(@rlimit) != 2) { + warn "could not parse $k: $v\n"; + } + my $inf = $v =~ /\binfinity\b/i ? + $PublicInbox::Spawn::RLIMITS{RLIM_INFINITY} // eval { + require BSD::Resource; + BSD::Resource::RLIM_INFINITY(); + } // do { + warn "BSD::Resource missing for $rlim"; + next; + } : undef; + for my $i (0..$#rlimit) { + next if $rlimit[$i] ne 'INFINITY'; + $rlimit[$i] = $inf; + } + $self->{$rlim} = \@rlimit; + } +} + +1; diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index a02eafc4..306a57e7 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # two-step linkification. @@ -11,8 +11,8 @@ # Maybe this could be done more efficiently... package PublicInbox::Linkify; use strict; -use warnings; -use Digest::SHA qw/sha1_hex/; +use v5.10.1; +use PublicInbox::SHA qw(sha1_hex); use PublicInbox::Hval qw(ascii_html mid_href); use PublicInbox::MID qw($MID_EXTRACT); @@ -68,23 +68,22 @@ sub linkify_1 { # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($url . $SALT); - + $key =~ tr/0-9/A-J/; # no digits for YAML highlight $_[0]->{$key} = $url; - $beg . 'PI-LINK-'. $key . $end; + $beg . 'LINKIFY' . $key . $end; ^geo; $_[1]; } sub linkify_2 { - # Added "PI-LINK-" prefix to avoid false-positives on git commits - $_[1] =~ s!\bPI-LINK-([a-f0-9]{40})\b! + # Added "LINKIFY" prefix to avoid false-positives on git commits + $_[1] =~ s!\bLINKIFY([a-fA-J]{40})\b! my $key = $1; my $url = $_[0]->{$key}; if (defined $url) { "<a\nhref=\"$url\">$url</a>"; - } else { - # false positive or somebody tried to mess with us - $key; + } else { # false positive or somebody tried to mess with us + 'LINKIFY'.$key; } !ge; $_[1]; @@ -102,20 +101,20 @@ sub linkify_mids { # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($html . $SALT); + $key =~ tr/0-9/A-J/; my $repl = qq(<<a\nhref="$pfx/$href/">$html</a>>); $repl .= qq{ (<a\nhref="$pfx/$href/raw">raw</a>)} if $raw; $self->{$key} = $repl; - 'PI-LINK-'. $key; + 'LINKIFY'.$key; !ge; $$str = ascii_html($$str); - $$str =~ s!\bPI-LINK-([a-f0-9]{40})\b! + $$str =~ s!\bLINKIFY([a-fA-J]{40})\b! my $key = $1; my $repl = $_[0]->{$key}; if (defined $repl) { $repl; - } else { - # false positive or somebody tried to mess with us - $key; + } else { # false positive or somebody tried to mess with us + 'LINKIFY'.$key; } !ge; } diff --git a/lib/PublicInbox/Listener.pm b/lib/PublicInbox/Listener.pm index 2e0fc248..c83901b2 100644 --- a/lib/PublicInbox/Listener.pm +++ b/lib/PublicInbox/Listener.pm @@ -1,14 +1,15 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used by -nntpd for listen sockets package PublicInbox::Listener; -use strict; +use v5.12; use parent 'PublicInbox::DS'; use Socket qw(SOL_SOCKET SO_KEEPALIVE IPPROTO_TCP TCP_NODELAY); use IO::Handle; -use PublicInbox::Syscall qw(EPOLLIN EPOLLEXCLUSIVE EPOLLET); -use Errno qw(EAGAIN ECONNABORTED EPERM); +use PublicInbox::Syscall qw(EPOLLIN EPOLLEXCLUSIVE); +use Errno qw(EAGAIN ECONNABORTED); +our $MULTI_ACCEPT = 0; # Warn on transient errors, mostly resource limitations. # EINTR would indicate the failure to set NonBlocking in systemd or similar @@ -16,42 +17,35 @@ my %ERR_WARN = map {; eval("Errno::$_()") => $_ } qw(EMFILE ENFILE ENOBUFS ENOMEM EINTR); -sub new ($$$) { - my ($class, $s, $cb) = @_; +sub new { + my ($class, $s, $cb, $multi_accept) = @_; setsockopt($s, SOL_SOCKET, SO_KEEPALIVE, 1); setsockopt($s, IPPROTO_TCP, TCP_NODELAY, 1); # ignore errors on non-TCP - listen($s, 1024); + listen($s, 2**31 - 1); # kernel will clamp my $self = bless { post_accept => $cb }, $class; - $self->SUPER::new($s, EPOLLIN|EPOLLET|EPOLLEXCLUSIVE); + $self->{multi_accept} = $multi_accept //= $MULTI_ACCEPT; + $self->SUPER::new($s, EPOLLIN|EPOLLEXCLUSIVE); } sub event_step { my ($self) = @_; my $sock = $self->{sock} or return; - - # no loop here, we want to fairly distribute clients - # between multiple processes sharing the same socket - # XXX our event loop needs better granularity for - # a single accept() here to be, umm..., acceptable - # on high-traffic sites. - if (my $addr = accept(my $c, $sock)) { - IO::Handle::blocking($c, 0); # no accept4 :< - eval { $self->{post_accept}->($c, $addr, $sock) }; - warn "E: $@\n" if $@; - $self->requeue; - } elsif ($! == EAGAIN || $! == ECONNABORTED || $! == EPERM) { - # EAGAIN is common and likely - # ECONNABORTED is common with bad connections - # EPERM happens if firewall rules prevent a connection - # on Linux (and everything that emulates Linux). - # Firewall rules are sometimes intentional, so we don't - # warn on EPERM to avoid being too noisy... - return; - } elsif (my $sym = $ERR_WARN{int($!)}) { - warn "W: accept(): $! ($sym)\n"; - } else { - warn "BUG?: accept(): $!\n"; - } + my $n = $self->{multi_accept}; + do { + if (my $addr = accept(my $c, $sock)) { + IO::Handle::blocking($c, 0); # no accept4 :< + eval { $self->{post_accept}->($c, $addr, $sock) }; + warn "E: $@\n" if $@; + } elsif ($! == EAGAIN || $! == ECONNABORTED) { + # EAGAIN is common and likely + # ECONNABORTED is common with bad connections + return; + } elsif (my $sym = $ERR_WARN{int($!)}) { + return warn "W: accept(): $! ($sym)\n"; + } else { + return warn "BUG?: accept(): $!\n"; + } + } while ($n--); } 1; diff --git a/lib/PublicInbox/Lock.pm b/lib/PublicInbox/Lock.pm index b2c8227f..7162d80e 100644 --- a/lib/PublicInbox/Lock.pm +++ b/lib/PublicInbox/Lock.pm @@ -1,35 +1,66 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# Base class for per-inbox locking +# Base class for per-inbox locking, subclassed by several +# only uses {lock_path} and {lockfh} fields package PublicInbox::Lock; -use strict; -use warnings; -use Fcntl qw(:flock :DEFAULT); +use v5.12; +use Fcntl qw(LOCK_UN LOCK_EX O_RDWR O_CREAT); use Carp qw(croak); +use PublicInbox::OnDestroy; +use Errno qw(EINTR); +use autodie qw(close sysopen syswrite); + +sub xflock ($$) { + until (flock($_[0], $_[1])) { return if $! != EINTR } + 1; +} + +sub new { bless { lock_path => $_[1] }, $_[0] } # we only acquire the flock if creating or reindexing; # PublicInbox::Import already has the lock on its own. sub lock_acquire { my ($self) = @_; - my $lock_path = $self->{lock_path}; - croak 'already locked '.($lock_path // '(undef)') if $self->{lockfh}; - return unless defined($lock_path); - sysopen(my $lockfh, $lock_path, O_WRONLY|O_CREAT) or - croak "failed to open $lock_path: $!\n"; - flock($lockfh, LOCK_EX) or croak "lock $lock_path failed: $!\n"; - $self->{lockfh} = $lockfh; + my $fn = $self->{lock_path}; + croak 'already locked '.($fn // '(undef)') if $self->{lockfh}; + $fn // return; + sysopen(my $fh, $fn, O_RDWR|O_CREAT); + xflock($fh, LOCK_EX) or croak "LOCK_EX $fn: $!"; + $self->{lockfh} = $fh; } sub lock_release { my ($self, $wake) = @_; - defined(my $lock_path = $self->{lock_path}) or return; - my $lockfh = delete $self->{lockfh} or croak "not locked: $lock_path"; + my $fn = $self->{lock_path} // return; + my $fh = delete $self->{lockfh} or croak "not locked: $fn"; + syswrite($fh, '.') if $wake; + xflock($fh, LOCK_UN) or croak "LOCK_UN $fn: $!"; + close $fh; # may detect errors +} - syswrite($lockfh, '.') if $wake; +# caller must use return value +sub lock_for_scope { + my ($self) = @_; + lock_acquire($self) or return; # lock_path not set + on_destroy \&lock_release, $self; +} - flock($lockfh, LOCK_UN) or croak "unlock $lock_path failed: $!\n"; - close $lockfh or croak "close $lock_path failed: $!\n"; +sub lock_acquire_fast { + my $fh = $_[0]->{lockfh} or return lock_acquire($_[0]); + xflock($fh, LOCK_EX) or croak "LOCK_EX $_[0]->{lock_path}: $!"; +} + +sub lock_release_fast { + xflock($_[0]->{lockfh} // return, LOCK_UN) or + croak "LOCK_UN $_[0]->{lock_path}: $!" +} + +# caller must use return value +sub lock_for_scope_fast { + my ($self) = @_; + lock_acquire_fast($self) or return; # lock_path not set + on_destroy \&lock_release_fast, $self; } 1; diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index fa4a2ad8..f82194a3 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2013-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # For the -mda script (mail delivery agent) @@ -83,7 +83,7 @@ sub set_list_headers { } sub inboxes_for_list_id ($$) { - my ($klass, $config, $simple) = @_; + my ($klass, $pi_cfg, $simple) = @_; # newer Email::Simple allows header_raw, as does Email::MIME: my @list_ids = $simple->can('header_raw') ? @@ -92,7 +92,7 @@ sub inboxes_for_list_id ($$) { my @dests; for my $list_id (@list_ids) { $list_id =~ /<[ \t]*(.+)?[ \t]*>/ or next; - if (my $ibx = $config->lookup_list_id($1)) { + if (my $ibx = $pi_cfg->lookup_list_id($1)) { push @dests, $ibx; } } diff --git a/lib/PublicInbox/MHreader.pm b/lib/PublicInbox/MHreader.pm new file mode 100644 index 00000000..16e505a2 --- /dev/null +++ b/lib/PublicInbox/MHreader.pm @@ -0,0 +1,104 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# MH reader, based on Lib/mailbox.py in cpython source +package PublicInbox::MHreader; +use v5.12; +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::OnDestroy; +use PublicInbox::IO qw(try_cat); +use PublicInbox::MdirSort; +use Carp qw(carp); +use autodie qw(chdir closedir opendir); + +my %FL2OFF = ( # mh_sequences key => our keyword + replied => 0, + flagged => 1, + unseen => 2, # negate +); +my @OFF2KW = qw(answered flagged); # [2] => unseen (negated) + +sub new { + my ($cls, $dir, $cwdfh) = @_; + if (substr($dir, -1) ne '/') { # TODO: do this earlier + carp "W: appending `/' to `$dir' (fix caller)\n"; + $dir .= '/'; + } + bless { dir => $dir, cwdfh => $cwdfh }, $cls; +} + +sub read_mh_sequences ($) { # caller must chdir($self->{dir}) + my ($self) = @_; + my ($fl, $off, @n); + my @seq = ('', '', ''); + for (split /\n+/s, try_cat('.mh_sequences')) { + ($fl, @n) = split /[: \t]+/; + $off = $FL2OFF{$fl} // do { warn <<EOM; +W: unknown `$fl' in $self->{dir}.mh_sequences (ignoring) +EOM + next; + }; + @n = grep /\A[0-9]+\z/s, @n; # don't stat, yet + if (@n) { + @n = sort { $b <=> $a } @n; # to avoid resize + my $buf = ''; + vec($buf, $_, 1) = 1 for @n; + $seq[$off] = $buf; + } + } + \@seq; +} + +sub mh_each_file { + my ($self, $efcb, @arg) = @_; + opendir(my $dh, my $dir = $self->{dir}); + my $restore = on_destroy \&chdir, $self->{cwdfh}; + chdir($dh); + my $sort = $self->{sort}; + if (defined $sort && "@$sort" ne 'none') { + my @sort = map { + my @tmp = $_ eq '' ? ('sequence') : split(/[, ]/); + # sorting by name alphabetically makes no sense for MH: + for my $k (@tmp) { + s/\A(\-|\+|)(?:name|)\z/$1sequence/; + } + @tmp; + } @$sort; + my @n = grep /\A[0-9]+\z/s, readdir $dh; + mdir_sort \@n, \@sort; + $efcb->($dir, $_, $self, @arg) for @n; + } else { + while (readdir $dh) { # perl v5.12+ to set $_ on readdir + $efcb->($dir, $_, $self, @arg) if /\A[0-9]+\z/s; + } + } + closedir $dh; # may die +} + +sub kw_for ($$) { + my ($self, $n) = @_; + my $seq = $self->{mh_seq} //= read_mh_sequences($self); + my @kw = map { vec($seq->[$_], $n, 1) ? $OFF2KW[$_] : () } (0, 1); + vec($seq->[2], $n, 1) or push @kw, 'seen'; + \@kw; +} + +sub _file2eml { # mh_each_file / mh_read_one cb + my ($dir, $n, $self, $ucb, @arg) = @_; + my $eml = eml_from_path($n); + $ucb->($dir, $n, kw_for($self, $n), $eml, @arg) if $eml; +} + +sub mh_each_eml { + my ($self, $ucb, @arg) = @_; + mh_each_file($self, \&_file2eml, $ucb, @arg); +} + +sub mh_read_one { + my ($self, $n, $ucb, @arg) = @_; + my $restore = on_destroy \&chdir, $self->{cwdfh}; + chdir(my $dir = $self->{dir}); + _file2eml($dir, $n, $self, $ucb, @arg); +} + +1; diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 5aeffb8c..36c05855 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -1,15 +1,15 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Various Message-ID-related functions. package PublicInbox::MID; use strict; -use warnings; -use base qw/Exporter/; +use v5.10.1; # TODO: check unicode_strings compat for v5.12 +use parent qw(Exporter); our @EXPORT_OK = qw(mid_clean id_compress mid2path mid_escape MID_ESC - mids references mids_for_index $MID_EXTRACT); + mids references mids_for_index mids_in $MID_EXTRACT); use URI::Escape qw(uri_escape_utf8); -use Digest::SHA qw/sha1_hex/; +use PublicInbox::SHA qw(sha1_hex); require PublicInbox::Address; use constant { ID_MAX => 40, # SHA-1 hex length for HTML id anchors @@ -73,14 +73,17 @@ sub mids ($) { uniq_mids(extract_mids(@mids)); } +# for Resent-Message-ID and maybe others +sub mids_in ($@) { + my ($eml, @headers) = @_; + uniq_mids(extract_mids(map { ($eml->header_raw($_)) } @headers)); +} + # we allow searching on X-Alt-Message-ID since PublicInbox::NNTP uses them # to placate some clients, and we want to ensure NNTP-only clients can # import and index without relying on HTTP endpoints sub mids_for_index ($) { - my ($hdr) = @_; - my @mids = $hdr->header_raw('Message-ID'); - my @alts = $hdr->header_raw('X-Alt-Message-ID'); - uniq_mids(extract_mids(@mids, @alts)); + mids_in($_[0], qw(Message-ID X-Alt-Message-ID)); } # last References should be IRT, but some mail clients do things @@ -89,8 +92,7 @@ sub references ($) { my ($hdr) = @_; my @mids; foreach my $f (qw(References In-Reply-To)) { - my @v = $hdr->header_raw($f); - foreach my $v (@v) { + for my $v ($hdr->header_raw($f)) { push(@mids, ($v =~ /$MID_EXTRACT/g)); } } @@ -101,8 +103,7 @@ sub references ($) { my %addr = ( y => 1, n => 1 ); foreach my $f (qw(To From Cc)) { - my @v = $hdr->header_raw($f); - foreach my $v (@v) { + for my $v ($hdr->header_raw($f)) { $addr{$_} = 1 for (PublicInbox::Address::emails($v)); } } @@ -114,17 +115,17 @@ sub uniq_mids ($;$) { my @ret; $seen ||= {}; foreach my $mid (@$mids) { - $mid =~ tr/\n\t\r//d; + $mid =~ tr/\n\t\r\0//d; if (length($mid) > MAX_MID_SIZE) { warn "Message-ID: <$mid> too long, truncating\n"; $mid = substr($mid, 0, MAX_MID_SIZE); } - push(@ret, $mid) unless $seen->{$mid}++; + $seen->{$mid} //= push(@ret, $mid); } \@ret; } -# RFC3986, section 3.3: +# RFC3986, section 3.3 (pathnames only): sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\'\(\)\*\+,;=:@' } sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) } diff --git a/lib/PublicInbox/MailDiff.pm b/lib/PublicInbox/MailDiff.pm new file mode 100644 index 00000000..125360fe --- /dev/null +++ b/lib/PublicInbox/MailDiff.pm @@ -0,0 +1,137 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::MailDiff; +use v5.12; +use File::Temp 0.19 (); # 0.19 for ->newdir +use PublicInbox::ContentHash qw(content_digest); +use PublicInbox::MsgIter qw(msg_part_text); +use PublicInbox::ViewDiff qw(flush_diff); +use PublicInbox::GitAsyncCat; +use PublicInbox::ContentDigestDbg; +use PublicInbox::Qspawn; +use PublicInbox::IO qw(write_file); +use autodie qw(close mkdir); + +sub write_part { # Eml->each_part callback + my ($ary, $self) = @_; + my ($part, $depth, $idx) = @$ary; + if ($idx ne '1' || $self->{-raw_hdr}) { # lei mail-diff --raw-header + write_file '>', "$self->{curdir}/$idx.hdr", ${$part->{hdr}}; + } + my $ct = $part->content_type || 'text/plain'; + my ($s, $err) = msg_part_text($part, $ct); + my $sfx = defined($s) ? 'txt' : 'bin'; + $s //= $part->body; + $s =~ s/\r\n/\n/gs; # TODO: consider \r+\n to match View + $s =~ s/\s*\z//s; + write_file '>:utf8', "$self->{curdir}/$idx.$sfx", $s, "\n"; +} + +# public +sub dump_eml ($$$) { + my ($self, $dir, $eml) = @_; + local $self->{curdir} = $dir; + mkdir $dir; + $eml->each_part(\&write_part, $self); + my $fh = write_file '>', "$dir/content_digest"; + my $dig = PublicInbox::ContentDigestDbg->new($fh); + content_digest($eml, $dig); + say $fh "\n", $dig->hexdigest; + close $fh; +} + +# public +sub prep_a ($$) { + my ($self, $eml) = @_; + $self->{tmp} = File::Temp->newdir('mail-diff-XXXX', TMPDIR => 1); + dump_eml($self, "$self->{tmp}/a", $eml); +} + +# WWW-specific stuff below (TODO: split out for non-lei) + +sub next_smsg ($) { + my ($self) = @_; + my $ctx = $self->{ctx}; + my $over = $ctx->{ibx}->over; + $self->{smsg} = $over ? $over->next_by_mid(@{$self->{next_arg}}) + : $ctx->gone('over'); + if (!$self->{smsg}) { + $ctx->write('</pre>', $ctx->_html_end); + return $ctx->close; + } + PublicInbox::DS::requeue($self) if $ctx->{env}->{'pi-httpd.async'}; +} + +sub emit_msg_diff { + my ($bref, $self) = @_; # bref is `git diff' output + require PublicInbox::Hval; + PublicInbox::Hval::utf8_maybe($$bref); + + # will be escaped to `•' in HTML + $self->{ctx}->{ibx}->{obfuscate} and + PublicInbox::Hval::obfuscate_addrs($self->{ctx}->{ibx}, + $$bref, "\x{2022}"); + print { $self->{ctx}->{zfh} } '</pre><hr><pre>' if $self->{nr} > 1; + flush_diff($self->{ctx}, $bref); + next_smsg($self); +} + +sub do_diff { + my ($self, $eml) = @_; + my $n = 'N'.(++$self->{nr}); + my $dir = "$self->{tmp}/$n"; + $self->dump_eml($dir, $eml); + my $cmd = [ qw(git diff --no-index --no-color -- a), $n ]; + my $opt = { -C => "$self->{tmp}", quiet => 1 }; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, $opt); + $qsp->psgi_qx($self->{ctx}->{env}, undef, \&emit_msg_diff, $self); +} + +sub diff_msg_i { + my ($self, $eml) = @_; + if ($eml) { + if ($self->{tmp}) { # 2nd..last message + do_diff($self, $eml); + } else { # first message: + prep_a($self, $eml); + next_smsg($self); + } + } else { + warn "W: $self->{smsg}->{blob} missing\n"; + next_smsg($self); + } +} + +sub diff_msg_i_async { + my ($bref, $oid, $type, $size, $self) = @_; + diff_msg_i($self, $bref ? PublicInbox::Eml->new($bref) : undef); +} + +sub event_step { + my ($self) = @_; + eval { + my $ctx = $self->{ctx}; + if ($ctx->{env}->{'pi-httpd.async'}) { + ibx_async_cat($ctx->{ibx}, $self->{smsg}->{blob}, + \&diff_msg_i_async, $self); + } else { + diff_msg_i($self, $ctx->{ibx}->smsg_eml($self->{smsg})); + } + }; + if ($@) { + warn "E: $@"; + delete $self->{smsg}; + $self->{ctx}->close; + } +} + +sub begin_mail_diff { + my ($self) = @_; + if ($self->{ctx}->{env}->{'pi-httpd.async'}) { + PublicInbox::DS::requeue($self); + } else { + event_step($self) while $self->{smsg}; + } +} + +1; diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm index f98d9d01..be5d5f2a 100644 --- a/lib/PublicInbox/ManifestJsGz.pm +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -1,105 +1,47 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# generates manifest.js.gz for grokmirror(1) +# generates manifest.js.gz for grokmirror(1) via PublicInbox::WWW +# This doesn't parse manifest.js.gz (that happens in LeiMirror) package PublicInbox::ManifestJsGz; -use strict; -use v5.10.1; +use v5.12; use parent qw(PublicInbox::WwwListing); -use Digest::SHA (); -use File::Spec (); -use bytes (); # length -use PublicInbox::Inbox; -use PublicInbox::Git; +use PublicInbox::Config; use IO::Compress::Gzip qw(gzip); use HTTP::Date qw(time2str); -*try_cat = \&PublicInbox::Inbox::try_cat; -our $json; -for my $mod (qw(JSON::MaybeXS JSON JSON::PP)) { - eval "require $mod" or next; - # ->ascii encodes non-ASCII to "\uXXXX" - $json = $mod->new->ascii(1) and last; -} +my $json = PublicInbox::Config::json(); -# called by WwwListing -sub url_regexp { +sub url_filter { my ($ctx) = @_; # grokmirror uses relative paths, so it's domain-dependent - # SUPER calls PublicInbox::WwwListing::url_regexp - $ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain'); + # SUPER calls PublicInbox::WwwListing::url_filter + $ctx->SUPER::url_filter('publicInbox.grokManifest', 'match=domain'); } -sub fingerprint ($) { - my ($git) = @_; - # TODO: convert to qspawn for fairness when there's - # thousands of repos - my ($fh, $pid) = $git->popen('show-ref'); - my $dig = Digest::SHA->new(1); - while (read($fh, my $buf, 65536)) { - $dig->add($buf); - } - close $fh; - waitpid($pid, 0); - return if $?; # empty, uninitialized git repo - $dig->hexdigest; +sub inject_entry ($$$;$) { + my ($ctx, $url_path, $ent, $git_dir) = @_; + $ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path; + my $modified = $ent->{modified}; + $ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0); + $ctx->{manifest}->{$url_path} = $ent; } -sub manifest_add ($$;$$) { +sub manifest_add ($$;$$) { # slow path w/o extindex "all" (or per-inbox) my ($ctx, $ibx, $epoch, $default_desc) = @_; my $url_path = "/$ibx->{name}"; - my $git_dir = $ibx->{inboxdir}; + my $git; if (defined $epoch) { - $git_dir .= "/git/$epoch.git"; $url_path .= "/git/$epoch.git"; + $git = $ibx->git_epoch($epoch) or return; + } else { + $git = $ibx->git; } - return unless -d $git_dir; - my $git = PublicInbox::Git->new($git_dir); - my $fingerprint = fingerprint($git) or return; # no empty repos - - chomp(my $owner = $git->qx('config', 'gitweb.owner')); - chomp(my $desc = try_cat("$git_dir/description")); - utf8::decode($owner); - utf8::decode($desc); - $owner = undef if $owner eq ''; - $desc = 'Unnamed repository' if $desc eq ''; - - # templates/hooks--update.sample and git-multimail in git.git - # only match "Unnamed repository", not the full contents of - # templates/this--description in git.git - if ($desc =~ /\AUnnamed repository/) { - $desc = "$default_desc [epoch $epoch]" if defined($epoch); - } - - my $reference; - chomp(my $alt = try_cat("$git_dir/objects/info/alternates")); - if ($alt) { - # n.b.: GitPython doesn't seem to handle comments or C-quoted - # strings like native git does; and we don't for now, either. - my @alt = split(/\n+/, $alt); - - # grokmirror only supports 1 alternate for "reference", - if (scalar(@alt) == 1) { - my $objdir = "$git_dir/objects"; - $reference = File::Spec->rel2abs($alt[0], $objdir); - $reference =~ s!/[^/]+/?\z!!; # basename - } - } - $ctx->{-abs2urlpath}->{$git_dir} = $url_path; - my $modified = $git->modified; - if ($modified > ($ctx->{-mtime} // 0)) { - $ctx->{-mtime} = $modified; - } - $ctx->{manifest}->{$url_path} = { - owner => $owner, - reference => $reference, - description => $desc, - modified => $modified, - fingerprint => $fingerprint, - }; + my $ent = $git->manifest_entry($epoch, $default_desc) or return; + inject_entry($ctx, $url_path, $ent, $git->{git_dir}); } -sub ibx_entry { +sub slow_manifest_add ($$) { my ($ctx, $ibx) = @_; eval { if (defined(my $max = $ibx->max_git_epoch)) { @@ -114,9 +56,47 @@ sub ibx_entry { warn "E: $@" if $@; } -sub hide_key { 'manifest' } +sub eidx_manifest_add ($$$) { + my ($ctx, $ALL, $ibx) = @_; + if (my $data = $ALL->misc->inbox_data($ibx)) { + $data = $json->decode($data); + delete $data->{''}; # private + while (my ($url_path, $ent) = each %$data) { + inject_entry($ctx, $url_path, $ent); + } + } else { + warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n"; + # do not use slow path for global manifest since + # it can become catastrophically slow. per-inbox manifest + # is not too bad with dozens of epochs, so never fail that: + slow_manifest_add($ctx, $ibx) if $ibx == $ctx->{ibx}; + } +} + +sub response { + my ($class, $ctx) = @_; + bless $ctx, $class; + my ($re, undef) = $ctx->url_filter; + $re // return psgi_triple($ctx); + my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg}, + $ctx->can('list_match_i'), $re, $ctx); + sub { + $ctx->{-wcb} = $_[0]; # HTTP server callback + ($ctx->{www}->{pi_cfg}->ALL || + !$ctx->{env}->{'pi-httpd.async'}) ? + $iter->each_section : $iter->event_step; + } +} + +sub ibx_entry { + my ($ctx, $ibx) = @_; + my $ALL = $ctx->{www}->{pi_cfg}->ALL; + $ALL ? eidx_manifest_add($ctx, $ALL, $ibx) : + slow_manifest_add($ctx, $ibx); +} + +sub hide_key { 'manifest' } # for WwwListing->list_match_i -# overrides WwwListing->psgi_triple sub psgi_triple { my ($ctx) = @_; my $abs2urlpath = delete($ctx->{-abs2urlpath}) // {}; @@ -129,7 +109,13 @@ sub psgi_triple { gzip(\$manifest => \(my $out)); [ 200, [ qw(Content-Type application/gzip), 'Last-Modified', time2str($ctx->{-mtime}), - 'Content-Length', bytes::length($out) ], [ $out ] ] + 'Content-Length', length($out) ], [ $out ] ] +} + +sub per_inbox { + my ($ctx) = @_; + ibx_entry($ctx, $ctx->{ibx}); + psgi_triple($ctx); } 1; diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 47025891..17893a09 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -1,10 +1,10 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Streaming interface for mboxrd HTTP responses # See PublicInbox::GzipFilter for details. package PublicInbox::Mbox; -use strict; +use v5.12; use parent 'PublicInbox::GzipFilter'; use PublicInbox::MID qw/mid_escape/; use PublicInbox::Hval qw/to_filename/; @@ -17,15 +17,12 @@ use PublicInbox::Eml; sub getline { my ($ctx) = @_; # ctx my $smsg = $ctx->{smsg} or return; - my $ibx = $ctx->{-inbox}; - my $eml = $ibx->smsg_eml($smsg) or return; - my $n = $ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}}); - $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid})); - if ($n) { - $ctx->translate(msg_body($eml)); + my $ibx = $ctx->{ibx}; + my $eml = delete($ctx->{eml}) // $ibx->smsg_eml($smsg) // return; + if (($ctx->{smsg} = $ibx->over->next_by_mid(@{$ctx->{next_arg}}))) { + $ctx->translate(msg_hdr($ctx, $eml), msg_body($eml)); } else { # last message - $ctx->zmore(msg_body($eml)); - $ctx->zflush; + $ctx->zflush(msg_hdr($ctx, $eml), msg_body($eml)); } } @@ -34,8 +31,8 @@ sub async_next { my ($http) = @_; # PublicInbox::HTTP my $ctx = $http->{forward} or return; # client aborted eval { - my $smsg = $ctx->{smsg} or return $ctx->close; - $ctx->smsg_blob($smsg); + my $smsg = $ctx->{smsg} // return $ctx->close; + $ctx->smsg_blob($smsg) if $smsg; }; warn "E: $@" if $@; } @@ -44,19 +41,19 @@ sub async_eml { # for async_blob_cb my ($ctx, $eml) = @_; my $smsg = delete $ctx->{smsg}; # next message - $ctx->{smsg} = $ctx->{-inbox}->over->next_by_mid(@{$ctx->{next_arg}}); - - $ctx->zmore(msg_hdr($ctx, $eml, $smsg->{mid})); - $ctx->{http_out}->write($ctx->translate(msg_body($eml))); + $ctx->{smsg} = $ctx->{ibx}->over->next_by_mid(@{$ctx->{next_arg}}); + local $ctx->{eml} = $eml; # for mbox_hdr + $ctx->write(msg_hdr($ctx, $eml), msg_body($eml)); } -sub res_hdr ($$) { - my ($ctx, $subject) = @_; - my $fn = $subject // ''; +sub mbox_hdr ($) { + my ($ctx) = @_; + my $eml = $ctx->{eml} //= $ctx->{ibx}->smsg_eml($ctx->{smsg}); + my $fn = $eml->header_str('Subject') // ''; $fn =~ s/^re:\s+//i; $fn = to_filename($fn) // 'no-subject'; my @hdr = ('Content-Type'); - if ($ctx->{-inbox}->{obfuscate}) { + if ($ctx->{ibx}->{obfuscate}) { # obfuscation is stupid, but maybe scrapers are, too... push @hdr, 'application/mbox'; $fn .= '.mbox'; @@ -64,69 +61,44 @@ sub res_hdr ($$) { push @hdr, 'text/plain'; $fn .= '.txt'; } + my $cs = $ctx->{eml}->ct->{attributes}->{charset} // 'UTF-8'; + $cs = 'UTF-8' if $cs =~ /[^a-zA-Z0-9\-\_]/; # avoid header injection + $hdr[-1] .= "; charset=$cs"; push @hdr, 'Content-Disposition', "inline; filename=$fn"; - \@hdr; + [ 200, \@hdr ]; } # for rare cases where v1 inboxes aren't indexed w/ ->over at all sub no_over_raw ($) { my ($ctx) = @_; - my $mref = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return; - my $eml = PublicInbox::Eml->new($mref); - [ 200, res_hdr($ctx, $eml->header_str('Subject')), - [ msg_hdr($ctx, $eml, $ctx->{mid}) . msg_body($eml) ] ] + my $mref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; + my $eml = $ctx->{eml} = PublicInbox::Eml->new($mref); + [ @{mbox_hdr($ctx)}, [ msg_hdr($ctx, $eml) . msg_body($eml) ] ] } # /$INBOX/$MESSAGE_ID/raw sub emit_raw { my ($ctx) = @_; - $ctx->{base_url} = $ctx->{-inbox}->base_url($ctx->{env}); - my $over = $ctx->{-inbox}->over or return no_over_raw($ctx); + my $over = $ctx->{ibx}->over or return no_over_raw($ctx); my ($id, $prev); my $mip = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; my $smsg = $ctx->{smsg} = $over->next_by_mid(@$mip) or return; - my $res_hdr = res_hdr($ctx, $smsg->{subject}); bless $ctx, __PACKAGE__; - $ctx->psgi_response(200, $res_hdr); + $ctx->psgi_response(\&mbox_hdr); } -sub msg_hdr ($$;$) { - my ($ctx, $eml, $mid) = @_; - my $header_obj = $eml->header_obj; +sub msg_hdr ($$) { + my ($ctx, $eml) = @_; - # drop potentially confusing headers, ssoma already should've dropped - # Lines and Content-Length - foreach my $d (qw(Lines Bytes Content-Length Status)) { - $header_obj->header_set($d); + # drop potentially confusing headers, various importers should've + # already dropped these, but we can't trust stuff we've cloned + for my $d (qw(Lines Bytes Content-Length Status)) { + $eml->header_set($d); } - my $ibx = $ctx->{-inbox}; - my $base = $ctx->{base_url}; - $mid = $ctx->{mid} unless defined $mid; - $mid = mid_escape($mid); - my @append = ( - 'Archived-At', "<$base$mid/>", - 'List-Archive', "<$base>", - 'List-Post', "<mailto:$ibx->{-primary_address}>", - ); - my $crlf = $header_obj->crlf; - my $buf = $header_obj->as_string; - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - $buf = "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf; - - for (my $i = 0; $i < @append; $i += 2) { - my $k = $append[$i]; - my $v = $append[$i + 1]; - my @v = $header_obj->header_raw($k); - foreach (@v) { - if ($v eq $_) { - $v = undef; - last; - } - } - $buf .= "$k: $v$crlf" if defined $v; - } - $buf .= $crlf; + my $crlf = $eml->crlf; + my $buf = $eml->header_obj->as_string; + PublicInbox::Eml::strip_from($buf); + "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf . $crlf; } sub msg_body ($) { @@ -147,8 +119,9 @@ sub thread_cb { return $smsg; } # refill result set - $ctx->{msgs} = $msgs = $ctx->{over}->get_thread($ctx->{mid}, - $ctx->{prev}); + my $over = $ctx->{ibx}->over or return $ctx->gone('over'); + $ctx->{msgs} = $msgs = $over->get_thread($ctx->{mid}, + $ctx->{prev}); return unless @$msgs; $ctx->{prev} = $msgs->[-1]; } @@ -159,7 +132,6 @@ sub thread_mbox { my $msgs = $ctx->{msgs} = $over->get_thread($ctx->{mid}, {}); return [404, [qw(Content-Type text/plain)], []] if !@$msgs; $ctx->{prev} = $msgs->[-1]; - $ctx->{over} = $over; # bump refcnt require PublicInbox::MboxGz; PublicInbox::MboxGz::mbox_gz($ctx, \&thread_cb, $msgs->[0]->{subject}); } @@ -178,97 +150,124 @@ sub emit_range { sub all_ids_cb { my ($ctx) = @_; + my $over = $ctx->{ibx}->over or return $ctx->gone('over'); my $ids = $ctx->{ids}; do { while ((my $num = shift @$ids)) { - my $smsg = $ctx->{over}->get_art($num) or next; + my $smsg = $over->get_art($num) or next; return $smsg; } - $ctx->{ids} = $ids = $ctx->{mm}->ids_after(\($ctx->{prev})); + $ctx->{ids} = $ids = $over->ids_after(\($ctx->{prev})); } while (@$ids); + undef; } sub mbox_all_ids { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; my $prev = 0; - my $mm = $ctx->{mm} = $ibx->mm; - my $ids = $mm->ids_after(\$prev) or return - [404, [qw(Content-Type text/plain)], ["No results found\n"]]; - $ctx->{over} = $ibx->over or + my $over = $ctx->{ibx}->over or return PublicInbox::WWW::need($ctx, 'Overview'); + my $ids = $over->ids_after(\$prev) or return + [404, [qw(Content-Type text/plain)], ["No results found\n"]]; $ctx->{ids} = $ids; $ctx->{prev} = $prev; + $ctx->{-low_prio} = 1; require PublicInbox::MboxGz; PublicInbox::MboxGz::mbox_gz($ctx, \&all_ids_cb, 'all'); } -sub results_cb { - my ($ctx) = @_; - my $over = $ctx->{-inbox}->over or return; - while (1) { - while (defined(my $num = shift(@{$ctx->{ids}}))) { - my $smsg = $over->get_art($num) or next; - return $smsg; - } - # refill result set - my $srch = $ctx->{-inbox}->search(undef, $ctx) or return; - my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); - my $size = $mset->size or return; - $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); +my $refill_ids_cb = sub { # async_mset cb + my ($ctx, $http, $mset, $err) = @_; + $http = undef unless $ctx->{-really_async}; + if ($err) { + warn "E: $err"; + $ctx->close if $http; # our async httpd + return; } -} - -sub results_thread_cb { - my ($ctx) = @_; + # refill result set, deprioritize since there's many results + my $size = $mset->size or do { + $ctx->close if $http; + $ctx->{-mbox_done} = 1; + return; + }; + $ctx->{qopts}->{offset} += $size; + $ctx->{ids} = $ctx->{srch}->mset_to_artnums($mset, $ctx->{qopts}); + $ctx->{-low_prio} = 1; # true + return if !$http; + eval { + my $smsg = results_cb($ctx) // return $ctx->close; + return if !$smsg; # '' wait for async_mset + $ctx->smsg_blob($ctx->{smsg} = $smsg); + }; + warn "E: $@" if $@; +}; - my $over = $ctx->{-inbox}->over or return; +sub results_cb { # async_next or MboxGz->getline cb + my ($ctx, $http) = @_; + my $over = $ctx->{ibx}->over or return $ctx->gone('over'); while (1) { - while (defined(my $num = shift(@{$ctx->{xids}}))) { + my $ids = $ctx->{xids} // $ctx->{ids}; + while (defined(my $num = shift(@$ids))) { my $smsg = $over->get_art($num) or next; return $smsg; } - - # refills ctx->{xids} - next if $over->expand_thread($ctx); - - # refill result set - my $srch = $ctx->{-inbox}->search(undef, $ctx) or return; - my $mset = $srch->mset($ctx->{query}, $ctx->{qopts}); - my $size = $mset->size or return; - $ctx->{qopts}->{offset} += $size; - $ctx->{ids} = $srch->mset_to_artnums($mset); + next if $ctx->{xids} && $over->expand_thread($ctx); + return '' if $ctx->{srch}->async_mset(@$ctx{qw(query qopts)}, + $refill_ids_cb, $ctx, $http); + return if $ctx->{-mbox_done}; } +} +sub mbox_qry_cb { # async_mset cb + my ($ctx, $q, $mset, $err) = @_; + my $wcb = delete $ctx->{wcb}; + if ($err) { + warn "E: $err"; + return $wcb->([500, [qw(Content-Type text/plain)], + [ "Internal server error\n" ]]) + } + $ctx->{qopts}->{offset} = $mset->size or + return $wcb->([404, [qw(Content-Type text/plain)], + ["No results found\n"]]); + $ctx->{ids} = $ctx->{srch}->mset_to_artnums($mset, $ctx->{qopts}); + my $fn; + if ($q->{t} && $ctx->{srch}->has_threadid) { + $ctx->{xids} = []; # triggers over->expand_thread + $fn = "results-thread-$ctx->{query}"; + } else { + $fn = "results-$ctx->{query}"; + } + require PublicInbox::MboxGz; + my $res = PublicInbox::MboxGz::mbox_gz($ctx, \&results_cb, $fn); + ref($res) eq 'CODE' ? $res->($wcb) : $wcb->($res); } sub mbox_all { my ($ctx, $q) = @_; - my $q_string = $q->{'q'}; - return mbox_all_ids($ctx) if $q_string !~ /\S/; - my $srch = $ctx->{-inbox}->search or + my $qstr = $q->{'q'}; + return mbox_all_ids($ctx) if $qstr !~ /\S/; + my $srch = $ctx->{srch} = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); - my $over = $ctx->{-inbox}->over or - return PublicInbox::WWW::need($ctx, 'Overview'); + my $opt = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC - my $qopts = $ctx->{qopts} = { mset => 2 }; # order by docid - $qopts->{thread} = 1 if $q->{t}; - my $mset = $srch->mset($q_string, $qopts); - $qopts->{offset} = $mset->size or - return [404, [qw(Content-Type text/plain)], - ["No results found\n"]]; - $ctx->{query} = $q_string; - $ctx->{ids} = $srch->mset_to_artnums($mset); - require PublicInbox::MboxGz; - my $fn; - if ($q->{t} && $srch->has_threadid) { - $fn = 'results-thread-'.$q_string; - PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn); - } else { - $fn = 'results-'.$q_string; - PublicInbox::MboxGz::mbox_gz($ctx, \&results_cb, $fn); + # {threadid} limits results to a given thread + # {threads} collapses results from messages in the same thread, + # allowing us to use ->expand_thread w/o duplicates in our own code + if (defined($ctx->{mid})) { + my $over = ($ctx->{ibx}->{isrch} ? + $ctx->{ibx}->{isrch}->{es}->over : + $ctx->{ibx}->over) or + return PublicInbox::WWW::need($ctx, 'Overview'); + $opt->{threadid} = $over->mid2tid($ctx->{mid}); } + $opt->{threads} = 1 if $q->{t}; + $srch->query_approxidate($ctx->{ibx}->git, $qstr); + $ctx->{query} = $qstr; + sub { # called by PSGI server + $ctx->{wcb} = $_[0]; # PSGI server supplied write cb + $srch->async_mset($qstr, $opt, \&mbox_qry_cb, $ctx, $q) and + $ctx->{-really_async} = 1; + }; } 1; diff --git a/lib/PublicInbox/MboxGz.pm b/lib/PublicInbox/MboxGz.pm index 913be6e4..90e69c09 100644 --- a/lib/PublicInbox/MboxGz.pm +++ b/lib/PublicInbox/MboxGz.pm @@ -1,7 +1,7 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::MboxGz; -use strict; +use v5.12; use parent 'PublicInbox::GzipFilter'; use PublicInbox::Eml; use PublicInbox::Hval qw/to_filename/; @@ -13,8 +13,8 @@ sub async_next ($) { my ($http) = @_; # PublicInbox::HTTP my $ctx = $http->{forward} or return; eval { - $ctx->{smsg} = $ctx->{cb}->($ctx) or return $ctx->close; - $ctx->smsg_blob($ctx->{smsg}); + my $smsg = $ctx->{cb}->($ctx, $http) // return $ctx->close; + $smsg and $ctx->smsg_blob($ctx->{smsg} = $smsg); }; warn "E: $@" if $@; } @@ -22,7 +22,6 @@ sub async_next ($) { sub mbox_gz { my ($self, $cb, $fn) = @_; $self->{cb} = $cb; - $self->{base_url} = $self->{-inbox}->base_url($self->{env}); $self->{gz} = PublicInbox::GzipFilter::gzip_or_die(); $fn = to_filename($fn // '') // 'no-subject'; # http://www.iana.org/assignments/media-types/application/gzip @@ -37,9 +36,8 @@ sub getline { my ($self) = @_; my $cb = $self->{cb} or return; while (my $smsg = $cb->($self)) { - my $eml = $self->{-inbox}->smsg_eml($smsg) or next; - $self->zmore(msg_hdr($self, $eml, $smsg->{mid})); - return $self->translate(msg_body($eml)); + my $eml = $self->{ibx}->smsg_eml($smsg) or next; + return $self->translate(msg_hdr($self, $eml), msg_body($eml)); } # signal that we're done and can return undef next call: delete $self->{cb}; diff --git a/lib/PublicInbox/MboxLock.pm b/lib/PublicInbox/MboxLock.pm new file mode 100644 index 00000000..5e373873 --- /dev/null +++ b/lib/PublicInbox/MboxLock.pm @@ -0,0 +1,134 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Various mbox locking methods +package PublicInbox::MboxLock; +use v5.12; +use PublicInbox::OnDestroy (); +use Fcntl qw(:flock F_SETLK F_SETLKW F_RDLCK F_WRLCK + O_CREAT O_EXCL O_WRONLY SEEK_SET); +use Carp qw(croak); +use PublicInbox::DS qw(now); # ugh... +use autodie qw(chdir opendir unlink); + +our $TMPL = do { + if ($^O eq 'linux') { + \'s @32' + } elsif ($^O =~ /bsd/ || $^O eq 'dragonfly') { + \'@20 s @256' # n.b. @32 may be enough... + } else { + eval { require File::FcntlLock; 1 } + } +}; + +# This order matches Debian policy on Linux systems. +# See policy/ch-customized-programs.rst in +# https://salsa.debian.org/dbnpolicy/policy.git +sub defaults { [ qw(fcntl dotlock) ] } + +sub acq_fcntl { + my ($self) = @_; + my $op = $self->{nb} ? F_SETLK : F_SETLKW; + my $t = $self->{rw} ? F_WRLCK : F_RDLCK; + my $end = now + $self->{timeout}; + $TMPL or die <<EOF; +"struct flock" layout not available on $^O, install File::FcntlLock? +EOF + do { + if (ref $TMPL) { + return if fcntl($self->{fh}, $op, pack($$TMPL, $t)); + } else { + my $fl = File::FcntlLock->new; + $fl->l_type($t); + $fl->l_whence(SEEK_SET); + $fl->l_start(0); + $fl->l_len(0); + return if $fl->lock($self->{fh}, $op); + } + select(undef, undef, undef, $self->{delay}); + } while (now < $end); + die "fcntl lock timeout $self->{f}: $!\n"; +} + +sub acq_dotlock { + my ($self) = @_; + my $dot_lock = "$self->{f}.lock"; + my ($pfx, $base) = ($self->{f} =~ m!(\A.*?/)?([^/]+)\z!); + $pfx //= ''; + my $pid = $$; + my $end = now + $self->{timeout}; + do { + my $tmp = "$pfx.$base-".sprintf('%x,%x,%x', + rand(0xffffffff), $pid, time); + if (sysopen(my $fh, $tmp, O_CREAT|O_EXCL|O_WRONLY)) { + if (link($tmp, $dot_lock)) { + unlink($tmp); + $self->{".lock$pid"} = $dot_lock; + substr($dot_lock, 0, 1) eq '/' or + opendir($self->{dh}, '.'); + return; + } + unlink($tmp); + select(undef, undef, undef, $self->{delay}); + } else { + croak "open $tmp (for $dot_lock): $!" if !$!{EXIST}; + } + } while (now < $end); + die "dotlock timeout $dot_lock\n"; +} + +sub acq_flock { + my ($self) = @_; + my $op = $self->{rw} ? LOCK_EX : LOCK_SH; + $op |= LOCK_NB if $self->{nb}; + my $end = now + $self->{timeout}; + do { + return if flock($self->{fh}, $op); + if ($!{EWOULDBLOCK}) { + select(undef, undef, undef, $self->{delay}); + } elsif (!$!{EINTR}) { + croak "flock($self->{f} ($self->{fh}): $!"; + } + } while (now < $end); + die "flock timeout $self->{f}: $!\n"; +} + +sub acq { + my ($cls, $f, $rw, $methods) = @_; + my $self = bless { f => $f, rw => $rw }, $cls; + my $ok = open $self->{fh}, $rw ? '+>>' : '<', $f; + croak "open($f): $!" if !$ok && ($rw || !$!{ENOENT}); + my $m = "@$methods"; + if ($m ne 'none') { + my @m = map { + if (/\A(timeout|delay)=([0-9\.]+)s?\z/) { + $self->{$1} = $2 + 0; + (); + } else { + $cls->can("acq_$_") // $_ + } + } split(/[, ]/, $m); + my @bad = grep { !ref } @m; + croak "Unsupported lock methods: @bad\n" if @bad; + croak "No lock methods supplied with $m\n" if !@m; + $self->{nb} = $#m || defined($self->{timeout}); + $self->{delay} //= 0.1; + $self->{timeout} //= 5; + $_->($self) for @m; + } + $self; +} + +sub DESTROY { + my ($self) = @_; + my $f = $self->{".lock$$"} or return; + my $od; + if (my $dh = delete $self->{dh}) { + opendir my $c, '.'; + $od = PublicInbox::OnDestroy::all \&chdir, $c; + chdir($dh); + } + CORE::unlink($f) or die "unlink($f): $! (lock stolen?)"; +} + +1; diff --git a/lib/PublicInbox/MboxReader.pm b/lib/PublicInbox/MboxReader.pm new file mode 100644 index 00000000..3d78ca23 --- /dev/null +++ b/lib/PublicInbox/MboxReader.pm @@ -0,0 +1,188 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# reader for mbox variants we support (and also sets up commands for writing) +package PublicInbox::MboxReader; +use strict; +use v5.10.1; # check regexps before v5.12 +use Data::Dumper; +$Data::Dumper::Useqq = 1; # should've been the default, for bad data + +my $from_strict = + qr/^From \S+ +\S+ \S+ +\S+ [^\n:]+:[^\n:]+:[^\n:]+ [^\n:]+\n/sm; + +# cf: https://doc.dovecot.org/configuration_manual/mail_location/mbox/ +my %status2kw = (F => 'flagged', A => 'answered', R => 'seen', T => 'draft'); +# O (old/non-recent), and D (deleted) aren't in JMAP, +# so probably won't be supported by us. +sub mbox_keywords { + my $eml = $_[-1]; + my $s = "@{[$eml->header_raw('X-Status'),$eml->header_raw('Status')]}"; + my %kw; + $s =~ s/([FART])/$kw{$status2kw{$1}} = 1/sge; + [ sort(keys %kw) ]; +} + +sub _mbox_from { + my ($mbfh, $from_re, $eml_cb, @arg) = @_; + my $buf = ''; + my @raw; + while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) { + if ($r == 0) { # close here to check for "curl --fail" + $mbfh->close or die "error closing mbox: \$?=$? $!"; + @raw = ($buf); + } else { + @raw = split(/$from_strict/mos, $buf, -1); + next if scalar(@raw) == 0; + $buf = pop(@raw); # last bit may be incomplete + } + @raw = grep /[^ \t\r\n]/s, @raw; # skip empty messages + while (defined(my $raw = shift @raw)) { + $raw =~ s/^\r?\n\z//ms; + $raw =~ s/$from_re/$1/gms; + my $eml = PublicInbox::Eml->new(\$raw); + $eml_cb->($eml, @arg) if $eml->raw_size; + } + return if $r == 0; # EOF + } + die "error reading mboxo/mboxrd handle: $!"; +} + +sub mboxrd { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_from($mbfh, qr/^>(>*From )/ms, $eml_cb, @arg); +} + +sub mboxo { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_from($mbfh, qr/^>(From )/ms, $eml_cb, @arg); +} + +sub _cl_body { + my ($mbfh, $bref, $cl) = @_; + my $body = substr($$bref, 0, $cl, ''); + my $need = $cl - length($body); + if ($need > 0) { + $mbfh or die "E: needed $need bytes after EOF"; + defined(my $r = read($mbfh, $body, $need, length($body))) or + die "E: read error: $!\n"; + $r == $need or die "E: read $r of $need bytes\n"; + } + \$body; +} + +sub _extract_hdr { + my ($ref) = @_; + if (index($$ref, "\r\n") < 0 && (my $pos = index($$ref, "\n\n")) >= 0) { + # likely on *nix + \substr($$ref, 0, $pos + 2, ''); # sv_chop on $$ref + } elsif ($$ref =~ /\r?\n\r?\n/s) { + \substr($$ref, 0, $+[0], ''); # sv_chop on $$ref + } else { + undef + } +} + +sub _mbox_cl ($$$;@) { + my ($mbfh, $uxs_from, $eml_cb, @arg) = @_; + my $buf = ''; + while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) { + if ($r == 0) { # detect "curl --fail" + $mbfh->close or + die "error closing mboxcl/mboxcl2: \$?=$? $!"; + undef $mbfh; + } + while (my $hdr = _extract_hdr(\$buf)) { + PublicInbox::Eml::strip_from($$hdr) or + die "E: no 'From ' line in:\n", Dumper($hdr); + my $eml = PublicInbox::Eml->new($hdr); + next unless $eml->raw_size; + my @cl = $eml->header_raw('Content-Length'); + my $n = scalar(@cl); + $n == 0 and die "E: Content-Length missing in:\n", + Dumper($eml->as_string); + $n == 1 or die "E: multiple ($n) Content-Length in:\n", + Dumper($eml->as_string); + $cl[0] =~ /\A[0-9]+\z/ or die + "E: Content-Length `$cl[0]' invalid\n", + Dumper($eml->as_string); + if (($eml->{bdy} = _cl_body($mbfh, \$buf, $cl[0]))) { + $uxs_from and + ${$eml->{bdy}} =~ s/^>From /From /sgm; + } + $eml_cb->($eml, @arg); + } + if ($r == 0) { + $buf =~ /[^ \r\n\t]/ and + warn "W: leftover at end of mboxcl/mboxcl2:\n", + Dumper(\$buf); + return; + } + } + die "error reading mboxcl/mboxcl2 handle: $!"; +} + +sub mboxcl { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_cl($mbfh, 1, $eml_cb, @arg); +} + +sub mboxcl2 { + my (undef, $mbfh, $eml_cb, @arg) = @_; + _mbox_cl($mbfh, undef, $eml_cb, @arg); +} + +sub new { bless \(my $x), __PACKAGE__ } + +sub reads { + my $ifmt = $_[-1]; + $ifmt =~ /\Ambox(?:rd|cl|cl2|o)\z/ ? __PACKAGE__->can($ifmt) : undef +} + +# all of these support -c for stdout and -d for decompression, +# mutt is commonly distributed with hooks for gz, bz2 and xz, at least +# { foo => '' } means "--foo" is passed to the command-line +my %zsfx2cmd = ( + gz => [ qw(GZIP pigz gzip), { rsyncable => '' } ], + bz2 => [ 'bzip2', {} ], + xz => [ 'xz', {} ], + # don't add new entries here unless MUA support is widely available +); + +sub zsfx ($) { + my ($pathname) = @_; + my $allow = join('|', keys %zsfx2cmd); + $pathname =~ /\.($allow)\z/ ? $1 : undef; +} + +sub zsfx2cmd ($$$) { + my ($zsfx, $decompress, $lei) = @_; + my $x = $zsfx2cmd{$zsfx} // die "BUG: no support for suffix=.$zsfx"; + my @info = @$x; + my $cmd_opt = ref($info[-1]) ? pop(@info) : undef; + my @cmd = (undef, $decompress ? qw(-dc) : qw(-c)); + require PublicInbox::Spawn; + for my $exe (@info) { + # I think respecting client's ENV{GZIP} is OK, not sure + # about ENV overrides for other, less-common compressors + if ($exe eq uc($exe)) { + $exe = $lei->{env}->{$exe} or next; + } + $cmd[0] = PublicInbox::Spawn::which($exe) and last; + } + $cmd[0] // die join(' or ', @info)." missing for .$zsfx"; + + # only for --rsyncable. TODO: support compression level? + for my $key (keys %$cmd_opt) { + push @cmd, '--'.$key if $lei->{opt}->{$key}; + } + \@cmd; +} + +sub zsfxcat ($$$) { + my ($in, $zsfx, $lei) = @_; + my $cmd = zsfx2cmd($zsfx, 1, $lei); + PublicInbox::Spawn::popen_rd($cmd, undef, { 0 => $in, 2 => $lei->{2} }); +} + +1; diff --git a/lib/PublicInbox/MdirReader.pm b/lib/PublicInbox/MdirReader.pm new file mode 100644 index 00000000..2981b058 --- /dev/null +++ b/lib/PublicInbox/MdirReader.pm @@ -0,0 +1,108 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Maildirs only (PublicInbox::MHreader exists, now) +# ref: https://cr.yp.to/proto/maildir.html +# https://wiki2.dovecot.org/MailboxFormat/Maildir +package PublicInbox::MdirReader; +use strict; +use v5.10.1; +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::SHA qw(sha256_hex); + +# returns Maildir flags from a basename ('' for no flags, undef for invalid) +sub maildir_basename_flags { + my (@f) = split(/:/, $_[0], -1); + return if (scalar(@f) > 2 || substr($f[0], 0, 1) eq '.'); + $f[1] // return ''; # "new" + $f[1] =~ /\A2,([A-Za-z]*)\z/ ? $1 : undef; # "cur" +} + +# same as above, but for full path name +sub maildir_path_flags { + my ($f) = @_; + my $i = rindex($f, '/'); + $i >= 0 ? maildir_basename_flags(substr($f, $i + 1)) : undef; +} + +sub shard_ok ($$$) { + my ($bn, $mod, $shard) = @_; + # can't get dirent.d_ino w/ pure Perl readdir, so we extract + # the OID if it looks like one instead of doing stat(2) + my $hex = $bn =~ m!\A([a-f0-9]{40,})! ? $1 : sha256_hex($bn); + my $recno = hex(substr($hex, 0, 8)); + ($recno % $mod) == $shard; +} + +sub maildir_each_file { + my ($self, $dir, $cb, @arg) = @_; + $dir .= '/' unless substr($dir, -1) eq '/'; + my ($mod, $shard) = @{$self->{shard_info} // []}; + for my $d (qw(new/ cur/)) { + my $pfx = $dir.$d; + opendir my $dh, $pfx or next; + while (defined(my $bn = readdir($dh))) { + my $fl = maildir_basename_flags($bn) // next; + next if defined($mod) && !shard_ok($bn, $mod, $shard); + next if index($fl, 'T') >= 0; # no Trashed messages + $cb->($pfx.$bn, $fl, @arg); + } + } +} + +my %c2kw = ('D' => 'draft', F => 'flagged', P => 'forwarded', + R => 'answered', S => 'seen'); + +sub maildir_each_eml { + my ($self, $dir, $cb, @arg) = @_; + $dir .= '/' unless substr($dir, -1) eq '/'; + my ($mod, $shard) = @{$self->{shard_info} // []}; + my $pfx = $dir . 'new/'; + if (opendir(my $dh, $pfx)) { + while (defined(my $bn = readdir($dh))) { + next if substr($bn, 0, 1) eq '.'; + my @f = split(/:/, $bn, -1); + + # mbsync and offlineimap both use "2," in "new/" + next if ($f[1] // '2,') ne '2,' || defined($f[2]); + + next if defined($mod) && !shard_ok($bn, $mod, $shard); + my $f = $pfx.$bn; + my $eml = eml_from_path($f) or next; + $cb->($f, [], $eml, @arg); + } + } + $pfx = $dir . 'cur/'; + opendir my $dh, $pfx or return; + while (defined(my $bn = readdir($dh))) { + my $fl = maildir_basename_flags($bn) // next; + next if index($fl, 'T') >= 0; + next if defined($mod) && !shard_ok($bn, $mod, $shard); + my $f = $pfx.$bn; + my $eml = eml_from_path($f) or next; + my @kw = sort(map { $c2kw{$_} // () } split(//, $fl)); + $cb->($f, \@kw, $eml, @arg); + } +} + +sub new { bless {}, __PACKAGE__ } + +sub flags2kw ($) { + if (wantarray) { + my @unknown; + my %kw; + for (split(//, $_[0])) { + my $k = $c2kw{$_}; + if (defined($k)) { + $kw{$k} = 1; + } else { + push @unknown, $_; + } + } + (\%kw, \@unknown); + } else { + [ sort(map { $c2kw{$_} // () } split(//, $_[0])) ]; + } +} + +1; diff --git a/lib/PublicInbox/MdirSort.pm b/lib/PublicInbox/MdirSort.pm new file mode 100644 index 00000000..6bd9fb6c --- /dev/null +++ b/lib/PublicInbox/MdirSort.pm @@ -0,0 +1,46 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# used for sorting MH (and (TODO) Maildir) names +# TODO: consider sort(1) to parallelize sorting of gigantic directories +package PublicInbox::MdirSort; +use v5.12; +use Time::HiRes (); +use parent qw(Exporter); +use Fcntl qw(S_ISREG); +our @EXPORT = qw(mdir_sort); +my %ST = (sequence => 0, size => 1, atime => 2, mtime => 3, ctime => 4); + +sub mdir_sort ($$;$) { + my ($ent, $sort, $max) = @_; + my @st; + my @ent = map { + @st = Time::HiRes::stat $_; + # name, size, {a,m,c}time + S_ISREG($st[2]) ? [ $_, @st[7..10] ] : (); + } @$ent; + @ent = grep { $_->[1] <= $max } @ent if $max; + use sort 'stable'; + for my $s (@$sort) { + if ($s =~ /\A(\-|\+|)name\z/) { + if ($1 eq '-') { + @ent = sort { $b->[0] cmp $a->[0] } @ent; + } else { + @ent = sort { $a->[0] cmp $b->[0] } @ent; + } + } elsif ($s =~ /\A(\-|\+|) + (sequence|size|ctime|mtime|atime)\z/x) { + my $key = $ST{$2}; + if ($1 eq '-') { + @ent = sort { $b->[$key] <=> $a->[$key] } @ent; + } else { + @ent = sort { $a->[$key] <=> $b->[$key] } @ent; + } + } else { + die "E: unrecognized sort parameter: `$s'"; + } + } + @$ent = map { $_->[0] } @ent; +} + +1; diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm new file mode 100644 index 00000000..6708527d --- /dev/null +++ b/lib/PublicInbox/MiscIdx.pm @@ -0,0 +1,161 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# like PublicInbox::SearchIdx, but for searching for non-mail messages. +# Things indexed include: +# * inboxes themselves +# * epoch information +# * (maybe) git code repository information (not commits) +# Expect ~100K-1M documents with no parallelism opportunities, +# so no sharding, here. +# +# See MiscSearch for read-only counterpart +package PublicInbox::MiscIdx; +use strict; +use v5.10.1; +use PublicInbox::InboxWritable; +use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat +use PublicInbox::SearchIdx qw(index_text term_generator add_val); +use Carp qw(croak); +use File::Path (); +use PublicInbox::MiscSearch; +use PublicInbox::Config; +use PublicInbox::Syscall; +my $json; + +sub new { + my ($class, $eidx) = @_; + PublicInbox::SearchIdx::load_xapian_writable(); + my $mi_dir = "$eidx->{xpfx}/misc"; + File::Path::mkpath($mi_dir); + PublicInbox::Syscall::nodatacow_dir($mi_dir); + my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; + $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $flags |= $PublicInbox::SearchIdx::DB_DANGEROUS if $eidx->{-dangerous}; + $json //= PublicInbox::Config::json(); + bless { + mi_dir => $mi_dir, + flags => $flags, + indexlevel => 'full', # small DB, no point in medium? + }, $class; +} + +sub _begin_txn ($) { + my ($self) = @_; + my $wdb = $PublicInbox::Search::X{WritableDatabase}; + my $xdb = eval { $wdb->new($self->{mi_dir}, $self->{flags}) }; + croak "Failed opening $self->{mi_dir}: $@" if $@; + $xdb->begin_transaction; + $xdb; +} + +sub commit_txn { + my ($self) = @_; + my $xdb = delete $self->{xdb} or return; + $xdb->commit_transaction; +} + +sub create_xdb { + my ($self) = @_; + $self->{xdb} //= _begin_txn($self); + commit_txn($self); +} + +sub remove_eidx_key { + my ($self, $eidx_key) = @_; + my $xdb = $self->{xdb} //= _begin_txn($self); + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my @docids; # only one, unless we had bugs + for (; $head != $tail; $head++) { + push @docids, $head->get_docid; + } + for my $docid (@docids) { + $xdb->delete_document($docid); + warn "# remove inbox docid #$docid ($eidx_key)\n"; + } +} + +# adds or updates according to $eidx_key +sub index_ibx { + my ($self, $ibx) = @_; + my $eidx_key = $ibx->eidx_key; + my $xdb = $self->{xdb} //= _begin_txn($self); + # Q = uniQue in Xapian terminology + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my ($docid, @drop); + for (; $head != $tail; $head++) { + if (defined $docid) { + my $i = $head->get_docid; + push @drop, $i; + warn <<EOF; +W: multiple inboxes keyed to `$eidx_key', deleting #$i +EOF + } else { + $docid = $head->get_docid; + } + } + $xdb->delete_document($_) for @drop; # just in case + + my $doc = $PublicInbox::Search::X{Document}->new; + term_generator($self)->set_document($doc); + + # allow sorting by modified and uidvalidity (created at) + add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); + add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); + + $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + $doc->add_boolean_term('T'.'inbox'); # Type + + # force reread from disk, {description} could be loaded from {misc} + delete @$ibx{qw(-art_min -art_max description)}; + if (defined($ibx->{newsgroup}) && $ibx->nntp_usable) { + $doc->add_boolean_term('T'.'newsgroup'); # additional Type + my $n = $ibx->art_min; + add_val($doc, $PublicInbox::MiscSearch::ART_MIN, $n) if $n; + $n = $ibx->art_max; + add_val($doc, $PublicInbox::MiscSearch::ART_MAX, $n) if $n; + } + + my $desc = $ibx->description; + + # description = S/Subject (or title) + # address = A/Author + index_text($self, $desc, 1, 'S'); + index_text($self, $ibx->{name}, 1, 'XNAME'); + my %map = ( + address => 'A', + listid => 'XLISTID', + infourl => 'XINFOURL', + url => 'XURL' + ); + while (my ($f, $pfx) = each %map) { + for my $v (@{$ibx->{$f} // []}) { + index_text($self, $v, 1, $pfx); + } + } + my $data = {}; + if (defined(my $max = $ibx->max_git_epoch)) { # v2 + my $pfx = "/$ibx->{name}/git/"; + for my $epoch (0..$max) { + my $git = $ibx->git_epoch($epoch) or return; + if (my $ent = $git->manifest_entry($epoch, $desc)) { + $data->{"$pfx$epoch.git"} = $ent; + $ent->{git_dir} = $git->{git_dir}; + } + $git->cleanup; # ->modified starts cat-file --batch + } + } elsif (my $ent = $ibx->git->manifest_entry) { # v1 + $ent->{git_dir} = $ibx->{inboxdir}; + $data->{"/$ibx->{name}"} = $ent; + } + $doc->set_data($json->encode($data)); + if (defined $docid) { + $xdb->replace_document($docid, $doc); + } else { + $xdb->add_document($doc); + } +} + +1; diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm new file mode 100644 index 00000000..5fb47d03 --- /dev/null +++ b/lib/PublicInbox/MiscSearch.pm @@ -0,0 +1,154 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# read-only counterpart to MiscIdx +package PublicInbox::MiscSearch; +use strict; +use v5.10.1; +use PublicInbox::Search qw(retry_reopen int_val xap_terms); +my $json; + +# Xapian value columns: +our $MODIFIED = 0; +our $UIDVALIDITY = 1; # (created time) +our $ART_MIN = 2; # NNTP article number +our $ART_MAX = 3; # NNTP article number + +# avoid conflicting with message Search::prob_prefix for UI/UX reasons +my %PROB_PREFIX = ( + description => 'S', # $INBOX_DIR/description + address => 'A', + listid => 'XLISTID', + url => 'XURL', + infourl => 'XINFOURL', + name => 'XNAME', + '' => 'S A XLISTID XNAME XURL XINFOURL' +); + +sub new { + my ($class, $dir) = @_; + PublicInbox::Search::load_xapian(); + $json //= PublicInbox::Config::json(); + bless { + xdb => $PublicInbox::Search::X{Database}->new($dir) + }, $class; +} + +# read-only +sub mi_qp_new ($) { + my ($self) = @_; + my $xdb = $self->{xdb}; + my $qp = $PublicInbox::Search::X{QueryParser}->new; + $qp->set_default_op(PublicInbox::Search::OP_AND()); + $qp->set_database($xdb); + $qp->set_stemmer(PublicInbox::Search::stemmer($self)); + $qp->set_stemming_strategy(PublicInbox::Search::STEM_SOME()); + my $cb = $qp->can('set_max_wildcard_expansion') // + $qp->can('set_max_expansion'); # Xapian 1.5.0+ + $cb->($qp, 100); + $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + while (my ($name, $prefix) = each %PROB_PREFIX) { + $qp->add_prefix($name, $_) for split(/ /, $prefix); + } + $qp->add_boolean_prefix('type', 'T'); + $qp; +} + +sub misc_enquire_once { # retry_reopen callback + my ($self, $qr, $opt) = @_; + my $eq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); + $eq->set_query($qr); + my $desc = !$opt->{asc}; + my $rel = $opt->{relevance} // 0; + if ($rel == -1) { # ORDER BY docid + $eq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); + $eq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); + } elsif ($rel) { + $eq->set_sort_by_relevance_then_value($MODIFIED, $desc); + } else { + $eq->set_sort_by_value_then_relevance($MODIFIED, $desc); + } + $eq->get_mset($opt->{offset} || 0, $opt->{limit} || 200); +} + +sub mset { + my ($self, $qs, $opt) = @_; + $opt ||= {}; + reopen($self); + my $qp = $self->{qp} //= mi_qp_new($self); + $qs = 'type:inbox' if $qs eq ''; + my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS); + $opt->{relevance} = 1 unless exists $opt->{relevance}; + retry_reopen($self, \&misc_enquire_once, $qr, $opt); +} + +sub ibx_data_once { + my ($self, $ibx) = @_; + my $xdb = $self->{xdb}; + my $term = 'Q'.$ibx->eidx_key; # may be {inboxdir}, so private + my $head = $xdb->postlist_begin($term); + my $tail = $xdb->postlist_end($term); + return if $head == $tail; + my $doc = $xdb->get_document($head->get_docid); + $ibx->{uidvalidity} //= int_val($doc, $UIDVALIDITY); + $ibx->{-modified} = int_val($doc, $MODIFIED); + $ibx->{-art_min} = int_val($doc, $ART_MIN); + $ibx->{-art_max} = int_val($doc, $ART_MAX); + $doc->get_data; +} + +sub doc2ibx_cache_ent { # @_ == ($self, $doc) OR ($doc) + my ($doc) = $_[-1]; + my $d; + my $data = $json->decode($doc->get_data); + for (values %$data) { + $d = $_->{description} // next; + $d =~ s/ \[epoch [0-9]+\]\z// or next; + last; + } + { + uidvalidity => int_val($doc, $UIDVALIDITY), + -modified => int_val($doc, $MODIFIED), + -art_min => int_val($doc, $ART_MIN), # may be undef + -art_max => int_val($doc, $ART_MAX), # may be undef + # extract description from manifest.js.gz epoch description + description => $d + }; +} + +sub inbox_data { + my ($self, $ibx) = @_; + retry_reopen($self, \&ibx_data_once, $ibx); +} + +sub ibx_cache_load { + my ($doc, $cache) = @_; + my ($eidx_key) = xap_terms('Q', $doc); + return unless defined($eidx_key); # expired + $cache->{$eidx_key} = doc2ibx_cache_ent($doc); +} + +sub _nntpd_cache_load { # retry_reopen callback + my ($self) = @_; + my $opt = { limit => $self->{xdb}->get_doccount * 10, relevance => -1 }; + my $mset = mset($self, 'type:newsgroup type:inbox', $opt); + my $cache = {}; + for my $it ($mset->items) { + ibx_cache_load($it->get_document, $cache); + } + $cache +} + +# returns { newsgroup => $cache_entry } mapping, $cache_entry contains +# anything which may trigger seeks at startup, currently: description, +# -modified, and uidvalidity. +sub nntpd_cache_load { + my ($self) = @_; + retry_reopen($self, \&_nntpd_cache_load); +} + +no warnings 'once'; +*reopen = \&PublicInbox::Search::reopen; + +1; diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index bb1dfead..dd28417b 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # read-only utilities for Email::MIME @@ -84,18 +84,30 @@ sub msg_part_text ($$) { # If forcing charset=UTF-8 failed, # caller will warn further down... $s = $part->body if $@; + } elsif ($err && $ct =~ m!\bapplication/octet-stream\b!i) { + # Some unconfigured/poorly-configured MUAs will set + # application/octet-stream even for all text attachments. + # Try to see if it's printable text that we can index + # and display: + $s = $part->body; + utf8::decode($s); + undef($s =~ /[^\p{XPosixPrint}\s]/s ? $s : $err); } ($s, $err); } # returns an array of quoted or unquoted sections sub split_quotes { + # some editors don't put trailing newlines at the end, + # make sure split_quotes can work: + $_[0] .= "\n" if substr($_[0], -1) ne "\n"; + # Quiet "Complex regular subexpression recursion limit" warning # in case an inconsiderate sender quotes 32K of text at once. # The warning from Perl is harmless for us since our callers can # tolerate less-than-ideal matches which work within Perl limits. no warnings 'regexp'; - split(/((?:^>[^\n]*\n)+)/sm, shift); + split(/((?:^>[^\n]*\n)+)/sm, $_[0]); } 1; diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index 8596f01c..bbc9a007 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -1,11 +1,11 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Various date/time-related functions package PublicInbox::MsgTime; +use v5.10.1; # unicode_strings in 5.12 may not work... use strict; -use warnings; -use base qw(Exporter); +use parent qw(Exporter); our @EXPORT_OK = qw(msg_timestamp msg_datestamp); use Time::Local qw(timegm); my @MoY = qw(january february march april may june @@ -125,10 +125,7 @@ sub str2date_zone ($) { # but we want to keep "git fsck" happy. # "-1200" is the furthest westermost zone offset, # but git fast-import is liberal so we use "-1400" - if ($zone >= 1400 || $zone <= -1400) { - warn "bogus TZ offset: $zone, ignoring and assuming +0000\n"; - $zone = '+0000'; - } + $zone = '+0000' if $zone >= 1400 || $zone <= -1400; [$ts, $zone]; } @@ -138,50 +135,38 @@ sub time_response ($) { } sub msg_received_at ($) { - my ($hdr) = @_; # PublicInbox::Eml - my @recvd = $hdr->header_raw('Received'); - my ($ts); - foreach my $r (@recvd) { + my ($eml) = @_; + my $ts; + for my $r ($eml->header_raw('Received')) { $r =~ /\s*([0-9]+\s+[a-zA-Z]+\s+[0-9]{2,4}\s+ [0-9]+[^0-9][0-9]+(?:[^0-9][0-9]+) - \s+([\+\-][0-9]+))/sx or next; + \s+(?:[\+\-][0-9]+))/sx or next; $ts = eval { str2date_zone($1) } and return $ts; - my $mid = $hdr->header_raw('Message-ID'); - warn "no date in $mid Received: $r\n"; } undef; } sub msg_date_only ($) { - my ($hdr) = @_; # PublicInbox::Eml - my @date = $hdr->header_raw('Date'); - my ($ts); - foreach my $d (@date) { + my ($eml) = @_; + my $ts; + for my $d ($eml->header_raw('Date')) { $ts = eval { str2date_zone($d) } and return $ts; - if ($@) { - my $mid = $hdr->header_raw('Message-ID'); - warn "bad Date: $d in $mid: $@\n"; - } } undef; } # Favors Received header for sorting globally sub msg_timestamp ($;$) { - my ($hdr, $fallback) = @_; # PublicInbox::Eml - my $ret; - $ret = msg_received_at($hdr) and return time_response($ret); - $ret = msg_date_only($hdr) and return time_response($ret); - time_response([ $fallback // time, '+0000' ]); + my ($eml, $fallback) = @_; + time_response(msg_received_at($eml) // msg_date_only($eml) // + [ $fallback // time, '+0000' ]); } # Favors the Date: header for display and sorting within a thread sub msg_datestamp ($;$) { - my ($hdr, $fallback) = @_; # PublicInbox::Eml - my $ret; - $ret = msg_date_only($hdr) and return time_response($ret); - $ret = msg_received_at($hdr) and return time_response($ret); - time_response([ $fallback // time, '+0000' ]); + my ($eml, $fallback) = @_; # PublicInbox::Eml + time_response(msg_date_only($eml) // msg_received_at($eml) // + [ $fallback // time, '+0000' ]); } 1; diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index f15875e3..cb4bb295 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # bidirectional Message-ID <-> Article Number mapping for the NNTP @@ -13,20 +13,17 @@ use v5.10.1; use DBI; use DBD::SQLite; use PublicInbox::Over; -use PublicInbox::Spawn; - -sub new { - my ($class, $git_dir, $writable) = @_; - my $d = "$git_dir/public-inbox"; - if ($writable && !-d $d && !mkdir $d) { - my $err = $!; - -d $d or die "$d not created: $err"; - } - new_file($class, "$d/msgmap.sqlite3", $writable); -} +use Scalar::Util qw(blessed); sub new_file { - my ($class, $f, $rw) = @_; + my ($class, $ibx, $rw) = @_; + my $f; + if (blessed($ibx)) { + $f = $ibx->mm_file; + $rw = 2 if $rw && $ibx->{-no_fsync}; + } else { + $f = $ibx; + } return if !$rw && !-r $f; my $self = bless { filename => $f }, $class; @@ -34,10 +31,16 @@ sub new_file { if ($rw) { $dbh->begin_work; create_tables($dbh); - $self->created_at(time) unless $self->created_at; + unless ($self->created_at) { + my $t; - my $max = $self->max // 0; - $self->num_highwater($max); + if (blessed($ibx) && + -f "$ibx->{inboxdir}/inbox.config.example") { + $t = (stat(_))[9]; # mtime set by "curl -R" + } + $self->created_at($t // time); + } + $self->num_highwater(max($self)); $dbh->commit; } $self; @@ -47,9 +50,10 @@ sub new_file { sub tmp_clone { my ($self, $dir) = @_; require File::Temp; - my $tmp = "mm_tmp-$$-XXXXXX"; + my $tmp = "mm_tmp-$$-XXXX"; my ($fh, $fn) = File::Temp::tempfile($tmp, EXLOCK => 0, DIR => $dir); - PublicInbox::Spawn::nodatacow_fd(fileno($fh)); + require PublicInbox::Syscall; + PublicInbox::Syscall::nodatacow_fh($fh); $self->{dbh}->sqlite_backup_to_file($fn); $tmp = ref($self)->new_file($fn, 2); $tmp->{dbh}->do('PRAGMA journal_mode = MEMORY'); @@ -62,18 +66,15 @@ sub meta_accessor { my ($self, $key, $value) = @_; my $sql = 'SELECT val FROM meta WHERE key = ? LIMIT 1'; - my $dbh = $self->{dbh}; - my $prev; - defined $value or return $dbh->selectrow_array($sql, undef, $key); - - $prev = $dbh->selectrow_array($sql, undef, $key); + my $prev = $self->{dbh}->selectrow_array($sql, undef, $key); + $value // return $prev; if (defined $prev) { $sql = 'UPDATE meta SET val = ? WHERE key = ?'; - $dbh->do($sql, undef, $value, $key); + $self->{dbh}->do($sql, undef, $value, $key); } else { $sql = 'INSERT INTO meta (key,val) VALUES (?,?)'; - $dbh->do($sql, undef, $key, $value); + $self->{dbh}->do($sql, undef, $key, $value); } $prev; } @@ -99,44 +100,40 @@ sub created_at { sub num_highwater { my ($self, $num) = @_; - my $high = $self->{num_highwater} ||= - $self->meta_accessor('num_highwater'); + my $high = $self->meta_accessor('num_highwater'); if (defined($num) && (!defined($high) || ($num > $high))) { - $self->{num_highwater} = $num; + $high = $num; $self->meta_accessor('num_highwater', $num); } - $self->{num_highwater}; + $high } sub mid_insert { my ($self, $mid) = @_; - my $dbh = $self->{dbh}; - my $sth = $dbh->prepare_cached(<<''); + my $sth = $self->{dbh}->prepare_cached(<<''); INSERT INTO msgmap (mid) VALUES (?) return unless eval { $sth->execute($mid) }; - my $num = $dbh->last_insert_id(undef, undef, 'msgmap', 'num'); + my $num = $self->{dbh}->last_insert_id(undef, undef, 'msgmap', 'num'); $self->num_highwater($num) if defined($num); $num; } sub mid_for { my ($self, $num) = @_; - my $dbh = $self->{dbh}; - my $sth = $self->{mid_for} ||= - $dbh->prepare('SELECT mid FROM msgmap WHERE num = ? LIMIT 1'); - $sth->bind_param(1, $num); - $sth->execute; + my $sth = $self->{dbh}->prepare_cached(<<"", undef, 1); +SELECT mid FROM msgmap WHERE num = ? LIMIT 1 + + $sth->execute($num); $sth->fetchrow_array; } sub num_for { my ($self, $mid) = @_; - my $dbh = $self->{dbh}; - my $sth = $self->{num_for} ||= - $dbh->prepare('SELECT num FROM msgmap WHERE mid = ? LIMIT 1'); - $sth->bind_param(1, $mid); - $sth->execute; + my $sth = $self->{dbh}->prepare_cached(<<"", undef, 1); +SELECT num FROM msgmap WHERE mid = ? LIMIT 1 + + $sth->execute($mid); $sth->fetchrow_array; } @@ -144,32 +141,30 @@ sub max { my $sth = $_[0]->{dbh}->prepare_cached('SELECT MAX(num) FROM msgmap', undef, 1); $sth->execute; - $sth->fetchrow_array; + $sth->fetchrow_array // 0; } -sub minmax { - # breaking MIN and MAX into separate queries speeds up from 250ms - # to around 700us with 2.7million messages. +sub min { my $sth = $_[0]->{dbh}->prepare_cached('SELECT MIN(num) FROM msgmap', undef, 1); $sth->execute; - ($sth->fetchrow_array, max($_[0])); + $sth->fetchrow_array // 0; +} + +sub minmax { + # breaking MIN and MAX into separate queries speeds up from 250ms + # to around 700us with 2.7million messages. + (min($_[0]), max($_[0])); } sub mid_delete { my ($self, $mid) = @_; - my $dbh = $self->{dbh}; - my $sth = $dbh->prepare('DELETE FROM msgmap WHERE mid = ?'); - $sth->bind_param(1, $mid); - $sth->execute; + $self->{dbh}->do('DELETE FROM msgmap WHERE mid = ?', undef, $mid); } sub num_delete { my ($self, $num) = @_; - my $dbh = $self->{dbh}; - my $sth = $dbh->prepare('DELETE FROM msgmap WHERE num = ?'); - $sth->bind_param(1, $num); - $sth->execute; + $self->{dbh}->do('DELETE FROM msgmap WHERE num = ?', undef, $num); } sub create_tables { @@ -190,23 +185,11 @@ CREATE TABLE IF NOT EXISTS meta ( } -# used by NNTP.pm -sub ids_after { - my ($self, $num) = @_; - my $ids = $self->{dbh}->selectcol_arrayref(<<'', undef, $$num); -SELECT num FROM msgmap WHERE num > ? -ORDER BY num ASC LIMIT 1000 - - $$num = $ids->[-1] if @$ids; - $ids; -} - sub msg_range { my ($self, $beg, $end, $cols) = @_; $cols //= 'num,mid'; - my $dbh = $self->{dbh}; my $attr = { Columns => [] }; - my $mids = $dbh->selectall_arrayref(<<"", $attr, $$beg, $end); + my $mids = $self->{dbh}->selectall_arrayref(<<"", $attr, $$beg, $end); SELECT $cols FROM msgmap WHERE num >= ? AND num <= ? ORDER BY num ASC LIMIT 1000 @@ -218,10 +201,9 @@ ORDER BY num ASC LIMIT 1000 # see scripts/xhdr-num2mid or PublicInbox::Filter::RubyLang for usage sub mid_set { my ($self, $num, $mid) = @_; - my $sth = $self->{mid_set} ||= do { - $self->{dbh}->prepare( - 'INSERT OR IGNORE INTO msgmap (num,mid) VALUES (?,?)'); - }; + my $sth = $self->{dbh}->prepare_cached(<<""); +INSERT OR IGNORE INTO msgmap (num,mid) VALUES (?,?) + my $result = $sth->execute($num, $mid); $self->num_highwater($num) if (defined($result) && $result == 1); $result; @@ -283,21 +265,10 @@ sub skip_artnum { sub check_inodes { my ($self) = @_; - # no filename if in-:memory: - my $f = $self->{dbh}->sqlite_db_filename // return; - if (my @st = stat($f)) { # did st_dev, st_ino change? - my $st = pack('dd', $st[0], $st[1]); - if ($st ne ($self->{st} // $st)) { - my $tmp = eval { ref($self)->new_file($f) }; - if ($@) { - warn "E: DBI->connect($f): $@\n"; - } else { - %$self = %$tmp; - } - } - } else { - warn "W: stat $f: $!\n"; - } + $self->{dbh} // return; + my $rw = !$self->{dbh}->{ReadOnly}; + PublicInbox::Over::check_inodes($self); + $self->{dbh} //= PublicInbox::Over::dbh_new($self, !$rw); } 1; diff --git a/lib/PublicInbox/MultiGit.pm b/lib/PublicInbox/MultiGit.pm new file mode 100644 index 00000000..b7691806 --- /dev/null +++ b/lib/PublicInbox/MultiGit.pm @@ -0,0 +1,145 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# common git alternates + all.git||ALL.git management code +package PublicInbox::MultiGit; +use strict; +use v5.10.1; +use PublicInbox::Spawn qw(run_die run_qx); +use PublicInbox::Import; +use File::Temp 0.19; +use List::Util qw(max); +use PublicInbox::IO qw(read_all); +use autodie qw(chmod close rename); + +sub new { + my ($cls, $topdir, $all, $epfx) = @_; + bless { + topdir => $topdir, # inboxdir || extindex.*.topdir + all => $all, # all.git or ALL.git + epfx => $epfx, # "git" (inbox) or "local" (lei/store) + }, $cls; +} + +sub read_alternates { + my ($self, $moderef, $prune) = @_; + my $objpfx = "$self->{topdir}/$self->{all}/objects/"; + my $f = "${objpfx}info/alternates"; + my %alt; # line => score + my %seen; # $st_dev\0$st_ino => count + my $other = 0; + if (open(my $fh, '<', $f)) { + my $is_edir = defined($self->{epfx}) ? + qr!\A\Q../../$self->{epfx}\E/([0-9]+)\.git/objects\z! : + undef; + $$moderef = (stat($fh))[2] & 07777; + for my $rel (split(/^/m, read_all($fh, -s _))) { + chomp(my $dir = $rel); + my $score; + if (defined($is_edir) && $dir =~ $is_edir) { + $score = $1 + 0; + substr($dir, 0, 0) = $objpfx; + } else { # absolute paths, if any (extindex) + $score = --$other; + } + if (my @st = stat($dir)) { + next if $seen{"$st[0]\0$st[1]"}++; + $alt{$rel} = $score; + } else { + warn "W: stat($dir) failed: $! ($f)"; + if ($prune) { + ++$$prune; + } else { + $alt{$rel} = $score; + } + } + } + } elsif (!$!{ENOENT}) { + die "E: open($f): $!"; + } + (\%alt, \%seen); +} + +sub epoch_dir { "$_[0]->{topdir}/$_[0]->{epfx}" } + +sub write_alternates { + my ($self, $mode, $alt, @new) = @_; + my $all_dir = "$self->{topdir}/$self->{all}"; + PublicInbox::Import::init_bare($all_dir); + my $out = join('', sort { $alt->{$b} <=> $alt->{$a} } keys %$alt); + my $info_dir = "$all_dir/objects/info"; + my $fh = File::Temp->new(TEMPLATE => 'alt-XXXX', DIR => $info_dir); + print $fh $out, @new; + chmod($mode, $fh); + close $fh; + rename($fh->filename, "$info_dir/alternates"); + $fh->unlink_on_destroy(0); +} + +# returns true if new epochs exist +sub merge_epochs { + my ($self, $alt, $seen) = @_; + my $epoch_dir = epoch_dir($self); + if (opendir my $dh, $epoch_dir) { + my $has_new; + for my $bn (grep(/\A[0-9]+\.git\z/, readdir($dh))) { + my $rel = "../../$self->{epfx}/$bn/objects\n"; + next if exists($alt->{$rel}); + if (my @st = stat("$epoch_dir/$bn/objects")) { + next if $seen->{"$st[0]\0$st[1]"}++; + $alt->{$rel} = substr($bn, 0, -4) + 0; + $has_new = 1; + } else { + warn "E: stat($epoch_dir/$bn/objects): $!"; + } + } + $has_new; + } else { + $!{ENOENT} ? undef : die "opendir($epoch_dir): $!"; + } +} + +sub fill_alternates { + my ($self) = @_; + my ($alt, $seen) = read_alternates($self, \(my $mode = 0644)); + merge_epochs($self, $alt, $seen) and + write_alternates($self, $mode, $alt); +} + +sub epoch_cfg_set { + my ($self, $epoch_nr) = @_; + my $f = epoch_dir($self)."/$epoch_nr.git/config"; + my $v = "../../$self->{all}/config"; + if (-r $f) { + chomp(my $x = run_qx([qw(git config -f), $f, 'include.path'])); + return if $x eq $v; + } + run_die([qw(git config -f), $f, 'include.path', $v ]); +} + +sub add_epoch { + my ($self, $epoch_nr) = @_; + my $git_dir = epoch_dir($self)."/$epoch_nr.git"; + my $f = "$git_dir/config"; + my $existing = -f $f; + PublicInbox::Import::init_bare($git_dir); + epoch_cfg_set($self, $epoch_nr) unless $existing; + fill_alternates($self); + $git_dir; +} + +sub git_epochs { + my ($self) = @_; + if (opendir(my $dh, epoch_dir($self))) { + my @epochs = map { + substr($_, 0, -4) + 0; # drop ".git" suffix + } grep(/\A[0-9]+\.git\z/, readdir($dh)); + wantarray ? sort { $b <=> $a } @epochs : (max(@epochs) // 0); + } elsif ($!{ENOENT}) { + wantarray ? () : 0; + } else { + die(epoch_dir($self).": $!"); + } +} + +1; diff --git a/lib/PublicInbox/NDC_PP.pm b/lib/PublicInbox/NDC_PP.pm deleted file mode 100644 index 10a7ee2a..00000000 --- a/lib/PublicInbox/NDC_PP.pm +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> - -# Pure-perl class for Linux non-Inline::C users to disable COW for btrfs -package PublicInbox::NDC_PP; -use strict; -use v5.10.1; - -sub nodatacow_dir ($) { - my ($path) = @_; - open my $mh, '<', '/proc/self/mounts' or return; - for (grep(/ btrfs /, <$mh>)) { - my (undef, $mnt_path, $type) = split(/ /); - next if $type ne 'btrfs'; # in case of false-positive from grep - - # weird chars are escaped as octal - $mnt_path =~ s/\\(0[0-9]{2})/chr(oct($1))/egs; - $mnt_path .= '/' unless $mnt_path =~ m!/\z!; - if (index($path, $mnt_path) == 0) { - # error goes to stderr, but non-fatal for us - system('chattr', '+C', $path); - last; - } - } -} - -sub nodatacow_fd ($) { - my ($fd) = @_; - return if $^O ne 'linux'; - defined(my $path = readlink("/proc/self/fd/$fd")) or return; - nodatacow_dir($path); -} - -1; diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 88fe2bb0..603cf094 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -1,39 +1,40 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Each instance of this represents a NNTP client socket # fields: # nntpd: PublicInbox::NNTPD ref # article: per-session current article number -# ng: PublicInbox::Inbox ref +# ibx: PublicInbox::Inbox ref # long_cb: long_response private data package PublicInbox::NNTP; use strict; +use v5.10.1; use parent qw(PublicInbox::DS); use PublicInbox::MID qw(mid_escape $MID_EXTRACT); use PublicInbox::Eml; use POSIX qw(strftime); use PublicInbox::DS qw(now); -use Digest::SHA qw(sha1_hex); +use PublicInbox::SHA qw(sha1_hex); use Time::Local qw(timegm timelocal); use PublicInbox::GitAsyncCat; +use PublicInbox::Address; + use constant { LINE_MAX => 512, # RFC 977 section 2.3 - r501 => '501 command syntax error', - r502 => '502 Command unavailable', - r221 => '221 Header follows', - r224 => '224 Overview information follows (multi-line)', - r225 => '225 Headers follow (multi-line)', - r430 => '430 No article with that message-id', + r501 => "501 command syntax error\r\n", + r502 => "502 Command unavailable\r\n", + r221 => "221 Header follows\r\n", + r225 => "225 Headers follow (multi-line)\r\n", + r430 => "430 No article with that message-id\r\n", }; -use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); use Errno qw(EAGAIN); my $ONE_MSGID = qr/\A$MID_EXTRACT\z/; my @OVERVIEW = qw(Subject From Date Message-ID References); my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines), '') . - "Xref:full\r\n"; + "Xref:full\r\n.\r\n"; my $LIST_HEADERS = join("\r\n", @OVERVIEW, - qw(:bytes :lines Xref To Cc)) . "\r\n"; + qw(:bytes :lines Xref To Cc)) . "\r\n.\r\n"; my $CAPABILITIES = <<""; 101 Capability list:\r VERSION 2\r @@ -44,32 +45,17 @@ HDR\r OVER\r COMPRESS DEFLATE\r -sub greet ($) { $_[0]->write($_[0]->{nntpd}->{greet}) }; - -sub new ($$$) { - my ($class, $sock, $nntpd) = @_; - my $self = bless { nntpd => $nntpd }, $class; - my $ev = EPOLLIN; - my $wbuf; - if ($sock->can('accept_SSL') && !$sock->accept_SSL) { - return CORE::close($sock) if $! != EAGAIN; - $ev = PublicInbox::TLS::epollbit(); - $wbuf = [ \&PublicInbox::DS::accept_tls_step, \&greet ]; - } - $self->SUPER::new($sock, $ev | EPOLLONESHOT); - if ($wbuf) { - $self->{wbuf} = $wbuf; - } else { - greet($self); - } - $self->update_idle_time; - $self; +sub do_greet ($) { $_[0]->write($_[0]->{nntpd}->{greet}) }; + +sub new { + my ($cls, $sock, $nntpd) = @_; + (bless { nntpd => $nntpd }, $cls)->greet($sock) } sub args_ok ($$) { my ($cb, $argc) = @_; my $tot = prototype $cb; - my ($nreq, undef) = split(';', $tot); + my ($nreq, undef) = split(/;/, $tot); $nreq = ($nreq =~ tr/$//) - 1; $tot = ($tot =~ tr/$//) - 1; ($argc <= $tot && $argc >= $nreq); @@ -80,20 +66,17 @@ sub process_line ($$) { my ($self, $l) = @_; my ($req, @args) = split(/[ \t]+/, $l); return 1 unless defined($req); # skip blank line - $req = $self->can('cmd_'.lc($req)); - return res($self, '500 command not recognized') unless $req; - return res($self, r501) unless args_ok($req, scalar @args); - + $req = $self->can('cmd_'.lc($req)) // + return $self->write(\"500 command not recognized\r\n"); + return $self->write(\r501) unless args_ok($req, scalar @args); my $res = eval { $req->($self, @args) }; my $err = $@; if ($err && $self->{sock}) { - local $/ = "\n"; - chomp($l); - err($self, 'error from: %s (%s)', $l, $err); - $res = '503 program fault - command not performed'; + $l =~ s/\r?\n//s; + warn("error from: $l ($err)\n"); + $res = \"503 program fault - command not performed\r\n"; } - return 0 unless defined $res; - res($self, $res); + defined($res) ? $self->write($res) : 0; } # The keyword argument is not used (rfc3977 5.2.2) @@ -101,65 +84,86 @@ sub cmd_capabilities ($;$) { my ($self, undef) = @_; my $res = $CAPABILITIES; if (!$self->{sock}->can('accept_SSL') && - $self->{nntpd}->{accept_tls}) { + $self->{nntpd}->{ssl_ctx_opt}) { $res .= "STARTTLS\r\n"; } - $res .= '.'; + $res .= ".\r\n"; } sub cmd_mode ($$) { my ($self, $arg) = @_; - $arg = uc $arg; - return r501 unless $arg eq 'READER'; - '201 Posting prohibited'; + uc($arg) eq 'READER' ? \"201 Posting prohibited\r\n" : \r501; } -sub cmd_slave ($) { '202 slave status noted' } +sub cmd_slave ($) { \"202 slave status noted\r\n" } sub cmd_xgtitle ($;$) { my ($self, $wildmat) = @_; - more($self, '282 list of groups and descriptions follows'); + $self->msg_more("282 list of groups and descriptions follows\r\n"); list_newsgroups($self, $wildmat); - '.' } -sub list_overview_fmt ($) { - my ($self) = @_; - $self->msg_more($OVERVIEW_FMT); +sub list_overview_fmt ($) { $OVERVIEW_FMT } + +sub list_headers ($;$) { $LIST_HEADERS } + +sub names2ibx ($;$) { + my ($self, $names) = @_; + my $groups = $self->{nntpd}->{pi_cfg}->{-by_newsgroup}; + if ($names) { # modify arrayref in-place + $_ = $groups->{$_} for @$names; + $names; # now an arrayref of ibx + } else { + my @ret = map { $groups->{$_} } @{$self->{nntpd}->{groupnames}}; + \@ret; + } } -sub list_headers ($;$) { - my ($self) = @_; - $self->msg_more($LIST_HEADERS); +sub list_active_i { # "LIST ACTIVE" and also just "LIST" (no args) + my ($self, $ibxs) = @_; + my @window = splice(@$ibxs, 0, 1000); + emit_group_lines($self, \@window); + scalar @$ibxs; # continue if there's more } -sub list_active ($;$) { +sub list_active ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - group_line($self, $ng); - } + my @names = grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}); + $self->long_response(\&list_active_i, names2ibx($self, \@names)); } -sub list_active_times ($;$) { +sub list_active_times_i { + my ($self, $ibxs) = @_; + my @window = splice(@$ibxs, 0, 1000); + $self->msg_more(join('', map { + my $c = eval { $_->uidvalidity } // time; + "$_->{newsgroup} $c <$_->{-primary_address}>\r\n"; + } @window)); + scalar @$ibxs; # continue if there's more +} + +sub list_active_times ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - my $c = eval { $ng->mm->created_at } || time; - more($self, "$ng->{newsgroup} $c $ng->{-primary_address}"); - } + my @names = grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}); + $self->long_response(\&list_active_times_i, names2ibx($self, \@names)); } -sub list_newsgroups ($;$) { +sub list_newsgroups_i { + my ($self, $ibxs) = @_; + my @window = splice(@$ibxs, 0, 1000); + $self->msg_more(join('', map { + "$_->{newsgroup} ".$_->description."\r\n" + } @window)); + scalar @$ibxs; # continue if there's more +} + +sub list_newsgroups ($;$) { # called by cmd_list my ($self, $wildmat) = @_; wildmat2re($wildmat); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $wildmat or next; - my $d = $ng->description; - more($self, "$ng->{newsgroup} $d"); - } + my @names = grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}); + $self->long_response(\&list_newsgroups_i, names2ibx($self, \@names)); } # LIST SUBSCRIPTIONS, DISTRIB.PATS are not supported @@ -168,33 +172,31 @@ sub cmd_list ($;$$) { if (scalar @args) { my $arg = shift @args; $arg =~ tr/A-Z./a-z_/; + my $ret = $arg eq 'active'; $arg = "list_$arg"; $arg = $self->can($arg); return r501 unless $arg && args_ok($arg, scalar @args); - more($self, '215 information follows'); + $self->msg_more("215 information follows\r\n"); $arg->($self, @args); } else { - more($self, '215 list of newsgroups follows'); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - group_line($self, $ng); - } + $self->msg_more("215 list of newsgroups follows\r\n"); + $self->long_response(\&list_active_i, names2ibx($self)); } - '.' } sub listgroup_range_i { my ($self, $beg, $end) = @_; - my $r = $self->{ng}->mm->msg_range($beg, $end, 'num'); + my $r = $self->{ibx}->mm(1)->msg_range($beg, $end, 'num'); scalar(@$r) or return; - more($self, join("\r\n", map { $_->[0] } @$r)); + $self->msg_more(join('', map { "$_->[0]\r\n" } @$r)); 1; } sub listgroup_all_i { my ($self, $num) = @_; - my $ary = $self->{ng}->mm->ids_after($num); + my $ary = $self->{ibx}->over(1)->ids_after($num); scalar(@$ary) or return; - more($self, join("\r\n", @$ary)); + $self->msg_more(join("\r\n", @$ary, '')); 1; } @@ -202,16 +204,16 @@ sub cmd_listgroup ($;$$) { my ($self, $group, $range) = @_; if (defined $group) { my $res = cmd_group($self, $group); - return $res if ($res !~ /\A211 /); - more($self, $res); + return $res if ref($res); # error if const strref + $self->msg_more($res); } - $self->{ng} or return '412 no newsgroup selected'; + $self->{ibx} or return \"412 no newsgroup selected\r\n"; if (defined $range) { my $r = get_range($self, $range); return $r unless ref $r; - long_response($self, \&listgroup_range_i, @$r); + $self->long_response(\&listgroup_range_i, @$r); } else { # grab every article number - long_response($self, \&listgroup_all_i, \(my $num = 0)); + $self->long_response(\&listgroup_all_i, \(my $num = 0)); } } @@ -223,7 +225,7 @@ sub parse_time ($$;$) { $gmt = 1; } my ($YYYY, $MM, $DD); - if (bytes::length($date) == 8) { # RFC 3977 allows YYYYMMDD + if (length($date) == 8) { # RFC 3977 allows YYYYMMDD ($YYYY, $MM, $DD) = unpack('A4A2A2', $date); } else { # legacy clients send YYMMDD my $YY; @@ -241,10 +243,27 @@ sub parse_time ($$;$) { } } -sub group_line ($$) { - my ($self, $ng) = @_; - my ($min, $max) = $ng->mm->minmax; - more($self, "$ng->{newsgroup} $max $min n") if defined $min && defined $max; +sub emit_group_lines { + my ($self, $ibxs) = @_; + my ($min, $max); + my $ALL = $self->{nntpd}->{pi_cfg}->ALL; + my $misc = $ALL->misc if $ALL; + my $buf = ''; + for my $ibx (@$ibxs) { + $misc ? $misc->inbox_data($ibx) : + delete(@$ibx{qw(-art_min -art_max)}); + ($min, $max) = ($ibx->art_min, $ibx->art_max); + $buf .= "$ibx->{newsgroup} $max $min n\r\n"; + } + $self->msg_more($buf); +} + +sub newgroups_i { + my ($self, $ts, $ibxs) = @_; + my @window = splice(@$ibxs, 0, 1000); + @window = grep { (eval { $_->uidvalidity } // 0) > $ts } @window; + emit_group_lines($self, \@window); + scalar @$ibxs; # any more? } sub cmd_newgroups ($$$;$$) { @@ -253,13 +272,8 @@ sub cmd_newgroups ($$$;$$) { return r501 if $@; # TODO dists - more($self, '231 list of new newsgroups follows'); - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - my $c = eval { $ng->mm->created_at } || 0; - next unless $c > $ts; - group_line($self, $ng); - } - '.' + $self->msg_more("231 list of new newsgroups follows\r\n"); + $self->long_response(\&newgroups_i, $ts, names2ibx($self)); } sub wildmat2re (;$) { @@ -294,77 +308,69 @@ sub ngpat2re (;$) { } sub newnews_i { - my ($self, $overs, $ts, $prev) = @_; - my $over = $overs->[0]; - my $msgs = $over->query_ts($ts, $$prev); - if (scalar @$msgs) { - more($self, '<' . - join(">\r\n<", map { $_->{mid} } @$msgs ). - '>'); - $$prev = $msgs->[-1]->{num}; - } else { - shift @$overs; - if (@$overs) { # continue onto next newsgroup - $$prev = 0; - return 1; - } else { # break out of the long response. - return; + my ($self, $ibxs, $ts, $prev) = @_; + if (my $over = $ibxs->[0]->over) { + my $msgs = $over->query_ts($ts, $$prev); + if (scalar @$msgs) { + $self->msg_more(join('', map { + "<$_->{mid}>\r\n"; + } @$msgs)); + $$prev = $msgs->[-1]->{num}; + return 1; # continue on current group } } + shift @$ibxs; + if (@$ibxs) { # continue onto next newsgroup + $$prev = 0; + 1; + } else { # all done, break out of the long_response + undef; + } } sub cmd_newnews ($$$$;$$) { my ($self, $newsgroups, $date, $time, $gmt, $dists) = @_; my $ts = eval { parse_time($date, $time, $gmt) }; return r501 if $@; - more($self, '230 list of new articles by message-id follows'); - my ($keep, $skip) = split('!', $newsgroups, 2); + $self->msg_more("230 list of new articles by message-id follows\r\n"); + my ($keep, $skip) = split(/!/, $newsgroups, 2); ngpat2re($keep); ngpat2re($skip); - my @overs; - foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{newsgroup} =~ $keep or next; - $ng->{newsgroup} =~ $skip and next; - my $over = $ng->over or next; - push @overs, $over; - }; - return '.' unless @overs; - + my @names = grep(/$keep/, @{$self->{nntpd}->{groupnames}}); + @names = grep(!/$skip/, @names); + return \".\r\n" unless scalar(@names); my $prev = 0; - long_response($self, \&newnews_i, \@overs, $ts, \$prev); + $self->long_response(\&newnews_i, names2ibx($self, \@names), + $ts, \$prev); } sub cmd_group ($$) { my ($self, $group) = @_; - my $no_such = '411 no such news group'; my $nntpd = $self->{nntpd}; - my $ng = $nntpd->{groups}->{$group} or return $no_such; + my $ibx = $nntpd->{pi_cfg}->{-by_newsgroup}->{$group} or + return \"411 no such news group\r\n"; $nntpd->idler_start; - $self->{ng} = $ng; - my ($min, $max) = $ng->mm->minmax; - $min ||= 0; - $max ||= 0; + $self->{ibx} = $ibx; + my ($min, $max) = $ibx->mm(1)->minmax; $self->{article} = $min; my $est_size = $max - $min; - "211 $est_size $min $max $group"; + "211 $est_size $min $max $group\r\n"; } sub article_adj ($$) { my ($self, $off) = @_; - my $ng = $self->{ng} or return '412 no newsgroup selected'; - - my $n = $self->{article}; - defined $n or return '420 no current article has been selected'; + my $ibx = $self->{ibx} // return \"412 no newsgroup selected\r\n"; + my $n = $self->{article} // + return \"420 no current article has been selected\r\n"; $n += $off; - my $mid = $ng->mm->mid_for($n); - unless ($mid) { + my $mid = $ibx->mm(1)->mid_for($n) // do { $n = $off > 0 ? 'next' : 'previous'; - return "421 no $n article in this group"; - } + return "421 no $n article in this group\r\n"; + }; $self->{article} = $n; - "223 $n <$mid> article retrieved - request text separately"; + "223 $n <$mid> article retrieved - request text separately\r\n"; } sub cmd_next ($) { article_adj($_[0], 1) } @@ -374,39 +380,53 @@ sub cmd_last ($) { article_adj($_[0], -1) } # the single-point-of-failure a single server provides. sub cmd_post ($) { my ($self) = @_; - my $ng = $self->{ng}; - $ng ? "440 mailto:$ng->{-primary_address} to post" - : '440 posting not allowed' + my $ibx = $self->{ibx}; + $ibx ? "440 mailto:$ibx->{-primary_address} to post\r\n" + : \"440 posting not allowed\r\n" } sub cmd_quit ($) { my ($self) = @_; - res($self, '205 closing connection - goodbye!'); + $self->write(\"205 closing connection - goodbye!\r\n"); $self->shutdn; undef; } -sub header_append ($$$) { - my ($hdr, $k, $v) = @_; - my @v = $hdr->header_raw($k); - foreach (@v) { - return if $v eq $_; +sub xref_by_tc ($$$) { + my ($xref, $pi_cfg, $smsg) = @_; + my $by_addr = $pi_cfg->{-by_addr}; + my $mid = $smsg->{mid}; + for my $f (qw(to cc)) { + my @ibxs = map { + $by_addr->{lc($_)} // () + } (PublicInbox::Address::emails($smsg->{$f} // '')); + for my $ibx (@ibxs) { + $xref->{$ibx->{newsgroup}} //= + $ibx->mm(1)->num_for($mid); + } } - $hdr->header_set($k, @v, $v); } -sub xref ($$$$) { - my ($self, $ng, $n, $mid) = @_; - my $ret = $self->{nntpd}->{servername} . " $ng->{newsgroup}:$n"; - - # num_for is pretty cheap and sometimes we'll lookup the existence - # of an article without getting even the OVER info. In other words, - # I'm not sure if its worth optimizing by scanning To:/Cc: and - # PublicInbox::ExtMsg on the PSGI end is just as expensive - foreach my $other (@{$self->{nntpd}->{grouplist}}) { - next if $ng eq $other; - my $num = eval { $other->mm->num_for($mid) } or next; - $ret .= " $other->{newsgroup}:$num"; +sub xref ($$$) { + my ($self, $cur_ibx, $smsg) = @_; + my $nntpd = $self->{nntpd}; + my $cur_ng = $cur_ibx->{newsgroup}; + my $xref; + if (my $ALL = $nntpd->{pi_cfg}->ALL) { + $xref = $ALL->nntp_xref_for($cur_ibx, $smsg); + xref_by_tc($xref, $nntpd->{pi_cfg}, $smsg); + } else { # slow path + $xref = { $cur_ng => $smsg->{num} }; + my $mid = $smsg->{mid}; + for my $ibx (values %{$nntpd->{pi_cfg}->{-by_newsgroup}}) { + $xref->{$ibx->{newsgroup}} //= + $ibx->mm(1)->num_for($mid); + } + } + my $ret = "$nntpd->{servername} $cur_ng:".delete($xref->{$cur_ng}); + for my $ng (sort keys %$xref) { + my $num = $xref->{$ng} // next; + $ret .= " $ng:$num"; } $ret; } @@ -415,10 +435,6 @@ sub set_nntp_headers ($$) { my ($hdr, $smsg) = @_; my ($mid) = $smsg->{mid}; - # why? leafnode requires a Path: header for some inexplicable - # reason. We'll fake the shortest one possible. - $hdr->header_set('Path', 'y'); - # leafnode (and maybe other NNTP clients) have trouble dealing # with v2 messages which have multiple Message-IDs (either due # to our own content-based dedupe or buggy git-send-email versions). @@ -432,68 +448,58 @@ sub set_nntp_headers ($$) { $hdr->header_set('X-Alt-Message-ID', @alt); } - # clobber some + # clobber some existing headers my $ibx = $smsg->{-ibx}; - my $xref = xref($smsg->{nntp}, $ibx, $smsg->{num}, $mid); + my $xref = xref($smsg->{nntp}, $ibx, $smsg); $hdr->header_set('Xref', $xref); - $xref =~ s/:[0-9]+//g; - $hdr->header_set('Newsgroups', (split(/ /, $xref, 2))[1]); - header_append($hdr, 'List-Post', "<mailto:$ibx->{-primary_address}>"); - if (my $url = $ibx->base_url) { - $mid = mid_escape($mid); - header_append($hdr, 'Archived-At', "<$url$mid/>"); - header_append($hdr, 'List-Archive', "<$url>"); - } + + # RFC 5536 3.1.4 + my ($server_name, $newsgroups) = split(/ /, $xref, 2); + $newsgroups =~ s/:[0-9]+\b//g; # drop NNTP article numbers + $newsgroups =~ tr/ /,/; + $hdr->header_set('Newsgroups', $newsgroups); + + # *something* here is required for leafnode, try to follow + # RFC 5536 3.1.5... + $hdr->header_set('Path', $server_name . '!not-for-mail'); } sub art_lookup ($$$) { my ($self, $art, $code) = @_; - my $ng = $self->{ng}; - my ($n, $mid); + my ($ibx, $n); my $err; if (defined $art) { if ($art =~ /\A[0-9]+\z/) { - $err = '423 no such article number in this group'; + $err = \"423 no such article number in this group\r\n"; $n = int($art); - goto find_mid; + goto find_ibx; } elsif ($art =~ $ONE_MSGID) { - $mid = $1; - $err = r430; - $n = $ng->mm->num_for($mid) if $ng; - goto found if defined $n; - foreach my $g (values %{$self->{nntpd}->{groups}}) { - $n = $g->mm->num_for($mid); - if (defined $n) { - $ng = $g; - goto found; - } - } - return $err; + ($ibx, $n) = mid_lookup($self, $1); + goto found if $ibx; + return \r430; } else { - return r501; + return \r501; } } else { - $err = '420 no current article has been selected'; - $n = $self->{article}; - defined $n or return $err; -find_mid: - $ng or return '412 no newsgroup has been selected'; - $mid = $ng->mm->mid_for($n); - defined $mid or return $err; + $err = \"420 no current article has been selected\r\n"; + $n = $self->{article} // return $err; +find_ibx: + $ibx = $self->{ibx} or + return \"412 no newsgroup has been selected\r\n"; } found: - my $smsg = $ng->over->get_art($n) or return $err; - $smsg->{-ibx} = $ng; + my $smsg = $ibx->over(1)->get_art($n) or return $err; + $smsg->{-ibx} = $ibx; if ($code == 223) { # STAT set_art($self, $n); "223 $n <$smsg->{mid}> article retrieved - " . - "request text separately"; + "request text separately\r\n"; } else { # HEAD | BODY | ARTICLE $smsg->{nntp} = $self; $smsg->{nntp_code} = $code; set_art($self, $art); # this dereferences to `undef' - ${git_async_cat($ng->git, $smsg->{blob}, \&blob_cb, $smsg)}; + ${ibx_async_cat($ibx, $smsg->{blob}, \&blob_cb, $smsg)}; } } @@ -503,7 +509,7 @@ sub msg_body_write ($$) { # these can momentarily double the memory consumption :< $$msg =~ s/^\./../smg; $$msg =~ s/(?<!\r)\n/\r\n/sg; # Alpine barfs without this - $$msg .= "\r\n" unless $$msg =~ /\r\n\z/s; + $$msg .= "\r\n" unless substr($$msg, -2, 2) eq "\r\n"; $self->msg_more($$msg); } @@ -517,8 +523,7 @@ sub msg_hdr_write ($$) { set_nntp_headers($eml, $smsg); my $hdr = $eml->{hdr} // \(my $x = ''); - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $$hdr =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($$hdr); $$hdr =~ s/(?<!\r)\n/\r\n/sg; # Alpine barfs without this # for leafnode compatibility, we need to ensure Message-ID headers @@ -527,11 +532,15 @@ sub msg_hdr_write ($$) { $smsg->{nntp}->msg_more($$hdr); } -sub blob_cb { # called by git->cat_async via git_async_cat +sub blob_cb { # called by git->cat_async via ibx_async_cat my ($bref, $oid, $type, $size, $smsg) = @_; my $self = $smsg->{nntp}; my $code = $smsg->{nntp_code}; - if (!defined($oid)) { + if (!defined($type)) { + warn "E: git aborted on $oid / $smsg->{blob} ". + $self->{-ibx}->{inboxdir}; + return $self->close; + } elsif ($type ne 'blob') { # it's possible to have TOCTOU if an admin runs # public-inbox-(edit|purge), just move onto the next message warn "E: $smsg->{blob} missing in $smsg->{-ibx}->{inboxdir}\n"; @@ -543,21 +552,21 @@ sub blob_cb { # called by git->cat_async via git_async_cat my $r = "$code $smsg->{num} <$smsg->{mid}> article retrieved - "; my $eml = PublicInbox::Eml->new($bref); if ($code == 220) { - more($self, $r .= 'head and body follow'); + $self->msg_more($r .= "head and body follow\r\n"); msg_hdr_write($eml, $smsg); $self->msg_more("\r\n"); msg_body_write($self, $bref); } elsif ($code == 221) { - more($self, $r .= 'head follows'); + $self->msg_more($r .= "head follows\r\n"); msg_hdr_write($eml, $smsg); } elsif ($code == 222) { - more($self, $r .= 'body follows'); + $self->msg_more($r .= "body follows\r\n"); msg_body_write($self, $bref); } else { $self->close; die "BUG: bad code: $r"; } - $self->write(\".\r\n"); # flushes (includes ->zflush) + $self->write(\".\r\n"); # flushes (includes ->dflush) $self->requeue; } @@ -581,22 +590,20 @@ sub cmd_stat ($;$) { art_lookup($self, $art, 223); # art may be msgid } -sub cmd_ihave ($) { '435 article not wanted - do not send it' } +sub cmd_ihave ($) { \"435 article not wanted - do not send it\r\n" } -sub cmd_date ($) { '111 '.strftime('%Y%m%d%H%M%S', gmtime(time)) } +sub cmd_date ($) { '111 '.strftime('%Y%m%d%H%M%S', gmtime(time))."\r\n" } -sub cmd_help ($) { - my ($self) = @_; - more($self, '100 help text follows'); - '.' -} +sub cmd_help ($) { \"100 help text follows\r\n.\r\n" } +# returns a ref on success sub get_range ($$) { my ($self, $range) = @_; - my $ng = $self->{ng} or return '412 no news group has been selected'; - defined $range or return '420 No article(s) selected'; + my $ibx = $self->{ibx} // + return "412 no news group has been selected\r\n"; + $range // return "420 No article(s) selected\r\n"; my ($beg, $end); - my ($min, $max) = $ng->mm->minmax; + my ($min, $max) = $ibx->mm(1)->minmax; if ($range =~ /\A([0-9]+)\z/) { $beg = $end = $1; } elsif ($range =~ /\A([0-9]+)-\z/) { @@ -608,67 +615,16 @@ sub get_range ($$) { } $beg = $min if ($beg < $min); $end = $max if ($end > $max); - return '420 No article(s) selected' if ($beg > $end); - [ \$beg, $end ]; + $beg > $end ? "420 No article(s) selected\r\n" : [ \$beg, $end ]; } -sub long_step { - my ($self) = @_; - # wbuf is unset or empty, here; {long} may add to it - my ($fd, $cb, $t0, @args) = @{$self->{long_cb}}; - my $more = eval { $cb->($self, @args) }; - if ($@ || !$self->{sock}) { # something bad happened... - delete $self->{long_cb}; - my $elapsed = now() - $t0; - if ($@) { - err($self, - "%s during long response[$fd] - %0.6f", - $@, $elapsed); - } - out($self, " deferred[$fd] aborted - %0.6f", $elapsed); - $self->close; - } elsif ($more) { # $self->{wbuf}: - $self->update_idle_time; - - # COMPRESS users all share the same DEFLATE context. - # Flush it here to ensure clients don't see - # each other's data - $self->zflush; - - # no recursion, schedule another call ASAP, but only after - # all pending writes are done. autovivify wbuf: - my $new_size = push(@{$self->{wbuf}}, \&long_step); - - # wbuf may be populated by $cb, no need to rearm if so: - $self->requeue if $new_size == 1; - } else { # all done! - delete $self->{long_cb}; - res($self, '.'); - my $elapsed = now() - $t0; - my $fd = fileno($self->{sock}); - out($self, " deferred[$fd] done - %0.6f", $elapsed); - my $wbuf = $self->{wbuf}; # do NOT autovivify - $self->requeue unless $wbuf && @$wbuf; - } -} - -sub long_response ($$;@) { - my ($self, $cb, @args) = @_; # cb returns true if more, false if done - - my $sock = $self->{sock} or return; - # make sure we disable reading during a long response, - # clients should not be sending us stuff and making us do more - # work while we are stream a response to them - $self->{long_cb} = [ fileno($sock), $cb, now(), @args ]; - long_step($self); # kick off! - undef; -} +sub long_response_done { $_[0]->write(\".\r\n") } # overrides superclass sub hdr_msgid_range_i { my ($self, $beg, $end) = @_; - my $r = $self->{ng}->mm->msg_range($beg, $end); + my $r = $self->{ibx}->mm(1)->msg_range($beg, $end); @$r or return; - more($self, join("\r\n", map { "$_->[0] <$_->[1]>" } @$r)); + $self->msg_more(join('', map { "$_->[0] <$_->[1]>\r\n" } @$r)); 1; } @@ -676,42 +632,68 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. my ($self, $xhdr, $range) = @_; if (defined $range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); return r430 unless $n; - hdr_mid_response($self, $xhdr, $ng, $n, $range, $range); + hdr_mid_response($self, $xhdr, $ibx, $n, $range, $range); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); return $r unless ref $r; - more($self, $xhdr ? r221 : r225); - long_response($self, \&hdr_msgid_range_i, @$r); + $self->msg_more($xhdr ? r221 : r225); + $self->long_response(\&hdr_msgid_range_i, @$r); } } sub mid_lookup ($$) { my ($self, $mid) = @_; - my $self_ng = $self->{ng}; - if ($self_ng) { - my $n = $self_ng->mm->num_for($mid); - return ($self_ng, $n) if defined $n; - } - foreach my $ng (values %{$self->{nntpd}->{groups}}) { - next if defined $self_ng && $ng eq $self_ng; - my $n = $ng->mm->num_for($mid); - return ($ng, $n) if defined $n; + my $cur_ibx = $self->{ibx}; + if ($cur_ibx) { + my $n = $cur_ibx->mm(1)->num_for($mid); + return ($cur_ibx, $n) if defined $n; + } + my $pi_cfg = $self->{nntpd}->{pi_cfg}; + if (my $ALL = $pi_cfg->ALL) { + my ($id, $prev); + while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($smsg->{num}); + if (my @x = grep(/:$smsg->{blob}\z/, @$xr3)) { + my ($ngname, $xnum) = split(/:/, $x[0]); + my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname}; + return ($ibx, $xnum) if $ibx; + # fall through to trying all xref3s + } else { + warn <<EOF; +W: xref3 missing for <$mid> ($smsg->{blob}) in $ALL->{topdir}, -extindex bug? +EOF + } + # try all xref3s + for my $x (@$xr3) { + my ($ngname, $xnum) = split(/:/, $x); + my $ibx = $pi_cfg->{-by_newsgroup}->{$ngname}; + return ($ibx, $xnum) if $ibx; + warn "W: `$ngname' does not exist for #$xnum\n"; + } + } + # no warning here, $mid is just invalid + } else { # slow path for non-ALL users + for my $ibx (values %{$pi_cfg->{-by_newsgroup}}) { + next if defined $cur_ibx && $ibx eq $cur_ibx; + my $n = $ibx->mm(1)->num_for($mid); + return ($ibx, $n) if defined $n; + } } (undef, undef); } sub xref_range_i { my ($self, $beg, $end) = @_; - my $ng = $self->{ng}; - my $r = $ng->mm->msg_range($beg, $end); - @$r or return; - more($self, join("\r\n", map { - my $num = $_->[0]; - "$num ".xref($self, $ng, $num, $_->[1]); - } @$r)); + my $ibx = $self->{ibx}; + my $msgs = $ibx->over(1)->query_xover($$beg, $end); + scalar(@$msgs) or return; + $$beg = $msgs->[-1]->{num} + 1; + $self->msg_more(join('', map { + "$_->{num} ".xref($self, $ibx, $_) . "\r\n"; + } @$msgs)); 1; } @@ -720,30 +702,30 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin if (defined $range && $range =~ $ONE_MSGID) { my $mid = $1; - my ($ng, $n) = mid_lookup($self, $mid); + my ($ibx, $n) = mid_lookup($self, $mid); return r430 unless $n; - hdr_mid_response($self, $xhdr, $ng, $n, $range, - xref($self, $ng, $n, $mid)); + my $smsg = $ibx->over(1)->get_art($n) or return; + hdr_mid_response($self, $xhdr, $ibx, $n, $range, + xref($self, $ibx, $smsg)); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); return $r unless ref $r; - more($self, $xhdr ? r221 : r225); - long_response($self, \&xref_range_i, @$r); + $self->msg_more($xhdr ? r221 : r225); + $self->long_response(\&xref_range_i, @$r); } } sub over_header_for { - my ($over, $num, $field) = @_; - my $smsg = $over->get_art($num) or return; + my ($ibx, $num, $field) = @_; + my $smsg = $ibx->over(1)->get_art($num) or return; return PublicInbox::Smsg::date($smsg) if $field eq 'date'; $smsg->{$field}; } sub smsg_range_i { my ($self, $beg, $end, $field) = @_; - my $over = $self->{ng}->over; - my $msgs = $over->query_xover($$beg, $end); + my $msgs = $self->{ibx}->over(1)->query_xover($$beg, $end); scalar(@$msgs) or return; my $tmp = ''; @@ -765,16 +747,16 @@ sub smsg_range_i { sub hdr_smsg ($$$$) { my ($self, $xhdr, $field, $range) = @_; if (defined $range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); return r430 unless defined $n; - my $v = over_header_for($ng->over, $n, $field); - hdr_mid_response($self, $xhdr, $ng, $n, $range, $v); + my $v = over_header_for($ibx, $n, $field); + hdr_mid_response($self, $xhdr, $ibx, $n, $range, $v); } else { # numeric range $range = $self->{article} unless defined $range; my $r = get_range($self, $range); return $r unless ref $r; - more($self, $xhdr ? r221 : r225); - long_response($self, \&smsg_range_i, @$r, $field); + $self->msg_more($xhdr ? r221 : r225); + $self->long_response(\&smsg_range_i, @$r, $field); } } @@ -791,7 +773,7 @@ sub do_hdr ($$$;$) { } elsif ($sub =~ /\A:(bytes|lines)\z/) { hdr_smsg($self, $xhdr, $1, $range); } else { - $xhdr ? (r221 . "\r\n.") : "503 HDR not permitted on $header"; + $xhdr ? (r221.".\r\n") : "503 HDR not permitted on $header\r\n"; } } @@ -808,57 +790,50 @@ sub cmd_xhdr ($$;$) { } sub hdr_mid_prefix ($$$$$) { - my ($self, $xhdr, $ng, $n, $mid) = @_; + my ($self, $xhdr, $ibx, $n, $mid) = @_; return $mid if $xhdr; # HDR for RFC 3977 users - if (my $self_ng = $self->{ng}) { - ($self_ng eq $ng) ? $n : '0'; + if (my $cur_ibx = $self->{ibx}) { + ($cur_ibx eq $ibx) ? $n : '0'; } else { '0'; } } sub hdr_mid_response ($$$$$$) { - my ($self, $xhdr, $ng, $n, $mid, $v) = @_; - my $res = ''; - if ($xhdr) { - $res .= r221 . "\r\n"; - $res .= "$mid $v\r\n"; - } else { - $res .= r225 . "\r\n"; - my $pfx = hdr_mid_prefix($self, $xhdr, $ng, $n, $mid); - $res .= "$pfx $v\r\n"; - } - res($self, $res .= '.'); + my ($self, $xhdr, $ibx, $n, $mid, $v) = @_; + $self->write(($xhdr ? r221.$mid : + r225.hdr_mid_prefix($self, $xhdr, $ibx, $n, $mid)) . + " $v\r\n.\r\n"); undef; } sub xrover_i { my ($self, $beg, $end) = @_; - my $h = over_header_for($self->{ng}->over, $$beg, 'references'); - more($self, "$$beg $h") if defined($h); + my $h = over_header_for($self->{ibx}, $$beg, 'references'); + $self->msg_more("$$beg $h\r\n") if defined($h); $$beg++ < $end; } sub cmd_xrover ($;$) { my ($self, $range) = @_; - my $ng = $self->{ng} or return '412 no newsgroup selected'; + my $ibx = $self->{ibx} or return \"412 no newsgroup selected\r\n"; (defined $range && $range =~ /[<>]/) and - return '420 No article(s) selected'; # no message IDs + return \"420 No article(s) selected\r\n"; # no message IDs $range = $self->{article} unless defined $range; my $r = get_range($self, $range); return $r unless ref $r; - more($self, '224 Overview information follows'); - long_response($self, \&xrover_i, @$r); + $self->msg_more("224 Overview information follows\r\n"); + $self->long_response(\&xrover_i, @$r); } -sub over_line ($$$$) { - my ($self, $ng, $num, $smsg) = @_; +sub over_line ($$$) { + my ($self, $ibx, $smsg) = @_; # n.b. field access and procedural calls can be # 10%-15% faster than OO method calls: - my $s = join("\t", $num, + my $s = join("\t", $smsg->{num}, $smsg->{subject}, $smsg->{from}, PublicInbox::Smsg::date($smsg), @@ -866,24 +841,29 @@ sub over_line ($$$$) { $smsg->{references}, $smsg->{bytes}, $smsg->{lines}, - "Xref: " . xref($self, $ng, $num, $smsg->{mid})); + "Xref: " . xref($self, $ibx, $smsg)); utf8::encode($s); - $s + $s .= "\r\n"; } sub cmd_over ($;$) { my ($self, $range) = @_; if ($range && $range =~ $ONE_MSGID) { - my ($ng, $n) = mid_lookup($self, $1); + my ($ibx, $n) = mid_lookup($self, $1); defined $n or return r430; - my $smsg = $ng->over->get_art($n) or return r430; - more($self, '224 Overview information follows (multi-line)'); + my $smsg = $ibx->over(1)->get_art($n) or return r430; + $self->msg_more( + "224 Overview information follows (multi-line)\r\n"); # Only set article number column if it's the current group - my $self_ng = $self->{ng}; - $n = 0 if (!$self_ng || $self_ng ne $ng); - more($self, over_line($self, $ng, $n, $smsg)); - '.'; + # (RFC 3977 8.3.2) + my $cur_ibx = $self->{ibx}; + if (!$cur_ibx || $cur_ibx ne $ibx) { + # set {-orig_num} for nntp_xref_for + $smsg->{-orig_num} = $smsg->{num}; + $smsg->{num} = 0; + } + over_line($self, $ibx, $smsg).".\r\n"; } else { cmd_xover($self, $range); } @@ -891,13 +871,13 @@ sub cmd_over ($;$) { sub xover_i { my ($self, $beg, $end) = @_; - my $ng = $self->{ng}; - my $msgs = $ng->over->query_xover($$beg, $end); + my $ibx = $self->{ibx}; + my $msgs = $ibx->over(1)->query_xover($$beg, $end); my $nr = scalar @$msgs or return; # OVERVIEW.FMT - more($self, join("\r\n", map { - over_line($self, $ng, $_->{num}, $_); + $self->msg_more(join('', map { + over_line($self, $ibx, $_); } @$msgs)); $$beg = $msgs->[-1]->{num} + 1; } @@ -908,21 +888,20 @@ sub cmd_xover ($;$) { my $r = get_range($self, $range); return $r unless ref $r; my ($beg, $end) = @$r; - more($self, "224 Overview information follows for $$beg to $end"); - long_response($self, \&xover_i, @$r); + $self->msg_more( + "224 Overview information follows for $$beg to $end\r\n"); + $self->long_response(\&xover_i, @$r); } -sub compressed { undef } - sub cmd_starttls ($) { my ($self) = @_; - my $sock = $self->{sock} or return; # RFC 4642 2.2.1 - return r502 if ($sock->can('accept_SSL') || $self->compressed); - my $opt = $self->{nntpd}->{accept_tls} or - return '580 can not initiate TLS negotiation'; - res($self, '382 Continue with TLS negotiation'); - $self->{sock} = IO::Socket::SSL->start_SSL($sock, %$opt); + (($self->{sock} // return)->can('stop_SSL') || $self->compressed) and + return r502; + $self->{nntpd}->{ssl_ctx_opt} or + return \"580 can not initiate TLS negotiation\r\n"; + $self->write(\"382 Continue with TLS negotiation\r\n"); + PublicInbox::TLS::start($self->{sock}, $self->{nntpd}); $self->requeue if PublicInbox::DS::accept_tls_step($self); undef; } @@ -930,43 +909,42 @@ sub cmd_starttls ($) { # RFC 8054 sub cmd_compress ($$) { my ($self, $alg) = @_; - return '503 Only DEFLATE is supported' if uc($alg) ne 'DEFLATE'; + return "503 Only DEFLATE is supported\r\n" if uc($alg) ne 'DEFLATE'; return r502 if $self->compressed; - PublicInbox::NNTPdeflate->enable($self); + PublicInbox::NNTPdeflate->enable($self) or return + \"403 Unable to activate compression\r\n"; + PublicInbox::DS::write($self, \"206 Compression active\r\n"); $self->requeue; undef } -sub zflush {} # overridden by NNTPdeflate - sub cmd_xpath ($$) { my ($self, $mid) = @_; return r501 unless $mid =~ $ONE_MSGID; $mid = $1; my @paths; - foreach my $ng (values %{$self->{nntpd}->{groups}}) { - my $n = $ng->mm->num_for($mid); - push @paths, "$ng->{newsgroup}/$n" if defined $n; + my $pi_cfg = $self->{nntpd}->{pi_cfg}; + my $groups = $pi_cfg->{-by_newsgroup}; + if (my $ALL = $pi_cfg->ALL) { + my ($id, $prev, %seen); + while (my $smsg = $ALL->over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $ALL->over->get_xref3($smsg->{num}); + for my $x (@$xr3) { + my ($ngname, $n) = split(/:/, $x); + $x = "$ngname/$n"; + if ($groups->{$ngname} && !$seen{$x}++) { + push(@paths, $x); + } + } + } + } else { # slow path, no point in using long_response + for my $ibx (values %$groups) { + my $n = $ibx->mm(1)->num_for($mid) // next; + push @paths, "$ibx->{newsgroup}/$n"; + } } - return '430 no such article on server' unless @paths; - '223 '.join(' ', @paths); -} - -sub res ($$) { do_write($_[0], $_[1] . "\r\n") } - -sub more ($$) { $_[0]->msg_more($_[1] . "\r\n") } - -sub do_write ($$) { - my $self = $_[0]; - my $done = $self->write(\($_[1])); - return 0 unless $self->{sock}; - - $done; -} - -sub err ($$;@) { - my ($self, $fmt, @args) = @_; - printf { $self->{nntpd}->{err} } $fmt."\n", @args; + return \"430 no such article on server\r\n" unless @paths; + '223 '.join(' ', sort(@paths))."\r\n"; } sub out ($$;@) { @@ -977,10 +955,9 @@ sub out ($$;@) { # callback used by PublicInbox::DS for any (e)poll (in/out/hup/err) sub event_step { my ($self) = @_; - + local $SIG{__WARN__} = $self->{nntpd}->{warn_cb}; return unless $self->flush_write && $self->{sock} && !$self->{long_cb}; - $self->update_idle_time; # only read more requests if we've drained the write buffer, # otherwise we can be buffering infinitely w/o backpressure @@ -1002,17 +979,19 @@ sub event_step { out($self, "[$fd] %s - %0.6f$pending", $line, now() - $t0); return $self->close if $r < 0; $self->rbuf_idle($rbuf); - $self->update_idle_time; # maybe there's more pipelined data, or we'll have # to register it for socket-readiness notifications $self->requeue unless $pending; } -# for graceful shutdown in PublicInbox::Daemon: -sub busy { - my ($self, $now) = @_; - ($self->{rbuf} || $self->{wbuf} || $self->not_idle_long($now)); +sub busy { # for graceful shutdown in PublicInbox::Daemon: + my ($self) = @_; + defined($self->{rbuf}) || defined($self->{wbuf}) } +package PublicInbox::NNTPdeflate; +use PublicInbox::DSdeflate; +our @ISA = qw(PublicInbox::DSdeflate PublicInbox::NNTP); + 1; diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm index 6b762d89..4401a29b 100644 --- a/lib/PublicInbox/NNTPD.pm +++ b/lib/PublicInbox/NNTPD.pm @@ -1,74 +1,64 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # represents an NNTPD (currently a singleton), # see script/public-inbox-nntpd for how it is used package PublicInbox::NNTPD; use strict; -use warnings; +use v5.10.1; use Sys::Hostname; use PublicInbox::Config; use PublicInbox::InboxIdle; +use PublicInbox::NNTP; sub new { my ($class) = @_; - my $pi_config = PublicInbox::Config->new; - my $name = $pi_config->{'publicinbox.nntpserver'}; - if (!defined($name) or $name eq '') { - $name = hostname; - } elsif (ref($name) eq 'ARRAY') { - $name = $name->[0]; - } - bless { - groups => {}, err => \*STDERR, out => \*STDOUT, - grouplist => [], - pi_config => $pi_config, - servername => $name, - greet => \"201 $name ready - post via email\r\n", - # accept_tls => { SSL_server => 1, ..., SSL_reuse_ctx => ... } + # pi_cfg => $pi_cfg, + # ssl_ctx_opt => { SSL_cert_file => ..., SSL_key_file => ... } # idler => PublicInbox::InboxIdle }, $class; } sub refresh_groups { my ($self, $sig) = @_; - my $pi_config = $sig ? PublicInbox::Config->new : $self->{pi_config}; - my $new = {}; - my @list; - $pi_config->each_inbox(sub { - my ($ng) = @_; - my $ngname = $ng->{newsgroup} or return; - if (ref $ngname) { - warn 'multiple newsgroups not supported: '. - join(', ', @$ngname). "\n"; - # Newsgroup name needs to be compatible with RFC 3977 - # wildmat-exact and RFC 3501 (IMAP) ATOM-CHAR. - # Leave out a few chars likely to cause problems or conflicts: - # '|', '<', '>', ';', '#', '$', '&', - } elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) { - warn "newsgroup name invalid: `$ngname'\n"; - } elsif ($ng->nntp_usable) { - # Only valid if msgmap and search works - $new->{$ngname} = $ng; - push @list, $ng; - + my $pi_cfg = PublicInbox::Config->new; + my $name = $pi_cfg->{'publicinbox.nntpserver'}; + if (!defined($name) or $name eq '') { + $name = hostname; + } elsif (ref($name) eq 'ARRAY') { + $name = $name->[0]; + } + if ($name ne ($self->{servername} // '')) { + $self->{servername} = $name; + $self->{greet} = \"201 $name ready - post via email\r\n"; + } + my $groups = $pi_cfg->{-by_newsgroup}; # filled during each_inbox + my $cache = eval { $pi_cfg->ALL->misc->nntpd_cache_load } // {}; + $pi_cfg->each_inbox(sub { + my ($ibx) = @_; + my $ngname = $ibx->{newsgroup} // return; + my $ce = $cache->{$ngname}; + if (($ce and (%$ibx = (%$ibx, %$ce))) || $ibx->nntp_usable) { + # only valid if msgmap and over works # preload to avoid fragmentation: - $ng->description; - $ng->base_url; + $ibx->description; + } else { + delete $groups->{$ngname}; + # Note: don't be tempted to delete more for memory + # savings just yet: NNTP, IMAP, and WWW may all + # run in the same process someday. } }); - @list = sort { $a->{newsgroup} cmp $b->{newsgroup} } @list; - $self->{grouplist} = \@list; - $self->{pi_config} = $pi_config; + @{$self->{groupnames}} = sort(keys %$groups); # this will destroy old groups that got deleted - %{$self->{groups}} = %$new; + $self->{pi_cfg} = $pi_cfg; } sub idler_start { - $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_config}); + $_[0]->{idler} //= PublicInbox::InboxIdle->new($_[0]->{pi_cfg}); } 1; diff --git a/lib/PublicInbox/NetNNTPSocks.pm b/lib/PublicInbox/NetNNTPSocks.pm new file mode 100644 index 00000000..d27efba1 --- /dev/null +++ b/lib/PublicInbox/NetNNTPSocks.pm @@ -0,0 +1,36 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# wrap Net::NNTP client with SOCKS support. Convoluted, but AFAIK this +# is the only way to get SOCKS working with Net::NNTP w/o LD_PRELOAD. +package PublicInbox::NetNNTPSocks; +use v5.12; +use Net::NNTP; +our %OPT; # used to pass options between ->new_socks and our ->new +our @ISA = qw(IO::Socket::Socks); + +# use this instead of Net::NNTP->new if using Proxy* +sub new_socks { + my (undef, %opt) = @_; + require IO::Socket::Socks; + local @Net::NNTP::ISA = (qw(Net::Cmd), __PACKAGE__); + local %OPT = map {; + defined($opt{$_}) ? ($_ => $opt{$_}) : () + } qw(ProxyAddr ProxyPort SocksVersion SocksDebug SocksResolve); + no warnings 'uninitialized'; # needed for $SOCKS_ERROR + my $ret = Net::NNTP->new(%opt); # calls PublicInbox::NetNNTPSocks::new + return $ret if $ret || $!{EINTR}; + $ret // die "errors: \$!=$! SOCKS=", + eval('$IO::Socket::Socks::SOCKS_ERROR // ""'), + ', SSL=', + (eval('IO::Socket::SSL->errstr') // ''), "\n"; +} + +# called by Net::NNTP->new +sub new { + my ($self, %opt) = @_; + @OPT{qw(ConnectAddr ConnectPort)} = @opt{qw(PeerAddr PeerPort)}; + $self->SUPER::new(%OPT); +} + +1; diff --git a/lib/PublicInbox/NetReader.pm b/lib/PublicInbox/NetReader.pm new file mode 100644 index 00000000..ec18818b --- /dev/null +++ b/lib/PublicInbox/NetReader.pm @@ -0,0 +1,896 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# common reader code for IMAP and NNTP (and maybe JMAP) +package PublicInbox::NetReader; +use v5.12; +use parent qw(Exporter PublicInbox::IPC); +use PublicInbox::Eml; +use PublicInbox::Config; +our %IMAPflags2kw = map {; "\\\u$_" => $_ } qw(seen answered flagged draft); +$IMAPflags2kw{'$Forwarded'} = 'forwarded'; # RFC 5550 + +our @EXPORT = qw(uri_section imap_uri nntp_uri); + +sub ndump { + require Data::Dumper; + Data::Dumper->new([ $_[-1] ])->Useqq(1)->Terse(1)->Dump; +} + +# returns the git config section name, e.g [imap "imaps://user@example.com"] +# without the mailbox, so we can share connections between different inboxes +sub uri_section ($) { + my ($uri) = @_; + $uri->scheme . '://' . $uri->authority; +} + +sub socks_args ($) { + my ($val) = @_; + return if ($val // '') eq ''; + if ($val =~ m!\Asocks5h:// (?: \[ ([^\]]+) \] | ([^:/]+) ) + (?::([0-9]+))?/*\z!ix) { + my ($h, $p) = ($1 // $2, $3 + 0); + $h = '127.0.0.1' if $h eq '0'; + eval { require IO::Socket::Socks } or die <<EOM; +IO::Socket::Socks missing for socks5h://$h:$p +EOM + # for IO::Socket::Socks + return { ProxyAddr => $h, ProxyPort => $p }; + } + die "$val not understood (only socks5h:// is supported)\n"; +} + +# gives an arrayref suitable for the Mail::IMAPClient Ssl or Starttls arg +sub mic_tls_opt ($$) { + my ($o, $hostname) = @_; + require IO::Socket::SSL; + $o = {} if !ref($o); + $o->{SSL_hostname} //= $hostname; + [ map { ($_, $o->{$_}) } keys %$o ]; +} + +sub set_ssl_verify_mode ($$) { + my ($o, $bool) = @_; + require IO::Socket::SSL; + $o->{SSL_verify_mode} = $bool ? IO::Socket::SSL::SSL_VERIFY_PEER() : + IO::Socket::SSL::SSL_VERIFY_NONE(); +} + +sub mic_new ($$$$) { + my ($self, $mic_arg, $sec, $uri) = @_; + my %mic_arg = (%$mic_arg, Keepalive => 1); + my $sa = $self->{cfg_opt}->{$sec}->{-proxy_cfg} || $self->{-proxy_cli}; + my ($mic, $s, $t); + if ($sa) { + # this `require' needed for worker[1..Inf], since socks_args + # only got called in worker[0] + require IO::Socket::Socks; + my %opt = (%$sa, Keepalive => 1); + $opt{SocksDebug} = 1 if $mic_arg{Debug}; + $opt{ConnectAddr} = delete $mic_arg{Server}; + $opt{ConnectPort} = delete $mic_arg{Port}; + do { + $! = 0; + $s = IO::Socket::Socks->new(%opt); + } until ($s || !$!{EINTR} || $self->{quit}); + return if $self->{quit}; + $s or die "E: <$uri> ".eval('$IO::Socket::Socks::SOCKS_ERROR'); + $mic_arg{Socket} = $s; + if (my $o = delete $mic_arg{Ssl}) { # for imaps:// + $o = mic_tls_opt($o, $opt{ConnectAddr}); + do { + $! = 0; + $t = IO::Socket::SSL->start_SSL($s, @$o); + } until ($t || !$!{EINTR} || $self->{quit}); + return if $self->{quit}; + $t or die "E: <$uri> ".(IO::Socket::SSL->errstr // ''); + $mic_arg{Socket} = $t; + } elsif ($o = $mic_arg{Starttls}) { + # Mail::IMAPClient will use this: + $mic_arg{Starttls} = mic_tls_opt($o, $opt{ConnectAddr}); + } + } elsif ($mic_arg{Ssl} || $mic_arg{Starttls}) { + for my $f (qw(Ssl Starttls)) { + my $o = $mic_arg{$f} or next; + $mic_arg{$f} = mic_tls_opt($o, $mic_arg{Server}); + } + } + do { + $! = 0; + $mic = PublicInbox::IMAPClient->new(%mic_arg); + } until ($mic || !$!{EINTR} || $self->{quit}); + $mic; +} + +sub auth_anon_cb { '' }; # for Mail::IMAPClient::Authcallback + +sub onion_hint ($$) { + my ($lei, $uri) = @_; + $uri->host =~ /\.onion\z/i or return "\n"; + my $t = $uri->isa('PublicInbox::URIimap') ? 'imap' : 'nntp'; + my $url = PublicInbox::Config::squote_maybe(uri_section($uri)); + my $scheme = $uri->scheme; + my $set_cfg = 'lei config'; + if (!$lei) { # public-inbox-watch + my $f = PublicInbox::Config::squote_maybe( + $ENV{PI_CONFIG} || '~/.public-inbox/config'); + $set_cfg = "git config -f $f"; + } + my $dq = substr($url, 0, 1) eq "'" ? '"' : ''; + <<EOM + +Assuming you have Tor configured and running locally on port 9050, +try configuring a socks5h:// proxy: + + url=$url + $set_cfg $t.$dq\$url$dq.proxy socks5h://127.0.0.1:9050 + +git 2.26+ users may instead rely on `*' to match all .onion URLs: + + $set_cfg '$t.$scheme://*.onion.proxy' socks5h://127.0.0.1:9050 + +...before retrying your current command +EOM +} + +# Net::NNTP doesn't support CAPABILITIES, yet; and both IMAP+NNTP +# servers may have multiple listen sockets. +sub try_starttls ($) { + my ($host) = @_; + return if $host =~ /\.onion\z/si; + return if $host =~ /\A127\.[0-9]+\.[0-9]+\.[0-9]+\z/s; + return if $host eq '::1'; + 1; +} + +# mic_for may prompt the user and store auth info, prepares mic_get +sub mic_for ($$$$) { # mic = Mail::IMAPClient + my ($self, $uri, $mic_common, $lei) = @_; + require PublicInbox::GitCredential; + my $cred = bless { + url => "$uri", + protocol => $uri->scheme, + host => $uri->host, + username => $uri->user, + password => $uri->password, + }, 'PublicInbox::GitCredential'; + my $sec = uri_section($uri); + my $common = $mic_common->{$sec} // {}; + # IMAPClient and Net::Netrc both mishandles `0', so we pass `127.0.0.1' + my $host = $cred->{host}; + $host = '127.0.0.1' if $host eq '0'; + my $mic_arg = { + Port => $uri->port, + Server => $host, + %$common, # may set Starttls, Compress, Debug .... + }; + require PublicInbox::IMAPClient; + my $mic = mic_new($self, $mic_arg, $sec, $uri); + return if $self->{quit}; + ($mic && $mic->IsConnected) or + die "E: <$uri> new: $@".onion_hint($lei, $uri); + + # default to using STARTTLS if it's available, but allow + # it to be disabled since I usually connect to localhost + if (!$mic_arg->{Ssl} && !defined($mic_arg->{Starttls}) && + $mic->has_capability('STARTTLS') && + try_starttls($host) && + $mic->can('starttls')) { + $mic->starttls or die "E: <$uri> STARTTLS: $@\n"; + } + + # do we even need credentials? + if (!defined($cred->{username}) && + $mic->has_capability('AUTH=ANONYMOUS')) { + $cred = undef; + } + if ($cred) { + my $p = $cred->{password} // $cred->check_netrc($lei); + $cred->fill($lei) unless defined($p); # may prompt user here + $mic->User($mic_arg->{User} = $cred->{username}); + $mic->Password($mic_arg->{Password} = $cred->{password}); + } else { # AUTH=ANONYMOUS + $mic->Authmechanism($mic_arg->{Authmechanism} = 'ANONYMOUS'); + $mic_arg->{Authcallback} = 'auth_anon_cb'; + $mic->Authcallback(\&auth_anon_cb); + } + my $err; + if ($mic->login && $mic->IsAuthenticated) { + # success! keep IMAPClient->new arg in case we get disconnected + $self->{net_arg}->{$sec} = $mic_arg; + if ($cred) { + $uri->user($cred->{username}) if !defined($uri->user); + } elsif ($mic_arg->{Authmechanism} eq 'ANONYMOUS') { + $uri->auth('ANONYMOUS') if !defined($uri->auth); + } + } else { + $err = "E: <$uri> LOGIN: $@\n"; + if ($cred && defined($cred->{password})) { + $err =~ s/\Q$cred->{password}\E/*******/g; + } + $mic = undef; + } + $cred->run($mic ? 'approve' : 'reject') if $cred && $cred->{filled}; + if ($err) { + $lei ? $lei->fail($err) : warn($err); + } + $mic; +} + +sub nn_new ($$$$) { + my ($self, $nn_arg, $nntp_cfg, $uri) = @_; + my $nn; + my ($Net_NNTP, $new) = qw(Net::NNTP new); + if (defined $nn_arg->{ProxyAddr}) { + require PublicInbox::NetNNTPSocks; + ($Net_NNTP, $new) = qw(PublicInbox::NetNNTPSocks new_socks); + $nn_arg->{SocksDebug} = 1 if $nn_arg->{Debug}; + } + do { + $! = 0; + $nn = $Net_NNTP->$new(%$nn_arg); + } until ($nn || !$!{EINTR} || $self->{quit}); + $nn // return; + setsockopt($nn, Socket::SOL_SOCKET(), Socket::SO_KEEPALIVE(), 1); + + # default to using STARTTLS if it's available, but allow + # it to be disabled for localhost/VPN users + if (!$nn_arg->{SSL} && $nn->can('starttls')) { + if (!defined($nntp_cfg->{starttls}) && + try_starttls($nn_arg->{Host})) { + # soft fail by default + $nn->starttls or warn <<""; +W: <$uri> STARTTLS tried and failed (not requested): ${\(ndump($nn->message))} + + } elsif ($nntp_cfg->{starttls}) { + # hard fail if explicitly configured + $nn->starttls or die <<""; +E: <$uri> STARTTLS requested and failed: ${\(ndump($nn->message))} + + } + } elsif ($nntp_cfg->{starttls}) { + $nn->can('starttls') or + die "E: <$uri> Net::NNTP too old for STARTTLS\n"; + $nn->starttls or die <<""; +E: <$uri> STARTTLS requested and failed: ${\(ndump($nn->message))} + + } + $nn; +} + +sub nn_for ($$$$) { # nn = Net::NNTP + my ($self, $uri, $nn_common, $lei) = @_; + my $sec = uri_section($uri); + my $nntp_cfg = $self->{cfg_opt}->{$sec} //= {}; + my $host = $uri->host; + # Net::NNTP and Net::Netrc both mishandle `0', so we pass `127.0.0.1' + $host = '127.0.0.1' if $host eq '0'; + my $cred; + my ($u, $p); + if (defined(my $ui = $uri->userinfo)) { + require PublicInbox::GitCredential; + $cred = bless { + url => $sec, + protocol => $uri->scheme, + host => $host, + }, 'PublicInbox::GitCredential'; + ($u, $p) = split(/:/, $ui, 2); + ($cred->{username}, $cred->{password}) = ($u, $p); + $p //= $cred->check_netrc($lei); + } + my $common = $nn_common->{$sec} // {}; + my $nn_arg = { + Port => $uri->port, + Host => $host, + %$common, # may Debug .... + }; + $nn_arg->{SSL} = 1 if $uri->secure; # snews == nntps + my $sa = $self->{-proxy_cli}; + %$nn_arg = (%$nn_arg, %$sa) if $sa; + my $nn = nn_new($self, $nn_arg, $nntp_cfg, $uri); + return if $self->{quit}; + $nn // die "E: <$uri> new: $@".onion_hint($lei, $uri); + if ($cred) { + $p //= do { + $cred->fill($lei); # may prompt user here + $cred->{password}; + }; + if ($nn->authinfo($u, $p)) { + push @{$nntp_cfg->{-postconn}}, [ 'authinfo', $u, $p ]; + } else { + warn <<EOM; +E: <$uri> AUTHINFO $u XXXX: ${\(ndump($nn->message))} +EOM + $nn = undef; + } + } + if ($nn && $nntp_cfg->{compress}) { + # https://rt.cpan.org/Ticket/Display.html?id=129967 + if ($nn->can('compress')) { + if ($nn->compress) { + push @{$nntp_cfg->{-postconn}}, [ 'compress' ]; + } else { + warn <<EOM; +W: <$uri> COMPRESS: ${\(ndump($nn->message))} +EOM + } + } else { + delete $nntp_cfg->{compress}; + warn <<""; +W: <$uri> COMPRESS not supported by Net::NNTP +W: see https://rt.cpan.org/Ticket/Display.html?id=129967 for updates + + } + } + + $self->{net_arg}->{$sec} = $nn_arg; + $cred->run($nn ? 'approve' : 'reject') if $cred && $cred->{filled}; + $nn; +} + +sub imap_uri { + my ($url, $ls_ok) = @_; + require PublicInbox::URIimap; + my $uri = PublicInbox::URIimap->new($url); + $uri && ($ls_ok || $uri->mailbox) ? $uri->canonical : undef; +} + +my %IS_NNTP = (news => 1, snews => 1, nntp => 1, nntps => 1); +sub nntp_uri { + my ($url, $ls_ok) = @_; + require PublicInbox::URInntps; + my $uri = PublicInbox::URInntps->new($url); + $uri && $IS_NNTP{$uri->scheme} && ($ls_ok || $uri->group) ? + $uri->canonical : undef; +} + +sub cfg_intvl ($$$) { + my ($cfg, $key, $url) = @_; + my $v = $cfg->urlmatch($key, $url) // return; + $v =~ /\A[0-9]+(?:\.[0-9]+)?\z/s and return $v + 0; + if (ref($v) eq 'ARRAY') { + $v = join(', ', @$v); + warn "W: $key has multiple values: $v\nW: $key ignored\n"; + } else { + warn "W: $key=$v is not a numeric value in seconds\n"; + } +} + +# flesh out common IMAP-specific data structures +sub imap_common_init ($;$) { + my ($self, $lei) = @_; + return unless $self->{imap_order}; + $self->{quiet} = 1 if $lei && $lei->{opt}->{quiet}; + eval { require PublicInbox::IMAPClient } or + die "Mail::IMAPClient is required for IMAP:\n$@\n"; + ($lei || eval { require PublicInbox::IMAPTracker }) or + die "DBD::SQLite is required for IMAP\n:$@\n"; + require PublicInbox::URIimap; + my $cfg = $self->{pi_cfg} // $lei->_lei_cfg; + my $mic_common = {}; # scheme://authority => Mail:IMAPClient arg + for my $uri (@{$self->{imap_order}}) { + my $sec = uri_section($uri); + + # knobs directly for Mail::IMAPClient->new + for my $k (qw(Starttls Debug Compress)) { + my $v = $cfg->urlmatch('--bool', "imap.$k", $$uri); + $mic_common->{$sec}->{$k} = $v if defined $v; + } + my $to = cfg_intvl($cfg, 'imap.timeout', $$uri); + $mic_common->{$sec}->{Timeout} = $to if $to; + $mic_common->{$sec}->{Ssl} = 1 if $uri->scheme eq 'imaps'; + + # knobs we use ourselves: + my $sa = socks_args($cfg->urlmatch('imap.Proxy', $$uri)); + $self->{cfg_opt}->{$sec}->{-proxy_cfg} = $sa if $sa; + for my $k (qw(pollInterval idleInterval)) { + $to = cfg_intvl($cfg, "imap.$k", $$uri) // next; + $self->{cfg_opt}->{$sec}->{$k} = $to; + } + my $k = 'imap.fetchBatchSize'; + if (defined(my $bs = $cfg->urlmatch($k, $$uri))) { + ($bs =~ /\A([0-9]+)\z/ && $bs > 0) ? + ($self->{cfg_opt}->{$sec}->{batch_size} = $bs) : + warn("$k=$bs is not a positive integer\n"); + } + my $v = $cfg->urlmatch(qw(--bool imap.sslVerify), $$uri); + if (defined $v) { + my $cur = $mic_common->{$sec} //= {}; + $cur->{Starttls} //= 1 if !$cur->{Ssl}; + for my $f (grep { $cur->{$_} } qw(Ssl Starttls)) { + set_ssl_verify_mode($cur->{$f} = {}, $v); + } + } + } + # make sure we can connect and cache the credentials in memory + my $mics = {}; # schema://authority => IMAPClient obj + for my $orig_uri (@{$self->{imap_order}}) { + my $sec = uri_section($orig_uri); + my $uri = PublicInbox::URIimap->new("$sec/"); + my $mic = $mics->{$sec} //= + mic_for($self, $uri, $mic_common, $lei); + return if $self->{quit}; + $mic // die "Unable to continue\n"; + next unless $self->isa('PublicInbox::NetWriter'); + next if $self->{-skip_creat}; + my $dst = $orig_uri->mailbox // next; + next if $mic->exists($dst); # already exists + $mic->create($dst) or die "CREATE $dst failed <$orig_uri>: $@"; + } + $mics; +} + +# flesh out common NNTP-specific data structures +sub nntp_common_init ($;$) { + my ($self, $lei) = @_; + return unless $self->{nntp_order}; + $self->{quiet} = 1 if $lei && $lei->{opt}->{quiet}; + eval { require Net::NNTP } or + die "Net::NNTP is required for NNTP:\n$@\n"; + ($lei || eval { require PublicInbox::IMAPTracker }) or + die "DBD::SQLite is required for NNTP\n:$@\n"; + my $cfg = $self->{pi_cfg} // $lei->_lei_cfg; + my $nn_common = {}; # scheme://authority => Net::NNTP->new arg + for my $uri (@{$self->{nntp_order}}) { + my $sec = uri_section($uri); + my $args = $nn_common->{$sec} //= {}; + + # Debug and Timeout are passed to Net::NNTP->new + my $v = $cfg->urlmatch(qw(--bool nntp.Debug), $$uri); + $args->{Debug} = $v if defined $v; + my $to = cfg_intvl($cfg, 'nntp.Timeout', $$uri); + $args->{Timeout} = $to if $to; + my $sa = socks_args($cfg->urlmatch('nntp.Proxy', $$uri)); + %$args = (%$args, %$sa) if $sa; + + # Net::NNTP post-connect commands + for my $k (qw(starttls compress)) { + $v = $cfg->urlmatch('--bool', "nntp.$k", $$uri); + $self->{cfg_opt}->{$sec}->{$k} = $v if defined $v; + } + $v = $cfg->urlmatch(qw(--bool nntp.sslVerify), $$uri); + set_ssl_verify_mode($args, $v) if defined $v; + + # -watch internal option + for my $k (qw(pollInterval)) { + $to = cfg_intvl($cfg, "nntp.$k", $$uri) // next; + $self->{cfg_opt}->{$sec}->{$k} = $to; + } + } + # make sure we can connect and cache the credentials in memory + my %nn; # schema://authority => Net::NNTP object + for my $uri (@{$self->{nntp_order}}) { + my $sec = uri_section($uri); + $nn{$sec} //= nn_for($self, $uri, $nn_common, $lei); + } + \%nn; # for optional {nn_cached} +} + +sub add_url { + my ($self, $arg, $ls_ok) = @_; + my $uri; + if ($uri = imap_uri($arg, $ls_ok)) { + $_[1] = $$uri; # canonicalized + push @{$self->{imap_order}}, $uri; + } elsif ($uri = nntp_uri($arg, $ls_ok)) { + $_[1] = $$uri; # canonicalized + push @{$self->{nntp_order}}, $uri; + } else { + push @{$self->{unsupported_url}}, $arg; + } +} + +sub errors { + my ($self, $lei) = @_; + if (my $u = $self->{unsupported_url}) { + return "Unsupported URL(s): @$u"; + } + if ($self->{imap_order}) { + eval { require PublicInbox::IMAPClient } or + die "Mail::IMAPClient is required for IMAP:\n$@\n"; + } + if ($self->{nntp_order}) { + eval { require Net::NNTP } or + die "Net::NNTP is required for NNTP:\n$@\n"; + } + my $sa = socks_args($lei ? $lei->{opt}->{proxy} : undef); + $self->{-proxy_cli} = $sa if $sa; + undef; +} + +sub flags2kw ($$$$) { + my ($self, $uri, $uid, $flags) = @_; + my $kw = []; + for my $f (split(/ /, $flags)) { + if (my $k = $IMAPflags2kw{$f}) { + push @$kw, $k; + } elsif ($f eq "\\Recent") { # not in JMAP + } elsif ($f eq "\\Deleted") { # not in JMAP + return; + } elsif ($self->{verbose}) { + warn "# unknown IMAP flag $f <$uri/;UID=$uid>\n"; + } + } + @$kw = sort @$kw; # for LeiSearch->kw_changed and UI/UX purposes + $kw; +} + +sub _imap_do_msg ($$$$$) { + my ($self, $uri, $uid, $raw, $flags) = @_; + # our target audience expects LF-only, save storage + $$raw =~ s/\r\n/\n/sg; + my $kw = defined($flags) ? + (flags2kw($self, $uri, $uid, $flags) // return) : undef; + my ($eml_cb, @args) = @{$self->{eml_each}}; + $eml_cb->($uri, $uid, $kw, PublicInbox::Eml->new($raw), @args); +} + +sub run_commit_cb ($) { + my ($self) = @_; + my $cmt_cb_args = $self->{on_commit} or return; + my ($cb, @args) = @$cmt_cb_args; + $cb->(@args); +} + +sub itrk_last ($$;$$) { + my ($self, $uri, $r_uidval, $mic) = @_; + return (undef, undef, $r_uidval) unless $self->{incremental}; + my ($itrk, $l_uid, $l_uidval); + if (defined(my $lms = $self->{-lms_rw})) { # LeiMailSync or 0 + $uri->uidvalidity($r_uidval) if defined $r_uidval; + if ($mic) { + my $auth = $mic->Authmechanism // ''; + $uri->auth($auth) if $auth eq 'ANONYMOUS'; + my $user = $mic->User; + $uri->user($user) if defined($user); + } + my $x; + $l_uid = ($lms && ($x = $lms->location_stats($$uri))) ? + $x->{'uid.max'} : undef; + # itrk remains undef, lei/store worker writes to + # mail_sync.sqlite3 + } else { + $itrk = PublicInbox::IMAPTracker->new($$uri); + ($l_uidval, $l_uid) = $itrk->get_last($$uri); + } + ($itrk, $l_uid, $l_uidval //= $r_uidval); +} + +# import flags of already-seen messages +sub each_old_flags ($$$$) { + my ($self, $mic, $uri, $l_uid) = @_; + $l_uid ||= 1; + my $sec = uri_section($uri); + my $bs = ($self->{cfg_opt}->{$sec}->{batch_size} // 1) * 10000; + my ($eml_cb, @args) = @{$self->{eml_each}}; + $self->{quiet} or warn "# $uri syncing flags 1:$l_uid\n"; + for (my $n = 1; $n <= $l_uid; $n += $bs) { + my $end = $n + $bs; + $end = $l_uid if $end > $l_uid; + my $r = $mic->fetch_hash("$n:$end", 'FLAGS'); + if (!$r) { + return if $!{EINTR} && $self->{quit}; + return "E: $uri UID FETCH $n:$end error: $!"; + } + while (my ($uid, $per_uid) = each %$r) { + my $kw = flags2kw($self, $uri, $uid, $per_uid->{FLAGS}) + // next; + # LeiImport->input_net_cb + $eml_cb->($uri, $uid, $kw, undef, @args); + } + } +} + +# returns true if PERMANENTFLAGS indicates FLAGS of already imported +# messages are meaningful +sub perm_fl_ok ($) { + my ($perm_fl) = @_; + return if !defined($perm_fl); + for my $f (split(/[ \t]+/, $perm_fl)) { + return 1 if $IMAPflags2kw{$f}; + } + undef; +} + +# may be overridden in NetWriter or Watch +sub folder_select { $_[0]->{each_old} ? 'select' : 'examine' } + +sub _imap_fetch_bodies ($$$$) { + my ($self, $mic, $uri, $uids) = @_; + my $req = $mic->imap4rev1 ? 'BODY.PEEK[]' : 'RFC822.PEEK'; + my $key = $req; + $key =~ s/\.PEEK//; + my $sec = uri_section($uri); + my $mbx = $uri->mailbox; + my $bs = $self->{cfg_opt}->{$sec}->{batch_size} // 1; + my ($last_uid, $err); + my $use_fl = $self->{-use_fl}; + + while (scalar @$uids) { + my @batch = splice(@$uids, 0, $bs); + my $batch = join(',', @batch); + local $0 = "UID:$batch $mbx $sec"; + my $r = $mic->fetch_hash($batch, $req, 'FLAGS'); + unless ($r) { # network error? + last if $!{EINTR} && $self->{quit}; + $err = "E: $uri UID FETCH $batch error: $!"; + last; + } + for my $uid (@batch) { + # messages get deleted, so holes appear + my $per_uid = delete $r->{$uid} // next; + my $raw = delete($per_uid->{$key}) // next; + my $fl = $use_fl ? $per_uid->{FLAGS} : undef; + _imap_do_msg($self, $uri, $uid, \$raw, $fl); + $last_uid = $uid; + last if $self->{quit}; + } + last if $self->{quit}; + } + ($last_uid, $err); +} + +sub _imap_fetch_all ($$$) { + my ($self, $mic, $orig_uri) = @_; + my $sec = uri_section($orig_uri); + my $mbx = $orig_uri->mailbox; + $mic->Clear(1); # trim results history + + # we need to check for mailbox writability to see if we care about + # FLAGS from already-imported messages. + my $cmd = $self->folder_select; + $mic->$cmd($mbx) or return "E: \U$cmd\E $mbx ($sec) failed: $!"; + + my ($r_uidval, $r_uidnext, $perm_fl); + for ($mic->Results) { + /^\* OK \[PERMANENTFLAGS \(([^\)]*)\)\].*/ and $perm_fl = $1; + /^\* OK \[UIDVALIDITY ([0-9]+)\].*/ and $r_uidval = $1; + /^\* OK \[UIDNEXT ([0-9]+)\].*/ and $r_uidnext = $1; + } + $r_uidval //= $mic->uidvalidity($mbx) // + return "E: $orig_uri cannot get UIDVALIDITY"; + $r_uidnext //= $mic->uidnext($mbx) // + return "E: $orig_uri cannot get UIDNEXT"; + my $expect = $orig_uri->uidvalidity // $r_uidval; + return <<EOF if $expect != $r_uidval; +E: $orig_uri UIDVALIDITY mismatch (got $r_uidval) +EOF + + my $uri = $orig_uri->clone; + my $single_uid = $uri->uid; + my ($itrk, $l_uid, $l_uidval) = itrk_last($self, $uri, $r_uidval, $mic); + if (defined($single_uid)) { + $itrk = $l_uid = undef; + $uri->uid(undef); # for eml_cb + } + return <<EOF if $l_uidval != $r_uidval; +E: $uri UIDVALIDITY mismatch +E: local=$l_uidval != remote=$r_uidval +EOF + $uri->uidvalidity($r_uidval); + $l_uid //= 0; + my $r_uid = $r_uidnext - 1; + return <<EOF if $l_uid > $r_uid; +E: $uri local UID exceeds remote ($l_uid > $r_uid) +E: $uri strangely, UIDVALIDLITY matches ($l_uidval) +EOF + $mic->Uid(1); # the default, we hope + my $err; + my $use_fl = perm_fl_ok($perm_fl); + local $self->{-use_fl} = $use_fl; + if (!defined($single_uid) && $self->{each_old} && $use_fl) { + $err = each_old_flags($self, $mic, $uri, $l_uid); + return $err if $err; + } + return if $l_uid >= $r_uid; # nothing to do + $l_uid ||= 1; + my ($mod, $shard) = @{$self->{shard_info} // []}; + unless ($self->{quiet}) { + my $m = $mod ? " [(UID % $mod) == $shard]" : ''; + warn "# $uri fetching UID $l_uid:$r_uid$m\n"; + } + my $fetch_cb = \&_imap_fetch_bodies; + do { + # I wish "UID FETCH $START:*" could work, but: + # 1) servers do not need to return results in any order + # 2) Mail::IMAPClient doesn't offer a streaming API + my $uids; + if (defined $single_uid) { + $uids = [ $single_uid ]; + } elsif (!($uids = $mic->search("UID $l_uid:*"))) { + return if $!{EINTR} && $self->{quit}; + return "E: $uri UID SEARCH $l_uid:* error: $!"; + } + return if scalar(@$uids) == 0; + + # RFC 3501 doesn't seem to indicate order of UID SEARCH + # responses, so sort it ourselves. Order matters so + # IMAPTracker can store the newest UID. + @$uids = sort { $a <=> $b } @$uids; + + # Did we actually get new messages? + return if $uids->[0] < $l_uid; + + $l_uid = $uids->[-1] + 1; # for next search + @$uids = grep { ($_ % $mod) == $shard } @$uids if $mod; + (my $last_uid, $err) = $fetch_cb->($self, $mic, $uri, $uids); + run_commit_cb($self); + $itrk->update_last($r_uidval, $last_uid) if $itrk; + } until ($err || $self->{quit} || defined($single_uid)); + $err; +} + +# uses cached auth info prepared by mic_for +sub mic_get { + my ($self, $uri) = @_; + my $sec = uri_section($uri); + # see if caller saved result of imap_common_init + my $cached = $self->{mics_cached}; + if ($cached) { + my $mic = $cached->{$sec}; + return $mic if $mic && $mic->IsConnected; + delete $cached->{$sec}; + } + my $mic_arg = $self->{net_arg}->{$sec} or + die "BUG: no Mail::IMAPClient->new arg for $sec"; + if (defined(my $cb_name = $mic_arg->{Authcallback})) { + if (ref($cb_name) ne 'CODE') { + $mic_arg->{Authcallback} = $self->can($cb_name); + } + } + my $mic = mic_new($self, $mic_arg, $sec, $uri); + $cached //= {}; # invalid placeholder if no cache enabled + if ($mic && $mic->IsConnected) { + $cached->{$sec} = $mic; + } else { + warn 'IMAP LastError: ',$mic->LastError, "\n" if $mic; + warn "IMAP errno: $!\n" if $!; + undef; + } +} + +sub imap_each { + my ($self, $url, $eml_cb, @args) = @_; + my $uri = ref($url) ? $url : PublicInbox::URIimap->new($url); + my $sec = uri_section($uri); + local $0 = $uri->mailbox." $sec"; + my $mic = mic_get($self, $uri); + my $err; + if ($mic) { + local $self->{eml_each} = [ $eml_cb, @args ]; + $err = _imap_fetch_all($self, $mic, $uri); + } else { + $err = "E: <$uri> not connected: $!"; + } + die $err if $err && $self->{-can_die}; + warn $err if $err; + $mic; +} + +# may used cached auth info prepared by nn_for once +sub nn_get { + my ($self, $uri) = @_; + my $sec = uri_section($uri); + # see if caller saved result of nntp_common_init + my $cached = $self->{nn_cached} // {}; + my $nn; + $nn = delete($cached->{$sec}) and return $nn; + my $nn_arg = $self->{net_arg}->{$sec} or + die "BUG: no Net::NNTP->new arg for $sec"; + my $nntp_cfg = $self->{cfg_opt}->{$sec}; + $nn = nn_new($self, $nn_arg, $nntp_cfg, $uri) or return; + if (my $postconn = $nntp_cfg->{-postconn}) { + for my $m_arg (@$postconn) { + my ($method, @args) = @$m_arg; + $nn->$method(@args) and next; + die "E: <$uri> $method failed\n"; + return; + } + } + $nn; +} + +sub _nntp_fetch_all ($$$) { + my ($self, $nn, $uri) = @_; + my ($group, $num_a, $num_b) = $uri->group; + my $sec = uri_section($uri); + my ($nr, $beg, $end) = $nn->group($group); + unless (defined($nr)) { + my $msg = ndump($nn->message); + return "E: GROUP $group <$sec> $msg"; + } + (defined($num_a) && defined($num_b) && $num_a > $num_b) and + return "E: $uri: backwards range: $num_a > $num_b"; + if (defined($num_a)) { # no article numbers in mail_sync.sqlite3 + $uri = $uri->clone; + $uri->group($group); + } + # IMAPTracker is also used for tracking NNTP, UID == article number + # LIST.ACTIVE can get the equivalent of UIDVALIDITY, but that's + # expensive. So we assume newsgroups don't change: + my ($itrk, $l_art) = itrk_last($self, $uri); + + if (defined($l_art) && !defined($num_a)) { + return if $l_art >= $end; # nothing to do + $beg = $l_art + 1; + } + # allow users to specify articles to refetch + # cf. https://tools.ietf.org/id/draft-gilman-news-url-01.txt + # nntp://example.com/inbox.foo/$num_a-$num_b + $beg = $num_a if defined($num_a) && $num_a > $beg && $num_a <= $end; + $end = $num_b if defined($num_b) && $num_b >= $beg && $num_b < $end; + $end = $beg if defined($num_a) && !defined($num_b); + my ($err, $last_art, $kw); # kw stays undef, no keywords in NNTP + warn "# $uri fetching ARTICLE $beg..$end\n" if !$self->{quiet}; + my $n = $self->{max_batch}; + for my $art ($beg..$end) { + last if $self->{quit}; + local $0 = "#$art $group $sec"; + if (--$n < 0) { + run_commit_cb($self); + $itrk->update_last(0, $last_art) if $itrk; + $n = $self->{max_batch}; + } + my $raw = $nn->article($art); + unless (defined($raw)) { + my $msg = ndump($nn->message); + if ($nn->code == 421) { # pseudo response from Net::Cmd + $err = "E: $msg"; + last; + } else { # probably just a deleted message (spam) + warn "W: $msg"; + next; + } + } + $raw = join('', @$raw); + $raw =~ s/\r\n/\n/sg; + my ($eml_cb, @args) = @{$self->{eml_each}}; + $eml_cb->($uri, $art, $kw, PublicInbox::Eml->new(\$raw), @args); + $last_art = $art; + } + run_commit_cb($self); + $itrk->update_last(0, $last_art) if $itrk; + $err; +} + +sub nntp_each { + my ($self, $url, $eml_cb, @args) = @_; + my $uri = ref($url) ? $url : PublicInbox::URInntps->new($url); + my $sec = uri_section($uri); + local $0 = $uri->group ." $sec"; + my $nn = nn_get($self, $uri); + return if $self->{quit}; + my $err; + if ($nn) { + local $self->{eml_each} = [ $eml_cb, @args ]; + $err = _nntp_fetch_all($self, $nn, $uri); + } else { + $err = "E: <$uri> not connected: $!"; + } + die $err if $err && $self->{-can_die}; + warn $err if $err; + $nn; +} + +sub new { bless {}, shift }; + +# updates $uri with UIDVALIDITY +sub mic_for_folder { + my ($self, $uri) = @_; + my $mic = $self->mic_get($uri) or die "E: not connected: $@"; + my $m = $self->isa('PublicInbox::NetWriter') ? 'select' : 'examine'; + $mic->$m($uri->mailbox) or return; + my $uidval; + for ($mic->Results) { + /^\* OK \[UIDVALIDITY ([0-9]+)\].*/ or next; + $uidval = $1; + last; + } + $uidval //= $mic->uidvalidity($uri->mailbox) or + die "E: failed to get uidvalidity from <$uri>: $@"; + $uri->uidvalidity($uidval); + $mic; +} + + +1; diff --git a/lib/PublicInbox/NetWriter.pm b/lib/PublicInbox/NetWriter.pm new file mode 100644 index 00000000..7917ef89 --- /dev/null +++ b/lib/PublicInbox/NetWriter.pm @@ -0,0 +1,67 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# common writer code for IMAP (and later, JMAP) +package PublicInbox::NetWriter; +use v5.12; +use parent qw(PublicInbox::NetReader); +use PublicInbox::Smsg; +use PublicInbox::MsgTime qw(msg_timestamp); + +my %IMAPkw2flags; +@IMAPkw2flags{values %PublicInbox::NetReader::IMAPflags2kw} = + keys %PublicInbox::NetReader::IMAPflags2kw; + +sub kw2flags ($) { join(' ', map { $IMAPkw2flags{$_} } @{$_[0]}) } + +sub imap_append { + my ($mic, $folder, $bref, $smsg, $eml) = @_; + $bref //= \($eml->as_string); + $smsg //= bless {}, 'PublicInbox::Smsg'; + bless($smsg, 'PublicInbox::Smsg') if ref($smsg) eq 'HASH'; + $smsg->{ts} //= msg_timestamp($eml // PublicInbox::Eml->new($$bref)); + my $f = kw2flags($smsg->{kw}); + $mic->append_string($folder, $$bref, $f, $smsg->internaldate) or + die "APPEND $folder: $@"; +} + +sub folder_select { 'select' } # for PublicInbox::NetReader + +sub imap_delete_all { + my ($self, $uri) = @_; + my $mic = $self->mic_for_folder($uri) or return; + my $sec = $self->can('uri_section')->($uri); + local $0 = $uri->mailbox." $sec"; + if ($mic->delete_message('1:*')) { + $mic->expunge; + } +} + +sub imap_delete_1 { + my ($self, $uri, $uid, $delete_mic) = @_; + $$delete_mic //= $self->mic_for_folder($uri) or return; + $$delete_mic->delete_message($uid); +} + +sub imap_add_kw { + my ($self, $mic, $uid, $kw) = @_; + $mic->store($uid, '+FLAGS.SILENT', '('.kw2flags($kw).')'); + $mic; # caller must ->expunge +} + +sub imap_set_kw { + my ($self, $mic, $uid, $kw) = @_; + $mic->store($uid, 'FLAGS.SILENT', '('.kw2flags($kw).')'); + $mic; # caller must ->expunge +} + +sub can_store_flags { + my ($self, $mic) = @_; + for ($mic->Results) { + /^\* OK \[PERMANENTFLAGS \(([^\)]*)\)\].*/ and + return $self->can('perm_fl_ok')->($1); + } + undef; +} + +1; diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm index 6bed0103..d13731ae 100644 --- a/lib/PublicInbox/NewsWWW.pm +++ b/lib/PublicInbox/NewsWWW.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Plack app redirector for mapping /$NEWSGROUP requests to @@ -13,9 +13,8 @@ use PublicInbox::MID qw(mid_escape); use PublicInbox::Hval qw(prurl); sub new { - my ($class, $pi_config) = @_; - $pi_config ||= PublicInbox::Config->new; - bless { pi_config => $pi_config }, $class; + my ($class, $pi_cfg) = @_; + bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class; } sub redirect ($$) { @@ -46,9 +45,11 @@ sub call { # some links may have the article number in them: # /inbox.foo.bar/123456 my (undef, @parts) = split(m!/!, $env->{PATH_INFO}); + @parts or return + [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; my ($ng, $article) = @parts; - my $pi_config = $self->{pi_config}; - if (my $ibx = $pi_config->lookup_newsgroup($ng)) { + my $pi_cfg = $self->{pi_cfg}; + if (my $ibx = $pi_cfg->lookup_newsgroup($ng)) { my $url = prurl($env, $ibx->{url}); my $code = 301; if (defined $article && $article =~ /\A[0-9]+\z/) { @@ -63,7 +64,6 @@ sub call { return redirect($code, $url); } - my $res; my @try = (join('/', @parts)); # trailing slash is in the rest of our WWW, so maybe some users @@ -72,13 +72,31 @@ sub call { pop @parts; push @try, join('/', @parts); } - - foreach my $mid (@try) { - my $arg = [ $mid ]; - $pi_config->each_inbox(\&try_inbox, $arg); - defined($res = $arg->[1]) and last; + my $ALL = $pi_cfg->ALL; + if (my $over = $ALL ? $ALL->over : undef) { + my $by_eidx_key = $pi_cfg->{-by_eidx_key}; + for my $mid (@try) { + my ($id, $prev); + while (my $x = $over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $over->get_xref3($x->{num}); + for (@$xr3) { + s/:[0-9]+:$x->{blob}\z// or next; + my $ibx = $by_eidx_key->{$_} // next; + my $url = $ALL->base_url($env) // + $ibx->base_url // next; + $url .= mid_escape($mid) . '/'; + return redirect(302, $url); + } + } + } + } else { # slow path, scan every inbox + for my $mid (@try) { + my $arg = [ $mid ]; # [1] => result + $pi_cfg->each_inbox(\&try_inbox, $arg); + return $arg->[1] if $arg->[1]; + } } - $res || [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; + [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; } 1; diff --git a/lib/PublicInbox/OnDestroy.pm b/lib/PublicInbox/OnDestroy.pm new file mode 100644 index 00000000..4301edff --- /dev/null +++ b/lib/PublicInbox/OnDestroy.pm @@ -0,0 +1,31 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::OnDestroy; +use v5.12; +use parent qw(Exporter); +use autodie qw(fork); +our @EXPORT = qw(on_destroy); +our $fork_gen = 0; + +# either parent or child is expected to exit or exec shortly after this: +sub fork_tmp () { + my $pid = fork; + ++$fork_gen if $pid == 0; + $pid; +} + +# all children +sub all (@) { bless [ undef, @_ ], __PACKAGE__ } + +# same process +sub on_destroy (@) { bless [ $fork_gen, @_ ], __PACKAGE__ } + +sub cancel { @{$_[0]} = () } + +sub DESTROY { + my ($fgen, $cb, @args) = @{$_[0]}; + $cb->(@args) if ($cb && ($fgen // $fork_gen) == $fork_gen); +} + +1; diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index 08112386..3b7d49f5 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -7,20 +7,22 @@ package PublicInbox::Over; use strict; use v5.10.1; -use DBI; +use DBI qw(:sql_types); # SQL_BLOB use DBD::SQLite; use PublicInbox::Smsg; use Compress::Zlib qw(uncompress); use constant DEFAULT_LIMIT => 1000; +use List::Util (); # for max sub dbh_new { my ($self, $rw) = @_; my $f = delete $self->{filename}; - if (!-f $f) { # SQLite defaults mode to 0644, we want 0666 + if (!-s $f) { # SQLite defaults mode to 0644, we want 0666 if ($rw) { - require PublicInbox::Spawn; + require PublicInbox::Syscall; + my ($dir) = ($f =~ m!(.+)/[^/]+\z!); + PublicInbox::Syscall::nodatacow_dir($dir); open my $fh, '+>>', $f or die "failed to open $f: $!"; - PublicInbox::Spawn::nodatacow_fd(fileno($fh)); } else { $self->{filename} = $f; # die on stat() below: } @@ -80,7 +82,13 @@ sub dbh_close { } } -sub dbh ($) { $_[0]->{dbh} //= $_[0]->dbh_new } # dbh_new may be subclassed +sub dbh ($) { + my ($self) = @_; + $self->{dbh} // do { + my $dbh = $self->dbh_new; # dbh_new may be subclassed + $self->{dbh} = $dbh; + } +} sub load_from_row ($;$) { my ($smsg, $cull) = @_; @@ -106,8 +114,8 @@ sub do_get { } sub query_xover { - my ($self, $beg, $end) = @_; - do_get($self, <<'', {}, $beg, $end); + my ($self, $beg, $end, $opt) = @_; + do_get($self, <<'', $opt, $beg, $end); SELECT num,ts,ds,ddd FROM over WHERE num >= ? AND num <= ? ORDER BY num ASC @@ -192,15 +200,22 @@ ORDER BY $sort_col DESC # TODO separate strict and loose matches here once --reindex # is fixed to preserve `tid' properly push @$msgs, @$loose; + + # we wanted to retrieve the latest loose messages; but preserve + # chronological ordering for threading /$INBOX/$MSGID/[tT]/ + $sort_col eq 'ds' and + @$msgs = sort { $a->{ds} <=> $b->{ds} } @$msgs; } ($nr, $msgs); } # strict `tid' matches, only, for thread-expanded mbox.gz search results -# and future CLI interface +# and lei # returns true if we have IDs, undef if not sub expand_thread { my ($self, $ctx) = @_; + # previous maxuid for LeiSavedSearch is our min: + my $lss_min = $ctx->{min} // 0; my $dbh = dbh($self); do { defined(my $num = $ctx->{ids}->[0]) or return; @@ -213,7 +228,7 @@ SELECT num FROM over WHERE tid = ? AND num > ? ORDER BY num ASC LIMIT 1000 my $xids = $dbh->selectcol_arrayref($sql, undef, $tid, - $ctx->{prev} // 0); + List::Util::max($ctx->{prev} // 0, $lss_min)); if (scalar(@$xids)) { $ctx->{prev} = $xids->[-1]; $ctx->{xids} = $xids; @@ -252,21 +267,67 @@ SELECT ts,ds,ddd FROM over WHERE $s sub get_art { my ($self, $num) = @_; # caching $sth ourselves is faster than prepare_cached - my $sth = $self->{-get_art} //= dbh($self)->prepare(<<''); + my $sth = $self->{-get_art} // do { + my $sth = dbh($self)->prepare(<<''); SELECT num,tid,ds,ts,ddd FROM over WHERE num = ? LIMIT 1 + $self->{-get_art} = $sth; + }; $sth->execute($num); my $smsg = $sth->fetchrow_hashref; $smsg ? load_from_row($smsg) : undef; } +sub get_xref3 { + my ($self, $num, $raw) = @_; + my $dbh = dbh($self); + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT ibx_id,xnum,oidbin FROM xref3 WHERE docid = ? ORDER BY ibx_id,xnum ASC + + $sth->execute($num); + my $rows = $sth->fetchall_arrayref; + return $rows if $raw; + my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT eidx_key FROM inboxes WHERE ibx_id = ? + + for my $r (@$rows) { + $eidx_key_sth->execute($r->[0]); + my $eidx_key = $eidx_key_sth->fetchrow_array; + $eidx_key //= "missing://ibx_id=$r->[0]"; + $r = "$eidx_key:$r->[1]:".unpack('H*', $r->[2]); + } + $rows; +} + +sub mid2tid { + my ($self, $mid) = @_; + my $dbh = dbh($self); + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT id FROM msgid WHERE mid = ? LIMIT 1 + + $sth->execute($mid); + my $id = $sth->fetchrow_array or return; + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT num FROM id2num WHERE id = ? AND num > ? +ORDER BY num ASC LIMIT 1 + + $sth->execute($id, 0); + my $num = $sth->fetchrow_array or return; + $sth = $dbh->prepare(<<''); +SELECT tid FROM over WHERE num = ? LIMIT 1 + + $sth->execute($num); + $sth->fetchrow_array; +} + sub next_by_mid { my ($self, $mid, $id, $prev) = @_; my $dbh = dbh($self); unless (defined $$id) { my $sth = $dbh->prepare_cached(<<'', undef, 1); - SELECT id FROM msgid WHERE mid = ? LIMIT 1 +SELECT id FROM msgid WHERE mid = ? LIMIT 1 $sth->execute($mid); $$id = $sth->fetchrow_array; @@ -328,4 +389,37 @@ sub check_inodes { } } +sub oidbin_exists { + my ($self, $oidbin) = @_; + if (wantarray) { + my $sth = $self->dbh->prepare_cached(<<'', undef, 1); +SELECT docid FROM xref3 WHERE oidbin = ? ORDER BY docid ASC + + $sth->bind_param(1, $oidbin, SQL_BLOB); + $sth->execute; + my $tmp = $sth->fetchall_arrayref; + map { $_->[0] } @$tmp; + } else { + my $sth = $self->dbh->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM xref3 WHERE oidbin = ? + + $sth->bind_param(1, $oidbin, SQL_BLOB); + $sth->execute; + $sth->fetchrow_array; + } +} + +sub blob_exists { oidbin_exists($_[0], pack('H*', $_[1])) } + +# used by NNTP.pm +sub ids_after { + my ($self, $num) = @_; + my $ids = dbh($self)->selectcol_arrayref(<<'', undef, $$num); +SELECT num FROM over WHERE num > ? +ORDER BY num ASC LIMIT 1000 + + $$num = $ids->[-1] if @$ids; + $ids; +} + 1; diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index db4b7738..4f8533f7 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # for XOVER, OVER in NNTP, and feeds/homepage/threads in PSGI @@ -17,6 +17,7 @@ use PublicInbox::MID qw/id_compress mids_for_index references/; use PublicInbox::Smsg qw(subject_normalized); use Compress::Zlib qw(compress); use Carp qw(croak); +use bytes (); # length sub dbh_new { my ($self) = @_; @@ -79,6 +80,11 @@ SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1 } } +sub ibx_id { + my ($self, $eidx_key) = @_; + id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key); +} + sub sid { my ($self, $path) = @_; return unless defined $path && $path ne ''; @@ -153,7 +159,8 @@ SELECT $cols FROM over WHERE over.num = ? LIMIT 1 foreach (@$nums) { $sth->execute($_->[0]); - my $smsg = $sth->fetchrow_hashref; + # $cb may delete rows and invalidate nums + my $smsg = $sth->fetchrow_hashref // next; $smsg = PublicInbox::Over::load_from_row($smsg); $cb->($self, $smsg, @arg) or return; } @@ -170,8 +177,14 @@ sub _resolve_mid_to_tid { $$tid = $cur_tid; } else { # rethreading, queue up dead ghosts $$tid = next_tid($self); - my $num = $smsg->{num}; - push(@{$self->{-ghosts_to_delete}}, $num) if $num < 0; + my $n = $smsg->{num}; + if ($n > 0) { + $self->{dbh}->prepare_cached(<<'')->execute($$tid, $n); +UPDATE over SET tid = ? WHERE num = ? + + } elsif ($n < 0) { + push(@{$self->{-ghosts_to_delete}}, $n); + } } 1; } @@ -187,7 +200,7 @@ sub resolve_mid_to_tid { $tid // do { # create a new ghost my $id = mid2id($self, $mid); my $num = next_ghost_num($self); - $num < 0 or die "ghost num is non-negative: $num\n"; + $num < 0 or croak "BUG: ghost num is non-negative: $num\n"; $tid = next_tid($self); my $dbh = $self->{dbh}; $dbh->prepare_cached(<<'')->execute($num, $tid); @@ -232,57 +245,49 @@ sub link_refs { $tid; } -sub parse_references ($$$) { - my ($smsg, $hdr, $mids) = @_; - my $refs = references($hdr); - push(@$refs, @$mids) if scalar(@$mids) > 1; - return $refs if scalar(@$refs) == 0; - - # prevent circular references here: - my %seen = ( $smsg->{mid} => 1 ); - my @keep; - foreach my $ref (@$refs) { - if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { - warn "References: <$ref> too long, ignoring\n"; - next; - } - push(@keep, $ref) unless $seen{$ref}++; - } - $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; - \@keep; -} - -# normalize subjects so they are suitable as pathnames for URLs -# XXX: consider for removal +# normalize subjects somewhat, they used to be ASCII-only but now +# we use \w for UTF-8 support. We may still drop it entirely and +# rely on Xapian for subject matches... sub subject_path ($) { my ($subj) = @_; $subj = subject_normalized($subj); - $subj =~ s![^a-zA-Z0-9_\.~/\-]+!_!g; + $subj =~ s![^\w\.~/\-]+!_!g; lc($subj); } +sub ddd_for ($) { + my ($smsg) = @_; + my $dd = $smsg->to_doc_data; + utf8::encode($dd); + compress($dd); +} + sub add_overview { my ($self, $eml, $smsg) = @_; - $smsg->{lines} = $eml->body_raw =~ tr!\n!\n!; + my $raw = $eml->body_raw; + $smsg->{lines} = $raw =~ tr!\n!\n!; + $smsg->{bytes} //= bytes::length $raw; + undef $raw; my $mids = mids_for_index($eml); - my $refs = parse_references($smsg, $eml, $mids); + my $refs = $smsg->parse_references($eml, $mids); + $mids->[0] //= do { + $smsg->{mid} //= ''; + $eml->{-lei_fake_mid}; + }; my $subj = $smsg->{subject}; my $xpath; if ($subj ne '') { $xpath = subject_path($subj); $xpath = id_compress($xpath); } - my $dd = $smsg->to_doc_data; - utf8::encode($dd); - $dd = compress($dd); - add_over($self, $smsg, $mids, $refs, $xpath, $dd); + add_over($self, $smsg, $mids, $refs, $xpath, ddd_for($smsg)); } sub _add_over { my ($self, $smsg, $mid, $refs, $old_tid, $v) = @_; my $cur_tid = $smsg->{tid}; my $n = $smsg->{num}; - die "num must not be zero for $mid" if !$n; + croak "BUG: num must not be zero for $mid" if !$n; my $cur_valid = $cur_tid > $self->{min_tid}; if ($n > 0) { # regular mail @@ -294,7 +299,7 @@ sub _add_over { } } elsif ($n < 0) { # ghost $$old_tid //= $cur_valid ? $cur_tid : next_tid($self); - link_refs($self, $refs, $$old_tid); + $$old_tid = link_refs($self, $refs, $$old_tid); delete_by_num($self, $n); $$v++; } @@ -379,13 +384,12 @@ sub create_tables { $dbh->do(<<''); CREATE TABLE IF NOT EXISTS over ( - num INTEGER NOT NULL, /* NNTP article number == IMAP UID */ + num INTEGER PRIMARY KEY NOT NULL, /* NNTP article number == IMAP UID */ tid INTEGER NOT NULL, /* THREADID (IMAP REFERENCES threading, JMAP) */ sid INTEGER, /* Subject ID (IMAP ORDEREDSUBJECT "threading") */ ts INTEGER, /* IMAP INTERNALDATE (Received: header, git commit time) */ ds INTEGER, /* RFC-2822 sent Date: header, git author time */ - ddd VARBINARY, /* doc-data-deflated (->to_doc_data, ->load_from_data) */ - UNIQUE (num) + ddd VARBINARY /* doc-data-deflated (->to_doc_data, ->load_from_data) */ ) $dbh->do('CREATE INDEX IF NOT EXISTS idx_tid ON over (tid)'); @@ -434,6 +438,7 @@ sub commit_lazy { my ($self) = @_; delete $self->{txn} or return; $self->{dbh}->commit; + eval { $self->{dbh}->do('PRAGMA optimize') }; } sub begin_lazy { @@ -453,16 +458,20 @@ sub rollback_lazy { sub dbh_close { my ($self) = @_; - die "in transaction" if $self->{txn}; + Carp::confess('BUG: in transaction') if $self->{txn}; $self->SUPER::dbh_close; } sub create { my ($self) = @_; - unless (-r $self->{filename}) { + my $fn = $self->{filename} // do { + croak('BUG: no {filename}') unless $self->{dbh}; + return; + }; + unless (-r $fn) { require File::Path; - require File::Basename; - File::Path::mkpath(File::Basename::dirname($self->{filename})); + my ($dir) = ($fn =~ m!(.*?/)[^/]+\z!); + File::Path::mkpath($dir); } # create the DB: PublicInbox::Over::dbh($self); @@ -504,12 +513,178 @@ EOF next; } $pr->(<<EOM) if $pr; -I: ghost $r->{num} <$mid> THREADID=$r->{tid} culled +# ghost $r->{num} <$mid> THREADID=$r->{tid} culled EOM } delete_by_num($self, $r->{num}); } - $pr->("I: rethread culled $total ghosts\n") if $pr && $total; + $pr->("# rethread culled $total ghosts\n") if $pr && $total; +} + +# used for cross-inbox search +sub eidx_prep ($) { + my ($self) = @_; + $self->{-eidx_prep} // do { + my $dbh = $self->dbh; + $dbh->do(<<''); +INSERT OR IGNORE INTO counter (key) VALUES ('eidx_docid') + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS inboxes ( + ibx_id INTEGER PRIMARY KEY AUTOINCREMENT, + eidx_key VARCHAR(255) NOT NULL, /* {newsgroup} // {inboxdir} */ + UNIQUE (eidx_key) +) + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS xref3 ( + docid INTEGER NOT NULL, /* <=> over.num */ + ibx_id INTEGER NOT NULL, /* <=> inboxes.ibx_id */ + xnum INTEGER NOT NULL, /* NNTP article number in ibx */ + oidbin VARBINARY NOT NULL, /* 20-byte SHA-1 or 32-byte SHA-256 */ + UNIQUE (docid, ibx_id, xnum, oidbin) +) + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)'); + + # performance critical, this is not UNIQUE since we may need to + # tolerate some old bugs from indexing mirrors. n.b. we used + # to index oidbin here, but leaving it out speeds up reindexing + # and "XHDR Xref <$MSGID>" isn't any slower w/o oidbin + $dbh->do('CREATE INDEX IF NOT EXISTS idx_reindex ON '. + 'xref3 (xnum,ibx_id)'); + + $dbh->do('CREATE INDEX IF NOT EXISTS idx_oidbin ON xref3 (oidbin)'); + + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidx_meta ( + key VARCHAR(255) PRIMARY KEY, + val VARCHAR(255) NOT NULL +) + + # A queue of current docids which need reindexing. + # eidxq persists across aborted -extindex invocations + # Currently used for "-extindex --reindex" for Xapian + # data, but may be used in more places down the line. + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS eidxq (docid INTEGER PRIMARY KEY NOT NULL) + + $self->{-eidx_prep} = 1; + }; +} + +sub eidx_meta { # requires transaction + my ($self, $key, $val) = @_; + + my $sql = 'SELECT val FROM eidx_meta WHERE key = ? LIMIT 1'; + my $dbh = $self->{dbh}; + defined($val) or return $dbh->selectrow_array($sql, undef, $key); + + my $prev = $dbh->selectrow_array($sql, undef, $key); + if (defined $prev) { + $sql = 'UPDATE eidx_meta SET val = ? WHERE key = ?'; + $dbh->do($sql, undef, $val, $key); + } else { + $sql = 'INSERT INTO eidx_meta (key,val) VALUES (?,?)'; + $dbh->do($sql, undef, $key, $val); + } + $prev; +} + +sub eidx_max { + my ($self) = @_; + get_counter($self->{dbh}, 'eidx_docid'); +} + +sub add_xref3 { + my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_; + begin_lazy($self); + my $ibx_id = ibx_id($self, $eidx_key); + my $oidbin = pack('H*', $oidhex); + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) + + $sth->bind_param(1, $docid); + $sth->bind_param(2, $ibx_id); + $sth->bind_param(3, $xnum); + $sth->bind_param(4, $oidbin, SQL_BLOB); + $sth->execute; +} + +# for when an xref3 goes missing, this does NOT update {ts} +sub update_blob { + my ($self, $smsg, $oidhex) = @_; + my $sth = $self->{dbh}->prepare(<<''); +UPDATE over SET ddd = ? WHERE num = ? + + $smsg->{blob} = $oidhex; + $sth->bind_param(1, ddd_for($smsg), SQL_BLOB); + $sth->bind_param(2, $smsg->{num}); + $sth->execute; +} + +sub merge_xref3 { # used for "-extindex --dedupe" + my ($self, $keep_docid, $drop_docid, $oidbin) = @_; + my $sth = $self->{dbh}->prepare_cached(<<''); +UPDATE OR IGNORE xref3 SET docid = ? WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $keep_docid); + $sth->bind_param(2, $drop_docid); + $sth->bind_param(3, $oidbin, SQL_BLOB); + $sth->execute; + + # drop anything that conflicted + $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM xref3 WHERE docid = ? AND oidbin = ? + + $sth->bind_param(1, $drop_docid); + $sth->bind_param(2, $oidbin, SQL_BLOB); + $sth->execute; +} + +sub eidxq_add { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +INSERT OR IGNORE INTO eidxq (docid) VALUES (?) + +} + +sub eidxq_del { + my ($self, $docid) = @_; + $self->dbh->prepare_cached(<<'')->execute($docid); +DELETE FROM eidxq WHERE docid = ? + +} + +# returns true if we're vivifying a message for lei/store that was +# previously external-metadata only +sub vivify_xvmd { + my ($self, $smsg) = @_; + my @docids = $self->blob_exists($smsg->{blob}); + my @vivify_xvmd; + for my $id (@docids) { + if (my $cur = $self->get_art($id)) { + # already indexed if bytes > 0 + return if $cur->{bytes} > 0; + push @vivify_xvmd, $id; + } else { + warn "W: $smsg->{blob} #$id gone (bug?)\n"; + } + } + $smsg->{-vivify_xvmd} = \@vivify_xvmd; +} + +sub fork_ok { + state $fork_ok = eval("v$DBD::SQLite::sqlite_version") ge v3.8.3; + return 1 if $fork_ok; + my ($opt) = @_; + my @j = split(/,/, $opt->{jobs} // ''); + state $warned; + grep { $_ > 1 } @j and $warned //= warn(<<EOM); +DBD::SQLite version is v$DBD::SQLite::sqlite_version, need >= v3.8.3 for --jobs > 1 +EOM + $opt->{jobs} = '1,1'; + undef; } 1; diff --git a/lib/PublicInbox/POP3.pm b/lib/PublicInbox/POP3.pm new file mode 100644 index 00000000..06772069 --- /dev/null +++ b/lib/PublicInbox/POP3.pm @@ -0,0 +1,428 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Each instance of this represents a POP3 client connected to +# public-inbox-{netd,pop3d}. Much of this was taken from IMAP.pm and NNTP.pm +# +# POP3 is one mailbox per-user, so the "USER" command is like the +# format of -imapd and is mapped to $NEWSGROUP.$SLICE (large inboxes +# are sliced into 50K mailboxes in both POP3 and IMAP to avoid overloading +# clients) +# +# Unlike IMAP, the "$NEWSGROUP" mailbox (without $SLICE) is a rolling +# window of the latest messages. We can do this for POP3 since the +# typical POP3 session is short-lived while long-lived IMAP sessions +# would cause slices to grow on the server side without bounds. +# +# Like IMAP, POP3 also has per-session message sequence numbers (MSN), +# which require mapping to UIDs. The offset of an entry into our +# per-client cache is: (MSN-1) +# +# fields: +# - uuid - 16-byte (binary) UUID representation (before successful login) +# - cache - one-dimentional arrayref of (UID, bytesize, oidhex) +# - nr_dele - number of deleted messages +# - expire - string of packed unsigned short offsets +# - user_id - user-ID mapped to UUID (on successful login + lock) +# - txn_max_uid - for storing max deleted UID persistently +# - ibx - PublicInbox::Inbox object +# - slice - unsigned integer slice number (0..Inf), -1 => latest +# - salt - pre-auth for APOP +# - uid_dele - maximum deleted from previous session at login (NNTP ARTICLE) +# - uid_base - base UID for mailbox slice (0-based) (same as IMAP) +package PublicInbox::POP3; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::GitAsyncCat; +use PublicInbox::DS qw(now); +use Errno qw(EAGAIN); +use Digest::MD5 qw(md5); +use PublicInbox::IMAP; # for UID slice stuff + +use constant { + LINE_MAX => 512, # XXX unsure + UID_SLICE => PublicInbox::IMAP::UID_SLICE, +}; + +# XXX FIXME: duplicated stuff from NNTP.pm and IMAP.pm + +sub out ($$;@) { + my ($self, $fmt, @args) = @_; + printf { $self->{pop3d}->{out} } $fmt."\n", @args; +} + +sub do_greet { + my ($self) = @_; + my $s = $self->{salt} = sprintf('%x.%x', int(rand(0x7fffffff)), time); + $self->write("+OK POP3 server ready <$s\@public-inbox>\r\n"); +} + +sub new { + my ($cls, $sock, $pop3d) = @_; + (bless { pop3d => $pop3d }, $cls)->greet($sock) +} + +# POP user is $UUID1@$NEWSGROUP[.$SLICE][?QUERY_ARGS] +sub cmd_user ($$) { + my ($self, $mailbox) = @_; + $self->{salt} // return \"-ERR already authed\r\n"; + $mailbox =~ s/\A([a-f0-9\-]+)\@//i or + return \"-ERR no UUID@ in mailbox name\r\n"; + my $user = $1; + $user =~ tr/-//d; # most have dashes, some (dbus-uuidgen) don't + $user =~ m!\A[a-f0-9]{32}\z!i or return \"-ERR user has no UUID\r\n"; + + my %l; + if ($mailbox =~ s/\?(.*)\z//) { # query args + for (split(/&+/, $1)) { + /\A(initial_limit|limit)=([0-9]+)\z/ and $l{$1} = $2; + } + $self->{limits} = \%l; + } + my $slice = $mailbox =~ s/\.([0-9]+)\z// ? $1 + 0 : undef; + + my $ibx = $self->{pop3d}->{pi_cfg}->lookup_newsgroup($mailbox) // + return \"-ERR $mailbox does not exist\r\n"; + my $uidmax = $self->{uidmax} = $ibx->mm(1)->num_highwater // 0; + if (defined $slice) { + my $max = int($uidmax / UID_SLICE); + my $tip = "$mailbox.$max"; + return \"-ERR $mailbox.$slice does not exist ($tip does)\r\n" + if $slice > $max; + $self->{slice} = $slice; + } else { # latest messages: + $self->{slice} = -1; + } + $self->{ibx} = $ibx; + $self->{uuid} = pack('H*', $user); # deleted by _login_ok + $slice //= '(latest)'; + \"+OK $ibx->{newsgroup} slice=$slice selected\r\n"; +} + +sub _login_ok ($) { + my ($self) = @_; + $self->{pop3d}->lock_mailbox($self) or + return \"-ERR [IN-USE] unable to lock maildrop\r\n"; + + my $l = delete $self->{limits}; + $l = defined($self->{uid_dele}) ? $l->{limit} + : ($l->{initial_limit} // $l->{limit}); + my $uidmax = delete $self->{uidmax}; + if ($self->{slice} >= 0) { + $self->{uid_base} = $self->{slice} * UID_SLICE; + if (defined $l) { # n.b: the last slice is not full: + my $max = int($uidmax/UID_SLICE) == $self->{slice} ? + ($uidmax % UID_SLICE) : UID_SLICE; + my $off = $max - $l; + $self->{uid_base} += $off if $off > 0; + } + } else { # latest $l messages, or 1k if unspecified + my $base = $uidmax - ($l // 1000); + $self->{uid_base} = $base < 0 ? 0 : $base; + } + $self->{uid_max} = $self->{ibx}->over(1)->max; + \"+OK logged in\r\n"; +} + +sub cmd_apop { + my ($self, $mailbox, $hex) = @_; + my $res = cmd_user($self, $mailbox); # sets {uuid} + return $res if substr($$res, 0, 1) eq '-'; + my $s = delete($self->{salt}) // die 'BUG: salt missing'; + return _login_ok($self) if md5("<$s\@public-inbox>anonymous") eq + pack('H*', $hex); + $self->{salt} = $s; + \"-ERR APOP password mismatch\r\n"; +} + +sub cmd_pass { + my ($self, $pass) = @_; + $self->{ibx} // return \"-ERR mailbox unspecified\r\n"; + my $s = delete($self->{salt}) // return \"-ERR already authed\r\n"; + return _login_ok($self) if $pass eq 'anonymous'; + $self->{salt} = $s; + \"-ERR password is not `anonymous'\r\n"; +} + +sub cmd_stls { + my ($self) = @_; + ($self->{sock} // return)->can('stop_SSL') and + return \"-ERR TLS already enabled\r\n"; + $self->{pop3d}->{ssl_ctx_opt} or + return \"-ERR can't start TLS negotiation\r\n"; + $self->write(\"+OK begin TLS negotiation now\r\n"); + PublicInbox::TLS::start($self->{sock}, $self->{pop3d}); + $self->requeue if PublicInbox::DS::accept_tls_step($self); + undef; +} + +sub need_txn ($) { + exists($_[0]->{salt}) ? \"-ERR not in TRANSACTION\r\n" : undef; +} + +sub _stat_cache ($) { + my ($self) = @_; + my ($beg, $end) = (($self->{uid_dele} // -1) + 1, $self->{uid_max}); + PublicInbox::IMAP::uid_clamp($self, \$beg, \$end); + my (@cache, $m); + my $sth = $self->{ibx}->over(1)->dbh->prepare_cached(<<'', undef, 1); +SELECT num,ddd FROM over WHERE num >= ? AND num <= ? +ORDER BY num ASC + + $sth->execute($beg, $end); + my $tot = 0; + while (defined($m = $sth->fetchall_arrayref({}, 1000))) { + for my $x (@$m) { + PublicInbox::Over::load_from_row($x); + push(@cache, $x->{num}, $x->{bytes} + 0, $x->{blob}); + undef $x; # saves ~1.5M memory w/ 50k messages + $tot += $cache[-2]; + } + } + $self->{total_bytes} = $tot; + $self->{cache} = \@cache; +} + +sub cmd_stat { + my ($self) = @_; + my $err; $err = need_txn($self) and return $err; + my $cache = $self->{cache} // _stat_cache($self); + my $nr = @$cache / 3 - ($self->{nr_dele} // 0); + "+OK $nr $self->{total_bytes}\r\n"; +} + +# for LIST and UIDL +sub _list { + my ($desc, $idx, $self, $msn) = @_; + my $err; $err = need_txn($self) and return $err; + my $cache = $self->{cache} // _stat_cache($self); + if (defined $msn) { + my $base_off = ($msn - 1) * 3; + my $val = $cache->[$base_off + $idx] // + return \"-ERR no such message\r\n"; + "+OK $desc listing follows\r\n$msn $val\r\n.\r\n"; + } else { # always +OK, even if no messages + my $res = "+OK $desc listing follows\r\n"; + my $msn = 0; + for (my $i = 0; $i < scalar(@$cache); $i += 3) { + ++$msn; + defined($cache->[$i]) and + $res .= "$msn $cache->[$i + $idx]\r\n"; + } + $res .= ".\r\n"; + } +} + +sub cmd_list { _list('scan', 1, @_) } +sub cmd_uidl { _list('unique-id', 2, @_) } + +sub mark_dele ($$) { + my ($self, $off) = @_; + my $base_off = $off * 3; + my $cache = $self->{cache}; + my $uid = $cache->[$base_off] // return; # already deleted + + my $old = $self->{txn_max_uid} //= $uid; + $self->{txn_max_uid} = $uid if $uid > $old; + + $self->{total_bytes} -= $cache->[$base_off + 1]; + $cache->[$base_off] = undef; # clobber UID + $cache->[$base_off + 1] = undef; # clobber bytes + $cache->[$base_off + 2] = undef; # clobber oidhex + ++$self->{nr_dele}; +} + +sub retr_cb { # called by git->cat_async via ibx_async_cat + my ($bref, $oid, $type, $size, $args) = @_; + my ($self, $off, $top_nr) = @$args; + my $hex = $self->{cache}->[$off * 3 + 2] // + die "BUG: no hex (oid=$oid)"; + if (!defined($type)) { + warn "E: git aborted on $oid / $hex $self->{ibx}->{inboxdir}"; + return $self->close; + } elsif ($type ne 'blob') { + # it's possible to have TOCTOU if an admin runs + # public-inbox-(edit|purge), just move onto the next message + warn "E: $hex missing in $self->{ibx}->{inboxdir}\n"; + $self->write(\"-ERR no such message\r\n"); + return $self->requeue; + } elsif ($hex ne $oid) { + $self->close; + die "BUG: $hex != $oid"; + } + PublicInbox::IMAP::to_crlf_full($bref); + if (defined $top_nr) { + my ($hdr, $bdy) = split(/\r\n\r\n/, $$bref, 2); + $bref = \$hdr; + $hdr .= "\r\n\r\n"; + my @tmp = split(/^/m, $bdy); + $hdr .= join('', splice(@tmp, 0, $top_nr)); + } elsif (exists $self->{expire}) { + $self->{expire} .= pack('S', $off); + } + $$bref =~ s/^\./../gms; + $$bref .= substr($$bref, -2, 2) eq "\r\n" ? ".\r\n" : "\r\n.\r\n"; + $self->msg_more("+OK message follows\r\n"); + $self->write($bref); + $self->requeue; +} + +sub cmd_retr { + my ($self, $msn, $top_nr) = @_; + return \"-ERR lines must be a non-negative number\r\n" if + (defined($top_nr) && $top_nr !~ /\A[0-9]+\z/); + my $err; $err = need_txn($self) and return $err; + my $cache = $self->{cache} // _stat_cache($self); + my $off = $msn - 1; + my $hex = $cache->[$off * 3 + 2] // return \"-ERR no such message\r\n"; + ${ibx_async_cat($self->{ibx}, $hex, \&retr_cb, + [ $self, $off, $top_nr ])}; +} + +sub cmd_noop { $_[0]->write(\"+OK\r\n") } + +sub cmd_rset { + my ($self) = @_; + my $err; $err = need_txn($self) and return $err; + delete $self->{cache}; + delete $self->{txn_max_uid}; + \"+OK\r\n"; +} + +sub cmd_dele { + my ($self, $msn) = @_; + my $err; $err = need_txn($self) and return $err; + $self->{cache} // _stat_cache($self); + $msn =~ /\A[1-9][0-9]*\z/ or return \"-ERR no such message\r\n"; + mark_dele($self, $msn - 1) ? \"+OK\r\n" : \"-ERR no such message\r\n"; +} + +# RFC 2449 +sub cmd_capa { + my ($self) = @_; + my $STLS = !$self->{ibx} && !$self->{sock}->can('stop_SSL') && + $self->{pop3d}->{ssl_ctx_opt} ? "\nSTLS\r" : ''; + $self->{expire} = ''; # "EXPIRE 0" allows clients to avoid DELE commands + <<EOM; ++OK Capability list follows\r +TOP\r +USER\r +PIPELINING\r +UIDL\r +EXPIRE 0\r +RESP-CODES\r$STLS +.\r +EOM +} + +sub close { + my ($self) = @_; + $self->{pop3d}->unlock_mailbox($self); + $self->SUPER::close; +} + +# must be called inside a state_dbh transaction with flock held +sub __cleanup_state { + my ($self, $txn_id) = @_; + my $user_id = $self->{user_id} // die 'BUG: no {user_id}'; + $self->{pop3d}->{-state_dbh}->prepare_cached(<<'')->execute($txn_id); +DELETE FROM deletes WHERE txn_id = ? AND uid_dele = -1 + + my $sth = $self->{pop3d}->{-state_dbh}->prepare_cached(<<'', undef, 1); +SELECT COUNT(*) FROM deletes WHERE user_id = ? + + $sth->execute($user_id); + my $nr = $sth->fetchrow_array; + if ($nr == 0) { + $sth = $self->{pop3d}->{-state_dbh}->prepare_cached(<<''); +DELETE FROM users WHERE user_id = ? + + $sth->execute($user_id); + } + $nr; +} + +sub cmd_quit { + my ($self) = @_; + if (defined(my $txn_id = $self->{txn_id})) { + my $user_id = $self->{user_id} // die 'BUG: no {user_id}'; + if (my $exp = delete $self->{expire}) { + mark_dele($self, $_) for unpack('S*', $exp); + } + my $keep = 1; + my $dbh = $self->{pop3d}->{-state_dbh}; + my $lk = $self->{pop3d}->lock_for_scope; + $dbh->begin_work; + + if (defined(my $max = $self->{txn_max_uid})) { + $dbh->prepare_cached(<<'')->execute($max, $txn_id, $max) +UPDATE deletes SET uid_dele = ? WHERE txn_id = ? AND uid_dele < ? + + } else { + $keep = $self->__cleanup_state($txn_id); + } + $dbh->prepare_cached(<<'')->execute(time, $user_id) if $keep; +UPDATE users SET last_seen = ? WHERE user_id = ? + + $dbh->commit; + # we MUST do txn_id F_UNLCK here inside ->lock_for_scope: + $self->{did_quit} = 1; + $self->{pop3d}->unlock_mailbox($self); + } + $self->write(\"+OK public-inbox POP3 server signing off\r\n"); + $self->shutdn; + undef; +} + +# returns 1 if we can continue, 0 if not due to buffered writes or disconnect +sub process_line ($$) { + my ($self, $l) = @_; + my ($req, @args) = split(/[ \t]+/, $l); + return 1 unless defined($req); # skip blank line + $req = $self->can('cmd_'.lc($req)); + my $res = $req ? eval { $req->($self, @args) } : + \"-ERR command not recognized\r\n"; + my $err = $@; + if ($err && $self->{sock}) { + $l =~ s/\r?\n//s; + warn("error from: $l ($err)\n"); + $res = \"-ERR program fault - command not performed\r\n"; + } + defined($res) ? $self->write($res) : 0; +} + +# callback used by PublicInbox::DS for any (e)poll (in/out/hup/err) +sub event_step { + my ($self) = @_; + local $SIG{__WARN__} = $self->{pop3d}->{warn_cb}; + return unless $self->flush_write && $self->{sock} && !$self->{long_cb}; + + # only read more requests if we've drained the write buffer, + # otherwise we can be buffering infinitely w/o backpressure + my $rbuf = $self->{rbuf} // \(my $x = ''); + my $line = index($$rbuf, "\n"); + while ($line < 0) { + return $self->close if length($$rbuf) >= LINE_MAX; + $self->do_read($rbuf, LINE_MAX, length($$rbuf)) or return; + $line = index($$rbuf, "\n"); + } + $line = substr($$rbuf, 0, $line + 1, ''); + $line =~ s/\r?\n\z//s; + return $self->close if $line =~ /[[:cntrl:]]/s; + my $t0 = now(); + my $fd = fileno($self->{sock}); # may become invalid after process_line + my $r = eval { process_line($self, $line) }; + my $pending = $self->{wbuf} ? ' pending' : ''; + out($self, "[$fd] %s - %0.6f$pending - $r", $line, now() - $t0); + return $self->close if $r < 0; + $self->rbuf_idle($rbuf); + + # maybe there's more pipelined data, or we'll have + # to register it for socket-readiness notifications + $self->requeue unless $pending; +} + +no warnings 'once'; +*cmd_top = \&cmd_retr; + +1; diff --git a/lib/PublicInbox/POP3D.pm b/lib/PublicInbox/POP3D.pm new file mode 100644 index 00000000..bd440434 --- /dev/null +++ b/lib/PublicInbox/POP3D.pm @@ -0,0 +1,277 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# represents an POP3D +package PublicInbox::POP3D; +use v5.12; +use parent qw(PublicInbox::Lock); +use DBI qw(:sql_types); # SQL_BLOB +use Carp (); +use File::Temp 0.19 (); # 0.19 for ->newdir +use PublicInbox::Config; +use PublicInbox::POP3; +use PublicInbox::Syscall; +use File::Temp 0.19 (); # 0.19 for ->newdir +use Fcntl qw(F_SETLK F_UNLCK F_WRLCK SEEK_SET); +my ($FLOCK_TMPL, @FLOCK_ORDER); +# are all BSDs the same "struct flock"? tested Free+Net+Open... +if ($^O =~ /\A(?:linux|dragonfly)\z/ || $^O =~ /bsd/) { + require Config; + my $off_t; + my @LE_pad = ('', ''); + my $sz = $Config::Config{lseeksize}; + if ($sz == 8) { + if (eval('length(pack("q", 1)) == 8')) { + $off_t = 'q'; + } elsif ($Config::Config{byteorder} == 1234) { # OpenBSD i386 + $off_t = 'l'; + @LE_pad = ('@8', '@16'); + } else { # I have no 32-bit BE machine to test on... + warn <<EOM; +Perl built with 64-bit file support but not 64-bit int (pack("q")) support. +byteorder=$Config::Config{byteorder} +EOM + } + } elsif ($sz == 4) { + $off_t = 'l'; + } else { + warn "sizeof(off_t)=$sz requires File::FcntlLock\n" + } + if (defined($off_t)) { + if ($^O eq 'linux') { + $FLOCK_TMPL = 'ss@8'.$off_t.$LE_pad[0].$off_t.'@32'; + @FLOCK_ORDER = qw(l_type l_whence l_start l_len); + } else { # *bsd including dragonfly + $FLOCK_TMPL = $off_t.$LE_pad[0].$off_t.$LE_pad[1]. + 'lss@256'; + @FLOCK_ORDER = qw(l_start l_len l_pid l_type l_whence); + } + } +} +@FLOCK_ORDER or eval { require File::FcntlLock } or + die "File::FcntlLock required for POP3 on $^O: $@\n"; + +sub new { + my ($cls) = @_; + bless { + err => \*STDERR, + out => \*STDOUT, + # pi_cfg => PublicInbox::Config + # lock_path => ... + # interprocess lock is the $pop3state/txn.locks file + # txn_locks => {}, # intraworker locks + # ssl_ctx_opt => { SSL_cert_file => ..., SSL_key_file => ... } + }, $cls; +} + +sub refresh_groups { # PublicInbox::Daemon callback + my ($self, $sig) = @_; + # TODO share pi_cfg with nntpd/imapd inside -netd + my $new = PublicInbox::Config->new; + my $d = $new->{'publicinbox.pop3state'} // + die "publicinbox.pop3state undefined ($new->{-f})\n"; + -d $d or do { + require File::Path; + File::Path::make_path($d, { mode => 0700 }); + PublicInbox::Syscall::nodatacow_dir($d); + }; + $self->{lock_path} //= "$d/db.lock"; + if (my $old = $self->{pi_cfg}) { + my $s = 'publicinbox.pop3state'; + $new->{$s} //= $old->{$s}; + return warn <<EOM if $new->{$s} ne $old->{$s}; +$s changed: `$old->{$s}' => `$new->{$s}', config reload ignored +EOM + } + $self->{pi_cfg} = $new; +} + +# persistent tables +sub create_state_tables ($$) { + my ($self, $dbh) = @_; + + $dbh->do(<<''); # map publicinbox.<name>.newsgroup to integers +CREATE TABLE IF NOT EXISTS newsgroups ( + newsgroup_id INTEGER PRIMARY KEY NOT NULL, + newsgroup VARBINARY NOT NULL, + UNIQUE (newsgroup) ) + + # the $NEWSGROUP_NAME.$SLICE_INDEX is part of the POP3 username; + # POP3 has no concept of folders/mailboxes like IMAP/JMAP + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS mailboxes ( + mailbox_id INTEGER PRIMARY KEY NOT NULL, + newsgroup_id INTEGER NOT NULL REFERENCES newsgroups, + slice INTEGER NOT NULL, /* -1 for most recent slice */ + UNIQUE (newsgroup_id, slice) ) + + $dbh->do(<<''); # actual users are differentiated by their UUID +CREATE TABLE IF NOT EXISTS users ( + user_id INTEGER PRIMARY KEY NOT NULL, + uuid VARBINARY NOT NULL, + last_seen INTEGER NOT NULL, /* to expire idle accounts */ + UNIQUE (uuid) ) + + # we only track the highest-numbered deleted message per-UUID@mailbox + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS deletes ( + txn_id INTEGER PRIMARY KEY NOT NULL, /* -1 == txn lock offset */ + user_id INTEGER NOT NULL REFERENCES users, + mailbox_id INTEGER NOT NULL REFERENCES mailboxes, + uid_dele INTEGER NOT NULL DEFAULT -1, /* IMAP UID, NNTP article */ + UNIQUE(user_id, mailbox_id) ) + +} + +sub state_dbh_new { + my ($self) = @_; + my $f = "$self->{pi_cfg}->{'publicinbox.pop3state'}/db.sqlite3"; + my $creat = !-s $f; + if ($creat) { + open my $fh, '+>>', $f or Carp::croak "open($f): $!"; + PublicInbox::Syscall::nodatacow_fh($fh); + } + + my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', { + AutoCommit => 1, + RaiseError => 1, + PrintError => 0, + sqlite_use_immediate_transaction => 1, + sqlite_see_if_its_a_number => 1, + }); + $dbh->do('PRAGMA journal_mode = WAL') if $creat; + $dbh->do('PRAGMA foreign_keys = ON'); # don't forget this + + # ensure the interprocess fcntl lock file exists + $f = "$self->{pi_cfg}->{'publicinbox.pop3state'}/txn.locks"; + open my $fh, '+>>', $f or Carp::croak("open($f): $!"); + $self->{txn_fh} = $fh; + + create_state_tables($self, $dbh); + $dbh; +} + +sub _setlk ($%) { + my ($self, %lk) = @_; + $lk{l_pid} = 0; # needed for *BSD + $lk{l_whence} = SEEK_SET; + if (@FLOCK_ORDER) { + fcntl($self->{txn_fh}, F_SETLK, + pack($FLOCK_TMPL, @lk{@FLOCK_ORDER})); + } else { + my $fs = File::FcntlLock->new(%lk); + $fs->lock($self->{txn_fh}, F_SETLK); + } +} + +sub lock_mailbox { + my ($self, $pop3) = @_; # pop3 - PublicInbox::POP3 client object + my $lk = $self->lock_for_scope; # lock the SQLite DB, only + my $dbh = $self->{-state_dbh} //= state_dbh_new($self); + my ($user_id, $ngid, $mbid, $txn_id); + my $uuid = delete $pop3->{uuid}; + $dbh->begin_work; + my $creat = 0; + + # 1. make sure the user exists, update `last_seen' + my $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO users (uuid, last_seen) VALUES (?,?) + + $sth->bind_param(1, $uuid, SQL_BLOB); + $sth->bind_param(2, time); + if ($sth->execute == 0) { # existing user + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT user_id FROM users WHERE uuid = ? + + $sth->bind_param(1, $uuid, SQL_BLOB); + $sth->execute; + $user_id = $sth->fetchrow_array // + die 'BUG: user '.unpack('H*', $uuid).' not found'; + $sth = $dbh->prepare_cached(<<''); +UPDATE users SET last_seen = ? WHERE user_id = ? + + $sth->execute(time, $user_id); + } else { # new user + $user_id = $dbh->last_insert_id(undef, undef, + 'users', 'user_id') + } + + # 2. make sure the newsgroup has an integer ID + $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO newsgroups (newsgroup) VALUES (?) + + my $ng = $pop3->{ibx}->{newsgroup}; + $sth->bind_param(1, $ng, SQL_BLOB); + if ($sth->execute == 0) { + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT newsgroup_id FROM newsgroups WHERE newsgroup = ? + + $sth->bind_param(1, $ng, SQL_BLOB); + $sth->execute; + $ngid = $sth->fetchrow_array // die "BUG: `$ng' not found"; + } else { + $ngid = $dbh->last_insert_id(undef, undef, + 'newsgroups', 'newsgroup_id'); + } + + # 3. ensure the mailbox exists + $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO mailboxes (newsgroup_id, slice) VALUES (?,?) + + if ($sth->execute($ngid, $pop3->{slice}) == 0) { + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT mailbox_id FROM mailboxes WHERE newsgroup_id = ? AND slice = ? + + $sth->execute($ngid, $pop3->{slice}); + $mbid = $sth->fetchrow_array // + die "BUG: mailbox_id for $ng.$pop3->{slice} not found"; + } else { + $mbid = $dbh->last_insert_id(undef, undef, + 'mailboxes', 'mailbox_id'); + } + + # 4. ensure the (max) deletes row exists for locking + $sth = $dbh->prepare_cached(<<''); +INSERT OR IGNORE INTO deletes (user_id,mailbox_id) VALUES (?,?) + + if ($sth->execute($user_id, $mbid) == 0) { # fetching into existing + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT txn_id,uid_dele FROM deletes WHERE user_id = ? AND mailbox_id = ? + + $sth->execute($user_id, $mbid); + ($txn_id, $pop3->{uid_dele}) = $sth->fetchrow_array; + } else { # new user/mailbox combo + $txn_id = $dbh->last_insert_id(undef, undef, + 'deletes', 'txn_id'); + } + $dbh->commit; + + # see if it's locked by the same worker: + return if $self->{txn_locks}->{$txn_id}; + + # see if it's locked by another worker: + _setlk($self, l_type => F_WRLCK, l_start => $txn_id - 1, l_len => 1) + or return; + + $pop3->{user_id} = $user_id; + $pop3->{txn_id} = $txn_id; + $self->{txn_locks}->{$txn_id} = 1; +} + +sub unlock_mailbox { + my ($self, $pop3) = @_; + my $txn_id = delete($pop3->{txn_id}) // return; + if (!$pop3->{did_quit}) { # deal with QUIT-less disconnects + my $lk = $self->lock_for_scope; + $self->{-state_dbh}->begin_work; + $pop3->__cleanup_state($txn_id); + $self->{-state_dbh}->commit; + } + delete $self->{txn_locks}->{$txn_id}; # same worker + + # other workers + _setlk($self, l_type => F_UNLCK, l_start => $txn_id - 1, l_len => 1) + or die "F_UNLCK: $!"; +} + +1; diff --git a/lib/PublicInbox/PktOp.pm b/lib/PublicInbox/PktOp.pm new file mode 100644 index 00000000..1bcdd799 --- /dev/null +++ b/lib/PublicInbox/PktOp.pm @@ -0,0 +1,72 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# op dispatch socket, reads a message, runs a sub +# There may be multiple producers, but (for now) only one consumer +# Used for lei_xsearch and maybe other things +# "command" => [ $sub, @fixed_operands ] +package PublicInbox::PktOp; +use v5.12; +use parent qw(PublicInbox::DS); +use Errno qw(EAGAIN ECONNRESET); +use PublicInbox::Syscall qw(EPOLLIN); +use Socket qw(AF_UNIX SOCK_SEQPACKET); +use PublicInbox::IPC qw(ipc_freeze ipc_thaw); +use Scalar::Util qw(blessed); + +sub new { + my ($cls, $r) = @_; + my $self = bless { sock => $r }, $cls; + $r->blocking(0); + $self->SUPER::new($r, EPOLLIN); +} + +# returns a blessed objects as the consumer and producer +sub pair { + my ($cls) = @_; + my ($c, $p); + socketpair($c, $p, AF_UNIX, SOCK_SEQPACKET, 0) or die "socketpair: $!"; + (new($cls, $c), bless { op_p => $p }, $cls); +} + +sub pkt_do { # for the producer to trigger event_step in consumer + my ($self, $cmd, @args) = @_; + send($self->{op_p}, @args ? "$cmd\0".ipc_freeze(\@args) : $cmd, 0) +} + +sub event_step { + my ($self) = @_; + my $c = $self->{sock}; + my $n = recv($c, my $msg, 4096, 0); + unless (defined $n) { + return if $! == EAGAIN; + die "recv: $!" if $! != ECONNRESET; # we may be bidirectional + } + my ($cmd, @pargs); + if (index($msg, "\0") > 0) { + ($cmd, my $pargs) = split(/\0/, $msg, 2); + @pargs = @{ipc_thaw($pargs)}; + } else { + # for compatibility with the script/lei in client mode, + # it doesn't load Sereal||Storable for startup speed + ($cmd, @pargs) = split(/ /, $msg); + } + my $op = $self->{ops}->{$cmd //= $msg}; + if ($op) { + my ($obj, @args) = (@$op, @pargs); + if (blessed($args[0]) && $args[0]->can('do_env')) { + my $lei = shift @args; + $lei->do_env($obj, @args); + } elsif (blessed($obj)) { + $obj->can('do_env') ? $obj->do_env($cmd, @args) + : $obj->$cmd(@args); + } else { + $obj->(@args); + } + } elsif ($msg ne '') { + die "BUG: unknown message: `$cmd'"; + } + $self->close if $msg eq ''; # close on EOF +} + +1; diff --git a/lib/PublicInbox/ProcessPipe.pm b/lib/PublicInbox/ProcessPipe.pm deleted file mode 100644 index 2ce7eb8f..00000000 --- a/lib/PublicInbox/ProcessPipe.pm +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> -# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> - -# a tied handle for auto reaping of children tied to a pipe, see perltie(1) -package PublicInbox::ProcessPipe; -use strict; -use warnings; - -sub TIEHANDLE { - my ($class, $pid, $fh) = @_; - bless { pid => $pid, fh => $fh }, $class; -} - -sub READ { read($_[0]->{fh}, $_[1], $_[2], $_[3] || 0) } - -sub READLINE { readline($_[0]->{fh}) } - -sub CLOSE { - my $fh = delete($_[0]->{fh}); - my $ret = defined $fh ? close($fh) : ''; - my $pid = delete $_[0]->{pid}; - if (defined $pid) { - # PublicInbox::DS may not be loaded - eval { PublicInbox::DS::dwaitpid($pid, undef, undef) }; - - if ($@) { # ok, not in the event loop, work synchronously - waitpid($pid, 0); - $ret = '' if $?; - } - } - $ret; -} - -sub FILENO { fileno($_[0]->{fh}) } - -sub DESTROY { - CLOSE(@_); - undef; -} - -sub pid { $_[0]->{pid} } - -1; diff --git a/lib/PublicInbox/Qspawn.pm b/lib/PublicInbox/Qspawn.pm index 88b6d390..0bf857c6 100644 --- a/lib/PublicInbox/Qspawn.pm +++ b/lib/PublicInbox/Qspawn.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Like most Perl modules in public-inbox, this is internal and @@ -12,21 +12,28 @@ # operate in. This can be useful to ensure smaller inboxes can # be cloned while cloning of large inboxes is maxed out. # -# This does not depend on PublicInbox::DS or any other external -# scheduling mechanism, you just need to call start() and finish() -# appropriately. However, public-inbox-httpd (which uses PublicInbox::DS) -# will be able to schedule this based on readability of stdout from -# the spawned process. See GitHTTPBackend.pm and SolverGit.pm for -# usage examples. It does not depend on any form of threading. +# This does not depend on the PublicInbox::DS::event_loop or any +# other external scheduling mechanism, you just need to call +# start() and finish() appropriately. However, public-inbox-httpd +# (which uses PublicInbox::DS) will be able to schedule this +# based on readability of stdout from the spawned process. +# See GitHTTPBackend.pm and SolverGit.pm for usage examples. +# It does not depend on any form of threading. # # This is useful for scheduling CGI execution of both long-lived # git-http-backend(1) process (for "git clone") as well as short-lived # processes such as git-apply(1). package PublicInbox::Qspawn; -use strict; +use v5.12; use PublicInbox::Spawn qw(popen_rd); use PublicInbox::GzipFilter; +use Scalar::Util qw(blessed); +use PublicInbox::Limiter; +use PublicInbox::Aspawn qw(run_await); +use PublicInbox::Syscall qw(EPOLLIN); +use PublicInbox::InputPipe; +use Carp qw(carp confess); # n.b.: we get EAGAIN with public-inbox-httpd, and EINTR on other PSGI servers use Errno qw(EAGAIN EINTR); @@ -37,57 +44,45 @@ my $def_limiter; # $cmd is the command to spawn # $cmd_env is the environ for the child process (not PSGI env) # $opt can include redirects and perhaps other process spawning options -sub new ($$$;) { +# {qsp_err} is an optional error buffer callers may access themselves +sub new { my ($class, $cmd, $cmd_env, $opt) = @_; - bless { args => [ $cmd, $cmd_env, $opt ] }, $class; + bless { args => [ $cmd, $cmd_env, $opt ? { %$opt } : {} ] }, $class; } sub _do_spawn { my ($self, $start_cb, $limiter) = @_; - my $err; - my ($cmd, $cmd_env, $opt) = @{delete $self->{args}}; + my ($cmd, $cmd_env, $opt) = @{$self->{args}}; my %o = %{$opt || {}}; $self->{limiter} = $limiter; - foreach my $k (@PublicInbox::Spawn::RLIMITS) { - if (defined(my $rlimit = $limiter->{$k})) { - $o{$k} = $rlimit; - } + for my $k (@PublicInbox::Spawn::RLIMITS) { + $opt->{$k} = $limiter->{$k} // next; + } + $self->{-quiet} = 1 if $o{quiet}; + $limiter->{running}++; + if ($start_cb) { + eval { # popen_rd may die on EMFILE, ENFILE + $self->{rpipe} = popen_rd($cmd, $cmd_env, $opt, + \&waitpid_err, $self); + $start_cb->($self); # EPOLL_CTL_ADD may ENOSPC/ENOMEM + }; + } else { + eval { run_await($cmd, $cmd_env, $opt, \&wait_await, $self) }; + warn "E: $@" if $@; } - $self->{cmd} = $o{quiet} ? undef : $cmd; - eval { - # popen_rd may die on EMFILE, ENFILE - ($self->{rpipe}, $self->{pid}) = popen_rd($cmd, $cmd_env, \%o); - - die "E: $!" unless defined($self->{pid}); - - $limiter->{running}++; - $start_cb->($self); # EPOLL_CTL_ADD may ENOSPC/ENOMEM - }; finish($self, $@) if $@; } -sub child_err ($) { - my ($child_error) = @_; # typically $? - my $exitstatus = ($child_error >> 8) or return; - my $sig = $child_error & 127; - my $msg = "exit status=$exitstatus"; - $msg .= " signal=$sig" if $sig; - $msg; -} - -sub log_err ($$) { - my ($env, $msg) = @_; - $env->{'psgi.errors'}->print($msg, "\n"); +sub psgi_status_err { # Qspawn itself is useful w/o PSGI + require PublicInbox::WwwStatic; + PublicInbox::WwwStatic::r($_[0] // 500); } -sub finalize ($$) { - my ($self, $err) = @_; - - my ($env, $qx_cb, $qx_arg, $qx_buf) = - delete @$self{qw(psgi_env qx_cb qx_arg qx_buf)}; +sub finalize ($) { + my ($self) = @_; - # done, spawn whatever's in the queue - my $limiter = $self->{limiter}; + # process is done, spawn whatever's in the queue + my $limiter = delete $self->{limiter} or return; my $running = --$limiter->{running}; if ($running < $limiter->{max}) { @@ -95,61 +90,69 @@ sub finalize ($$) { _do_spawn(@$next, $limiter); } } - - if ($err) { - if (defined $self->{err}) { - $self->{err} .= "; $err"; - } else { - $self->{err} = $err; - } - if ($env && $self->{cmd}) { - log_err($env, join(' ', @{$self->{cmd}}) . ": $err"); + if (my $err = $self->{_err}) { # set by finish or waitpid_err + utf8::decode($err); + if (my $dst = $self->{qsp_err}) { + $$dst .= $$dst ? " $err" : "; $err"; } + warn "E: @{$self->{args}->[0]}: $err\n" if !$self->{-quiet}; } - if ($qx_cb) { - eval { $qx_cb->($qx_buf, $qx_arg) }; - } elsif (my $wcb = delete $env->{'qspawn.wcb'}) { + + my ($env, $qx_cb_arg) = delete @$self{qw(psgi_env qx_cb_arg)}; + if ($qx_cb_arg) { + my $cb = shift @$qx_cb_arg; + eval { $cb->($self->{args}->[2]->{1}, @$qx_cb_arg) }; + return unless $@; + warn "E: $@"; # hope qspawn.wcb can handle it + } + return if $self->{passed}; # another command chained it + if (my $wcb = delete $env->{'qspawn.wcb'}) { # have we started writing, yet? - require PublicInbox::WwwStatic; - $wcb->(PublicInbox::WwwStatic::r(500)); + $wcb->(psgi_status_err($env->{'qspawn.fallback'})); } } -# callback for dwaitpid -sub waitpid_err ($$) { - my ($self, $pid) = @_; - my $xpid = delete $self->{pid}; - my $err; - if (defined $pid) { - if ($pid > 0) { # success! - $err = child_err($?); - } elsif ($pid < 0) { # ??? does this happen in our case? - $err = "W: waitpid($xpid, 0) => $pid: $!"; - } # else should not be called with pid == 0 +sub waitpid_err { # callback for awaitpid + my (undef, $self) = @_; # $_[0]: pid + $self->{_err} = ''; # for defined check in ->finish + if ($?) { # XXX this may be redundant + my $status = $? >> 8; + my $sig = $? & 127; + $self->{_err} .= "exit status=$status"; + $self->{_err} .= " signal=$sig" if $sig; } - finalize($self, $err); + finalize($self) if !$self->{rpipe}; } -sub do_waitpid ($) { - my ($self) = @_; - my $pid = $self->{pid}; - # PublicInbox::DS may not be loaded - eval { PublicInbox::DS::dwaitpid($pid, \&waitpid_err, $self) }; - # done if we're running in PublicInbox::DS::EventLoop - if ($@) { - # non public-inbox-{httpd,nntpd} callers may block: - my $ret = waitpid($pid, 0); - waitpid_err($self, $ret); - } +sub wait_await { # run_await cb + my ($pid, $cmd, $cmd_env, $opt, $self) = @_; + waitpid_err($pid, $self); +} + +sub yield_chunk { # $_[-1] is sysread buffer (or undef) + my ($self, $ipipe) = @_; + if (!defined($_[-1])) { + warn "error reading body: $!"; + } elsif ($_[-1] eq '') { # normal EOF + $self->finish; + $self->{qfh}->close; + } elsif (defined($self->{qfh}->write($_[-1]))) { + return; # continue while HTTP client is reading our writes + } # else { # HTTP client disconnected + delete $self->{rpipe}; + $ipipe->close; } sub finish ($;$) { my ($self, $err) = @_; - if (delete $self->{rpipe}) { - do_waitpid($self); - } else { - finalize($self, $err); - } + $self->{_err} //= $err; # only for $@ + + # we can safely finalize if pipe was closed before, or if + # {_err} is defined by waitpid_err. Deleting {rpipe} will + # trigger PublicInbox::IO::DESTROY -> waitpid_err, + # but it may not fire right away if inside the event loop. + my $closed_before = !delete($self->{rpipe}); + finalize($self) if $closed_before || defined($self->{_err}); } sub start ($$$) { @@ -161,138 +164,92 @@ sub start ($$$) { } } -sub psgi_qx_init_cb { - my ($self) = @_; - my $async = delete $self->{async}; - my ($r, $buf); - my $qx_fh = $self->{qx_fh}; -reread: - $r = sysread($self->{rpipe}, $buf, 65536); - if ($async) { - $async->async_pass($self->{psgi_env}->{'psgix.io'}, - $qx_fh, \$buf); - } elsif (defined $r) { - $r ? (print $qx_fh $buf) : event_step($self, undef); - } else { - return if $! == EAGAIN; # try again when notified - goto reread if $! == EINTR; - event_step($self, $!); - } -} - -sub psgi_qx_start { - my ($self) = @_; - if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) { - # PublicInbox::HTTPD::Async->new(rpipe, $cb, cb_arg, $end_obj) - $self->{async} = $async->($self->{rpipe}, - \&psgi_qx_init_cb, $self, $self); - # init_cb will call ->async_pass or ->close - } else { # generic PSGI - psgi_qx_init_cb($self) while $self->{qx_fh}; - } -} - -# Similar to `backtick` or "qx" ("perldoc -f qx"), it calls $qx_cb with +# Similar to `backtick` or "qx" ("perldoc -f qx"), it calls @qx_cb_arg with # the stdout of the given command when done; but respects the given limiter # $env is the PSGI env. As with ``/qx; only use this when output is small # and safe to slurp. sub psgi_qx { - my ($self, $env, $limiter, $qx_cb, $qx_arg) = @_; + my ($self, $env, $limiter, @qx_cb_arg) = @_; $self->{psgi_env} = $env; - my $qx_buf = ''; - open(my $qx_fh, '+>', \$qx_buf) or die; # PerlIO::scalar - $self->{qx_cb} = $qx_cb; - $self->{qx_arg} = $qx_arg; - $self->{qx_fh} = $qx_fh; - $self->{qx_buf} = \$qx_buf; - $limiter ||= $def_limiter ||= PublicInbox::Qspawn::Limiter->new(32); - start($self, $limiter, \&psgi_qx_start); + $self->{qx_cb_arg} = \@qx_cb_arg; + $limiter ||= $def_limiter ||= PublicInbox::Limiter->new(32); + start($self, $limiter, undef); } -# this is called on pipe EOF to reap the process, may be called -# via PublicInbox::DS event loop OR via GetlineBody for generic -# PSGI servers. -sub event_step { - my ($self, $err) = @_; # $err: $! - log_err($self->{psgi_env}, "psgi_{return,qx} $err") if defined($err); - finish($self); - my ($fh, $qx_fh) = delete(@$self{qw(fh qx_fh)}); - $fh->close if $fh; # async-only (psgi_return) -} +sub yield_pass { + my ($self, $ipipe, $res) = @_; # $ipipe = InputPipe + my $env = $self->{psgi_env}; + my $wcb = delete $env->{'qspawn.wcb'} // confess('BUG: no qspawn.wcb'); + if (ref($res) eq 'CODE') { # chain another command + delete $self->{rpipe}; + $ipipe->close if $ipipe; + $res->($wcb); + $self->{passed} = 1; + return; # all done + } + confess("BUG: $res unhandled") if ref($res) ne 'ARRAY'; -sub rd_hdr ($) { - my ($self) = @_; - # typically used for reading CGI headers - # we must loop until EAGAIN for EPOLLET in HTTPD/Async.pm - # We also need to check EINTR for generic PSGI servers. - my $ret; - my $total_rd = 0; - my $hdr_buf = $self->{hdr_buf}; - my ($ph_cb, $ph_arg) = @{$self->{parse_hdr}}; - do { - my $r = sysread($self->{rpipe}, $$hdr_buf, 4096, - length($$hdr_buf)); - if (defined($r)) { - $total_rd += $r; - eval { $ret = $ph_cb->($total_rd, $hdr_buf, $ph_arg) }; - if ($@) { - log_err($self->{psgi_env}, "parse_hdr: $@"); - $ret = [ 500, [], [ "Internal error\n" ] ]; - } - } else { - # caller should notify us when it's ready: - return if $! == EAGAIN; - next if $! == EINTR; # immediate retry - log_err($self->{psgi_env}, "error reading header: $!"); - $ret = [ 500, [], [ "Internal error\n" ] ]; - } - } until (defined $ret); - delete $self->{parse_hdr}; # done parsing headers - $ret; + my $filter = blessed($res->[2]) && $res->[2]->can('attach') ? + pop(@$res) : delete($env->{'qspawn.filter'}); + $filter //= PublicInbox::GzipFilter::qsp_maybe($res->[1], $env); + + if (scalar(@$res) == 3) { # done early (likely error or static file) + delete $self->{rpipe}; + $ipipe->close if $ipipe; + $wcb->($res); # all done + return; + } + scalar(@$res) == 2 or confess("BUG: scalar(res) != 2: @$res"); + return ($wcb, $filter) if !$ipipe; # generic PSGI + # streaming response + my $qfh = $wcb->($res); # get PublicInbox::HTTP::(Chunked|Identity) + $qfh = $filter->attach($qfh) if $filter; + my ($bref) = @{delete $self->{yield_parse_hdr}}; + $qfh->write($$bref) if $$bref ne ''; + $self->{qfh} = $qfh; # keep $ipipe open } -sub psgi_return_init_cb { +sub parse_hdr_done ($$) { my ($self) = @_; - my $r = rd_hdr($self) or return; - my $env = $self->{psgi_env}; - my $filter = delete $env->{'qspawn.filter'} // - PublicInbox::GzipFilter::qsp_maybe($r->[1], $env); - - my $wcb = delete $env->{'qspawn.wcb'}; - my $async = delete $self->{async}; - if (scalar(@$r) == 3) { # error - if ($async) { - # calls rpipe->close && ->event_step - $async->close; - } else { - $self->{rpipe}->close; - event_step($self); + my ($ret, $err); + if (defined $_[-1]) { + my ($bref, $ph_cb, @ph_arg) = @{$self->{yield_parse_hdr}}; + $$bref .= $_[-1]; + $ret = eval { $ph_cb->(length($_[-1]), $bref, @ph_arg) }; + if (($err = $@)) { + $ret = psgi_status_err(); + } elsif (!$ret && $_[-1] eq '') { + $err = 'EOF'; + $ret = psgi_status_err(); } - $wcb->($r); - } elsif ($async) { - # done reading headers, handoff to read body - my $fh = $wcb->($r); # scalar @$r == 2 - $fh = $filter->attach($fh) if $filter; - $self->{fh} = $fh; - $async->async_pass($env->{'psgix.io'}, $fh, - delete($self->{hdr_buf})); - } else { # for synchronous PSGI servers - require PublicInbox::GetlineBody; - $r->[2] = PublicInbox::GetlineBody->new($self->{rpipe}, - \&event_step, $self, - ${$self->{hdr_buf}}, $filter); - $wcb->($r); + } else { + $err = "$!"; + $ret = psgi_status_err(); } + carp <<EOM if $err; +E: $err @{$self->{args}->[0]} ($self->{psgi_env}->{REQUEST_URI}) +EOM + $ret; # undef if headers incomplete } -sub psgi_return_start { # may run later, much later... +sub ipipe_cb { # InputPipe callback + my ($ipipe, $self) = @_; # $_[-1] rbuf + if ($self->{qfh}) { # already streaming + yield_chunk($self, $ipipe, $_[-1]); + } elsif (my $res = parse_hdr_done($self, $_[-1])) { + yield_pass($self, $ipipe, $res); + } # else: headers incomplete, keep reading +} + +sub _yield_start { # may run later, much later... my ($self) = @_; - if (my $async = $self->{psgi_env}->{'pi-httpd.async'}) { - # PublicInbox::HTTPD::Async->new(rpipe, $cb, $cb_arg, $end_obj) - $self->{async} = $async->($self->{rpipe}, - \&psgi_return_init_cb, $self, $self); - } else { # generic PSGI - psgi_return_init_cb($self) while $self->{parse_hdr}; + if ($self->{psgi_env}->{'pi-httpd.async'}) { + my $rpipe = $self->{rpipe}; + $rpipe->blocking(0); + PublicInbox::InputPipe::consume($rpipe, \&ipipe_cb, $self); + } else { + require PublicInbox::GetlineResponse; + PublicInbox::GetlineResponse::response($self); } } @@ -303,7 +260,7 @@ sub psgi_return_start { # may run later, much later... # $env->{'qspawn.wcb'} - the write callback from the PSGI server # optional, use this if you've already # captured it elsewhere. If not given, -# psgi_return will return an anonymous +# psgi_yield will return an anonymous # sub for the PSGI server to call # # $env->{'qspawn.filter'} - filter object, responds to ->attach for @@ -312,76 +269,33 @@ sub psgi_return_start { # may run later, much later... # # $limiter - the Limiter object to use (uses the def_limiter if not given) # -# $parse_hdr - Initial read function; often for parsing CGI header output. +# @parse_hdr_arg - Initial read cb+args; often for parsing CGI header output. # It will be given the return value of sysread from the pipe # and a string ref of the current buffer. Returns an arrayref # for PSGI responses. 2-element arrays in PSGI mean the # body will be streamed, later, via writes (push-based) to # psgix.io. 3-element arrays means the body is available # immediately (or streamed via ->getline (pull-based)). -sub psgi_return { - my ($self, $env, $limiter, $parse_hdr, $hdr_arg) = @_; + +sub psgi_yield { + my ($self, $env, $limiter, @parse_hdr_arg)= @_; $self->{psgi_env} = $env; - $self->{hdr_buf} = \(my $hdr_buf = ''); - $self->{parse_hdr} = [ $parse_hdr, $hdr_arg ]; - $limiter ||= $def_limiter ||= PublicInbox::Qspawn::Limiter->new(32); + $self->{yield_parse_hdr} = [ \(my $buf = ''), @parse_hdr_arg ]; + $limiter ||= $def_limiter ||= PublicInbox::Limiter->new(32); # the caller already captured the PSGI write callback from # the PSGI server, so we can call ->start, here: - $env->{'qspawn.wcb'} and - return start($self, $limiter, \&psgi_return_start); - - # the caller will return this sub to the PSGI server, so - # it can set the response callback (that is, for - # PublicInbox::HTTP, the chunked_wcb or identity_wcb callback), - # but other HTTP servers are supported: - sub { + $env->{'qspawn.wcb'} ? start($self, $limiter, \&_yield_start) : sub { + # the caller will return this sub to the PSGI server, so + # it can set the response callback (that is, for + # PublicInbox::HTTP, the chunked_wcb or identity_wcb callback), + # but other HTTP servers are supported: $env->{'qspawn.wcb'} = $_[0]; - start($self, $limiter, \&psgi_return_start); + start($self, $limiter, \&_yield_start); } } -package PublicInbox::Qspawn::Limiter; -use strict; -use warnings; - -sub new { - my ($class, $max) = @_; - bless { - # 32 is same as the git-daemon connection limit - max => $max || 32, - running => 0, - run_queue => [], - # RLIMIT_CPU => undef, - # RLIMIT_DATA => undef, - # RLIMIT_CORE => undef, - }, $class; -} - -sub setup_rlimit { - my ($self, $name, $config) = @_; - foreach my $rlim (@PublicInbox::Spawn::RLIMITS) { - my $k = lc($rlim); - $k =~ tr/_//d; - $k = "publicinboxlimiter.$name.$k"; - defined(my $v = $config->{$k}) or next; - my @rlimit = split(/\s*,\s*/, $v); - if (scalar(@rlimit) == 1) { - push @rlimit, $rlimit[0]; - } elsif (scalar(@rlimit) != 2) { - warn "could not parse $k: $v\n"; - } - eval { require BSD::Resource }; - if ($@) { - warn "BSD::Resource missing for $rlim"; - next; - } - foreach my $i (0..$#rlimit) { - next if $rlimit[$i] ne 'INFINITY'; - $rlimit[$i] = BSD::Resource::RLIM_INFINITY(); - } - $self->{$rlim} = \@rlimit; - } -} +no warnings 'once'; +*DESTROY = \&finalize; # ->finalize is idempotent 1; diff --git a/lib/PublicInbox/Reply.pm b/lib/PublicInbox/Reply.pm index 5058ff8c..091f20bc 100644 --- a/lib/PublicInbox/Reply.pm +++ b/lib/PublicInbox/Reply.pm @@ -1,23 +1,17 @@ -# Copyright (C) 2014-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # For reply instructions and address generation in WWW UI package PublicInbox::Reply; use strict; -use warnings; +use v5.10.1; use URI::Escape qw/uri_escape_utf8/; use PublicInbox::Hval qw(ascii_html obfuscate_addrs mid_href); use PublicInbox::Address; use PublicInbox::MID qw(mid_clean); +use PublicInbox::Config; -sub squote_maybe ($) { - my ($val) = @_; - if ($val =~ m{([^\w@\./,\%\+\-])}) { - $val =~ s/(['!])/'\\$1'/g; # '!' for csh - return "'$val'"; - } - $val; -} +*squote_maybe = \&PublicInbox::Config::squote_maybe; sub add_addrs { my ($to, $cc, @addrs) = @_; @@ -34,7 +28,7 @@ my $reply_headers = join('|', @reply_headers); sub mailto_arg_link { my ($ibx, $hdr) = @_; my $cc = {}; # everyone else - my $to; # this is the From address by defaultq + my $to; # this is the From address by default my $reply_to_all = 'reply-to-all'; # the only good default :P my $reply_to_cfg = $ibx->{replyto}; @@ -77,6 +71,7 @@ sub mailto_arg_link { my $mid = $hdr->header_raw('Message-ID'); push @arg, '--in-reply-to='.squote_maybe(mid_clean($mid)); my $irt = mid_href($mid); + add_addrs(\$to, $cc, $ibx->{-primary_address}) unless defined($to); delete $cc->{$to}; if ($obfs) { my $arg_to = $to; @@ -85,7 +80,6 @@ sub mailto_arg_link { # no $subj for $href below } else { push @arg, "--to=$to"; - $to = uri_escape_utf8($to); $subj = uri_escape_utf8($subj); } my @cc = sort values %$cc; @@ -108,6 +102,10 @@ sub mailto_arg_link { # anyways. return (\@arg, '', $reply_to_all) if $obfs; + # keep `@' instead of using `%40' for RFC 6068 + utf8::encode($to); + $to =~ s!([^A-Za-z0-9\-\._~\@])!$URI::Escape::escapes{$1}!ge; + # order matters, Subject is the least important header, # so it is last in case it's lost/truncated in a copy+paste my $href = "mailto:$to?In-Reply-To=$irt${cc}&Subject=$subj"; diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm new file mode 100644 index 00000000..eb0ed3c7 --- /dev/null +++ b/lib/PublicInbox/RepoAtom.pm @@ -0,0 +1,126 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# git log => Atom feed (cgit-compatible: $REPO/atom/[PATH]?h=$tip +package PublicInbox::RepoAtom; +use v5.12; +use parent qw(PublicInbox::GzipFilter); +use POSIX qw(strftime); +use URI::Escape qw(uri_escape); +use Scalar::Util (); +use PublicInbox::Hval qw(ascii_html utf8_maybe); + +# git for-each-ref and log use different format fields :< +my $ATOM_FMT = '--pretty=tformat:'.join('%n', + map { "%$_" } qw(H ct an ae at s b)).'%x00'; + +my $EACH_REF_FMT = '--format='.join(';', map { "\$r{'$_'}=%($_)" } qw( + objectname refname:short creator contents:subject contents:body + *subject *body)).'%00'; + +sub atom_ok { # parse_hdr for qspawn + my ($r, $bref, $ctx) = @_; + return [ 404, [], [ "Not Found\n"] ] if $r == 0; + bless $ctx, __PACKAGE__; + my $h = [ 'Content-Type' => 'application/atom+xml; charset=UTF-8' ]; + $ctx->{gz} = $ctx->can('gz_or_noop')->($h, $ctx->{env}); + my $title = ascii_html(delete $ctx->{-feed_title}); + my $desc = ascii_html($ctx->{git}->description); + my $url = ascii_html($ctx->{git}->base_url($ctx->{env})); + $ctx->{-base_url} = $url; + $ctx->zmore(<<EOM); +<?xml version="1.0"?> +<feed xmlns="http://www.w3.org/2005/Atom"> +<title>$title</title><subtitle>$desc</subtitle><link +rel="alternate" type="text/html" href="$url"/> +EOM + [ 200, $h, $ctx ]; # [2] is qspawn.filter +} + +# called by GzipFilter->close +sub zflush { $_[0]->SUPER::zflush('</feed>') } + +# called by GzipFilter->write or GetlineResponse->getline +sub translate { + my $self = shift; + $_[0] // return zflush($self); # getline caller + my @out; + my $lbuf = delete($self->{lbuf}) // shift; + $lbuf .= shift while @_; + my $is_tag = $self->{-is_tag}; + my ($H, $ct, $an, $ae, $at, $s, $bdy); + while ($lbuf =~ s/\A([^\0]+)\0\n//s) { + utf8_maybe($bdy = $1); + if ($is_tag) { + my %r; + eval "$bdy"; + for (qw(contents:subject contents:body)) { + $r{$_} =~ /\S/ or delete($r{$_}) + } + $H = $r{objectname}; + $s = $r{'contents:subject'} // $r{'*subject'}; + $bdy = $r{'contents:body'} // $r{'*body'}; + $s .= " ($r{'refname:short'})"; + $_ = ascii_html($_) for ($s, $bdy, $r{creator}); + ($an, $ae, $at) = split(/\s*&[gl]t;\s*/, $r{creator}); + $at =~ s/ .*\z//; # no TZ + $ct = $at = strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($at)); + } else { + $bdy = ascii_html($bdy); + ($H, $ct, $an, $ae, $at, $s, $bdy) = + split(/\n/, $bdy, 7); + $at = strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($at)); + $ct = strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($ct)); + } + $bdy //= ''; + push @out, <<""; +<entry><title>$s</title><updated>$ct</updated><author><name>$an</name> +<email>$ae</email></author><published>$at</published><link +rel="alternate" type="text/html" href="$self->{-base_url}$H/s/" +/><id>$H</id> + + push @out, <<'', $bdy, '</pre></div></content>' if $bdy ne ''; +<content type="xhtml"><div +xmlns="http://www.w3.org/1999/xhtml"><pre style="white-space:pre-wrap"> + + push @out, '</entry>'; + } + $self->{lbuf} = $lbuf; + chomp @out; + @out ? $self->SUPER::translate(@out) : ''; # not EOF, yet +} + +# $REPO/tags.atom endpoint +sub srv_tags_atom { + my ($ctx) = @_; + my $max = 50; # TODO configurable + my $cmd = $ctx->{git}->cmd(qw(for-each-ref --sort=-creatordate), + "--count=$max", '--perl', $EACH_REF_FMT, 'refs/tags'); + $ctx->{-feed_title} = "$ctx->{git}->{nick} tags"; + my $qsp = PublicInbox::Qspawn->new($cmd); + $ctx->{-is_tag} = 1; + $qsp->psgi_yield($ctx->{env}, undef, \&atom_ok, $ctx); +} + +sub srv_atom { + my ($ctx, $path) = @_; + return if index($path, '//') >= 0 || index($path, '/') == 0; + my $max = 50; # TODO configurable + my $cmd = $ctx->{git}->cmd(qw(log --no-notes --no-color --no-abbrev), + $ATOM_FMT, "-$max"); + my $tip = $ctx->{qp}->{h}; # same as cgit + $ctx->{-feed_title} = $ctx->{git}->{nick}; + $ctx->{-feed_title} .= " $path" if $path ne ''; + if (defined($tip)) { + push @$cmd, $tip; + $ctx->{-feed_title} .= ", $tip"; + } + # else: let git decide based on HEAD if $tip isn't defined + push @$cmd, '--'; + push @$cmd, $path if $path ne ''; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, + { quiet => 1, 2 => $ctx->{lh} }); + $qsp->psgi_yield($ctx->{env}, undef, \&atom_ok, $ctx); +} + +1; diff --git a/lib/PublicInbox/RepoList.pm b/lib/PublicInbox/RepoList.pm new file mode 100644 index 00000000..39dc9c0b --- /dev/null +++ b/lib/PublicInbox/RepoList.pm @@ -0,0 +1,39 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::RepoList; +use v5.12; +use parent qw(PublicInbox::WwwStream); +use PublicInbox::Hval qw(ascii_html prurl fmt_ts); +require PublicInbox::CodeSearch; + +sub html_top_fallback { # WwwStream->html_repo_top + my ($ctx) = @_; + my $title = delete($ctx->{-title_html}) // + ascii_html("$ctx->{env}->{PATH_INFO}*"); + my $upfx = $ctx->{-upfx} // ''; + "<html><head><title>$title</title>" . + $ctx->{www}->style($upfx) . '</head><body>'; +} + +sub html ($$$) { + my ($wcr, $ctx, $re) = @_; + my $cr = $wcr->{pi_cfg}->{-coderepos}; + my @nicks = grep(m!$re!, keys %$cr) or return; # 404 + __PACKAGE__->html_init($ctx); + my $zfh = $ctx->zfh; + print $zfh "<pre>matching coderepos\n"; + my @recs = PublicInbox::CodeSearch::repos_sorted($wcr->{pi_cfg}, + @$cr{@nicks}); + my $env = $ctx->{env}; + for (@recs) { + my ($t, $git) = @$_; + my $nick = ascii_html("$git->{nick}"); + for my $u ($git->pub_urls($env)) { + $u = prurl($env, $u); + print $zfh "\n".fmt_ts($t).qq{ <a\nhref="$u">$nick</a>} + } + } + $ctx->html_done('</pre>'); +} + +1; diff --git a/lib/PublicInbox/RepoSnapshot.pm b/lib/PublicInbox/RepoSnapshot.pm new file mode 100644 index 00000000..bff97bc8 --- /dev/null +++ b/lib/PublicInbox/RepoSnapshot.pm @@ -0,0 +1,85 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# cgit-compatible /snapshot/ endpoint for WWW coderepos +package PublicInbox::RepoSnapshot; +use v5.12; +use PublicInbox::Qspawn; +use PublicInbox::ViewVCS; +use PublicInbox::WwwStatic qw(r); + +# Not using standard mime types since the compressed tarballs are +# special or do not match my /etc/mime.types. Choose what gitweb +# and cgit agree on for compatibility. +our %FMT_TYPES = ( + 'tar' => 'application/x-tar', + 'tar.gz' => 'application/x-gzip', + 'tar.bz2' => 'application/x-bzip2', + 'tar.xz' => 'application/x-xz', + 'zip' => 'application/x-zip', +); + +our %FMT_CFG = ( + 'tar.xz' => 'xz -c', + 'tar.bz2' => 'bzip2 -c', + # not supporting lz nor zstd for now to avoid format proliferation + # and increased cache overhead required to handle extra formats. +); + +my $SUFFIX = join('|', map { quotemeta } keys %FMT_TYPES); + +# TODO deal with tagged blobs + +sub archive_hdr { # parse_hdr for Qspawn + my ($r, $bref, $ctx) = @_; + $r or return [500, [qw(Content-Type text/plain Content-Length 0)], []]; + my $fn = "$ctx->{snap_pfx}.$ctx->{snap_fmt}"; + my $type = $FMT_TYPES{$ctx->{snap_fmt}} // + die "BUG: bad fmt: $ctx->{snap_fmt}"; + [ 200, [ 'Content-Type', "$type; charset=UTF-8", + 'Content-Disposition', qq(inline; filename="$fn"), + 'ETag', qq("$ctx->{etag}") ] ]; +} + +sub ver_check { # git->check_async callback + my (undef, $oid, $type, $size, $ctx) = @_; + return if defined $ctx->{etag}; + my $treeish = shift @{$ctx->{-try}} // die 'BUG: no {-try}'; + if ($type eq 'missing') { + scalar(@{$ctx->{-try}}) or + delete($ctx->{env}->{'qspawn.wcb'})->(r(404)); + } else { # found, done: + $ctx->{etag} = $oid; + my $cmd = $ctx->{git}->cmd; + if (my $cmd = $FMT_CFG{$ctx->{snap_fmt}}) { + push @$cmd, '-c', "tar.$ctx->{snap_fmt}.command=$cmd"; + } + push @$cmd, 'archive', "--prefix=$ctx->{snap_pfx}/", + "--format=$ctx->{snap_fmt}", $treeish; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, { quiet => 1 }); + $qsp->psgi_yield($ctx->{env}, undef, \&archive_hdr, $ctx); + } +} + +sub srv { + my ($ctx, $fn) = @_; + return if $fn =~ /["\s]/s; + my $fmt = $ctx->{wcr}->{snapshots}; # TODO per-repo snapshots + $fn =~ s/\.($SUFFIX)\z//o and $fmt->{$1} or return; + $ctx->{snap_fmt} = $1; + my $pfx = $ctx->{git}->local_nick // return; + $pfx =~ s/(?:\.git)?\z/-/; + ($pfx) = ($pfx =~ m!([^/]+)\z!); + substr($fn, 0, length($pfx)) eq $pfx or return; + $ctx->{snap_pfx} = $fn; + my $v = $ctx->{snap_ver} = substr($fn, length($pfx), length($fn)); + # try without [vV] prefix, first + my @try = map { "$_$v" } ('', 'v', 'V'); # cf. cgit:ui-snapshot.c + @{$ctx->{-try}} = @try; + sub { + $ctx->{env}->{'qspawn.wcb'} = $_[0]; + PublicInbox::ViewVCS::do_check_async($ctx, \&ver_check, @try); + } +} + +1; diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm new file mode 100644 index 00000000..4c85f9a8 --- /dev/null +++ b/lib/PublicInbox/RepoTree.pm @@ -0,0 +1,99 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# cgit-compatible $REPO/tree/[PATH]?h=$tip redirector +package PublicInbox::RepoTree; +use v5.12; +use PublicInbox::ViewDiff qw(uri_escape_path); +use PublicInbox::WwwStatic qw(r); +use PublicInbox::Qspawn; +use PublicInbox::WwwStream qw(html_oneshot); +use PublicInbox::Hval qw(ascii_html utf8_maybe); + +sub rd_404_log { + my ($bref, $ctx) = @_; + my $path = $ctx->{-q_value_html} = ascii_html($ctx->{-path}); + my $tip = 'HEAD'; + $tip = ascii_html($ctx->{qp}->{h}) if defined($ctx->{qp}->{h}); + PublicInbox::WwwStream::html_init($ctx); + my $zfh = $ctx->{zfh}; + print $zfh "<pre>\$ git log -1 $tip -- $path\n"; + my $code = 200; + if ($$bref eq '') { + say $zfh "found no record of `$path' in git history in `$tip'"; + $ctx->{-has_srch} and + say $zfh 'perhaps try searching mail (above)'; + $code = 404; + } else { + my ($H, $h, $s_as) = split(/ /, $$bref, 3); + utf8_maybe($s_as); + my $x = uri_escape_path($ctx->{-path}); + $s_as = ascii_html($s_as); + print $zfh <<EOM; +found last record of `$path' in the following commit: + +<a href="$ctx->{-upfx}$H/s/?b=$x">$h</a> $s_as +EOM + } + my $res = $ctx->html_done; + $res->[0] = $code; + delete($ctx->{-wcb})->($res); +} + +sub find_missing { + my ($ctx) = @_; + if ($ctx->{-path} eq '') { + my $tip = 'HEAD'; + $tip = ascii_html($ctx->{qp}->{h}) if defined($ctx->{qp}->{h}); + PublicInbox::WwwStream::html_init($ctx); + print { $ctx->{zfh} } "<pre>`$tip' ref not found</pre>"; + my $res = $ctx->html_done; + $res->[0] = 404; + return delete($ctx->{-wcb})->($res); + } + my $cmd = $ctx->{git}->cmd(qw(log --no-color -1), + '--pretty=%H %h %s (%as)'); + push @$cmd, $ctx->{qp}->{h} if defined($ctx->{qp}->{h}); + push @$cmd, '--'; + push @$cmd, $ctx->{-path}; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, + { quiet => 1, 2 => $ctx->{lh} }); + $qsp->psgi_qx($ctx->{env}, undef, \&rd_404_log, $ctx); +} + +sub tree_show { # git check_async callback + my (undef, $oid, $type, $size, $ctx) = @_; + return find_missing($ctx) if $type eq 'missing'; + + my $res = [ $ctx->{git}, $oid, $type, $size ]; + my ($bn) = ($ctx->{-path} =~ m!/?([^/]+)\z!); + if ($type eq 'blob') { + my $obj = ascii_html($ctx->{-obj}); + $ctx->{-q_value_html} = 'dfn:'.ascii_html($ctx->{-path}) . + ' dfpost:'.substr($oid, 0, 7); + $ctx->{-paths} = [ $bn, qq[(<a +href="$ctx->{-upfx}$oid/s/$bn">raw</a>) +\$ git show $obj\t# shows this blob on the CLI] ]; + } + PublicInbox::ViewVCS::solve_result($res, $ctx); +} + +sub srv_tree { + my ($ctx, $path) = @_; + return if index($path, '//') >= 0 || index($path, '/') == 0; + my $tip = $ctx->{qp}->{h} // 'HEAD'; + $ctx->{-upfx} = '../' x (($path =~ tr!/!/!) + 1); + $path =~ s!/\z!!; + my $obj = $ctx->{-obj} = "$tip:$path"; + $ctx->{-path} = $path; + + # "\n" breaks with `git cat-file --batch-check', and there's no + # legitimate use of "\n" in filenames anyways. + return if index($obj, "\n") >= 0; + sub { + $ctx->{-wcb} = $_[0]; # HTTP::{Chunked,Identity} + PublicInbox::ViewVCS::do_check_async($ctx, \&tree_show, $obj); + }; +} + +1; diff --git a/lib/PublicInbox/SHA.pm b/lib/PublicInbox/SHA.pm new file mode 100644 index 00000000..3fa8530e --- /dev/null +++ b/lib/PublicInbox/SHA.pm @@ -0,0 +1,67 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# OpenSSL exception added in commit 22711f81f4e79da6b796820e37803a05cae14645 +# (README: add OpenSSL exception, 2015-10-05) + +# Replaces most uses of Digest::SHA with OpenSSL via Net::SSLeay if +# possible. OpenSSL SHA-256 is nearly twice as fast as Digest::SHA on +# x86-64, and SHA-1 is a bit faster as well. +# I don't think we can implement Digest::SHA->clone with what Net::SSLeay +# gives us... (maybe EVP_MD_CTX_copy+EVP_MD_CTX_copy_ex need to be added +# to Net::SSLeay?) +package PublicInbox::SHA; +use v5.12; +require Exporter; +our @EXPORT_OK = qw(sha1_hex sha256_hex sha256 sha_all); +use autodie qw(sysread); +our @ISA; + +BEGIN { + push @ISA, 'Exporter'; + unless (eval(<<'EOM')) { +use Net::SSLeay 1.43; +my %SHA = ( + 1 => Net::SSLeay::EVP_sha1(), + 256 => Net::SSLeay::EVP_sha256(), +); + +sub new { + my ($cls, $n) = @_; + my $mdctx = Net::SSLeay::EVP_MD_CTX_create(); + Net::SSLeay::EVP_DigestInit($mdctx, $SHA{$n}) or + die "EVP_DigestInit $n: $!"; + bless \$mdctx, $cls; +} + +sub add { + my $self = shift; + Net::SSLeay::EVP_DigestUpdate($$self, $_) for @_; + $self; +} + +sub digest { Net::SSLeay::EVP_DigestFinal(${$_[0]}) }; +sub hexdigest { unpack('H*', Net::SSLeay::EVP_DigestFinal(${$_[0]})) } +sub DESTROY { Net::SSLeay::EVP_MD_CTX_destroy(${$_[0]}) }; + +sub sha1_hex { unpack('H*', Net::SSLeay::SHA1($_[0])) }; +sub sha256_hex { unpack('H*', Net::SSLeay::SHA256($_[0])) }; +*sha256 = \&Net::SSLeay::SHA256; +# end of eval +EOM + require Digest::SHA; # stdlib fallback + push @ISA, 'Digest::SHA'; + *sha1_hex = \&Digest::SHA::sha1_hex; + *sha256_hex = \&Digest::SHA::sha256_hex; + *sha256 = \&Digest::SHA::sha256; +} + +} # /BEGIN + +sub sha_all ($$) { + my ($n, $fh) = @_; + my ($dig, $buf) = (PublicInbox::SHA->new($n)); + while (sysread($fh, $buf, 65536)) { $dig->add($buf) } + $dig +} + +1; diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pm b/lib/PublicInbox/SaPlugin/ListMirror.pm index a2a54944..06903cad 100644 --- a/lib/PublicInbox/SaPlugin/ListMirror.pm +++ b/lib/PublicInbox/SaPlugin/ListMirror.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # SpamAssassin rules useful for running a mailing list mirror. We want to: @@ -39,7 +39,11 @@ sub check_list_mirror_received { my $v = $pms->get($hdr) or next; local $/ = "\n"; chomp $v; - next if $v ne $hval; + if (ref($hval)) { + next if $v !~ $hval; + } else { + next if $v ne $hval; + } return 1 if $recvd !~ $host_re; } @@ -91,6 +95,8 @@ sub config_list_mirror { $host_glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; my $host_re = qr/\A\s*from\s+$host_glob(?:\s|$)/si; + (lc($hdr) eq 'list-id' && $hval =~ /<([^>]+)>/) and + $hval = qr/\A<\Q$1\E>\z/; push @{$self->{list_mirror_check}}, [ $hdr, $hval, $host_re, $addr ]; } diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pod b/lib/PublicInbox/SaPlugin/ListMirror.pod index 3c4ec8c1..e6a6c2ad 100644 --- a/lib/PublicInbox/SaPlugin/ListMirror.pod +++ b/lib/PublicInbox/SaPlugin/ListMirror.pod @@ -6,11 +6,11 @@ PublicInbox::SaPlugin::ListMirror - SpamAssassin plugin for mailing list mirrors loadplugin PublicInbox::SaPlugin::ListMirror -Declare some mailing lists based on the expected List-Id value, +Declare some mailing lists based on the expected List-ID value, expected servers, and mailing list address: - list_mirror List-Id <foo.example.com> *.example.com foo@example.com - list_mirror List-Id <bar.example.com> *.example.com bar@example.com + list_mirror List-ID <foo.example.com> *.example.com foo@example.com + list_mirror List-ID <bar.example.com> *.example.com bar@example.com Bump the score for messages which come from unexpected servers: @@ -43,14 +43,25 @@ C<allow_user_rules 1> =item list_mirror HEADER HEADER_VALUE HOSTNAME_GLOB [LIST_ADDRESS] -Declare a list based on an expected C<HEADER> matching C<HEADER_NAME> -exactly coming from C<HOSTNAME_GLOB>. C<LIST_ADDRESS> is optional, +Declare a list based on an expected C<HEADER> matching C<HEADER_VALUE> +coming from C<HOSTNAME_GLOB>. C<LIST_ADDRESS> is optional, but may specify the address of the mailing list being mirrored. -C<List-Id> or C<X-Mailing-List> are common values of C<HEADER> +C<List-ID> is the recommended value of C<HEADER> as most +mailing lists support it. An example of C<HEADER_VALUE> is C<E<lt>foo.example.orgE<gt>> -if C<HEADER> is C<List-Id>. +if C<HEADER> is C<List-ID>. + +As of public-inbox 2.0, using C<List-ID> as the C<HEADER> and a +C<HEADER_VALUE> contained by angle brackets (E<lt>list-idE<gt>), +matching is done in accordance with +L<RFC 2919|https://tools.ietf.org/html/rfc2919>. That is, +C<HEADER_VALUE> will be a case-insensitive substring match +and ignore the optional description C<phrase> as documented +in RFC 2919. + +All other C<HEADER> values use exact matches for backwards-compatibility. C<HOSTNAME_GLOB> may be a wildcard match for machines where mail can come from or an exact match. @@ -101,11 +112,11 @@ This rule allows users to assign a score to Bcc-ed messages Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> The mail archives are hosted at L<https://public-inbox.org/meta/> -and L<http://hjrcffqmbrq6wope.onion/meta/> +and L<http://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/meta/> =head1 COPYRIGHT -Copyright (C) 2016-2020 all contributors L<mailto:meta@public-inbox.org> +Copyright (C) all contributors L<mailto:meta@public-inbox.org> License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 0321ca93..e5c5d6ab 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -1,20 +1,24 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # based on notmuch, but with no concept of folders, files or flags # # Read-only search interface for use by the web and NNTP interfaces package PublicInbox::Search; use strict; +use v5.10.1; use parent qw(Exporter); -our @EXPORT_OK = qw(mdocid); +our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); use List::Util qw(max); +use POSIX qw(strftime); +use Carp (); +our $XHC = 0; # defined but false # values for searching, changing the numeric value breaks # compatibility with old indices (so don't change them it) use constant { - TS => 0, # Received: header in Unix time (IMAP INTERNALDATE) - YYYYMMDD => 1, # Date: header for searching in the WWW UI - DT => 2, # Date: YYYYMMDDHHMMSS + TS => 0, # Received: in Unix time (IMAP INTERNALDATE, JMAP receivedAt) + YYYYMMDD => 1, # redundant with DT below + DT => 2, # Date: YYYYMMDDHHMMSS (IMAP SENT*, JMAP sentAt) # added for public-inbox 1.6.0+ BYTES => 3, # IMAP RFC822.SIZE @@ -53,19 +57,48 @@ use constant { }; use PublicInbox::Smsg; -use PublicInbox::Over; -my $QP_FLAGS; -our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem); -our $Xap; # 'Search::Xapian' or 'Xapian' -my $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') -my $ENQ_ASCENDING; +eval { require PublicInbox::Over }; +our $QP_FLAGS; +our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); +our $Xap; # 'Xapian' or 'Search::Xapian' +our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') + +# ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, +# let's hope the ABI is stable +our $ENQ_DESCENDING = 0; +our $ENQ_ASCENDING = 1; +our @MAIL_VMAP = ( + [ YYYYMMDD, 'd:'], + [ TS, 'rt:' ], + # these are undocumented for WWW, but lei and IMAP use them + [ DT, 'dt:' ], + [ BYTES, 'z:' ], + [ UID, 'uid:' ] +); +our @MAIL_NRP; + +# Getopt::Long spec, only short options for portability in C++ implementation +our @XH_SPEC = ( + 'a', # ascending sort + 'c', # code search + 'd=s@', # shard dirs + 'g=s', # git dir (with -c) + 'k=i', # sort column (like sort(1)) + 'm=i', # maximum number of results + 'o=i', # offset + 'r', # 1=relevance then column + 't', # collapse threads + 'A=s@', # prefixes + 'K=i', # timeout kill after i seconds + 'O=s', # eidx_key + 'T=i', # threadid + 'Q=s@', # query prefixes "$user_prefix[:=]$XPREFIX" +); sub load_xapian () { return 1 if defined $Xap; - # n.b. PI_XAPIAN is intended for development use only. We still - # favor Search::Xapian since that's what's available in current - # Debian stable (10.x) and derived distros. - for my $x (($ENV{PI_XAPIAN} // 'Search::Xapian'), 'Xapian') { + # n.b. PI_XAPIAN is intended for development use only + for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') { eval "require $x"; next if $@; @@ -78,18 +111,13 @@ sub load_xapian () { # NumberRangeProcessor was added in Xapian 1.3.6, # NumberValueRangeProcessor was removed for 1.5.0+, - # favor the older /Value/ variant since that's what our - # (currently) preferred Search::Xapian supports + # continue with the older /Value/ variant for now... $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ? 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); $X{$_} = $Xap.'::'.$_ for (keys %X); - # ENQ_ASCENDING doesn't seem exported by SWIG Xapian.pm, - # so lets hope this part of the ABI is stable because it's - # just an integer: - $ENQ_ASCENDING = $x eq 'Xapian' ? - 1 : Search::Xapian::ENQ_ASCENDING(); - + *sortable_serialise = $x.'::sortable_serialise'; + *sortable_unserialise = $x.'::sortable_unserialise'; # n.b. FLAG_PURE_NOT is expensive not suitable for a public # website as it could become a denial-of-service vector # FLAG_PHRASE also seems to cause performance problems chert @@ -98,6 +126,7 @@ sub load_xapian () { # or make indexlevel=medium as default $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() | FLAG_WILDCARD(); + @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP; return 1; } undef; @@ -107,42 +136,50 @@ sub load_xapian () { # a prefix common in patch emails our $LANG = 'english'; +our %PATCH_BOOL_COMMON = ( + dfpre => 'XDFPRE', + dfpost => 'XDFPOST', + dfblob => 'XDFPRE XDFPOST', + patchid => 'XDFID', +); + # note: the non-X term prefix allocations are shared with # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue lid => 'G', # newsGroup (or similar entity), just inside <> - dfpre => 'XDFPRE', - dfpost => 'XDFPOST', - dfblob => 'XDFPRE XDFPOST', + %PATCH_BOOL_COMMON ); -my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST'; -my %prob_prefix = ( - # for mairix compatibility +# for mairix compatibility +our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; +our %PATCH_PROB_COMMON = ( s => 'S', - m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial - l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial f => 'A', - t => 'XTO', - tc => 'XTO XCC', - c => 'XCC', - tcf => 'XTO XCC A', - a => 'XTO XCC A', - b => $non_quoted_body . ' XQUOT', - bs => $non_quoted_body . ' XQUOT S', + b => $NON_QUOTED_BODY . ' XQUOT', + bs => $NON_QUOTED_BODY . ' XQUOT S', n => 'XFN', q => 'XQUOT', - nq => $non_quoted_body, + nq => $NON_QUOTED_BODY, dfn => 'XDFN', dfa => 'XDFA', dfb => 'XDFB', dfhh => 'XDFHH', dfctx => 'XDFCTX', +); +my %prob_prefix = ( + m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial + t => 'XTO', + tc => 'XTO XCC', + c => 'XCC', + tcf => 'XTO XCC A', + a => 'XTO XCC A', + %PATCH_PROB_COMMON, # default: - '' => 'XM S A XQUOT XFN ' . $non_quoted_body, + '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY, ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian @@ -152,12 +189,9 @@ my %prob_prefix = ( our @HELP = ( 's:' => 'match within Subject e.g. s:"a quick brown fox"', 'd:' => <<EOF, -date range as YYYYMMDD e.g. d:19931002..20101002 -Open-ended ranges such as d:19931002.. and d:..20101002 -are also supported -EOF - 'dt:' => <<EOF, -date-time range as YYYYMMDDhhmmss (e.g. dt:19931002011000..19931002011200) +match date-time range, git "approxidate" formats supported +Open-ended ranges such as `d:last.week..' and +`d:..2.days.ago' are supported EOF 'b:' => 'match within message body, including text attachments', 'nq:' => 'match non-quoted text within message body', @@ -178,6 +212,10 @@ EOF 'dfpre:' => 'match pre-image git blob ID', 'dfpost:' => 'match post-image git blob ID', 'dfblob:' => 'match either pre or post-image git blob ID', + 'patchid:' => "match `git patch-id --stable' output", + 'rt:' => <<EOF, +match received time, like `d:' if sender's clock was correct +EOF ); chomp @HELP; @@ -185,43 +223,38 @@ sub xdir ($;$) { my ($self, $rdonly) = @_; if ($rdonly || !defined($self->{shard})) { $self->{xpfx}; - } else { # v2 only: + } else { # v2, extindex, cindex only: "$self->{xpfx}/$self->{shard}"; } } -sub _xdb ($) { +# returns shard directories as an array of strings, does not verify existence +sub shard_dirs ($) { my ($self) = @_; - my $dir = xdir($self, 1); - my ($xdb, $slow_phrase); - my $qpf = \($self->{qp_flags} ||= $QP_FLAGS); - if ($self->{ibx_ver} >= 2) { - my @xdb; - opendir(my $dh, $dir) or return; # not initialized yet - + my $xpfx = $self->{xpfx}; + if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox + ($xpfx); + } else { # v2 inbox, eidx, cidx + opendir(my $dh, $xpfx) or return (); # not initialized yet # We need numeric sorting so shard[0] is first for reading # Xapian metadata, if needed - my $last = max(grep(/\A[0-9]+\z/, readdir($dh))); - return if !defined($last); - for (0..$last) { - my $shard_dir = "$dir/$_"; - if (-d $shard_dir && -r _) { - push @xdb, $X{Database}->new($shard_dir); - $slow_phrase ||= -f "$shard_dir/iamchert"; - } else { # gaps from missing epochs throw off mdocid() - warn "E: $shard_dir missing or unreadable\n"; - return; - } - } - $self->{nshard} = scalar(@xdb); - $xdb = shift @xdb; - $xdb->add_database($_) for @xdb; - } else { - $slow_phrase = -f "$dir/iamchert"; - $xdb = $X{Database}->new($dir); + my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return (); + map { "$xpfx/$_" } (0..$last); } - $$qpf |= FLAG_PHRASE() unless $slow_phrase; - $xdb; +} + +# returns all shards as separate Xapian::Database objects w/o combining +sub xdb_shards_flat ($) { + my ($self) = @_; + load_xapian(); + $self->{qp_flags} //= $QP_FLAGS; + my $slow_phrase; + my @xdb = map { + $slow_phrase ||= -f "$_/iamchert"; + $X{Database}->new($_); # raises if missing + } shard_dirs($self); + $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; + @xdb; } # v2 Xapian docids don't conflict, so they're identical to @@ -233,38 +266,36 @@ sub mdocid { int(($docid - 1) / $nshard) + 1; } +sub docids_to_artnums { + my $nshard = shift->{nshard}; + # XXX does array vs arrayref make a difference in modern Perls? + map { int(($_ - 1) / $nshard) + 1 } @_; +} + sub mset_to_artnums { my ($self, $mset) = @_; - my $nshard = $self->{nshard} // 1; + my $nshard = $self->{nshard}; [ map { mdocid($nshard, $_) } $mset->items ]; } sub xdb ($) { my ($self) = @_; - $self->{xdb} ||= do { - load_xapian(); - _xdb($self); + $self->{xdb} // do { + my @xdb = $self->xdb_shards_flat or return; + $self->{nshard} = scalar(@xdb); + my $xdb = shift @xdb; + $xdb->add_database($_) for @xdb; + $self->{xdb} = $xdb; }; } -sub xpfx_init ($) { - my ($self) = @_; - if ($self->{ibx_ver} == 1) { - $self->{xpfx} .= '/public-inbox/xapian' . SCHEMA_VERSION; - } else { - $self->{xpfx} .= '/xap'.SCHEMA_VERSION; - } -} - sub new { my ($class, $ibx) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; - my $self = bless { - xpfx => $ibx->{inboxdir}, # for xpfx_init - altid => $ibx->{altid}, - ibx_ver => $ibx->version, - }, $class; - xpfx_init($self); + my $xap = $ibx->version > 1 ? 'xap' : 'public-inbox/xapian'; + my $xpfx = "$ibx->{inboxdir}/$xap".SCHEMA_VERSION; + my $self = bless { xpfx => $xpfx }, $class; + $self->{altid} = $ibx->{altid} if defined($ibx->{altid}); $self; } @@ -276,46 +307,237 @@ sub reopen { $self; # make chaining easier } -# read-only +# Convert git "approxidate" ranges to something usable with our +# Xapian indices. At the moment, Xapian only offers a C++-only API +# and neither the SWIG nor XS bindings allow us to use custom code +# to parse dates (and libgit2 doesn't expose git__date_parse, either, +# so we're running git-rev-parse(1)). +# This replaces things we need to send to $git->date_parse with +# "\0".$strftime_format.['+'|$idx]."\0" placeholders +sub date_parse_prepare { + my ($to_parse, $pfx, $range) = @_; + # are we inside a parenthesized statement? + my $end = $range =~ s/([\)\s]*)\z// ? $1 : ''; + my @r = split(/\.\./, $range, 2); + + # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like + # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD + # We upgrade "d:" to "dt:" unconditionally + if ($pfx eq 'd') { + $pfx = 'dt'; + # upgrade YYYYMMDD to YYYYMMDDHHMMSS + $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]] + [0-9]{2}[^[:alnum:]] + [0-9]{2}\z!x, @r)); + $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r)); + } + if ($pfx eq 'dt') { + if (!defined($r[1])) { # git needs gaps and not /\d{14}/ + if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2}) + ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) { + push @$to_parse, "$1-$2-$3 $4:$5:$6"; + } else { + push @$to_parse, $r[0]; + } + $r[0] = "\0%Y%m%d%H%M%S$#$to_parse\0"; + $r[1] = "\0%Y%m%d%H%M%S+\0"; + } else { + for my $x (@r) { + next if $x eq '' || $x =~ /\A[0-9]{14}\z/; + push @$to_parse, $x; + $x = "\0%Y%m%d%H%M%S$#$to_parse\0"; + } + } + } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P + for my $x (@r) { + next if $x eq '' || $x =~ /\A[0-9]{5,}\z/; + push @$to_parse, $x; + $x = "\0%s$#$to_parse\0"; + } + $r[1] //= "\0%s+\0"; # add 1 day + } + "$pfx:".join('..', @r).$end; +} + +sub date_parse_finalize { + my ($git, $to_parse) = @_; + # git-rev-parse can handle any number of args up to system + # limits (around (4096*32) bytes on Linux). + my @r = $git->date_parse(@$to_parse); + # n.b. git respects TZ, times stored in SQLite/Xapian are always UTC, + # and gmtime doesn't seem to do the right thing when TZ!=UTC + my ($i, $t); + $_[2] =~ s/\0(%[%YmdHMSs]+)([0-9\+]+)\0/ + $t = $2 eq '+' ? ($r[$i]+86400) : $r[$i=$2+0]; + $1 eq '%s' ? $t : strftime($1, gmtime($t))/sge; +} + +# n.b. argv never has NUL, though we'll need to filter it out +# if this $argv isn't from a command execution +sub query_argv_to_string { + my (undef, $git, $argv) = @_; + my $to_parse; + my $tmp = join(' ', map {; + if (s!\b(d|rt|dt):(\S+)\z!date_parse_prepare( + $to_parse //= [], $1, $2)!sge) { + $_; + } elsif (/\s/) { + s/(.*?)\b(\w+:)// ? qq{$1$2"$_"} : qq{"$_"}; + } else { + $_ + } + } @$argv); + date_parse_finalize($git, $to_parse, $tmp) if $to_parse; + $tmp +} + +# this is for the WWW "q=" query parameter and "lei q --stdin" +# it can't do d:"5 days ago", but it will do d:5.days.ago +sub query_approxidate { + my (undef, $git) = @_; # $_[2] = $query_string (modified in-place) + my $DQ = qq<"\x{201c}\x{201d}>; # Xapian can use curly quotes + $_[2] =~ tr/\x00/ /; # Xapian doesn't do NUL, we use it as a placeholder + my ($terms, $phrase, $to_parse); + $_[2] =~ s{([^$DQ]*)([$DQ][^$DQ]*[$DQ])?}{ + ($terms, $phrase) = ($1, $2); + $terms =~ s!\b(d|rt|dt):(\S+)! + date_parse_prepare($to_parse //= [], $1, $2)!sge; + $terms.($phrase // ''); + }sge; + date_parse_finalize($git, $to_parse, $_[2]) if $to_parse; +} + +# read-only, for mail only (codesearch has different rules) sub mset { - my ($self, $query_string, $opts) = @_; - $opts ||= {}; - my $qp = $self->{qp} //= qparse_new($self); - my $query = $qp->parse_query($query_string, $self->{qp_flags}); - $opts->{relevance} = 1 unless exists $opts->{relevance}; - _do_enquire($self, $query, $opts); + my ($self, $qry_str, $opt) = @_; + my $qp = $self->{qp} //= $self->qparse_new; + my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); + if (defined(my $eidx_key = $opt->{eidx_key})) { + $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key); + } + if (defined(my $uid_range = $opt->{uid_range})) { + my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, + sortable_serialise($uid_range->[0]), + sortable_serialise($uid_range->[1])); + $qry = $X{Query}->new(OP_FILTER(), $qry, $range); + } + if (defined(my $tid = $opt->{threadid})) { + $tid = sortable_serialise($tid); + $qry = $X{Query}->new(OP_FILTER(), $qry, + $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid)); + } + do_enquire($self, $qry, $opt, TS); +} + +sub xhc_start_maybe (@) { + require PublicInbox::XapClient; + my $xhc = PublicInbox::XapClient::start_helper(@_); + require PublicInbox::XhcMset if $xhc; + $xhc; +} + +sub xh_opt ($$) { + my ($self, $opt) = @_; + my $lim = $opt->{limit} || 50; + my @ret; + push @ret, '-o', $opt->{offset} if $opt->{offset}; + push @ret, '-m', $lim; + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + push @ret, '-k', '-1'; + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + push @ret, '-k', '-1'; + push @ret, '-a'; + } elsif ($rel == 0) { + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } else { # rel > 0 + push @ret, '-r'; + push @ret, '-k', $opt->{sort_col} // TS; + push @ret, '-a' if $opt->{asc}; + } + push @ret, '-t' if $opt->{threads}; + push @ret, '-T', $opt->{threadid} if defined $opt->{threadid}; + push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; + my $apfx = $self->{-alt_pfx} //= do { + my @tmp; + for (grep /\Aserial:/, @{$self->{altid} // []}) { + my (undef, $pfx) = split /:/, $_; + push @tmp, '-Q', "$pfx=X\U$pfx"; + } + # TODO: arbitrary header indexing goes here + \@tmp; + }; + (@ret, @$apfx); +} + +# returns a true value if actually handled asynchronously, +# and a falsy value if handled synchronously +sub async_mset { + my ($self, $qry_str, $opt, $cb, @args) = @_; + if ($XHC) { # unconditionally retrieving pct + rank for now + xdb($self); # populate {nshards} + my @margs = ($self->xh_args, xh_opt($self, $opt)); + my $ret = eval { + my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); + PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); + }; + $cb->(@args, undef, $@) if $@; + $ret; + } else { # synchronous + my $mset = $self->mset($qry_str, $opt); + $cb->(@args, $mset); + undef; + } +} + +sub do_enquire { # shared with CodeSearch + my ($self, $qry, $opt, $col) = @_; + my $enq = $X{Enquire}->new(xdb($self)); + $enq->set_query($qry); + my $rel = $opt->{relevance} // 0; + if ($rel == -2) { # ORDER BY docid/UID (highest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_DESCENDING); + } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) + $enq->set_weighting_scheme($X{BoolWeight}->new); + $enq->set_docid_order($ENQ_ASCENDING); + } elsif ($rel == 0) { + $enq->set_sort_by_value_then_relevance($col, !$opt->{asc}); + } else { # rel > 0 + $enq->set_sort_by_relevance_then_value($col, !$opt->{asc}); + } + + # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse + # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword + (($opt->{threads} // 0) == 1 && has_threadid($self)) and + $enq->set_collapse_key(THREADID); + retry_reopen($self, \&enquire_once, $enq, + $opt->{offset} || 0, $opt->{limit} || 50); } sub retry_reopen { - my ($self, $cb, $arg) = @_; + my ($self, $cb, @arg) = @_; for my $i (1..10) { if (wantarray) { - my @ret; - eval { @ret = $cb->($arg) }; + my @ret = eval { $cb->($self, @arg) }; return @ret unless $@; } else { - my $ret; - eval { $ret = $cb->($arg) }; + my $ret = eval { $cb->($self, @arg) }; return $ret unless $@; } # Exception: The revision being read has been discarded - # you should call Xapian::Database::reopen() if (ref($@) =~ /\bDatabaseModifiedError\b/) { - warn "reopen try #$i on $@\n"; reopen($self); } else { # let caller decide how to spew, because ExtMsg queries # get wonky and trigger: # "something terrible happened at .../Xapian/Enquire.pm" - die; + Carp::croak($@); } } - die "Too many Xapian database modifications in progress\n"; -} - -sub _do_enquire { - my ($self, $query, $opts) = @_; - retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]); + Carp::croak("Too many Xapian database modifications in progress\n"); } # returns true if all docs have the THREADID value @@ -324,32 +546,14 @@ sub has_threadid ($) { (xdb($self)->get_metadata('has_threadid') // '') eq '1'; } -sub _enquire_once { # retry_reopen callback - my ($self, $query, $opts) = @{$_[0]}; - my $xdb = xdb($self); - my $enquire = $X{Enquire}->new($xdb); - $enquire->set_query($query); - $opts ||= {}; - my $desc = !$opts->{asc}; - if (($opts->{mset} || 0) == 2) { # mset == 2: ORDER BY docid/UID - $enquire->set_docid_order($ENQ_ASCENDING); - $enquire->set_weighting_scheme($X{BoolWeight}->new); - } elsif ($opts->{relevance}) { - $enquire->set_sort_by_relevance_then_value(TS, $desc); - } else { - $enquire->set_sort_by_value_then_relevance(TS, $desc); - } - - # `mairix -t / --threads' or JMAP collapseThreads - if ($opts->{thread} && has_threadid($self)) { - $enquire->set_collapse_key(THREADID); - } - $enquire->get_mset($opts->{offset} || 0, $opts->{limit} || 50); +sub enquire_once { # retry_reopen callback + my (undef, $enq, $offset, $limit) = @_; + $enq->get_mset($offset, $limit); } sub mset_to_smsg { my ($self, $ibx, $mset) = @_; - my $nshard = $self->{nshard} // 1; + my $nshard = $self->{nshard}; my $i = 0; my %order = map { mdocid($nshard, $_) => ++$i } $mset->items; my @msgs = sort { @@ -361,29 +565,27 @@ sub mset_to_smsg { # read-write sub stemmer { $X{Stem}->new($LANG) } -# read-only -sub qparse_new ($) { +sub qp_init_common { my ($self) = @_; - - my $xdb = xdb($self); my $qp = $X{QueryParser}->new; $qp->set_default_op(OP_AND()); - $qp->set_database($xdb); + $qp->set_database(xdb($self)); $qp->set_stemmer(stemmer($self)); $qp->set_stemming_strategy(STEM_SOME()); my $cb = $qp->can('set_max_wildcard_expansion') // $qp->can('set_max_expansion'); # Xapian 1.5.0+ $cb->($qp, 100); - $cb = $qp->can('add_valuerangeprocessor') // - $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ - $cb->($qp, $NVRP->new(YYYYMMDD, 'd:')); - $cb->($qp, $NVRP->new(DT, 'dt:')); + $qp; +} - # for IMAP, undocumented for WWW and may be split off go away - $cb->($qp, $NVRP->new(BYTES, 'bytes:')); - $cb->($qp, $NVRP->new(TS, 'ts:')); - $cb->($qp, $NVRP->new(UID, 'uid:')); +# read-only +sub qparse_new { + my ($self) = @_; + my $qp = qp_init_common($self); + my $cb = $qp->can('add_valuerangeprocessor') // + $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ + $cb->($qp, $_) for @MAIL_NRP; while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix); } @@ -413,9 +615,43 @@ EOF $qp; } +sub generate_cxx () { # generates snippet for xap_helper.h + my $ret = <<EOM; +# line ${\__LINE__} "${\__FILE__}" +static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}]; +static void mail_nrp_init(void) +{ +EOM + for (0..$#MAIL_VMAP) { + my $x = $MAIL_VMAP[$_]; + $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n} + } +$ret .= <<EOM; +} + +# line ${\__LINE__} "${\__FILE__}" +static void qp_init_mail_search(Xapian::QueryParser *qp) +{ + for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++) + qp->ADD_RP(mail_nrp[i]); +EOM + for my $name (sort keys %bool_pfx_external) { + for (split(/ /, $bool_pfx_external{$name})) { + $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} + } + } + # altid support is handled in xh_opt and srch_init_extra in XH + for my $name (sort keys %prob_prefix) { + for (split(/ /, $prob_prefix{$name})) { + $ret .= qq{\tqp->add_prefix("$name", "$_");\n} + } + } + $ret .= "}\n"; +} + sub help { my ($self) = @_; - $self->{qp} //= qparse_new($self); # parse altids + $self->{qp} //= $self->qparse_new; # parse altids my @ret = @HELP; if (my $user_pfx = $self->{-user_pfx}) { push @ret, @$user_pfx; @@ -423,4 +659,74 @@ sub help { \@ret; } +# always returns a scalar value +sub int_val ($$) { + my ($doc, $col) = @_; + my $val = $doc->get_value($col) or return undef; # undef is '' in Xapian + sortable_unserialise($val) + 0; # PV => IV conversion +} + +sub get_pct ($) { # mset item + # Capped at "99%" since "100%" takes an extra column in the + # thread skeleton view. <xapian/mset.h> says the value isn't + # very meaningful, anyways. + my $n = $_[0]->get_percent; + $n > 99 ? 99 : $n; +} + +sub xap_terms ($$;@) { + my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () + my $end = $xdb_or_doc->termlist_end(@docid); + my $cur = $xdb_or_doc->termlist_begin(@docid); + $cur->skip_to($pfx); + my (@ret, $tn); + my $pfxlen = length($pfx); + for (; $cur != $end; $cur++) { + $tn = $cur->get_termname; + index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen)); + } + wantarray ? @ret : +{ map { $_ => undef } @ret }; +} + +# get combined docid from over.num: +# (not generic Xapian, only works with our sharding scheme for mail) +sub num2docid ($$) { + my ($self, $num) = @_; + my $nshard = $self->{nshard}; + ($num - 1) * $nshard + $num % $nshard + 1; +} + +sub all_terms { + my ($self, $pfx) = @_; + my $cur = xdb($self)->allterms_begin($pfx); + my $end = $self->{xdb}->allterms_end($pfx); + my $pfxlen = length($pfx); + my @ret; + for (; $cur != $end; $cur++) { + push @ret, substr($cur->get_termname, $pfxlen); + } + wantarray ? @ret : +{ map { $_ => undef } @ret }; +} + +sub xh_args { # prep getopt args to feed to xap_helper.h socket + map { ('-d', $_) } shard_dirs($_[0]); +} + +sub docids_by_postlist ($$) { + my ($self, $q) = @_; + my $cur = $self->xdb->postlist_begin($q); + my $end = $self->{xdb}->postlist_end($q); + my @ids; + for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) }; + @ids; +} + +sub get_doc ($$) { + my ($self, $docid) = @_; + eval { $self->{xdb}->get_document($docid) } // do { + die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/; + undef; + } +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 803494f5..4fd493d9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -1,6 +1,6 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# based on notmuch, but with no concept of folders, files or flags +# based on notmuch, but with no concept of folders, files # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread @@ -9,34 +9,46 @@ package PublicInbox::SearchIdx; use strict; use v5.10.1; -use parent qw(PublicInbox::Search PublicInbox::Lock Exporter); +use parent qw(PublicInbox::Search PublicInbox::Lock PublicInbox::Umask + Exporter); use PublicInbox::Eml; +use PublicInbox::Search qw(xap_terms); use PublicInbox::InboxWritable; use PublicInbox::MID qw(mids_for_index mids); use PublicInbox::MsgIter; use PublicInbox::IdxStack; -use Carp qw(croak); +use Carp qw(croak carp); use POSIX qw(strftime); +use Fcntl qw(SEEK_SET); use Time::Local qw(timegm); use PublicInbox::OverIdx; -use PublicInbox::Spawn qw(spawn nodatacow_dir); +use PublicInbox::Spawn qw(run_wait popen_rd); use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); -our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size); +use PublicInbox::Address; +use Config; +our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack + index_text term_generator add_val is_bad_blob); my $X = \%PublicInbox::Search::X; -my ($DB_CREATE_OR_OPEN, $DB_OPEN); +our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; -our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : 1_000_000; +our $DB_DANGEROUS = 0; +our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : + # assume a typical 64-bit system has 8x more RAM than a + # typical 32-bit system: + (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024); use constant DEBUG => !!$ENV{DEBUG}; - +my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; +my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters +our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; +our $PATCHID_BROKEN; sub new { my ($class, $ibx, $creat, $shard) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; - my $levels = qr/\A(?:full|medium|basic)\z/; my $inboxdir = $ibx->{inboxdir}; my $version = $ibx->version; my $indexlevel = 'full'; @@ -46,27 +58,24 @@ sub new { $altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ]; } if ($ibx->{indexlevel}) { - if ($ibx->{indexlevel} =~ $levels) { + if ($ibx->{indexlevel} =~ $INDEXLEVELS) { $indexlevel = $ibx->{indexlevel}; } else { die("Invalid indexlevel $ibx->{indexlevel}\n"); } } + undef $PATCHID_BROKEN; # retry on new instances in case of upgrades $ibx = PublicInbox::InboxWritable->new($ibx); - my $self = bless { - ibx => $ibx, - xpfx => $inboxdir, # for xpfx_init - -altid => $altid, - ibx_ver => $version, - indexlevel => $indexlevel, - }, $class; - $self->xpfx_init; + my $self = PublicInbox::Search->new($ibx); + bless $self, $class; + $self->{ibx} = $ibx; + $self->{-altid} = $altid; + $self->{indexlevel} = $indexlevel; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; if ($ibx->{-skip_docdata}) { $self->{-set_skip_docdata_once} = 1; $self->{-skip_docdata} = 1; } - $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; @@ -84,12 +93,12 @@ sub new { $self; } -sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels } +sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels } sub idx_release { my ($self, $wake) = @_; if (need_xapian($self)) { - my $xdb = delete $self->{xdb} or croak 'not acquired'; + my $xdb = delete $self->{xdb} or croak '{xdb} not acquired'; $xdb->close; } $self->lock_release($wake) if $self->{creat}; @@ -98,19 +107,24 @@ sub idx_release { sub load_xapian_writable () { return 1 if $X->{WritableDatabase}; - PublicInbox::Search::load_xapian() or return; + PublicInbox::Search::load_xapian() or die "failed to load Xapian: $@\n"; my $xap = $PublicInbox::Search::Xap; for (qw(Document TermGenerator WritableDatabase)) { $X->{$_} = $xap.'::'.$_; } eval 'require '.$X->{WritableDatabase} or die; *sortable_serialise = $xap.'::sortable_serialise'; - *sortable_unserialise = $xap.'::sortable_unserialise'; $DB_CREATE_OR_OPEN = eval($xap.'::DB_CREATE_OR_OPEN()'); $DB_OPEN = eval($xap.'::DB_OPEN()'); - my $ver = (eval($xap.'::major_version()') << 16) | - (eval($xap.'::minor_version()') << 8); - $DB_NO_SYNC = 0x4 if $ver >= 0x10400; + my $ver = eval 'v'.join('.', eval($xap.'::major_version()'), + eval($xap.'::minor_version()'), + eval($xap.'::revision()')); + if ($ver ge v1.4) { # new flags in Xapian 1.4 + $DB_NO_SYNC = 0x4; + $DB_DANGEROUS = 0x10; + } + # Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks + $X->{CLOEXEC_UNSET} = 1 if $ver ge v1.2.21 && $ver le v1.2.24; 1; } @@ -123,6 +137,7 @@ sub idx_acquire { load_xapian_writable(); $flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN; } + my $owner = $self->{ibx} // $self->{eidx} // $self; if ($self->{creat}) { require File::Path; $self->lock_acquire; @@ -132,12 +147,15 @@ sub idx_acquire { if (!-d $dir && (!$is_shard || ($is_shard && need_xapian($self)))) { File::Path::mkpath($dir); - nodatacow_dir($dir); - $self->{-set_has_threadid_once} = 1; + require PublicInbox::Syscall; + PublicInbox::Syscall::nodatacow_dir($dir); + # owner == self for CodeSearchIdx + $self->{-set_has_threadid_once} = 1 if $owner != $self; + $flag |= $DB_DANGEROUS if $owner->{-dangerous}; } } return unless defined $flag; - $flag |= $DB_NO_SYNC if $self->{ibx}->{-no_fsync}; + $flag |= $DB_NO_SYNC if $owner->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; @@ -154,27 +172,48 @@ sub term_generator ($) { # write-only $self->{term_generator} //= do { my $tg = $X->{TermGenerator}->new; - $tg->set_stemmer($self->stemmer); + $tg->set_stemmer(PublicInbox::Search::stemmer($self)); $tg; } } +sub index_phrase ($$$$) { + my ($self, $text, $wdf_inc, $prefix) = @_; + + term_generator($self)->index_text($text, $wdf_inc, $prefix); + $self->{term_generator}->increase_termpos; +} + sub index_text ($$$$) { my ($self, $text, $wdf_inc, $prefix) = @_; - my $tg = term_generator($self); # man Search::Xapian::TermGenerator if ($self->{indexlevel} eq 'full') { - $tg->index_text($text, $wdf_inc, $prefix); - $tg->increase_termpos; + index_phrase($self, $text, $wdf_inc, $prefix); } else { - $tg->index_text_without_positions($text, $wdf_inc, $prefix); + term_generator($self)->index_text_without_positions( + $text, $wdf_inc, $prefix); } } sub index_headers ($$) { my ($self, $smsg) = @_; - my @x = (from => 'A', # Author - subject => 'S', to => 'XTO', cc => 'XCC'); + my @x = (from => 'A', to => 'XTO', cc => 'XCC'); # A: Author + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $val = $smsg->{$field}; + next if $val eq ''; + # include "(comments)" after the address, too, so not using + # PublicInbox::Address::names or pairs + index_text($self, $val, 1, $pfx); + + # we need positional info for email addresses since they + # can be considered phrases + if ($self->{indexlevel} eq 'medium') { + for my $addr (PublicInbox::Address::emails($val)) { + index_phrase($self, $addr, 1, $pfx); + } + } + } + @x = (subject => 'S'); while (my ($field, $pfx) = splice(@x, 0, 2)) { my $val = $smsg->{$field}; index_text($self, $val, 1, $pfx) if $val ne ''; @@ -187,7 +226,11 @@ sub index_diff_inc ($$$$) { index_text($self, join("\n", @$xnq), 1, 'XNQ'); @$xnq = (); } - index_text($self, $text, 1, $pfx); + if ($pfx eq 'XDFN') { + index_phrase($self, $text, 1, $pfx); + } else { + index_text($self, $text, 1, $pfx); + } } sub index_old_diff_fn { @@ -195,8 +238,8 @@ sub index_old_diff_fn { # no renames or space support for traditional diffs, # find the number of leading common paths to strip: - my @fa = split('/', $fa); - my @fb = split('/', $fb); + my @fa = split(m'/', $fa); + my @fb = split(m'/', $fb); while (scalar(@fa) && scalar(@fb)) { $fa = join('/', @fa); $fb = join('/', @fb); @@ -216,37 +259,59 @@ sub index_diff ($$$) { my ($self, $txt, $doc) = @_; my %seen; my $in_diff; - my @xnq; - my $xnq = \@xnq; - foreach (split(/\n/, $txt)) { - if ($in_diff && s/^ //) { # diff context + my $xnq = []; + my @l = split(/\n/, $$txt); + undef $$txt; + while (defined($_ = shift @l)) { + if ($in_diff && /^GIT binary patch/) { + push @$xnq, $_; + while (@l && $l[0] =~ /^(?:literal|delta) /) { + # TODO allow searching by size range? + # allows searching by exact size via: + # "literal $SIZE" or "delta $SIZE" + push @$xnq, shift(@l); + + # skip base85 and empty lines + while (@l && ($l[0] =~ /\A$BASE85\h*\z/o || + $l[0] !~ /\S/)) { + shift @l; + } + # loop hits trailing "literal 0\nHcmV?d00001\n" + } + } elsif ($in_diff && s/^ //) { # diff context index_diff_inc($self, $_, 'XDFCTX', $xnq); } elsif (/^-- $/) { # email signature begins $in_diff = undef; - } elsif (m!^diff --git "?[^/]+/.+ "?[^/]+/.+\z!) { - # wait until "---" and "+++" to capture filenames + } elsif (m!^diff --git ("?[^/]+/.+) ("?[^/]+/.+)\z!) { + # capture filenames here for binary diffs: + my ($fa, $fb) = ($1, $2); + push @$xnq, $_; $in_diff = 1; + $fa = (split(m'/', git_unquote($fa), 2))[1]; + $fb = (split(m'/', git_unquote($fb), 2))[1]; + $seen{$fa}++ or index_diff_inc($self, $fa, 'XDFN', $xnq); + $seen{$fb}++ or index_diff_inc($self, $fb, 'XDFN', $xnq); # traditional diff: } elsif (m/^diff -(.+) (\S+) (\S+)$/) { my ($opt, $fa, $fb) = ($1, $2, $3); - push @xnq, $_; + push @$xnq, $_; # only support unified: next unless $opt =~ /[uU]/; $in_diff = index_old_diff_fn($self, \%seen, $fa, $fb, $xnq); } elsif (m!^--- ("?[^/]+/.+)!) { my $fn = $1; - $fn = (split('/', git_unquote($fn), 2))[1]; + $fn = (split(m'/', git_unquote($fn), 2))[1]; $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (m!^\+\+\+ ("?[^/]+/.+)!) { my $fn = $1; - $fn = (split('/', git_unquote($fn), 2))[1]; + $fn = (split(m'/', git_unquote($fn), 2))[1]; $seen{$fn}++ or index_diff_inc($self, $fn, 'XDFN', $xnq); $in_diff = 1; } elsif (/^--- (\S+)/) { - $in_diff = $1; - push @xnq, $_; + $in_diff = $1; # old diff filename + push @$xnq, $_; } elsif (defined $in_diff && /^\+\+\+ (\S+)/) { $in_diff = index_old_diff_fn($self, \%seen, $in_diff, $1, $xnq); @@ -272,19 +337,65 @@ sub index_diff ($$$) { /^(?:dis)?similarity index / || /^\\ No newline at end of file/ || /^Binary files .* differ/) { - push @xnq, $_; + push @$xnq, $_; } elsif ($_ eq '') { # possible to be in diff context, some mail may be # stripped by MUA or even GNU diff(1). "git apply" # treats a bare "\n" as diff context, too } else { - push @xnq, $_; + push @$xnq, $_; warn "non-diff line: $_\n" if DEBUG && $_ ne ''; $in_diff = undef; } } - index_text($self, join("\n", @xnq), 1, 'XNQ'); + index_text($self, join("\n", @$xnq), 1, 'XNQ'); +} + +sub index_body_text { + my ($self, $doc, $sref) = @_; + my $rd; + # start patch-id in parallel + if ($$sref =~ /^(?:diff|---|\+\+\+) /ms && !$PATCHID_BROKEN) { + my $git = ($self->{ibx} // $self->{eidx} // $self)->git; + my $fh = PublicInbox::IO::write_file '+>:utf8', undef, $$sref; + $fh->flush or die "flush: $!"; + sysseek($fh, 0, SEEK_SET); + $rd = popen_rd($git->cmd(qw(patch-id --stable)), undef, + { 0 => $fh }); + } + + # split off quoted and unquoted blocks: + my @sections = PublicInbox::MsgIter::split_quotes($$sref); + undef $$sref; # free memory + for my $txt (@sections) { + if ($txt =~ /\A>/) { + if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) { + # get rid of Base-85 noise + $txt =~ s/^([>\h]+(?:literal|delta) + \x20[0-9]+\r?\n) + (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx; + } + index_text($self, $txt, 0, 'XQUOT'); + } else { # does it look like a diff? + if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { + index_diff($self, \$txt, $doc); + } else { + index_text($self, $txt, 1, 'XNQ'); + } + } + undef $txt; # free memory + } + if (defined $rd) { # reap `git patch-id' + (readline($rd) // '') =~ /\A([a-f0-9]{40,})/ and + $doc->add_term('XDFID'.$1); + if (!$rd->close) { + my $c = 'git patch-id --stable'; + $PATCHID_BROKEN = ($? >> 8) == 129; + $PATCHID_BROKEN ? warn("W: $c requires git v2.1.0+\n") + : warn("W: $c failed: \$?=$? (non-fatal)"); + } + } } sub index_xapian { # msg_iter callback @@ -293,7 +404,7 @@ sub index_xapian { # msg_iter callback my $ct = $part->content_type || 'text/plain'; my $fn = $part->filename; if (defined $fn && $fn ne '') { - index_text($self, $fn, 1, 'XFN'); + index_phrase($self, $fn, 1, 'XFN'); } if ($part->{is_submsg}) { my $mids = mids_for_index($part); @@ -306,48 +417,39 @@ sub index_xapian { # msg_iter callback my ($s, undef) = msg_part_text($part, $ct); defined $s or return; $_[0]->[0] = $part = undef; # free memory + index_body_text($self, $doc, \$s); +} - # split off quoted and unquoted blocks: - my @sections = PublicInbox::MsgIter::split_quotes($s); - undef $s; # free memory - for my $txt (@sections) { - if ($txt =~ /\A>/) { - index_text($self, $txt, 0, 'XQUOT'); - } else { - # does it look like a diff? - if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { - index_diff($self, $txt, $doc); - } else { - index_text($self, $txt, 1, 'XNQ'); - } - } - undef $txt; # free memory +sub index_list_id ($$$) { + my ($self, $doc, $hdr) = @_; + for my $l ($hdr->header_raw('List-Id')) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + $lid =~ tr/\n\t\r\0//d; # same rules as Message-ID + $doc->add_boolean_term('G' . $lid); + index_phrase($self, $lid, 1, 'XL'); # probabilistic } } sub index_ids ($$$$) { my ($self, $doc, $hdr, $mids) = @_; for my $mid (@$mids) { - index_text($self, $mid, 1, 'XM'); + index_phrase($self, $mid, 1, 'XM'); # because too many Message-IDs are prefixed with # "Pine.LNX."... if ($mid =~ /\w{12,}/) { my @long = ($mid =~ /(\w{3,}+)/g); - index_text($self, join(' ', @long), 1, 'XM'); + index_phrase($self, join(' ', @long), 1, 'XM'); } } $doc->add_boolean_term('Q' . $_) for @$mids; - for my $l ($hdr->header_raw('List-Id')) { - $l =~ /<([^>]+)>/ or next; - my $lid = $1; - $doc->add_boolean_term('G' . $lid); - index_text($self, $lid, 1, 'XL'); # probabilistic - } + index_list_id($self, $doc, $hdr); } -sub add_xapian ($$$$) { +sub eml2doc ($$$;$) { my ($self, $eml, $smsg, $mids) = @_; + $mids //= mids_for_index($eml); my $doc = $X->{Document}->new; add_val($doc, PublicInbox::Search::TS(), $smsg->{ts}); my @ds = gmtime($smsg->{ds}); @@ -359,20 +461,22 @@ sub add_xapian ($$$$) { add_val($doc, PublicInbox::Search::UID(), $smsg->{num}); add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid}); - my $tg = term_generator($self); - $tg->set_document($doc); + term_generator($self)->set_document($doc); index_headers($self, $smsg); + if (defined(my $eidx_key = $smsg->{eidx_key})) { + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; + } msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); # by default, we maintain compatibility with v1.5.0 and earlier - # by writing to docdata.glass, users who never exect to downgrade can + # by writing to docdata.glass, users who never expect to downgrade can # use --skip-docdata if (!$self->{-skip_docdata}) { # WWW doesn't need {to} or {cc}, only NNTP $smsg->{to} = $smsg->{cc} = ''; - PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); + $smsg->parse_references($eml, $mids); my $data = $smsg->to_doc_data; $doc->set_data($data); } @@ -387,22 +491,38 @@ sub add_xapian ($$$$) { } } } + $doc; +} + +sub add_xapian ($$$$) { + my ($self, $eml, $smsg, $mids) = @_; + begin_txn_lazy($self); + my $merge_vmd = delete $smsg->{-merge_vmd}; + my $doc = eml2doc($self, $eml, $smsg, $mids); + if (my $old = $merge_vmd ? _get_doc($self, $smsg->{num}) : undef) { + my @x = @VMD_MAP; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + for my $term (xap_terms($pfx, $old)) { + $doc->add_boolean_term($pfx.$term); + } + } + } $self->{xdb}->replace_document($smsg->{num}, $doc); } sub _msgmap_init ($) { my ($self) = @_; - die "BUG: _msgmap_init is only for v1\n" if $self->{ibx_ver} != 1; - $self->{mm} //= eval { + die "BUG: _msgmap_init is only for v1\n" if $self->{ibx}->version != 1; + $self->{mm} //= do { require PublicInbox::Msgmap; - my $rw = $self->{ibx}->{-no_fsync} ? 2 : 1; - PublicInbox::Msgmap->new($self->{ibx}->{inboxdir}, $rw); + PublicInbox::Msgmap->new_file($self->{ibx}, 1); }; } sub add_message { # mime = PublicInbox::Eml or Email::MIME object my ($self, $mime, $smsg, $sync) = @_; + begin_txn_lazy($self); my $mids = mids_for_index($mime); $smsg //= bless { blob => '' }, 'PublicInbox::Smsg'; # test-only compat $smsg->{mid} //= $mids->[0]; # v1 compatibility @@ -436,54 +556,172 @@ sub add_message { $smsg->{num}; } -sub get_val ($$) { - my ($doc, $col) = @_; - sortable_unserialise($doc->get_value($col)); +sub _get_doc ($$) { + my ($self, $docid) = @_; + $self->get_doc($docid) // do { + warn "E: #$docid missing in Xapian\n"; + undef; + } +} + +sub add_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + term_generator($self)->set_document($doc); + + # '.' is special for lei_store + $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; + + index_list_id($self, $doc, $eml); + $self->{xdb}->replace_document($docid, $doc); } -sub smsg_from_doc ($) { - my ($doc) = @_; - my $data = $doc->get_data or return; - my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->{ts} = get_val($doc, PublicInbox::Search::TS()); - my $dt = get_val($doc, PublicInbox::Search::DT()); - my ($yyyy, $mon, $dd, $hh, $mm, $ss) = unpack('A4A2A2A2A2A2', $dt); - $smsg->{ds} = timegm($ss, $mm, $hh, $dd, $mon - 1, $yyyy); - $smsg->load_from_data($data); - $smsg; +sub get_terms { + my ($self, $pfx, $docid) = @_; + begin_txn_lazy($self); + xap_terms($pfx, $self->{xdb}, $docid); } -sub xdb_remove { - my ($self, $oid, @removed) = @_; - my $xdb = $self->{xdb} or return; - for my $num (@removed) { - my $doc = eval { $xdb->get_document($num) }; - unless ($doc) { - warn "E: $@\n" if $@; - warn "E: #$num $oid missing in Xapian\n"; - next; +sub remove_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + eval { $doc->remove_term('O'.$eidx_key) }; + warn "W: ->remove_term O$eidx_key: $@\n" if $@; + for my $l ($eml ? $eml->header_raw('List-Id') : ()) { + $l =~ /<([^>]+)>/ or next; + my $lid = lc $1; + eval { $doc->remove_term('G' . $lid) }; + warn "W: ->remove_term G$lid: $@\n" if $@; + + # nb: we don't remove the XL probabilistic terms + # since terms may overlap if cross-posted. + # + # IOW, a message which has both <foo.example.com> + # and <bar.example.com> would have overlapping + # "XLexample" and "XLcom" as terms and which we + # wouldn't know if they're safe to remove if we just + # unindex <foo.example.com> while preserving + # <bar.example.com>. + # + # In any case, this entire sub is will likely never + # be needed and users using the "l:" prefix are probably + # rarer. + } + $self->{xdb}->replace_document($docid, $doc); +} + +sub set_vmd { + my ($self, $docid, $vmd) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my ($v, @rm, @add); + my @x = @VMD_MAP; + my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end); + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $set = $vmd->{$field} // next; + my %keep = map { $_ => 1 } @$set; + my %add = %keep; + $cur->skip_to($pfx); # works due to @VMD_MAP order + for (; $cur != $end; $cur++) { + $v = $cur->get_termname; + $v =~ s/\A$pfx//s or next; + $keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v); } - my $smsg = smsg_from_doc($doc); - my $blob = $smsg->{blob}; # may be undef if --skip-docdata - if (!defined($blob) || $blob eq $oid) { - $xdb->delete_document($num); - } else { - warn "E: #$num $oid != $blob in Xapian\n"; + push(@add, map { $pfx.$_ } keys %add); + } + return unless scalar(@rm) || scalar(@add); + $doc->remove_term($_) for @rm; + $doc->add_boolean_term($_) for @add; + $self->{xdb}->replace_document($docid, $doc); +} + +sub apply_vmd_mod ($$) { + my ($doc, $vmd_mod) = @_; + my $updated = 0; + my @x = @VMD_MAP; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + # field: "L" or "kw" + for my $val (@{$vmd_mod->{"-$field"} // []}) { + eval { + $doc->remove_term($pfx . $val); + ++$updated; + }; + } + for my $val (@{$vmd_mod->{"+$field"} // []}) { + $doc->add_boolean_term($pfx . $val); + ++$updated; + } + } + $updated; +} + +sub add_vmd { + my ($self, $docid, $vmd) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my @x = @VMD_MAP; + my $updated = 0; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $add = $vmd->{$field} // next; + $doc->add_boolean_term($pfx . $_) for @$add; + $updated += scalar(@$add); + } + $updated += apply_vmd_mod($doc, $vmd); + $self->{xdb}->replace_document($docid, $doc) if $updated; +} + +sub remove_vmd { + my ($self, $docid, $vmd) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my $replace; + my @x = @VMD_MAP; + while (my ($field, $pfx) = splice(@x, 0, 2)) { + my $rm = $vmd->{$field} // next; + for (@$rm) { + eval { + $doc->remove_term($pfx . $_); + $replace = 1; + }; } } + $self->{xdb}->replace_document($docid, $doc) if $replace; +} + +sub update_vmd { + my ($self, $docid, $vmd_mod) = @_; + begin_txn_lazy($self); + my $doc = _get_doc($self, $docid) or return; + my $updated = apply_vmd_mod($doc, $vmd_mod); + $self->{xdb}->replace_document($docid, $doc) if $updated; + $updated; +} + +sub xdb_remove { + my ($self, @docids) = @_; + begin_txn_lazy($self); + my $xdb = $self->{xdb} // die 'BUG: missing {xdb}'; + for my $docid (@docids) { + eval { $xdb->delete_document($docid) }; + warn "E: #$docid not in Xapian? $@\n" if $@; + } } -sub remove_by_oid { - my ($self, $oid, $num) = @_; - die "BUG: remove_by_oid is v2-only\n" if $self->{oidx}; - $self->begin_txn_lazy; - xdb_remove($self, $oid, $num) if need_xapian($self); +sub xdb_remove_quiet { + my ($self, $docid) = @_; + begin_txn_lazy($self); + my $xdb = $self->{xdb} // die 'BUG: missing {xdb}'; + eval { $xdb->delete_document($docid) }; + ++$self->{-quiet_rm} unless $@; } +sub nr_quiet_rm { delete($_[0]->{-quiet_rm}) // 0 } + sub index_git_blob_id { my ($doc, $pfx, $objid) = @_; - my $len = length($objid); for (my $len = length($objid); $len >= 7; ) { $doc->add_term($pfx.$objid); $objid = substr($objid, 0, --$len); @@ -502,8 +740,8 @@ sub unindex_eml { $tmp{$_}++ for @removed; } if (!$nr) { - $mids = join('> <', @$mids); - warn "W: <$mids> missing for removal from overview\n"; + my $m = join('> <', @$mids); + warn "W: <$m> missing for removal from overview\n"; } while (my ($num, $nr) = each %tmp) { warn "BUG: $num appears >1 times ($nr) for $oid\n" if $nr != 1; @@ -513,7 +751,7 @@ sub unindex_eml { } else { # just in case msgmap and over.sqlite3 become desynched: $self->{mm}->mid_delete($mids->[0]); } - xdb_remove($self, $oid, keys %tmp) if need_xapian($self); + xdb_remove($self, keys %tmp) if need_xapian($self); } sub index_mm { @@ -533,53 +771,73 @@ sub index_mm { } } -# returns the number of bytes to add if given a non-CRLF arg -sub crlf_adjust ($) { - if (index($_[0], "\r\n") < 0) { - # common case is LF-only, every \n needs an \r; - # so favor a cheap tr// over an expensive m//g - $_[0] =~ tr/\n/\n/; - } else { # count number of '\n' w/o '\r', expensive: - scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g)); +sub is_bad_blob ($$$$) { + my ($oid, $type, $size, $expect_oid) = @_; + if ($type ne 'blob') { + carp "W: $expect_oid is not a blob (type=$type)"; + return 1; } + croak "BUG: $oid != $expect_oid" if $oid ne $expect_oid; + $size == 0 ? 1 : 0; # size == 0 means purged } sub index_both { # git->cat_async callback my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); my ($nr, $max) = @$sync{qw(nr max)}; ++$$nr; $$max -= $size; - $size += crlf_adjust($$bref); - my $smsg = bless { bytes => $size, blob => $oid }, 'PublicInbox::Smsg'; + my $smsg = bless { blob => $oid }, 'PublicInbox::Smsg'; + $smsg->set_bytes($$bref, $size); my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; my $eml = PublicInbox::Eml->new($bref); $smsg->{num} = index_mm($self, $eml, $oid, $sync) or die "E: could not generate NNTP article number for $oid"; add_message($self, $eml, $smsg, $sync); + ++$self->{nidx}; + my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + ${$sync->{latest_cmt}} = $cur_cmt; } sub unindex_both { # git->cat_async callback - my ($bref, $oid, $type, $size, $self) = @_; + my ($bref, $oid, $type, $size, $sync) = @_; + return if is_bad_blob($oid, $type, $size, $sync->{oid}); + my $self = $sync->{sidx}; + local $self->{current_info} = "$self->{current_info}: $oid"; unindex_eml($self, $oid, PublicInbox::Eml->new($bref)); + # may be undef if leftover + if (defined(my $cur_cmt = $sync->{cur_cmt})) { + ${$sync->{latest_cmt}} = $cur_cmt; + } + ++$self->{nidx}; +} + +sub with_umask { + my $self = shift; + my $owner = $self->{ibx} // $self->{eidx}; + $owner ? $owner->with_umask(@_) : $self->SUPER::with_umask(@_) } # called by public-inbox-index sub index_sync { my ($self, $opt) = @_; delete $self->{lock_path} if $opt->{-skip_lock}; - $self->{ibx}->with_umask(\&_index_sync, $self, $opt); - if ($opt->{reindex}) { + $self->with_umask(\&_index_sync, $self, $opt); + if ($opt->{reindex} && !$opt->{quit} && + !grep(defined, @$opt{qw(since until)})) { my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller } } sub check_size { # check_async cb for -index --max-size=... - my ($oid, $type, $size, $arg, $git) = @_; - (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}"; + my (undef, $oid, $type, $size, $arg) = @_; + ($type // '') eq 'blob' or die "E: bad $oid in $arg->{git}->{git_dir}"; if ($size <= $arg->{max_size}) { - $git->cat_async($oid, $arg->{index_oid}, $arg); + $arg->{git}->cat_async($oid, $arg->{index_oid}, $arg); } else { warn "W: skipping $oid ($size > $arg->{max_size})\n"; } @@ -587,46 +845,45 @@ sub check_size { # check_async cb for -index --max-size=... sub v1_checkpoint ($$;$) { my ($self, $sync, $stk) = @_; - $self->{ibx}->git->check_async_wait; - $self->{ibx}->git->cat_async_wait; - - # latest_cmt may be undef - my $newest = $stk ? $stk->{latest_cmt} : undef; - if ($newest) { - my $cur = $self->{mm}->last_commit || ''; - if (need_update($self, $cur, $newest)) { + $self->{ibx}->git->async_wait_all; + + # $newest may be undef + my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + if (defined($newest)) { + my $cur = $self->{mm}->last_commit; + if (need_update($self, $sync, $cur, $newest)) { $self->{mm}->last_commit($newest); } - } else { - ${$sync->{max}} = $self->{batch_bytes}; } + ${$sync->{max}} = $self->{batch_bytes}; $self->{mm}->{dbh}->commit; - if ($newest && need_xapian($self)) { - my $xdb = $self->{xdb}; + eval { $self->{mm}->{dbh}->do('PRAGMA optimize') }; + my $xdb = $self->{xdb}; + if ($newest && $xdb) { my $cur = $xdb->get_metadata('last_commit'); - if (need_update($self, $cur, $newest)) { + if (need_update($self, $sync, $cur, $newest)) { $xdb->set_metadata('last_commit', $newest); } - + } + if ($stk) { # all done if $stk is passed # let SearchView know a full --reindex was done so it can # generate ->has_threadid-dependent links - if ($sync->{reindex} && !ref($sync->{reindex})) { + if ($xdb && $sync->{reindex} && !ref($sync->{reindex})) { my $n = $xdb->get_metadata('has_threadid'); $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } + $self->{oidx}->rethread_done($sync->{-opt}); # all done } - - $self->{oidx}->rethread_done($sync->{-opt}) if $newest; # all done commit_txn_lazy($self); - $self->{ibx}->git->cleanup; + $sync->{ibx}->git->cleanup; my $nr = ${$sync->{nr}}; idx_release($self, $nr); # let another process do some work... if (my $pr = $sync->{-opt}->{-progress}) { $pr->("indexed $nr/$sync->{ntodo}\n") if $nr; } - if (!$stk) { # more to come + if (!$stk && !$sync->{quit}) { # more to come begin_txn_lazy($self); $self->{mm}->{dbh}->begin_work; } @@ -635,45 +892,51 @@ sub v1_checkpoint ($$;$) { # only for v1 sub process_stack { my ($self, $sync, $stk) = @_; - my $git = $self->{ibx}->git; + my $git = $sync->{ibx}->git; my $max = $self->{batch_bytes}; my $nr = 0; $sync->{nr} = \$nr; $sync->{max} = \$max; $sync->{sidx} = $self; + $sync->{latest_cmt} = \(my $latest_cmt); $self->{mm}->{dbh}->begin_work; if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $sync); } } if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = \&index_both; } - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { + while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) { + my $arg = { %$sync, cur_cmt => $cur_cmt, oid => $oid }; + last if $sync->{quit}; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; + $arg->{autime} = $at; + $arg->{cotime} = $ct; if ($sync->{max_size}) { + $arg->{git} = $git; $git->check_async($oid, \&check_size, $arg); } else { $git->cat_async($oid, \&index_both, $arg); } v1_checkpoint($self, $sync) if $max <= 0; } elsif ($f eq 'd') { - $git->cat_async($oid, \&unindex_both, $self); + $git->cat_async($oid, \&unindex_both, $arg); } } - v1_checkpoint($self, $sync, $stk); + v1_checkpoint($self, $sync, $sync->{quit} ? undef : $stk); } -sub log2stack ($$$$) { - my ($sync, $git, $range, $ibx) = @_; +sub log2stack ($$$) { + my ($sync, $git, $range) = @_; my $D = $sync->{D}; # OID_BIN => NR (if reindexing, undef otherwise) my ($add, $del); - if ($ibx->version == 1) { + if ($sync->{ibx}->version == 1) { my $path = $hex.'{2}/'.$hex.'{38}'; $add = qr!\A:000000 100644 \S+ ($OID) A\t$path$!; $del = qr!\A:100644 000000 ($OID) \S+ D\t$path$!; @@ -684,43 +947,47 @@ sub log2stack ($$$$) { # Count the new files so they can be added newest to oldest # and still have numbers increasing from oldest to newest - my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H - --no-notes --no-color --no-renames --no-abbrev), - $range); - my ($at, $ct, $stk); - while (<$fh>) { - if (/\A([0-9]+)-([0-9]+)-($OID)$/o) { - ($at, $ct) = ($1 + 0, $2 + 0); - $stk //= PublicInbox::IdxStack->new($3); - } elsif (/$del/) { + my @cmd = qw(log --raw -r --pretty=tformat:%at-%ct-%H + --no-notes --no-color --no-renames --no-abbrev); + for my $k (qw(since until)) { + my $v = $sync->{-opt}->{$k} // next; + next if !$sync->{-opt}->{reindex}; + push @cmd, "--$k=$v"; + } + my $fh = $git->popen(@cmd, $range); + my ($at, $ct, $stk, $cmt, $l); + while (defined($l = <$fh>)) { + return if $sync->{quit}; + if ($l =~ /\A([0-9]+)-([0-9]+)-($OID)$/o) { + ($at, $ct, $cmt) = ($1 + 0, $2 + 0, $3); + $stk //= PublicInbox::IdxStack->new($cmt); + } elsif ($l =~ /$del/) { my $oid = $1; if ($D) { # reindex case $D->{pack('H*', $oid)}++; } else { # non-reindex case: - $stk->push_rec('d', $at, $ct, $oid); + $stk->push_rec('d', $at, $ct, $oid, $cmt); } - } elsif (/$add/) { + } elsif ($l =~ /$add/) { my $oid = $1; if ($D) { my $oid_bin = pack('H*', $oid); my $nr = --$D->{$oid_bin}; delete($D->{$oid_bin}) if $nr <= 0; - # nr < 0 (-1) means it never existed - $stk->push_rec('m', $at, $ct, $oid) if $nr < 0; - } else { - $stk->push_rec('m', $at, $ct, $oid); + next if $nr >= 0; } + $stk->push_rec('m', $at, $ct, $oid, $cmt); } } - close $fh or die "git log failed: \$?=$?"; + $fh->close or die "git log failed: \$?=$?"; $stk //= PublicInbox::IdxStack->new; $stk->read_prepare; } -sub prepare_stack ($$$) { - my ($self, $sync, $range) = @_; - my $git = $self->{ibx}->git; +sub prepare_stack ($$) { + my ($sync, $range) = @_; + my $git = $sync->{ibx}->git; if (index($range, '..') < 0) { # don't show annoying git errors to users who run -index @@ -729,24 +996,28 @@ sub prepare_stack ($$$) { return PublicInbox::IdxStack->new->read_prepare if $?; } $sync->{D} = $sync->{reindex} ? {} : undef; # OID_BIN => NR - log2stack($sync, $git, $range, $self->{ibx}); + log2stack($sync, $git, $range); } # --is-ancestor requires git 1.8.0+ sub is_ancestor ($$$) { my ($git, $cur, $tip) = @_; return 0 unless $git->check($cur); - my $cmd = [ 'git', "--git-dir=$git->{git_dir}", - qw(merge-base --is-ancestor), $cur, $tip ]; - my $pid = spawn($cmd); - waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; - $? == 0; + my $cmd = $git->cmd(qw(merge-base --is-ancestor), $cur, $tip); + run_wait($cmd) == 0; } -sub need_update ($$$) { - my ($self, $cur, $new) = @_; +sub need_update ($$$$) { + my ($self, $sync, $cur, $new) = @_; my $git = $self->{ibx}->git; - return 1 if $cur && !is_ancestor($git, $cur, $new); + $cur //= ''; # XS Search::Xapian ->get_metadata doesn't give undef + + # don't rewind if --{since,until,before,after} are in use + return if $cur ne '' && + grep(defined, @{$sync->{-opt}}{qw(since until)}) && + is_ancestor($git, $new, $cur); + + return 1 if $cur ne '' && !is_ancestor($git, $cur, $new); my $range = $cur eq '' ? $new : "$cur..$new"; chomp(my $n = $git->qx(qw(rev-list --count), $range)); ($n eq '' || $n > 0); @@ -777,15 +1048,34 @@ sub reindex_from ($$) { ref($reindex) eq 'HASH' ? $reindex->{from} : ''; } +sub quit_cb ($) { + my ($sync) = @_; + sub { + # we set {-opt}->{quit} too, so ->index_sync callers + # can abort multi-inbox loops this way + $sync->{quit} = $sync->{-opt}->{quit} = 1; + warn "gracefully quitting\n"; + } +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opt) = @_; my $tip = $opt->{ref} || 'HEAD'; - my $git = $self->{ibx}->git; + my $ibx = $self->{ibx}; + local $self->{current_info} = "$ibx->{inboxdir}"; $self->{batch_bytes} = $opt->{batch_size} // $BATCH_BYTES; - $git->batch_prepare; + + if ($X->{CLOEXEC_UNSET}) { + $ibx->git->cat_file($tip); + $ibx->git->check($tip); + } my $pr = $opt->{-progress}; - my $sync = { reindex => $opt->{reindex}, -opt => $opt }; + my $sync = { reindex => $opt->{reindex}, -opt => $opt, ibx => $ibx }; + my $quit = quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; my $xdb = $self->begin_txn_lazy; $self->{oidx}->rethread_prepare($opt); my $mm = _msgmap_init($self); @@ -803,10 +1093,10 @@ sub _index_sync { my $lx = reindex_from($sync->{reindex}, $last_commit); my $range = $lx eq '' ? $tip : "$lx..$tip"; $pr->("counting changes\n\t$range ... ") if $pr; - my $stk = prepare_stack($self, $sync, $range); + my $stk = prepare_stack($sync, $range); $sync->{ntodo} = $stk ? $stk->num_records : 0; $pr->("$sync->{ntodo}\n") if $pr; # continue previous line - process_stack($self, $sync, $stk); + process_stack($self, $sync, $stk) if !$sync->{quit}; } sub DESTROY { @@ -815,8 +1105,10 @@ sub DESTROY { $_[0]->{lockfh} = undef; } -sub _begin_txn { +sub begin_txn_lazy { my ($self) = @_; + return if $self->{txn}; + my $restore = $self->with_umask; my $xdb = $self->{xdb} || idx_acquire($self); $self->{oidx}->begin_lazy if $self->{oidx}; $xdb->begin_transaction if $xdb; @@ -824,13 +1116,8 @@ sub _begin_txn { $xdb; } -sub begin_txn_lazy { - my ($self) = @_; - $self->{ibx}->with_umask(\&_begin_txn, $self) if !$self->{txn}; -} - # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) -# This metadata is read by Admin::detect_indexlevel: +# This metadata is read by InboxWritable->detect_indexlevel: sub set_metadata_once { my ($self) = @_; @@ -852,8 +1139,14 @@ sub set_metadata_once { } } -sub _commit_txn { +sub commit_txn_lazy { my ($self) = @_; + return unless delete($self->{txn}); + my $restore = $self->with_umask; + if (my $eidx = $self->{eidx}) { + $eidx->git->async_wait_all; + $eidx->{transact_bytes} = 0; + } if (my $xdb = $self->{xdb}) { set_metadata_once($self); $xdb->commit_transaction; @@ -861,18 +1154,18 @@ sub _commit_txn { $self->{oidx}->commit_lazy if $self->{oidx}; } -sub commit_txn_lazy { - my ($self) = @_; - delete($self->{txn}) and - $self->{ibx}->with_umask(\&_commit_txn, $self); -} - -sub worker_done { - my ($self) = @_; - if (need_xapian($self)) { - die "$$ $0 xdb not released\n" if $self->{xdb}; - } - die "$$ $0 still in transaction\n" if $self->{txn}; +sub eidx_shard_new { + my ($class, $eidx, $shard) = @_; + my $self = bless { + eidx => $eidx, + xpfx => $eidx->{xpfx}, + indexlevel => $eidx->{indexlevel}, + -skip_docdata => 1, + shard => $shard, + creat => 1, + }, $class; + $self->{-set_indexlevel_once} = 1 if $self->{indexlevel} eq 'medium'; + $self; } 1; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index f23d23d0..ea261bda 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -1,155 +1,75 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Internal interface for a single Xapian shard in V2 inboxes. # See L<public-inbox-v2-format(5)> for more info on how we shard Xapian package PublicInbox::SearchIdxShard; -use strict; -use v5.10.1; -use parent qw(PublicInbox::SearchIdx); -use IO::Handle (); # autoflush -use PublicInbox::Eml; +use v5.12; +use parent qw(PublicInbox::SearchIdx PublicInbox::IPC); +use PublicInbox::OnDestroy; +use PublicInbox::Syscall qw($F_SETPIPE_SZ); sub new { - my ($class, $v2w, $shard) = @_; + my ($class, $v2w, $shard) = @_; # v2w may be ExtSearchIdx my $ibx = $v2w->{ibx}; - my $self = $class->SUPER::new($ibx, 1, $shard); + my $self = $ibx ? $class->SUPER::new($ibx, 1, $shard) + : $class->eidx_shard_new($v2w, $shard); # create the DB before forking: $self->idx_acquire; $self->set_metadata_once; $self->idx_release; - $self->spawn_worker($v2w, $shard) if $v2w->{parallel}; + if ($v2w->{parallel}) { + local $self->{-v2w_afc} = $v2w; + $self->ipc_worker_spawn("shard[$shard]"); + # Increasing the pipe size for requests speeds V2 batch imports + # across 8 cores by nearly 20%. Since many of our responses + # are small, make the response pipe as small as possible + if ($F_SETPIPE_SZ) { + fcntl($self->{-ipc_req}, $F_SETPIPE_SZ, 1048576); + fcntl($self->{-ipc_res}, $F_SETPIPE_SZ, 4096); + } + } $self; } -sub spawn_worker { - my ($self, $v2w, $shard) = @_; - my ($r, $w); - pipe($r, $w) or die "pipe failed: $!\n"; - $w->autoflush(1); - my $pid = fork; - defined $pid or die "fork failed: $!\n"; - if ($pid == 0) { - my $bnote = $v2w->atfork_child; - close $w or die "failed to close: $!"; - - # F_SETPIPE_SZ = 1031 on Linux; increasing the pipe size here - # speeds V2Writable batch imports across 8 cores by nearly 20% - fcntl($r, 1031, 1048576) if $^O eq 'linux'; - - eval { shard_worker_loop($self, $v2w, $r, $shard, $bnote) }; - die "worker $shard died: $@\n" if $@; - die "unexpected MM $self->{mm}" if $self->{mm}; - exit; - } - $self->{pid} = $pid; - $self->{w} = $w; - close $r or die "failed to close: $!"; +sub _worker_done { # OnDestroy cb + my ($self) = @_; + die "BUG: $$ $0 xdb active" if $self->need_xapian && $self->{xdb}; + die "BUG: $$ $0 txn active" if $self->{txn}; } -# this reads all the writes to $self->{w} from the parent process -sub shard_worker_loop ($$$$$) { - my ($self, $v2w, $r, $shard, $bnote) = @_; - $0 = "pi-v2-shard[$shard]"; +sub ipc_atfork_child { # called automatically before ipc_worker_loop + my ($self) = @_; + my $v2w = delete $self->{-v2w_afc} or die 'BUG: {-v2w_afc} missing'; + $v2w->atfork_child; # calls ipc_sibling_atfork_child on our siblings + $v2w->{current_info} = "[$self->{shard}]"; # for $SIG{__WARN__} $self->begin_txn_lazy; - while (my $line = readline($r)) { - $v2w->{current_info} = "[$shard] $line"; - if ($line eq "commit\n") { - $self->commit_txn_lazy; - } elsif ($line eq "close\n") { - $self->idx_release; - } elsif ($line eq "barrier\n") { - $self->commit_txn_lazy; - # no need to lock < 512 bytes is atomic under POSIX - print $bnote "barrier $shard\n" or - die "write failed for barrier $!\n"; - } elsif ($line =~ /\AD ([a-f0-9]{40,}) ([0-9]+)\n\z/s) { - $self->remove_by_oid($1, $2 + 0); - } else { - chomp $line; - # n.b. $mid may contain spaces(!) - my ($to_read, $bytes, $num, $blob, $ds, $ts, $tid, $mid) - = split(/ /, $line, 8); - $self->begin_txn_lazy; - my $n = read($r, my $msg, $to_read) or die "read: $!\n"; - $n == $to_read or die "short read: $n != $to_read\n"; - my $mime = PublicInbox::Eml->new(\$msg); - my $smsg = bless { - bytes => $bytes, - num => $num + 0, - blob => $blob, - mid => $mid, - tid => $tid, - ds => $ds, - ts => $ts, - }, 'PublicInbox::Smsg'; - $self->add_message($mime, $smsg); - } - } - $self->worker_done; -} - -sub index_raw { - my ($self, $msgref, $eml, $smsg) = @_; - if (my $w = $self->{w}) { - # mid must be last, it can contain spaces (but not LF) - print $w join(' ', @$smsg{qw(raw_bytes bytes - num blob ds ts tid mid)}), - "\n", $$msgref or die "failed to write shard $!\n"; - } else { - if ($eml) { - undef $$msgref; - } else { # --xapian-only + --sequential-shard: - $eml = PublicInbox::Eml->new($msgref); - } - $self->begin_txn_lazy; - $self->add_message($eml, $smsg); - } + # caller (ipc_worker_spawn) must capture this: + on_destroy \&_worker_done, $self; } -sub atfork_child { - close $_[0]->{w} or die "failed to close write pipe: $!\n"; +sub index_eml { + my ($self, $eml, $smsg, $eidx_key) = @_; + $smsg->{eidx_key} = $eidx_key if defined $eidx_key; + $self->ipc_do('add_xapian', $eml, $smsg); } -sub shard_barrier { - my ($self) = @_; - if (my $w = $self->{w}) { - print $w "barrier\n" or die "failed to print: $!"; - } else { - $self->commit_txn_lazy; - } +# wait for return to determine when ipc_do('commit_txn_lazy') is done +sub echo { + shift; + "@_"; } -sub shard_commit { +sub idx_close { my ($self) = @_; - if (my $w = $self->{w}) { - print $w "commit\n" or die "failed to write commit: $!"; - } else { - $self->commit_txn_lazy; - } + die "BUG: $$ $0 txn active" if $self->{txn}; + $self->idx_release if $self->{xdb}; } sub shard_close { my ($self) = @_; - if (my $w = delete $self->{w}) { - my $pid = delete $self->{pid} or die "no process to wait on\n"; - print $w "close\n" or die "failed to write to pid:$pid: $!\n"; - close $w or die "failed to close pipe for pid:$pid: $!\n"; - waitpid($pid, 0) == $pid or die "remote process did not finish"; - $? == 0 or die ref($self)." pid:$pid exited with: $?"; - } else { - die "transaction in progress $self\n" if $self->{txn}; - $self->idx_release if $self->{xdb}; - } -} - -sub shard_remove { - my ($self, $oid, $num) = @_; - if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child - print $w "D $oid $num\n" or die "failed to write remove $!"; - } else { # same process - $self->remove_by_oid($oid, $num); - } + $self->ipc_do('idx_close'); + $self->ipc_worker_stop; } 1; diff --git a/lib/PublicInbox/SearchQuery.pm b/lib/PublicInbox/SearchQuery.pm index 6724ae39..747e3249 100644 --- a/lib/PublicInbox/SearchQuery.pm +++ b/lib/PublicInbox/SearchQuery.pm @@ -1,12 +1,12 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# used by PublicInbox::SearchView +# used by PublicInbox::SearchView and PublicInbox::WwwListing package PublicInbox::SearchQuery; use strict; use v5.10.1; use URI::Escape qw(uri_escape); -use PublicInbox::MID qw(MID_ESC); +use PublicInbox::Hval qw(ascii_html); our $LIM = 200; sub new { @@ -16,10 +16,11 @@ sub new { my $t = $qp->{t}; # collapse threads my ($l) = (($qp->{l} || '') =~ /([0-9]+)/); $l = $LIM if !$l || $l > $LIM; + my ($o) = (($qp->{o} || '0') =~ /(-?[0-9]+)/); bless { q => $qp->{'q'}, x => $qp->{x} || '', - o => (($qp->{o} || '0') =~ /(-?[0-9]+)/), + o => $o, l => $l, r => (defined $r && $r ne '0'), t => (defined $t && $t ne '0'), @@ -32,11 +33,16 @@ sub qs_html { if (scalar(keys(%override))) { $self = bless { (%$self, %override) }, ref($self); } - - my $q = uri_escape($self->{'q'}, MID_ESC); - $q =~ s/%20/+/g; # improve URL readability - my $qs = "q=$q"; - + my $qs = ''; + if (defined(my $q = $self->{'q'})) { + # not using MID_ESC since that's for the path component and + # this is for the query component. Unlike MID_ESC, + # this disallows [\&\'\+=] and allows slash [/] for + # nicer looking dfn: queries + $q = uri_escape($q, '^A-Za-z0-9\-\._~!\$\(\)\*,;:@/'); + $q =~ s/%20/+/g; # improve URL readability + $qs .= 'q='.ascii_html($q); + } if (my $o = $self->{o}) { # ignore o == 0 $qs .= "&o=$o"; } diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 60f692b2..00ae9fac 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -24,70 +24,74 @@ use PublicInbox::MID qw($MID_EXTRACT); sub thread { my ($msgs, $ordersub, $ctx) = @_; + my (%id_table, @imposters); + keys(%id_table) = scalar @$msgs; # pre-size - # A. put all current $msgs (non-ghosts) into %id_table - my %id_table = map {; + # A. put all current non-imposter $msgs (non-ghosts) into %id_table + # (imposters are messages with reused Message-IDs) + # Sadly, we sort here anyways since the fill-in-the-blanks References: + # can be shakier if somebody used In-Reply-To with multiple, disparate + # messages. So, take the client Date: into account since we can't + # always determine ordering when somebody uses multiple In-Reply-To. + my @kids = sort { $a->{ds} <=> $b->{ds} } grep { # this delete saves around 4K across 1K messages # TODO: move this to a more appropriate place, breaks tests # if we do it during psgi_cull delete $_->{num}; - - $_->{mid} => PublicInbox::SearchThread::Msg::cast($_); + bless $_, 'PublicInbox::SearchThread::Msg'; + if (exists $id_table{$_->{mid}}) { + $_->{children} = []; + push @imposters, $_; # we'll deal with them later + undef; + } else { + $_->{children} = {}; # will become arrayref later + $id_table{$_->{mid}} = $_; + defined($_->{references}); + } } @$msgs; + for my $smsg (@kids) { + # This loop exists to help fill in gaps left from missing + # messages. It is not needed in a perfect world where + # everything is perfectly referenced, only the last ref + # matters. + my $prev; + for my $ref ($smsg->{references} =~ m/$MID_EXTRACT/go) { + # Find a Container object for the given Message-ID + my $cont = $id_table{$ref} //= + PublicInbox::SearchThread::Msg::ghost($ref); + + # Link the References field's Containers together in + # the order implied by the References header + # + # * If they are already linked don't change the + # existing links + # * Do not add a link if adding that link would + # introduce a loop... + if ($prev && + !$cont->{parent} && # already linked + !$cont->has_descendent($prev) # would loop + ) { + $prev->add_child($cont); + } + $prev = $cont; + } - # Sadly, we sort here anyways since the fill-in-the-blanks References: - # can be shakier if somebody used In-Reply-To with multiple, disparate - # messages. So, take the client Date: into account since we can't - # always determine ordering when somebody uses multiple In-Reply-To. - # We'll trust the client Date: header here instead of the Received: - # time since this is for display (and not retrieval) - _set_parent(\%id_table, $_) for sort { $a->{ds} <=> $b->{ds} } @$msgs; - my $ibx = $ctx->{-inbox}; - my $rootset = [ grep { - !delete($_->{parent}) && $_->visible($ibx) - } values %id_table ]; - $rootset = $ordersub->($rootset); - $_->order_children($ordersub, $ctx) for @$rootset; - $rootset; -} - -sub _set_parent ($$) { - my ($id_table, $this) = @_; - - # B. For each element in the message's References field: - defined(my $refs = $this->{references}) or return; - - # This loop exists to help fill in gaps left from missing - # messages. It is not needed in a perfect world where - # everything is perfectly referenced, only the last ref - # matters. - my $prev; - foreach my $ref ($refs =~ m/$MID_EXTRACT/go) { - # Find a Container object for the given Message-ID - my $cont = $id_table->{$ref} //= - PublicInbox::SearchThread::Msg::ghost($ref); - - # Link the References field's Containers together in - # the order implied by the References header - # - # * If they are already linked don't change the - # existing links - # * Do not add a link if adding that link would - # introduce a loop... - if ($prev && - !$cont->{parent} && # already linked - !$cont->has_descendent($prev) # would loop - ) { - $prev->add_child($cont); + # C. Set the parent of this message to be the last element in + # References. + if (defined $prev && !$smsg->has_descendent($prev)) { + $prev->add_child($smsg); } - $prev = $cont; } + my $ibx = $ctx->{ibx}; + my @rootset = grep { # n.b.: delete prevents cyclic refs + !delete($_->{parent}) && $_->visible($ibx) + } values %id_table; + $ordersub->(\@rootset); + $_->order_children($ordersub, $ctx) for @rootset; - # C. Set the parent of this message to be the last element in - # References. - if (defined $prev && !$this->has_descendent($prev)) { # would loop - $prev->add_child($this); - } + # parent imposter messages with reused Message-IDs + unshift(@{$id_table{$_->{mid}}->{children}}, $_) for @imposters; + \@rootset; } package PublicInbox::SearchThread::Msg; @@ -104,13 +108,6 @@ sub ghost { }, __PACKAGE__; } -# give a existing smsg the methods of this class -sub cast { - my ($smsg) = @_; - $smsg->{children} = {}; - bless $smsg, __PACKAGE__; -} - sub topmost { my ($self) = @_; my @q = ($self); @@ -166,14 +163,14 @@ sub order_children { my %seen = ($cur => 1); # self-referential loop prevention my @q = ($cur); - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; while (defined($cur = shift @q)) { - my $c = $cur->{children}; # The hashref here... - - $c = [ grep { !$seen{$_}++ && visible($_, $ibx) } values %$c ]; - $c = $ordersub->($c) if scalar @$c > 1; - $cur->{children} = $c; # ...becomes an arrayref - push @q, @$c; + # the {children} hashref here... + my @c = grep { !$seen{$_}++ && visible($_, $ibx) } + values %{delete $cur->{children}}; + $ordersub->(\@c) if scalar(@c) > 1; + $cur->{children} = \@c; # ...becomes an arrayref + push @q, @c; } } diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index c482f1c9..9919e25c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Displays search results for the web interface @@ -14,7 +14,7 @@ use PublicInbox::WwwAtomStream; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::SearchThread; use PublicInbox::SearchQuery; -use PublicInbox::Search qw(mdocid); +use PublicInbox::Search qw(get_pct); my %rmap_inc; sub mbox_results { @@ -30,58 +30,67 @@ sub mbox_results { sub sres_top_html { my ($ctx) = @_; - my $srch = $ctx->{-inbox}->search or + my $srch = $ctx->{srch} = $ctx->{ibx}->isrch or return PublicInbox::WWW::need($ctx, 'Search'); my $q = PublicInbox::SearchQuery->new($ctx->{qp}); - my $x = $q->{x}; - my $query = $q->{'q'}; - my $o = $q->{o}; + my $o = $q->{o} // 0; my $asc; if ($o < 0) { $asc = 1; $o = -($o + 1); # so [-1] is the last element, like Perl lists } - my $code = 200; # double the limit for expanded views: - my $opts = { + my $opt = { limit => $q->{l}, offset => $o, relevance => $q->{r}, - thread => $q->{t}, + threads => $q->{t}, asc => $asc, }; - my ($mset, $total, $err, $html); -retry: - eval { - $mset = $srch->mset($query, $opts); - $total = $mset->get_matches_estimated; - }; - $err = $@; + my $qs = $q->{'q'}; + $srch->query_approxidate($ctx->{ibx}->git, $qs); + sub { + $ctx->{wcb} = $_[0]; # PSGI server supplied write cb + $srch->async_mset($qs, $opt, \&sres_html_cb, $ctx, $opt, $q); + } +} + +sub sres_html_cb { # async_mset cb + my ($ctx, $opt, $q, $mset, $err) = @_; + my $code = 200; + my $total = $mset ? $mset->get_matches_estimated : undef; ctx_prepare($q, $ctx); + my ($res, $html); if ($err) { $code = 400; $html = '<pre>'.err_txt($ctx, $err).'</pre><hr>'; } elsif ($total == 0) { - if (defined($ctx->{-uxs_retried})) { - # undo retry damage: + if (defined($ctx->{-uxs_retried})) { # undo retry damage: $q->{'q'} = $ctx->{-uxs_retried}; - } elsif (index($q->{'q'}, '%') >= 0) { + } elsif (index($q->{'q'}, '%') >= 0) { # retry unescaped $ctx->{-uxs_retried} = $q->{'q'}; - $q->{'q'} = uri_unescape($q->{'q'}); - goto retry; + my $qs = $q->{'q'} = uri_unescape($q->{'q'}); + $ctx->{srch}->query_approxidate($ctx->{ibx}->git, $qs); + return $ctx->{srch}->async_mset($qs, $opt, + \&sres_html_cb, $ctx, $opt, $q); } $code = 404; $html = "<pre>\n[No results found]</pre><hr>"; + } elsif ($q->{x} eq 'A') { + $res = adump($mset, $q, $ctx); } else { - return adump($_[0], $mset, $q, $ctx) if $x eq 'A'; - $ctx->{-html_tip} = search_nav_top($mset, $q, $ctx); - return mset_thread($ctx, $mset, $q) if $x eq 't'; - mset_summary($ctx, $mset, $q); # appends to {-html_tip} - $html = ''; + if ($q->{x} eq 't') { + $res = mset_thread($ctx, $mset, $q); + } else { + mset_summary($ctx, $mset, $q); # appends to {-html_tip} + $html = ''; + } } - html_oneshot($ctx, $code); + $res //= html_oneshot($ctx, $code, $html); + my $wcb = delete $ctx->{wcb}; + ref($res) eq 'CODE' ? $res->($wcb) : $wcb->($res); } # display non-nested search results similar to what users expect from @@ -93,24 +102,24 @@ sub mset_summary { my $pad = length("$total"); my $pfx = ' ' x $pad; my $res = \($ctx->{-html_tip}); - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef; - my @nums = @{$ibx->search->mset_to_artnums($mset)}; + my @nums = @{$ibx->isrch->mset_to_artnums($mset)}; my %num2msg = map { $_->{num} => $_ } @{$ibx->over->get_all(@nums)}; - my ($min, $max); + my ($min, $max, %seen); foreach my $m ($mset->items) { - my $rank = sprintf("%${pad}d", $m->get_rank + 1); - my $pct = get_pct($m); my $num = shift @nums; - my $smsg = delete($num2msg{$num}) or do { - eval { - $m = "$m $num expired\n"; - $ctx->{env}->{'psgi.errors'}->print($m); - }; + my $smsg = delete($num2msg{$num}) // do { + warn "$m $num expired\n"; next; }; + my $mid = $smsg->{mid}; + next if $seen{$mid}++; + $mid = mid_href($mid); $ctx->{-t_max} //= $smsg->{ts}; + my $rank = sprintf("%${pad}d", $m->get_rank + 1); + my $pct = get_pct($m); # only when sorting by relevance, ->items is always # ordered descending: @@ -118,13 +127,12 @@ sub mset_summary { $min = $pct; my $s = ascii_html($smsg->{subject}); - my $f = ascii_html($smsg->{from_name}); + my $f = ascii_html(delete $smsg->{from_name}); if ($obfs_ibx) { obfuscate_addrs($obfs_ibx, $s); obfuscate_addrs($obfs_ibx, $f); } my $date = fmt_ts($smsg->{ds}); - my $mid = mid_href($smsg->{mid}); $s = '(no subject)' if $s eq ''; $$res .= qq{$rank. <b><a\nhref="$mid/">}. $s . "</a></b>\n"; @@ -134,7 +142,7 @@ sub mset_summary { $q->{-min_pct} = $min; $q->{-max_pct} = $max; } - $$res .= search_nav_bot($mset, $q); + $$res .= search_nav_bot($ctx, $mset, $q); undef; } @@ -156,7 +164,7 @@ sub path2inc ($) { sub err_txt { my ($ctx, $err) = @_; - my $u = $ctx->{-inbox}->base_url($ctx->{env}) . '_/text/help/'; + my $u = $ctx->{ibx}->base_url($ctx->{env}) . '_/text/help/'; $err =~ s/^\s*Exception:\s*//; # bad word to show users :P $err =~ s!(\S+)!path2inc($1)!sge; $err = ascii_html($err); @@ -167,7 +175,7 @@ sub err_txt { sub search_nav_top { my ($mset, $q, $ctx) = @_; my $m = $q->qs_html(x => 'm', r => undef, t => undef); - my $rv = qq{<form\naction="?$m"\nmethod="post"><pre>}; + my $rv = qq{<form\nid=d\naction="?$m"\nmethod=post><pre>}; my $initial_q = $ctx->{-uxs_retried}; if (defined $initial_q) { my $rewritten = $q->{'q'}; @@ -192,17 +200,25 @@ sub search_nav_top { $rv .= '] view['; my $x = $q->{x}; - if ($x eq '') { - my $t = $q->qs_html(x => 't'); - $rv .= qq{<b>summary</b>|<a\nhref="?$t">nested</a>} - } elsif ($q->{x} eq 't') { + my $pfx = "\t\t\t"; + if ($x eq 't') { my $s = $q->qs_html(x => ''); $rv .= qq{<a\nhref="?$s">summary</a>|<b>nested</b>}; + $pfx = "thread overview <a\nhref=#t>below</a> | "; + } else { + my $t = $q->qs_html(x => 't'); + $rv .= qq{<b>summary</b>|<a\nhref="?$t">nested</a>} } my $A = $q->qs_html(x => 'A', r => undef); - $rv .= qq{|<a\nhref="?$A">Atom feed</a>]}; - if ($ctx->{-inbox}->search->has_threadid) { - $rv .= qq{\n\t\t\tdownload mbox.gz: } . + $rv .= qq{|<a\nhref="?$A">Atom feed</a>]\n}; + $rv .= <<EOM if $x ne 't' && $q->{t}; +*** "t=1" collapses threads in summary, "full threads" requires mbox.gz *** +EOM + $rv .= <<EOM if $x eq 'm'; +*** "x=m" ignored for GET requests, use download buttons below *** +EOM + if ($ctx->{ibx}->isrch->has_threadid) { + $rv .= qq{${pfx}download mbox.gz: } . # we set name=z w/o using it since it seems required for # lynx (but works fine for w3m). qq{<input\ntype=submit\nname=z\n} . @@ -210,14 +226,14 @@ sub search_nav_top { qq{|<input\ntype=submit\nname=x\n} . q{value="full threads"/>}; } else { # BOFH needs to --reindex - $rv .= qq{\n\t\t\t\t\t\tdownload: } . + $rv .= qq{${pfx}download: } . qq{<input\ntype=submit\nname=z\nvalue="mbox.gz"/>} } $rv .= qq{</pre></form><pre>}; } -sub search_nav_bot { - my ($mset, $q) = @_; +sub search_nav_bot { # also used by WwwListing for searching extindex miscidx + my ($ctx, $mset, $q) = @_; my $total = $mset->get_matches_estimated; my $l = $q->{l}; my $rv = '</pre><hr><pre id=t>'; @@ -266,41 +282,37 @@ sub search_nav_bot { $rv .= qq{<a\nhref="?$prev"\nrel=prev>prev $pd</a>} if $prev; my $rev = $q->qs_html(o => $o < 0 ? 0 : -1); - $rv .= qq{ | <a\nhref="?$rev">reverse</a></pre>}; + $rv .= qq{ | <a\nhref="?$rev">reverse</a>}; + exists($ctx->{ibx}) and + $rv .= q{ | options <a href=#d>above</a></pre>}; + $rv; } sub sort_relevance { - [ sort { + @{$_[0]} = sort { (eval { $b->topmost->{pct} } // 0) <=> (eval { $a->topmost->{pct} } // 0) - } @{$_[0]} ] -} - -sub get_pct ($) { - # Capped at "99%" since "100%" takes an extra column in the - # thread skeleton view. <xapian/mset.h> says the value isn't - # very meaningful, anyways. - my $n = $_[0]->get_percent; - $n > 99 ? 99 : $n; + } @{$_[0]}; } sub mset_thread { my ($ctx, $mset, $q) = @_; - my $ibx = $ctx->{-inbox}; - my $nshard = $ibx->search->{nshard} // 1; - my %pct = map { mdocid($nshard, $_) => get_pct($_) } $mset->items; - my $msgs = $ibx->over->get_all(keys %pct); - $_->{pct} = $pct{$_->{num}} for @$msgs; + my $ibx = $ctx->{ibx}; + my @pct = map { get_pct($_) } $mset->items; + my $msgs = $ibx->isrch->mset_to_smsg($ibx, $mset); + my $i = 0; + $_->{pct} = $pct[$i++] for @$msgs; my $r = $q->{r}; if ($r) { # for descriptions in search_nav_bot - my @pct = values %pct; $q->{-min_pct} = min(@pct); $q->{-max_pct} = max(@pct); } my $rootset = PublicInbox::SearchThread::thread($msgs, $r ? \&sort_relevance : \&PublicInbox::View::sort_ds, $ctx); - my $skel = search_nav_bot($mset, $q). "<pre>"; + my $skel = search_nav_bot($ctx, $mset, $q).'<pre>'. <<EOM; +-- pct% links below jump to the message on this page, permalinks otherwise -- +EOM $ctx->{-upfx} = ''; $ctx->{anchor_idx} = 1; $ctx->{cur_level} = 0; @@ -318,20 +330,20 @@ sub mset_thread { # link $INBOX_DIR/description text to "recent" view around # the newest message in this result set: - $ctx->{-t_max} = max(map { delete $_->{ts} } @$msgs); + $ctx->{-t_max} = max(map { $_->{ts} } @$msgs); @$msgs = reverse @$msgs if $r; $ctx->{msgs} = $msgs; - PublicInbox::WwwStream::aresponse($ctx, 200, \&mset_thread_i); + PublicInbox::WwwStream::aresponse($ctx, \&mset_thread_i); } # callback for PublicInbox::WwwStream::getline sub mset_thread_i { my ($ctx, $eml) = @_; - $ctx->zmore($ctx->html_top) if exists $ctx->{-html_tip}; + print { $ctx->zfh } $ctx->html_top if exists $ctx->{-html_tip}; $eml and return PublicInbox::View::eml_entry($ctx, $eml); my $smsg = shift @{$ctx->{msgs}} or - $ctx->zmore(${delete($ctx->{skel})}); + print { $ctx->zfh } ${delete($ctx->{skel})}; $smsg; } @@ -353,17 +365,17 @@ sub ctx_prepare { } sub adump { - my ($cb, $mset, $q, $ctx) = @_; - $ctx->{ids} = $ctx->{-inbox}->search->mset_to_artnums($mset); + my ($mset, $q, $ctx) = @_; + $ctx->{ids} = $ctx->{ibx}->isrch->mset_to_artnums($mset); $ctx->{search_query} = $q; # used by WwwAtomStream::atom_header - PublicInbox::WwwAtomStream->response($ctx, 200, \&adump_i); + PublicInbox::WwwAtomStream->response($ctx, \&adump_i); } # callback for PublicInbox::WwwAtomStream::getline sub adump_i { my ($ctx) = @_; while (my $num = shift @{$ctx->{ids}}) { - my $smsg = eval { $ctx->{-inbox}->over->get_art($num) } or next; + my $smsg = eval { $ctx->{ibx}->over->get_art($num) } or next; return $smsg; } } diff --git a/lib/PublicInbox/Select.pm b/lib/PublicInbox/Select.pm new file mode 100644 index 00000000..face8edc --- /dev/null +++ b/lib/PublicInbox/Select.pm @@ -0,0 +1,43 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# This makes select(2) look like epoll to simplify the code in DS.pm. +# Unlike IO::Select, it does NOT hold references to IO handles. +# This is NOT meant to be an all encompassing emulation of epoll +# via select, but only to support cases we care about. +package PublicInbox::Select; +use v5.12; +use PublicInbox::Syscall qw(EPOLLONESHOT EPOLLIN EPOLLOUT); +use Errno; + +sub new { bless {}, __PACKAGE__ } # fd => events + +sub ep_wait { + my ($self, $msec, $events) = @_; + my ($rvec, $wvec) = ('', ''); # we don't use EPOLLERR + while (my ($fd, $ev) = each %$self) { + vec($rvec, $fd, 1) = 1 if $ev & EPOLLIN; + vec($wvec, $fd, 1) = 1 if $ev & EPOLLOUT; + } + @$events = (); + my $to = $msec < 0 ? undef : ($msec/1000); + my $n = select $rvec, $wvec, undef, $to or return; # timeout expired + return if $n < 0 && $! == Errno::EINTR; # caller recalculates timeout + die "select: $!" if $n < 0; + while (my ($fd, $ev) = each %$self) { + if (vec($rvec, $fd, 1) || vec($wvec, $fd, 1)) { + delete($self->{$fd}) if $ev & EPOLLONESHOT; + push @$events, $fd; + } + } + $n == scalar(@$events) or + warn "BUG? select() returned $n, but got ".scalar(@$events); +} + +sub ep_del { delete($_[0]->{fileno($_[1])}); 0 } +sub ep_add { $_[0]->{fileno($_[1])} = $_[2]; 0 } + +no warnings 'once'; +*ep_mod = \&ep_add; + +1; diff --git a/lib/PublicInbox/SharedKV.pm b/lib/PublicInbox/SharedKV.pm new file mode 100644 index 00000000..89ab3f74 --- /dev/null +++ b/lib/PublicInbox/SharedKV.pm @@ -0,0 +1,184 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# fork()-friendly key-value store. Will be used for making +# augmenting Maildirs and mboxes less expensive, maybe. +# We use flock(2) to avoid SQLite lock problems (busy timeouts, backoff) +package PublicInbox::SharedKV; +use strict; +use v5.10.1; +use parent qw(PublicInbox::Lock); +use File::Temp qw(tempdir); +use DBI qw(:sql_types); # SQL_BLOB +use PublicInbox::Spawn; +use File::Path qw(rmtree); + +sub dbh { + my ($self, $lock) = @_; + $self->{dbh} // do { + my $f = $self->{filename}; + $lock //= $self->lock_for_scope_fast; + my $dbh = DBI->connect("dbi:SQLite:dbname=$f", '', '', { + AutoCommit => 1, + RaiseError => 1, + PrintError => 0, + sqlite_use_immediate_transaction => 1, + # no sqlite_unicode here, this is for binary data + }); + my $opt = $self->{opt} // {}; + $dbh->do('PRAGMA synchronous = OFF') if !$opt->{fsync}; + $dbh->do('PRAGMA journal_mode = '. + ($opt->{journal_mode} // 'WAL')); + $dbh->do(<<''); +CREATE TABLE IF NOT EXISTS kv ( + k VARBINARY PRIMARY KEY NOT NULL, + v VARBINARY NOT NULL, + UNIQUE (k) +) + + $self->{dbh} = $dbh; + } +} + +sub new { + my ($cls, $dir, $base, $opt) = @_; + my $self = bless { opt => $opt }, $cls; + File::Path::mkpath($dir) if defined($dir); + $dir //= $self->{"tmp$$.$self"} = tempdir("skv.$$-XXXX", TMPDIR => 1); + $base //= ''; + my $f = $self->{filename} = "$dir/$base.sqlite3"; + $self->{lock_path} = $opt->{lock_path} // "$dir/$base.flock"; + unless (-s $f) { + require PublicInbox::Syscall; + PublicInbox::Syscall::nodatacow_dir($dir); # for journal/shm/wal + open my $fh, '+>>', $f or die "failed to open $f: $!"; + } + $self; +} + +sub set_maybe { + my ($self, $key, $val, $lock) = @_; + $lock //= $self->lock_for_scope_fast; + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR IGNORE INTO kv (k,v) VALUES (?, ?) + + $sth->bind_param(1, $key, SQL_BLOB); + $sth->bind_param(2, $val, SQL_BLOB); + my $e = $sth->execute; + $e == 0 ? undef : $e; +} + +# caller calls sth->fetchrow_array +sub each_kv_iter { + my ($self) = @_; + my $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT k,v FROM kv + + $sth->execute; + $sth +} + +sub keys { + my ($self, @pfx) = @_; + my $sql = 'SELECT k FROM kv'; + if (defined $pfx[0]) { + $sql .= ' WHERE k LIKE ? ESCAPE ?'; + my $anywhere = !!$pfx[1]; + $pfx[1] = '\\'; + $pfx[0] =~ s/([%_\\])/\\$1/g; # glob chars + $pfx[0] .= '%'; + substr($pfx[0], 0, 0, '%') if $anywhere; + } else { + @pfx = (); # [0] may've been undef + } + my $sth = $self->dbh->prepare($sql); + if (@pfx) { + $sth->bind_param(1, $pfx[0], SQL_BLOB); + $sth->bind_param(2, $pfx[1]); + } + $sth->execute; + map { $_->[0] } @{$sth->fetchall_arrayref}; +} + +sub set { + my ($self, $key, $val) = @_; + if (defined $val) { + my $sth = $self->{dbh}->prepare_cached(<<''); +INSERT OR REPLACE INTO kv (k,v) VALUES (?,?) + + $sth->bind_param(1, $key, SQL_BLOB); + $sth->bind_param(2, $val, SQL_BLOB); + my $e = $sth->execute; + $e == 0 ? undef : $e; + } else { + my $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM kv WHERE k = ? + + $sth->bind_param(1, $key, SQL_BLOB); + } +} + +sub get { + my ($self, $key) = @_; + my $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); +SELECT v FROM kv WHERE k = ? + + $sth->bind_param(1, $key, SQL_BLOB); + $sth->execute; + $sth->fetchrow_array; +} + +sub xchg { + my ($self, $key, $newval, $lock) = @_; + $lock //= $self->lock_for_scope_fast; + my $oldval = get($self, $key); + if (defined $newval) { + set($self, $key, $newval); + } else { + my $sth = $self->{dbh}->prepare_cached(<<''); +DELETE FROM kv WHERE k = ? + + $sth->bind_param(1, $key, SQL_BLOB); + $sth->execute; + } + $oldval; +} + +sub count { + my ($self) = @_; + my $sth = $self->{dbh}->prepare_cached(<<''); +SELECT COUNT(k) FROM kv + + $sth->execute; + $sth->fetchrow_array; +} + +# faster than ->count due to how SQLite works +sub has_entries { + my ($self) = @_; + my @n = $self->{dbh}->selectrow_array('SELECT k FROM kv LIMIT 1'); + scalar(@n) ? 1 : undef; +} + +sub dbh_release { + my ($self, $lock) = @_; + my $dbh = delete $self->{dbh} or return; + $lock //= $self->lock_for_scope_fast; # may be needed for WAL + %{$dbh->{CachedKids}} = (); # cleanup prepare_cached + $dbh->disconnect; +} + +sub DESTROY { + my ($self) = @_; + dbh_release($self); + my $dir = delete $self->{"tmp$$.$self"} or return; + my $tries = 0; + do { + $! = 0; + eval { rmtree($dir) }; + } while ($@ && $!{ENOENT} && $tries++ < 5); + warn "error removing $dir: $@" if $@; + warn "Took $tries tries to remove $dir\n" if $tries; +} + +1; diff --git a/lib/PublicInbox/Sigfd.pm b/lib/PublicInbox/Sigfd.pm index 5d61e630..b8a1ddfb 100644 --- a/lib/PublicInbox/Sigfd.pm +++ b/lib/PublicInbox/Sigfd.pm @@ -1,44 +1,35 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Wraps a signalfd (or similar) for PublicInbox::DS # fields: (sig: hashref similar to %SIG, but signal numbers as keys) package PublicInbox::Sigfd; -use strict; +use v5.12; use parent qw(PublicInbox::DS); -use PublicInbox::Syscall qw(signalfd EPOLLIN EPOLLET $SFD_NONBLOCK); -use POSIX qw(:signal_h); -use IO::Handle (); +use PublicInbox::Syscall qw(signalfd EPOLLIN EPOLLET %SIGNUM); +use POSIX (); # returns a coderef to unblock signals if neither signalfd or kqueue # are available. sub new { - my ($class, $sig, $flags) = @_; + my ($class, $sig) = @_; my %signo = map {; - my $cb = $sig->{$_}; - # SIGWINCH is 28 on FreeBSD, NetBSD, OpenBSD - my $num = ($_ eq 'WINCH' && $^O =~ /linux|bsd/i) ? 28 : do { - my $m = "SIG$_"; - POSIX->$m; - }; - $num => $cb; + # $num => [ $cb, $signame ]; + ($SIGNUM{$_} // POSIX->can("SIG$_")->()) => [ $sig->{$_}, $_ ] } keys %$sig; my $self = bless { sig => \%signo }, $class; my $io; - my $fd = signalfd(-1, [keys %signo], $flags); + my $fd = signalfd([keys %signo]); if (defined $fd && $fd >= 0) { - $io = IO::Handle->new_from_fd($fd, 'r+'); + open($io, '+<&=', $fd) or die "open: $!"; } elsif (eval { require PublicInbox::DSKQXS }) { - $io = PublicInbox::DSKQXS->signalfd([keys %signo], $flags); + $io = PublicInbox::DSKQXS->signalfd([keys %signo]); } else { return; # wake up every second to check for signals } - if ($flags & $SFD_NONBLOCK) { # it can go into the event loop - $self->SUPER::new($io, EPOLLIN | EPOLLET); - } else { # master main loop - $self->{sock} = $io; - $self; - } + $self->SUPER::new($io, EPOLLIN | EPOLLET); + $self->{is_kq} = 1 if tied(*$io); + $self; } # PublicInbox::Daemon in master main loop (blocking) @@ -51,8 +42,8 @@ sub wait_once ($) { for my $off (0..$nr) { # the first uint32_t of signalfd_siginfo: ssi_signo my $signo = unpack('L', substr($buf, 128 * $off, 4)); - my $cb = $self->{sig}->{$signo}; - $cb->($signo) if $cb ne 'IGNORE'; + my ($cb, $signame) = @{$self->{sig}->{$signo}}; + $cb->($signame) if $cb ne 'IGNORE'; } } $r; @@ -63,14 +54,4 @@ sub event_step { while (wait_once($_[0])) {} # non-blocking } -sub sig_setmask { sigprocmask(SIG_SETMASK, @_) or die "sigprocmask: $!" } - -sub block_signals () { - my $oldset = POSIX::SigSet->new; - my $newset = POSIX::SigSet->new; - $newset->fillset or die "fillset: $!"; - sig_setmask($newset, $oldset); - $oldset; -} - 1; diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 14086538..b132381b 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2015-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # A small/skeleton/slim representation of a message. @@ -9,13 +9,15 @@ # large threads in our WWW UI and the NNTP range responses. package PublicInbox::Smsg; use strict; -use warnings; -use base qw(Exporter); +use v5.10.1; +use parent qw(Exporter); our @EXPORT_OK = qw(subject_normalized); -use PublicInbox::MID qw(mids); +use PublicInbox::MID qw(mids references); use PublicInbox::Address; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +sub oidbin { pack('H*', $_[0]->{blob}) } + sub to_doc_data { my ($self) = @_; join("\n", @@ -57,19 +59,36 @@ sub load_from_data ($$) { sub psgi_cull ($) { my ($self) = @_; - # ghosts don't have ->{from} - my $from = delete($self->{from}) // ''; - my @n = PublicInbox::Address::names($from); - $self->{from_name} = join(', ', @n); - # drop NNTP-only fields which aren't relevant to PSGI results: # saves ~80K on a 200 item search result: # TODO: we may need to keep some of these for JMAP... - delete @$self{qw(tid to cc bytes lines)}; + my ($f) = delete @$self{qw(from tid to cc bytes lines)}; + # ghosts don't have ->{from} + $self->{from_name} = join(', ', PublicInbox::Address::names($f // '')); $self; } -# for Import and v1 non-SQLite WWW code paths +sub parse_references ($$$) { + my ($smsg, $hdr, $mids) = @_; + my $refs = references($hdr); + push(@$refs, @$mids) if scalar(@$mids) > 1; + return $refs if scalar(@$refs) == 0; + + # prevent circular references here: + my %seen = ( ($smsg->{mid} // '') => 1 ); + my @keep; + foreach my $ref (@$refs) { + if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { + warn "References: <$ref> too long, ignoring\n"; + next; + } + $seen{$ref} //= push(@keep, $ref); + } + $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; + \@keep; +} + +# used for v2, Import and v1 non-SQLite WWW code paths sub populate { my ($self, $hdr, $sync) = @_; for my $f (qw(From To Cc Subject)) { @@ -80,9 +99,6 @@ sub populate { # to protect git and NNTP clients $val =~ tr/\0\t\n/ /; - # rare: in case headers have wide chars (not RFC2047-encoded) - utf8::decode($val); - # lower-case fields for read-only stuff $self->{lc($f)} = $val; @@ -96,13 +112,13 @@ sub populate { $self->{$f} = $val if $val ne ''; } $sync //= {}; - $self->{-ds} = [ my @ds = msg_datestamp($hdr, $sync->{autime}) ]; - $self->{-ts} = [ my @ts = msg_timestamp($hdr, $sync->{cotime}) ]; + my @ds = msg_datestamp($hdr, $sync->{autime} // $self->{ds}); + my @ts = msg_timestamp($hdr, $sync->{cotime} // $self->{ts}); + $self->{-ds} = \@ds; + $self->{-ts} = \@ts; $self->{ds} //= $ds[0]; # no zone $self->{ts} //= $ts[0]; - - # for v1 users w/o SQLite - $self->{mid} //= eval { mids($hdr)->[0] } // ''; + $self->{mid} //= mids($hdr)->[0]; } # no strftime, that is locale-dependent and not for RFC822 @@ -127,6 +143,8 @@ sub internaldate { # for IMAP our $REPLY_RE = qr/^re:\s+/i; +# TODO: see RFC 5256 sec 2.1 "Base Subject" and evaluate compatibility +# w/ existing indices... sub subject_normalized ($) { my ($subj) = @_; $subj =~ s/\A\s+//s; # no leading space @@ -137,4 +155,17 @@ sub subject_normalized ($) { $subj; } +# returns the number of bytes to add if given a non-CRLF arg +sub crlf_adjust ($) { + if (index($_[0], "\r\n") < 0) { + # common case is LF-only, every \n needs an \r; + # so favor a cheap tr// over an expensive m//g + $_[0] =~ tr/\n/\n/; + } else { # count number of '\n' w/o '\r', expensive: + scalar(my @n = ($_[0] =~ m/(?<!\r)\n/g)); + } +} + +sub set_bytes { $_[0]->{bytes} = $_[2] + crlf_adjust($_[1]) } + 1; diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 83f7a4ee..b5f6b96e 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # "Solve" blobs which don't exist in git code repositories by @@ -11,13 +11,16 @@ package PublicInbox::SolverGit; use strict; use v5.10.1; use File::Temp 0.19 (); # 0.19 for ->newdir +use autodie qw(mkdir); use Fcntl qw(SEEK_SET); -use PublicInbox::Git qw(git_unquote git_quote); +use PublicInbox::Git qw(git_unquote git_quote git_exe); +use PublicInbox::IO qw(write_file); use PublicInbox::MsgIter qw(msg_part_text); use PublicInbox::Qspawn; use PublicInbox::Tmpfile; use PublicInbox::GitAsyncCat; use PublicInbox::Eml; +use PublicInbox::Compat qw(uniqstr); use URI::Escape qw(uri_escape_utf8); # POSIX requires _POSIX_ARG_MAX >= 4096, and xargs is required to @@ -79,11 +82,13 @@ sub solve_existing ($$) { my $try = $want->{try_gits} //= [ @{$self->{gits}} ]; # array copy my $git = shift @$try or die 'BUG {try_gits} empty'; my $oid_b = $want->{oid_b}; + + # can't use async_check due to last_check_err :< my ($oid_full, $type, $size) = $git->check($oid_b); + $git->schedule_cleanup if $self->{psgi_env}->{'pi-httpd.async'}; - # other than {oid_b, try_gits, try_ibxs} - my $have_hints = scalar keys %$want > 3; - if (defined($type) && (!$have_hints || $type eq 'blob')) { + if ($oid_b eq ($oid_full // '') || (defined($type) && + (!$self->{have_hints} || $type eq 'blob'))) { delete $want->{try_gits}; return [ $git, $oid_full, $type, int($size) ]; # done, success } @@ -106,13 +111,16 @@ sub solve_existing ($$) { scalar(@$try); } +sub _tmp { + $_[0]->{tmp} //= + File::Temp->newdir("solver.$_[0]->{oid_want}-XXXX", TMPDIR => 1); +} + sub extract_diff ($$) { my ($p, $arg) = @_; my ($self, $want, $smsg) = @$arg; my ($part) = @$p; # ignore $depth and @idx; my $ct = $part->content_type || 'text/plain'; - my ($s, undef) = msg_part_text($part, $ct); - defined $s or return; my $post = $want->{oid_b}; my $pre = $want->{oid_a}; if (!defined($pre) || $pre !~ /\A[a-f0-9]+\z/) { @@ -122,11 +130,18 @@ sub extract_diff ($$) { # Email::MIME::Encodings forces QP to be CRLF upon decoding, # change it back to LF: my $cte = $part->header('Content-Transfer-Encoding') || ''; + my ($s, undef) = msg_part_text($part, $ct); + defined $s or return; + delete $part->{bdy}; if ($cte =~ /\bquoted-printable\b/i && $part->crlf eq "\n") { $s =~ s/\r\n/\n/sg; } - + # Quiet "Complex regular subexpression recursion limit" warning. + # Not much we can do about it, but it's no longer relevant to + # Perl 5.3x (the warning was removed in 5.37.1, and actual + # recursino sometime before then). + no warnings 'regexp'; $s =~ m!( # $1 start header lines we save for debugging: # everything before ^index is optional, but we don't @@ -169,6 +184,7 @@ sub extract_diff ($$) { # because git-apply(1) handles that case, too (?:^(?:[\@\+\x20\-\\][^\n]*|)$LF)+ )!smx or return; + undef $s; # free memory my $di = { hdr_lines => $1, @@ -193,10 +209,8 @@ sub extract_diff ($$) { my $path = ++$self->{tot}; $di->{n} = $path; - open(my $tmp, '>:utf8', $self->{tmp}->dirname . "/$path") or - die "open(tmp): $!"; - print $tmp $di->{hdr_lines}, $patch or die "print(tmp): $!"; - close $tmp or die "close(tmp): $!"; + my $f = _tmp($self)->dirname."/$path"; + write_file '>:utf8', $f, $di->{hdr_lines}, $patch; # for debugging/diagnostics: $di->{ibx} = $want->{cur_ibx}; @@ -216,7 +230,7 @@ sub filename_query ($) { sub find_smsgs ($$$) { my ($self, $ibx, $want) = @_; - my $srch = $ibx->search or return; + my $srch = $ibx->isrch or return; my $post = $want->{oid_b} or die 'BUG: no {oid_b}'; $post =~ /\A[a-f0-9]+\z/ or die "BUG: oid_b not hex: $post"; @@ -242,14 +256,18 @@ sub find_smsgs ($$$) { sub update_index_result ($$) { my ($bref, $self) = @_; - my ($qsp, $msg) = delete @$self{qw(-qsp -msg)}; - if (my $err = $qsp->{err}) { - ERR($self, "git update-index error: $err"); - } + my ($qsp_err, $msg) = delete @$self{qw(-qsp_err -msg)}; + ERR($self, "git update-index error:$qsp_err") if $qsp_err; dbg($self, $msg); next_step($self); # onto do_git_apply } +sub qsp_qx ($$$) { + my ($self, $qsp, $cb) = @_; + $qsp->{qsp_err} = \($self->{-qsp_err} = ''); + $qsp->psgi_qx($self->{psgi_env}, $self->{limiter}, $cb, $self); +} + sub prepare_index ($) { my ($self) = @_; my $patches = $self->{patches}; @@ -275,49 +293,42 @@ sub prepare_index ($) { dbg($self, 'preparing index'); my $rdr = { 0 => $in }; - my $cmd = [ qw(git update-index -z --index-info) ]; + my $cmd = [ git_exe, qw(update-index -z --index-info) ]; my $qsp = PublicInbox::Qspawn->new($cmd, $self->{git_env}, $rdr); $path_a = git_quote($path_a); - $self->{-qsp} = $qsp; $self->{-msg} = "index prepared:\n$mode_a $oid_full\t$path_a"; - $qsp->psgi_qx($self->{psgi_env}, undef, \&update_index_result, $self); + qsp_qx $self, $qsp, \&update_index_result; } # pure Perl "git init" sub do_git_init ($) { my ($self) = @_; - my $dir = $self->{tmp}->dirname; - my $git_dir = "$dir/git"; - - foreach ('', qw(objects refs objects/info refs/heads)) { - mkdir("$git_dir/$_") or die "mkdir $_: $!"; - } - open my $fh, '>', "$git_dir/config" or die "open git/config: $!"; - print $fh <<'EOF' or die "print git/config $!"; + my $git_dir = _tmp($self)->dirname.'/git'; + + mkdir("$git_dir/$_") for ('', qw(objects refs objects/info refs/heads)); + my $first = $self->{gits}->[0]; + my $fmt = $first->object_format; + my ($v, @ext) = defined($$fmt) ? (1, <<EOM) : (0); +[extensions] + objectformat = $$fmt +EOM + write_file '>', "$git_dir/config", <<EOF, @ext; [core] - repositoryFormatVersion = 0 + repositoryFormatVersion = $v filemode = true bare = false - fsyncObjectfiles = false logAllRefUpdates = false EOF - close $fh or die "close git/config: $!"; - - open $fh, '>', "$git_dir/HEAD" or die "open git/HEAD: $!"; - print $fh "ref: refs/heads/master\n" or die "print git/HEAD: $!"; - close $fh or die "close git/HEAD: $!"; - - my $f = 'objects/info/alternates'; - open $fh, '>', "$git_dir/$f" or die "open: $f: $!"; - foreach my $git (@{$self->{gits}}) { - print $fh $git->git_path('objects'),"\n" or die "print $f: $!"; - } - close $fh or die "close: $f: $!"; + write_file '>', "$git_dir/HEAD", "ref: refs/heads/master\n"; + write_file '>', "$git_dir/objects/info/alternates", map { + $_->git_path('objects')."\n" + } @{$self->{gits}}; my $tmp_git = $self->{tmp_git} = PublicInbox::Git->new($git_dir); $tmp_git->{-tmp} = $self->{tmp}; $self->{git_env} = { GIT_DIR => $git_dir, GIT_INDEX_FILE => "$git_dir/index", + GIT_TEST_FSYNC => 0, # undocumented git env }; prepare_index($self); } @@ -377,12 +388,9 @@ sub event_step ($) { } sub next_step ($) { - my ($self) = @_; # if outside of public-inbox-httpd, caller is expected to be # looping event_step, anyways - my $async = $self->{psgi_env}->{'pi-httpd.async'} or return; - # PublicInbox::HTTPD::Async->new - $async->(undef, undef, $self); + PublicInbox::DS::requeue($_[0]) if $_[0]->{psgi_env}->{'pi-httpd.async'} } sub mark_found ($$$) { @@ -398,21 +406,18 @@ sub mark_found ($$$) { sub parse_ls_files ($$) { my ($self, $bref) = @_; - my ($qsp, $di) = delete @$self{qw(-qsp -cur_di)}; - if (my $err = $qsp->{err}) { - die "git ls-files error: $err"; - } + my ($qsp_err, $di) = delete @$self{qw(-qsp_err -cur_di)}; + die "git ls-files -s -z error:$qsp_err" if $qsp_err; - my ($line, @extra) = split(/\0/, $$bref); + my @ls = split(/\0/, $$bref); + my ($line, @extra) = grep(/\t\Q$di->{path_b}\E\z/, @ls); scalar(@extra) and die "BUG: extra files in index: <", - join('> <', @extra), ">"; - + join('> <', $line, @extra), ">"; + $line // die "no \Q$di->{path_b}\E in <",join('> <', @ls), '>'; my ($info, $file) = split(/\t/, $line, 2); my ($mode_b, $oid_b_full, $stage) = split(/ /, $info); - if ($file ne $di->{path_b}) { - die + $file eq $di->{path_b} or die "BUG: index mismatch: file=$file != path_b=$di->{path_b}"; - } my $tmp_git = $self->{tmp_git} or die 'no git working tree'; my (undef, undef, $size) = $tmp_git->check($oid_b_full); @@ -447,50 +452,49 @@ sub skip_identical ($$$) { } } -sub apply_result ($$) { +sub apply_result ($$) { # qx_cb my ($bref, $self) = @_; - my ($qsp, $di) = delete @$self{qw(-qsp -cur_di)}; + my ($qsp_err, $di) = delete @$self{qw(-qsp_err -cur_di)}; dbg($self, $$bref); my $patches = $self->{patches}; - if (my $err = $qsp->{err}) { - my $msg = "git apply error: $err"; + if ($qsp_err) { + my $msg = "git apply error:$qsp_err"; my $nxt = $patches->[0]; if ($nxt && oids_same_ish($nxt->{oid_b}, $di->{oid_b})) { dbg($self, $msg); dbg($self, 'trying '.di_url($self, $nxt)); return do_git_apply($self); } else { - ERR($self, $msg); + $msg .= " (no patches left to try for $di->{oid_b})\n"; + dbg($self, $msg); + return done($self, undef); } } else { skip_identical($self, $patches, $di->{oid_b}); } - my @cmd = qw(git ls-files -s -z); - $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}); + my @cmd = (git_exe, qw(ls-files -s -z)); + my $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}); $self->{-cur_di} = $di; - $self->{-qsp} = $qsp; - $qsp->psgi_qx($self->{psgi_env}, undef, \&ls_files_result, $self); + qsp_qx $self, $qsp, \&ls_files_result; } sub do_git_apply ($) { my ($self) = @_; - my $dn = $self->{tmp}->dirname; my $patches = $self->{patches}; # we need --ignore-whitespace because some patches are CRLF - my @cmd = (qw(git apply --cached --ignore-whitespace + my @cmd = (git_exe, qw(apply --cached --ignore-whitespace --unidiff-zero --whitespace=warn --verbose)); my $len = length(join(' ', @cmd)); - my $total = $self->{tot}; my $di; # keep track of the last one for "git ls-files" my $prv_oid_b; do { my $i = ++$self->{nr}; $di = shift @$patches; - dbg($self, "\napplying [$i/$total] " . di_url($self, $di) . - "\n" . $di->{hdr_lines}); + dbg($self, "\napplying [$i/$self->{nr_p}] " . + di_url($self, $di) . "\n" . $di->{hdr_lines}); my $path = $di->{n}; $len += length($path) + 1; push @cmd, $path; @@ -498,11 +502,10 @@ sub do_git_apply ($) { } while (@$patches && $len < $ARG_SIZE_MAX && !oids_same_ish($patches->[0]->{oid_b}, $prv_oid_b)); - my $opt = { 2 => 1, -C => $dn, quiet => 1 }; + my $opt = { 2 => 1, -C => _tmp($self)->dirname, quiet => 1 }; my $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}, $opt); $self->{-cur_di} = $di; - $self->{-qsp} = $qsp; - $qsp->psgi_qx($self->{psgi_env}, undef, \&apply_result, $self); + qsp_qx $self, $qsp, \&apply_result; } sub di_url ($$) { @@ -551,8 +554,9 @@ sub extract_diffs_done { my $diffs = delete $self->{tmp_diffs}; if (scalar @$diffs) { unshift @{$self->{patches}}, @$diffs; - dbg($self, "found $want->{oid_b} in " . join(" ||\n\t", - map { di_url($self, $_) } @$diffs)); + my @u = uniqstr(map { di_url($self, $_) } @$diffs); + dbg($self, "found $want->{oid_b} in " . join(" ||\n\t", @u)); + ++$self->{nr_p}; # good, we can find a path to the oid we $want, now # lets see if we need to apply more patches: @@ -593,8 +597,7 @@ sub resolve_patch ($$) { if (my $msgs = $want->{try_smsgs}) { my $smsg = shift @$msgs; if ($self->{psgi_env}->{'pi-httpd.async'}) { - return git_async_cat($want->{cur_ibx}->git, - $smsg->{blob}, + return ibx_async_cat($want->{cur_ibx}, $smsg->{blob}, \&extract_diff_async, [$self, $want, $smsg]); } else { @@ -635,7 +638,7 @@ sub resolve_patch ($$) { # scan through inboxes to look for emails which results in # the oid we want: - my $ibx = shift(@{$want->{try_ibxs}}) or die 'BUG: {try_ibxs} empty'; + my $ibx = shift(@{$want->{try_ibxs}}) or return done($self, undef); if (my $msgs = find_smsgs($self, $ibx, $want)) { $want->{try_smsgs} = $msgs; $want->{cur_ibx} = $ibx; @@ -649,15 +652,19 @@ sub resolve_patch ($$) { # so user_cb never references the SolverGit object sub new { my ($class, $ibx, $user_cb, $uarg) = @_; + my $gits = $ibx ? $ibx->{-repo_objs} : undef; + + # FIXME: cindex --join= is super-aggressive and may hit too many + $gits = [ @$gits[0..2] ] if $gits && @$gits > 3; - bless { - gits => $ibx->{-repo_objs}, + bless { # $ibx is undef if coderepo only (see WwwCoderepo) + gits => $gits, user_cb => $user_cb, uarg => $uarg, - # -cur_di, -qsp, -msg => temporary fields for Qspawn callbacks + # -cur_di, -qsp_err, -msg => temp fields for Qspawn callbacks # TODO: config option for searching related inboxes - inboxes => [ $ibx ], + inboxes => $ibx ? [ $ibx ] : [], }, $class; } @@ -676,17 +683,16 @@ sub solve ($$$$$) { $self->{oid_want} = $oid_want; $self->{out} = $out; $self->{seen_oid} = {}; - $self->{tot} = 0; + $self->{tot} = $self->{nr_p} = 0; $self->{psgi_env} = $env; + $self->{have_hints} = 1 if scalar keys %$hints; $self->{todo} = [ { %$hints, oid_b => $oid_want } ]; $self->{patches} = []; # [ $di, $di, ... ] $self->{found} = {}; # { abbr => [ ::Git, oid, type, size, $di ] } - $self->{tmp} = File::Temp->newdir("solver.$oid_want-XXXXXXXX", TMPDIR => 1); dbg($self, "solving $oid_want ..."); - if (my $async = $env->{'pi-httpd.async'}) { - # PublicInbox::HTTPD::Async->new - $async->(undef, undef, $self); + if ($env->{'pi-httpd.async'}) { + PublicInbox::DS::requeue($self); } else { event_step($self) while $self->{user_cb}; } diff --git a/lib/PublicInbox/Spamcheck.pm b/lib/PublicInbox/Spamcheck.pm index ffebb3cf..fbf9355d 100644 --- a/lib/PublicInbox/Spamcheck.pm +++ b/lib/PublicInbox/Spamcheck.pm @@ -1,21 +1,17 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Spamchecking used by -watch and -mda tools package PublicInbox::Spamcheck; -use strict; -use warnings; +use v5.12; sub get { - my ($config, $key, $default) = @_; - my $spamcheck = $config->{$key}; - $spamcheck = $default unless $spamcheck; + my ($cfg, $key, $default) = @_; + my $spamcheck = $cfg->{$key} || $default; return if !$spamcheck || $spamcheck eq 'none'; - if ($spamcheck eq 'spamc') { - $spamcheck = 'PublicInbox::Spamcheck::Spamc'; - } + $spamcheck = 'PublicInbox::Spamcheck::Spamc' if $spamcheck eq 'spamc'; if ($spamcheck =~ /::/) { eval "require $spamcheck"; return $spamcheck->new; diff --git a/lib/PublicInbox/Spamcheck/Spamc.pm b/lib/PublicInbox/Spamcheck/Spamc.pm index 3ba2c3c9..b4f95e2b 100644 --- a/lib/PublicInbox/Spamcheck/Spamc.pm +++ b/lib/PublicInbox/Spamcheck/Spamc.pm @@ -1,18 +1,17 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Default spam filter class for wrapping spamc(1) package PublicInbox::Spamcheck::Spamc; -use strict; -use warnings; -use PublicInbox::Spawn qw(popen_rd spawn); +use v5.12; +use PublicInbox::Spawn qw(run_qx run_wait); use IO::Handle; use Fcntl qw(SEEK_SET); sub new { my ($class) = @_; bless { - checkcmd => [qw(spamc -E --headers)], + checkcmd => [qw(spamc -E)], hamcmd => [qw(spamc -L ham)], spamcmd => [qw(spamc -L spam)], }, $class; @@ -21,15 +20,9 @@ sub new { sub spamcheck { my ($self, $msg, $out) = @_; + $out = \(my $buf = '') unless ref($out); my $rdr = { 0 => _msg_to_fh($self, $msg) }; - my ($fh, $pid) = popen_rd($self->{checkcmd}, undef, $rdr); - unless (ref $out) { - my $buf = ''; - $out = \$buf; - } - $$out = do { local $/; <$fh> }; - close $fh or die "close failed: $!"; - waitpid($pid, 0); + $$out = run_qx($self->{checkcmd}, undef, $rdr); ($? || $$out eq '') ? 0 : 1; } @@ -49,9 +42,7 @@ sub _learn { $rdr->{0} = _msg_to_fh($self, $msg); $rdr->{1} ||= $self->_devnull; $rdr->{2} ||= $self->_devnull; - my $pid = spawn($self->{$field}, undef, $rdr); - waitpid($pid, 0); - !$?; + 0 == run_wait($self->{$field}, undef, $rdr); } sub _devnull { diff --git a/lib/PublicInbox/Spawn.pm b/lib/PublicInbox/Spawn.pm index cb16fcf6..e9e81e88 100644 --- a/lib/PublicInbox/Spawn.pm +++ b/lib/PublicInbox/Spawn.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # This allows vfork to be used for spawning subprocesses if @@ -6,35 +6,38 @@ # is explicitly defined in the environment (and writable). # Under Linux, vfork can make a big difference in spawning performance # as process size increases (fork still needs to mark pages for CoW use). -# Currently, we only use this for code intended for long running -# daemons (inside the PSGI code (-httpd) and -nntpd). The short-lived -# scripts (-mda, -index, -learn, -init) either use IPC::run or standard -# Perl routines. +# None of this is intended to be thread-safe since Perl5 maintainers +# officially discourage the use of threads. # # There'll probably be more OS-level C stuff here, down the line. # We don't want too many DSOs: https://udrepper.livejournal.com/8790.html package PublicInbox::Spawn; -use strict; +use v5.12; use parent qw(Exporter); -use Symbol qw(gensym); -use PublicInbox::ProcessPipe; -our @EXPORT_OK = qw/which spawn popen_rd nodatacow_dir/; -our @RLIMITS = qw(RLIMIT_CPU RLIMIT_CORE RLIMIT_DATA); +use PublicInbox::Lock; +use Fcntl qw(SEEK_SET); +use IO::Handle (); +use Carp qw(croak); +use PublicInbox::IO; +our @EXPORT_OK = qw(which spawn popen_rd popen_wr run_die run_wait run_qx); +our (@RLIMITS, %RLIMITS); +use autodie qw(close open pipe seek sysseek truncate); -my $vfork_spawn = <<'VFORK_SPAWN'; +BEGIN { + @RLIMITS = qw(RLIMIT_CPU RLIMIT_CORE RLIMIT_DATA); + my $all_libc = <<'ALL_LIBC'; # all *nix systems we support +#include <sys/resource.h> +#include <sys/socket.h> #include <sys/types.h> #include <sys/time.h> -#include <sys/resource.h> +#include <sys/uio.h> #include <unistd.h> #include <stdlib.h> #include <errno.h> - -/* some platforms need alloca.h, but some don't */ -#if defined(__GNUC__) && !defined(alloca) -# define alloca(sz) __builtin_alloca(sz) -#endif - +#include <time.h> +#include <stdio.h> +#include <string.h> #include <signal.h> #include <assert.h> @@ -46,11 +49,17 @@ my $vfork_spawn = <<'VFORK_SPAWN'; * This is unlike "sv_len", which returns what you would expect. */ #define AV2C_COPY(dst, src) do { \ + static size_t dst##__capa; \ I32 i; \ I32 top_index = av_len(src); \ I32 real_len = top_index + 1; \ I32 capa = real_len + 1; \ - dst = alloca(capa * sizeof(char *)); \ + if (capa > dst##__capa) { \ + dst##__capa = 0; /* in case Newx croaks */ \ + Safefree(dst); \ + Newx(dst, capa, char *); \ + dst##__capa = capa; \ + } \ for (i = 0; i < real_len; i++) { \ SV **sv = av_fetch(src, i, 0); \ dst[i] = SvPV_nolen(*sv); \ @@ -59,9 +68,10 @@ my $vfork_spawn = <<'VFORK_SPAWN'; } while (0) /* needs to be safe inside a vfork'ed process */ -static void exit_err(int *cerrnum) +static void exit_err(const char *fn, volatile int *cerrnum) { *cerrnum = errno; + write(2, fn, strlen(fn)); _exit(1); } @@ -71,49 +81,60 @@ static void exit_err(int *cerrnum) * Be sure to update PublicInbox::SpawnPP if this changes */ int pi_fork_exec(SV *redirref, SV *file, SV *cmdref, SV *envref, SV *rlimref, - const char *cd) + const char *cd, int pgid) { AV *redir = (AV *)SvRV(redirref); AV *cmd = (AV *)SvRV(cmdref); AV *env = (AV *)SvRV(envref); AV *rlim = (AV *)SvRV(rlimref); const char *filename = SvPV_nolen(file); - pid_t pid; - char **argv, **envp; - sigset_t set, old, cset; - int ret, perrnum, cerrnum = 0; + pid_t pid = -1; + static char **argv, **envp; + sigset_t set, old; + int ret, perrnum; + volatile int cerrnum = 0; /* shared due to vfork */ + int chld_is_member; /* needed due to shared memory w/ vfork */ + I32 max_fd = av_len(redir); AV2C_COPY(argv, cmd); AV2C_COPY(envp, env); - ret = sigfillset(&set); - assert(ret == 0 && "BUG calling sigfillset"); - ret = sigprocmask(SIG_SETMASK, &set, &old); - assert(ret == 0 && "BUG calling sigprocmask to block"); - ret = sigemptyset(&cset); - assert(ret == 0 && "BUG calling sigemptyset"); - ret = sigaddset(&cset, SIGCHLD); - assert(ret == 0 && "BUG calling sigaddset for SIGCHLD"); + if (sigfillset(&set)) goto out; + if (sigdelset(&set, SIGABRT)) goto out; + if (sigdelset(&set, SIGBUS)) goto out; + if (sigdelset(&set, SIGFPE)) goto out; + if (sigdelset(&set, SIGILL)) goto out; + if (sigdelset(&set, SIGSEGV)) goto out; + /* no XCPU/XFSZ here */ + if (sigprocmask(SIG_SETMASK, &set, &old)) goto out; + chld_is_member = sigismember(&old, SIGCHLD); + if (chld_is_member < 0) goto out; + if (chld_is_member > 0 && sigdelset(&old, SIGCHLD)) goto out; + pid = vfork(); if (pid == 0) { int sig; - I32 i, child_fd, max = av_len(redir); + I32 i, child_fd, max_rlim; - for (child_fd = 0; child_fd <= max; child_fd++) { + for (child_fd = 0; child_fd <= max_fd; child_fd++) { SV **parent = av_fetch(redir, child_fd, 0); int parent_fd = SvIV(*parent); if (parent_fd == child_fd) continue; if (dup2(parent_fd, child_fd) < 0) - exit_err(&cerrnum); + exit_err("dup2", &cerrnum); } + if (pgid >= 0 && setpgid(0, pgid) < 0) + exit_err("setpgid", &cerrnum); for (sig = 1; sig < NSIG; sig++) signal(sig, SIG_DFL); /* ignore errors on signals */ - if (*cd && chdir(cd) < 0) - exit_err(&cerrnum); + if (*cd && chdir(cd) < 0) { + write(2, "cd ", 3); + exit_err(cd, &cerrnum); + } - max = av_len(rlim); - for (i = 0; i < max; i += 3) { + max_rlim = av_len(rlim); + for (i = 0; i < max_rlim; i += 3) { struct rlimit rl; SV **res = av_fetch(rlim, i, 0); SV **soft = av_fetch(rlim, i + 1, 0); @@ -122,134 +143,207 @@ int pi_fork_exec(SV *redirref, SV *file, SV *cmdref, SV *envref, SV *rlimref, rl.rlim_cur = SvIV(*soft); rl.rlim_max = SvIV(*hard); if (setrlimit(SvIV(*res), &rl) < 0) - exit_err(&cerrnum); + exit_err("setrlimit", &cerrnum); } - /* - * don't bother unblocking other signals for now, just SIGCHLD. - * we don't want signals to the group taking out a subprocess - */ - (void)sigprocmask(SIG_UNBLOCK, &cset, NULL); + (void)sigprocmask(SIG_SETMASK, &old, NULL); execve(filename, argv, envp); - exit_err(&cerrnum); + exit_err("execve", &cerrnum); } perrnum = errno; + if (chld_is_member > 0) + sigaddset(&old, SIGCHLD); ret = sigprocmask(SIG_SETMASK, &old, NULL); assert(ret == 0 && "BUG calling sigprocmask to restore"); if (cerrnum) { + int err_fd = STDERR_FILENO; + if (err_fd <= max_fd) { + SV **parent = av_fetch(redir, err_fd, 0); + err_fd = SvIV(*parent); + } if (pid > 0) waitpid(pid, NULL, 0); pid = -1; + /* continue message started by exit_err in child */ + dprintf(err_fd, ": %s\n", strerror(cerrnum)); errno = cerrnum; } else if (perrnum) { errno = perrnum; } +out: + if (pid < 0) + croak("E: fork_exec %s: %s\n", filename, strerror(errno)); return (int)pid; } -VFORK_SPAWN -# btrfs on Linux is copy-on-write (COW) by default. As of Linux 5.7, -# this still leads to fragmentation for SQLite and Xapian files where -# random I/O happens, so we disable COW just for SQLite files and Xapian -# directories. Disabling COW disables checksumming, so we only do this -# for regeneratable files, and not canonical git storage (git doesn't -# checksum refs, only data under $GIT_DIR/objects). -my $set_nodatacow = $^O eq 'linux' ? <<'SET_NODATACOW' : ''; -#include <sys/ioctl.h> -#include <sys/vfs.h> -#include <linux/magic.h> -#include <linux/fs.h> -#include <dirent.h> -#include <errno.h> -#include <stdio.h> -#include <string.h> - -void nodatacow_fd(int fd) +static int sendmsg_retry(int *tries) { - struct statfs buf; - int val = 0; - - if (fstatfs(fd, &buf) < 0) { - fprintf(stderr, "fstatfs: %s\\n", strerror(errno)); - return; + const struct timespec req = { 0, 100000000 }; /* 100ms */ + int err = errno; + switch (err) { + case EINTR: PERL_ASYNC_CHECK(); return 1; + case ENOBUFS: case ENOMEM: case ETOOMANYREFS: + if (--*tries < 0) return 0; + fprintf(stderr, "# sleeping on sendmsg: %s (%d tries left)\n", + strerror(err), *tries); + nanosleep(&req, NULL); + PERL_ASYNC_CHECK(); + return 1; + default: return 0; } +} - /* only btrfs is known to have this problem, so skip for non-btrfs */ - if (buf.f_type != BTRFS_SUPER_MAGIC) - return; +#if defined(CMSG_SPACE) && defined(CMSG_LEN) +#define SEND_FD_CAPA 10 +#define SEND_FD_SPACE (SEND_FD_CAPA * sizeof(int)) +union my_cmsg { + struct cmsghdr hdr; + char pad[sizeof(struct cmsghdr) + 16 + SEND_FD_SPACE]; +}; - if (ioctl(fd, FS_IOC_GETFLAGS, &val) < 0) { - fprintf(stderr, "FS_IOC_GET_FLAGS: %s\\n", strerror(errno)); - return; +SV *send_cmd4_(PerlIO *s, SV *svfds, SV *data, int flags, int tries) +{ + struct msghdr msg = { 0 }; + union my_cmsg cmsg = { 0 }; + STRLEN dlen = 0; + struct iovec iov; + ssize_t sent; + AV *fds = (AV *)SvRV(svfds); + I32 i, nfds = av_len(fds) + 1; + int *fdp; + + if (SvOK(data)) { + iov.iov_base = SvPV(data, dlen); + iov.iov_len = dlen; + } + if (!dlen) { /* must be non-zero */ + iov.iov_base = &msg.msg_namelen; /* whatever */ + iov.iov_len = 1; } - val |= FS_NOCOW_FL; - if (ioctl(fd, FS_IOC_SETFLAGS, &val) < 0) - fprintf(stderr, "FS_IOC_SET_FLAGS: %s\\n", strerror(errno)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + if (nfds) { + if (nfds > SEND_FD_CAPA) { + fprintf(stderr, "FIXME: bump SEND_FD_CAPA=%d\n", nfds); + nfds = SEND_FD_CAPA; + } + msg.msg_control = &cmsg.hdr; + msg.msg_controllen = CMSG_SPACE(nfds * sizeof(int)); + cmsg.hdr.cmsg_level = SOL_SOCKET; + cmsg.hdr.cmsg_type = SCM_RIGHTS; + cmsg.hdr.cmsg_len = CMSG_LEN(nfds * sizeof(int)); + fdp = (int *)CMSG_DATA(&cmsg.hdr); + for (i = 0; i < nfds; i++) { + SV **fd = av_fetch(fds, i, 0); + *fdp++ = SvIV(*fd); + } + } + do { + sent = sendmsg(PerlIO_fileno(s), &msg, flags); + } while (sent < 0 && sendmsg_retry(&tries)); + return sent >= 0 ? newSViv(sent) : &PL_sv_undef; } -void nodatacow_dir(const char *dir) +void recv_cmd4(PerlIO *s, SV *buf, STRLEN n) { - DIR *dh = opendir(dir); - int fd; + union my_cmsg cmsg = { 0 }; + struct msghdr msg = { 0 }; + struct iovec iov; + ssize_t i; + Inline_Stack_Vars; + Inline_Stack_Reset; - if (!dh) croak("opendir(%s): %s", dir, strerror(errno)); - fd = dirfd(dh); - if (fd >= 0) - nodatacow_fd(fd); - /* ENOTSUP probably won't happen under Linux... */ - closedir(dh); -} -SET_NODATACOW - -my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} //= ( - $ENV{XDG_CACHE_HOME} // - ( ($ENV{HOME} // '/nonexistent').'/.cache' ) - ).'/public-inbox/inline-c'; + if (!SvOK(buf)) + sv_setpvn(buf, "", 0); + iov.iov_base = SvGROW(buf, n + 1); + iov.iov_len = n; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = &cmsg.hdr; + msg.msg_controllen = CMSG_SPACE(SEND_FD_SPACE); -$set_nodatacow = $vfork_spawn = undef unless -d $inline_dir && -w _; -if (defined $vfork_spawn) { - # Inline 0.64 or later has locking in multi-process env, - # but we support 0.5 on Debian wheezy - use Fcntl qw(:flock); - eval { - my $f = "$inline_dir/.public-inbox.lock"; - open my $fh, '>', $f or die "failed to open $f: $!\n"; - flock($fh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n"; - eval 'use Inline C => $vfork_spawn . $set_nodatacow'; - my $err = $@; - my $ndc_err; - if ($err && $set_nodatacow) { # missing Linux kernel headers - $ndc_err = $err; - undef $set_nodatacow; - eval 'use Inline C => $vfork_spawn'; + for (;;) { + i = recvmsg(PerlIO_fileno(s), &msg, 0); + if (i >= 0 || errno != EINTR) break; + PERL_ASYNC_CHECK(); + } + if (i >= 0) { + SvCUR_set(buf, i); + if (cmsg.hdr.cmsg_level == SOL_SOCKET && + cmsg.hdr.cmsg_type == SCM_RIGHTS) { + size_t len = cmsg.hdr.cmsg_len; + int *fdp = (int *)CMSG_DATA(&cmsg.hdr); + for (i = 0; CMSG_LEN((i + 1) * sizeof(int)) <= len; i++) + Inline_Stack_Push(sv_2mortal(newSViv(*fdp++))); } - flock($fh, LOCK_UN) or die "LOCK_UN failed on $f: $!\n"; - die $err if $err; - warn $ndc_err if $ndc_err; - }; - if ($@) { - warn "Inline::C failed for vfork: $@\n"; - $set_nodatacow = $vfork_spawn = undef; + } else { + Inline_Stack_Push(&PL_sv_undef); + SvCUR_set(buf, 0); } + Inline_Stack_Done; } +#endif /* defined(CMSG_SPACE) && defined(CMSG_LEN) */ -unless (defined $vfork_spawn) { - require PublicInbox::SpawnPP; - *pi_fork_exec = \&PublicInbox::SpawnPP::pi_fork_exec -} -unless ($set_nodatacow) { - require PublicInbox::NDC_PP; - no warnings 'once'; - *nodatacow_fd = \&PublicInbox::NDC_PP::nodatacow_fd; - *nodatacow_dir = \&PublicInbox::NDC_PP::nodatacow_dir; -} -undef $set_nodatacow; -undef $vfork_spawn; +void rlimit_map() +{ + Inline_Stack_Vars; + Inline_Stack_Reset; +ALL_LIBC + my $inline_dir = $ENV{PERL_INLINE_DIRECTORY} // ( + $ENV{XDG_CACHE_HOME} // + ( ($ENV{HOME} // '/nonexistent').'/.cache' ) + ).'/public-inbox/inline-c'; + undef $all_libc unless -d $inline_dir; + if (defined $all_libc) { + for (@RLIMITS, 'RLIM_INFINITY') { + $all_libc .= <<EOM; + Inline_Stack_Push(sv_2mortal(newSVpvs("$_"))); + Inline_Stack_Push(sv_2mortal(newSViv($_))); +EOM + } + $all_libc .= <<EOM; + Inline_Stack_Done; +} // rlimit_map +EOM + local $ENV{PERL_INLINE_DIRECTORY} = $inline_dir; + # CentOS 7.x ships Inline 0.53, 0.64+ has built-in locking + my $lk = PublicInbox::Lock->new($inline_dir. + '/.public-inbox.lock'); + my $fh = $lk->lock_acquire; + open my $oldout, '>&', \*STDOUT; + open my $olderr, '>&', \*STDERR; + open STDOUT, '>&', $fh; + open STDERR, '>&', $fh; + STDERR->autoflush(1); + STDOUT->autoflush(1); + eval 'use Inline C => $all_libc, BUILD_NOISY => 1'; + my $err = $@; + open(STDERR, '>&', $olderr); + open(STDOUT, '>&', $oldout); + if ($err) { + seek($fh, 0, SEEK_SET); + my @msg = <$fh>; + truncate($fh, 0); + warn "Inline::C build failed:\n", $err, "\n", @msg; + $all_libc = undef; + } + } + if (defined $all_libc) { # set for Gcf2 + $ENV{PERL_INLINE_DIRECTORY} = $inline_dir; + %RLIMITS = rlimit_map(); + *send_cmd4 = sub ($$$$;$) { + send_cmd4_($_[0], $_[1], $_[2], $_[3], 50); + } + } else { + require PublicInbox::SpawnPP; + *pi_fork_exec = \&PublicInbox::SpawnPP::pi_fork_exec + } +} # /BEGIN sub which ($) { my ($file) = @_; return $file if index($file, '/') >= 0; - foreach my $p (split(':', $ENV{PATH})) { + for my $p (split(/:/, $ENV{PATH})) { $p .= "/$file"; return $p if -x $p; } @@ -257,53 +351,94 @@ sub which ($) { } sub spawn ($;$$) { - my ($cmd, $env, $opts) = @_; - my $f = which($cmd->[0]); - defined $f or die "$cmd->[0]: command not found\n"; - my @env; - $opts ||= {}; - - my %env = $env ? (%ENV, %$env) : %ENV; + my ($cmd, $env, $opt) = @_; + my $f = which($cmd->[0]) // die "$cmd->[0]: command not found\n"; + my (@env, @rdr); + my %env = (%ENV, $env ? %$env : ()); while (my ($k, $v) = each %env) { - push @env, "$k=$v"; + push @env, "$k=$v" if defined($v); } - my $redir = []; for my $child_fd (0..2) { - my $parent_fd = $opts->{$child_fd}; - if (defined($parent_fd) && $parent_fd !~ /\A[0-9]+\z/) { - defined(my $fd = fileno($parent_fd)) or - die "$parent_fd not an IO GLOB? $!"; - $parent_fd = $fd; + my $pfd = $opt->{$child_fd}; + if ('SCALAR' eq ref($pfd)) { + open my $fh, '+>', undef; + $opt->{"fh.$child_fd"} = $fh; # for read_out_err + if ($child_fd == 0) { + print $fh $$pfd; + $fh->flush or die "flush: $!"; + sysseek($fh, 0, SEEK_SET); + } + $pfd = fileno($fh); + } elsif (defined($pfd) && $pfd !~ /\A[0-9]+\z/) { + my $fd = fileno($pfd) // + croak "BUG: $pfd not an IO GLOB? $!"; + $pfd = $fd; } - $redir->[$child_fd] = $parent_fd // $child_fd; + $rdr[$child_fd] = $pfd // $child_fd; } my $rlim = []; - foreach my $l (@RLIMITS) { - defined(my $v = $opts->{$l}) or next; - my $r = eval "require BSD::Resource; BSD::Resource::$l();"; - unless (defined $r) { - warn "$l undefined by BSD::Resource: $@\n"; - next; - } + my $v = $opt->{$l} // next; + my $r = $RLIMITS{$l} // + eval "require BSD::Resource; BSD::Resource::$l();" // + do { + warn "$l undefined by BSD::Resource: $@\n"; + next; + }; push @$rlim, $r, @$v; } - my $cd = $opts->{'-C'} // ''; # undef => NULL mapping doesn't work? - my $pid = pi_fork_exec($redir, $f, $cmd, \@env, $rlim, $cd); - die "fork_exec @$cmd failed: $!\n" unless $pid > 0; - $pid; + my $cd = $opt->{'-C'} // ''; # undef => NULL mapping doesn't work? + my $pgid = $opt->{pgid} // -1; + pi_fork_exec(\@rdr, $f, $cmd, \@env, $rlim, $cd, $pgid); } sub popen_rd { - my ($cmd, $env, $opts) = @_; - pipe(my ($r, $w)) or die "pipe: $!\n"; - $opts ||= {}; - $opts->{1} = fileno($w); - my $pid = spawn($cmd, $env, $opts); - return ($r, $pid) if wantarray; - my $ret = gensym; - tie *$ret, 'PublicInbox::ProcessPipe', $pid, $r; - $ret; + my ($cmd, $env, $opt, @cb_arg) = @_; + pipe(my $r, local $opt->{1}); + PublicInbox::IO::attach_pid($r, spawn($cmd, $env, $opt), @cb_arg); +} + +sub popen_wr { + my ($cmd, $env, $opt, @cb_arg) = @_; + pipe(local $opt->{0}, my $w); + $w->autoflush(1); + PublicInbox::IO::attach_pid($w, spawn($cmd, $env, $opt), @cb_arg); +} + +sub read_out_err ($) { + my ($opt) = @_; + for my $fd (1, 2) { # read stdout/stderr + my $fh = delete($opt->{"fh.$fd"}) // next; + seek($fh, 0, SEEK_SET); + PublicInbox::IO::read_all $fh, undef, $opt->{$fd}; + } +} + +sub run_wait ($;$$) { + my ($cmd, $env, $opt) = @_; + waitpid(spawn($cmd, $env, $opt), 0); + read_out_err($opt); + $? +} + +sub run_die ($;$$) { + my ($cmd, $env, $rdr) = @_; + run_wait($cmd, $env, $rdr) and croak "E: @$cmd failed: \$?=$?"; +} + +sub run_qx { + my ($cmd, $env, $opt) = @_; + my $fh = popen_rd($cmd, $env, $opt); + my @ret; + if (wantarray) { + @ret = <$fh>; + } else { + local $/; + $ret[0] = <$fh>; + } + $fh->close; # caller should check $? + read_out_err($opt); + wantarray ? @ret : $ret[0]; } 1; diff --git a/lib/PublicInbox/SpawnPP.pm b/lib/PublicInbox/SpawnPP.pm index a72d5a2d..9ad4d0a1 100644 --- a/lib/PublicInbox/SpawnPP.pm +++ b/lib/PublicInbox/SpawnPP.pm @@ -1,59 +1,70 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Pure-Perl implementation of "spawn". This can't take advantage # of vfork, so no speedups under Linux for spawning from large processes. +# Do not require this directly, only use from PublicInbox::Spawn package PublicInbox::SpawnPP; -use strict; -use warnings; -use POSIX qw(dup2 :signal_h); +use v5.12; +use POSIX qw(dup2 _exit setpgid :signal_h); +use autodie qw(chdir close pipe); +use PublicInbox::OnDestroy; +# this is loaded by PublicInbox::Spawn, so we can't use/require it, here # Pure Perl implementation for folks that do not use Inline::C -sub pi_fork_exec ($$$$$$) { - my ($redir, $f, $cmd, $env, $rlim, $cd) = @_; +sub pi_fork_exec ($$$$$$$) { + my ($redir, $f, $cmd, $env, $rlim, $cd, $pgid) = @_; my $old = POSIX::SigSet->new(); my $set = POSIX::SigSet->new(); - $set->fillset or die "fillset failed: $!"; - sigprocmask(SIG_SETMASK, $set, $old) or die "can't block signals: $!"; - my $syserr; - my $pid = fork; - unless (defined $pid) { # compat with Inline::C version - $syserr = $!; - $pid = -1; + $set->fillset or die "sigfillset: $!"; + for (POSIX::SIGABRT, POSIX::SIGBUS, POSIX::SIGFPE, + POSIX::SIGILL, POSIX::SIGSEGV) { + $set->delset($_) or die "delset($_): $!"; } + sigprocmask(SIG_SETMASK, $set, $old) or die "SIG_SETMASK(set): $!"; + pipe(my $r, my $w); + my $pid = PublicInbox::OnDestroy::fork_tmp; if ($pid == 0) { - while (@$rlim) { - my ($r, $soft, $hard) = splice(@$rlim, 0, 3); - BSD::Resource::setrlimit($r, $soft, $hard) or - warn "failed to set $r=[$soft,$hard]\n"; - } + close $r; + $SIG{__DIE__} = sub { + warn(@_); + syswrite($w, my $num = $! + 0); + _exit(1); + }; for my $child_fd (0..$#$redir) { my $parent_fd = $redir->[$child_fd]; next if $parent_fd == $child_fd; dup2($parent_fd, $child_fd) or - die "dup2($parent_fd, $child_fd): $!\n"; + die "dup2($parent_fd, $child_fd): $!"; } - if ($cd ne '') { - chdir $cd or die "chdir $cd: $!"; + if ($pgid >= 0 && !defined(setpgid(0, $pgid))) { + die "setpgid(0, $pgid): $!"; } - $SIG{$_} = 'DEFAULT' for keys %SIG; - my $cset = POSIX::SigSet->new(); - $cset->addset(POSIX::SIGCHLD) or die "can't add SIGCHLD: $!"; - sigprocmask(SIG_UNBLOCK, $cset) or - die "can't unblock SIGCHLD: $!"; + $SIG{$_} = 'DEFAULT' for grep(!/\A__/, keys %SIG); + chdir($cd) if $cd ne ''; + while (@$rlim) { + my ($r, $soft, $hard) = splice(@$rlim, 0, 3); + BSD::Resource::setrlimit($r, $soft, $hard) or + die "setrlimit($r=[$soft,$hard]: $!)"; + } + $old->delset(POSIX::SIGCHLD) or die "sigdelset CHLD: $!"; + sigprocmask(SIG_SETMASK, $old) or die "SIG_SETMASK ~CHLD: $!"; + $cmd->[0] = $f; if ($ENV{MOD_PERL}) { - exec which('env'), '-i', @$env, @$cmd; - die "exec env -i ... $cmd->[0] failed: $!\n"; + $f = PublicInbox::Spawn::which('env'); + @$cmd = ('env', '-i', @$env, @$cmd); } else { - local %ENV = map { split(/=/, $_, 2) } @$env; - my @cmd = @$cmd; - $cmd[0] = $f; - exec @cmd; - die "exec $cmd->[0] failed: $!\n"; + %ENV = map { split(/=/, $_, 2) } @$env; } + exec { $f } @$cmd; + die "exec @$cmd failed: $!"; + } + close $w; + sigprocmask(SIG_SETMASK, $old) or die "SIG_SETMASK(old): $!"; + if (my $cerrnum = do { local $/, <$r> }) { + $! = $cerrnum; + die "forked child $@: $!"; } - sigprocmask(SIG_SETMASK, $old) or die "can't unblock signals: $!"; - $! = $syserr; $pid; } diff --git a/lib/PublicInbox/Syscall.pm b/lib/PublicInbox/Syscall.pm index e4f00a2a..4cbe9623 100644 --- a/lib/PublicInbox/Syscall.pm +++ b/lib/PublicInbox/Syscall.pm @@ -2,33 +2,34 @@ # specifically the Debian libsys-syscall-perl 0.25-6 version to # fix upstream regressions in 0.25. # +# See devel/sysdefs-list in the public-inbox source tree for maintenance +# <https://80x24.org/public-inbox.git>, and machines from the GCC Farm: +# <https://portal.cfarm.net/> +# # This license differs from the rest of public-inbox # # This module is Copyright (c) 2005 Six Apart, Ltd. -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # # All rights reserved. # # You may distribute under the terms of either the GNU General Public # License or the Artistic License, as specified in the Perl README file. package PublicInbox::Syscall; -use strict; +use v5.12; use parent qw(Exporter); -use POSIX qw(ENOSYS O_NONBLOCK); +use POSIX qw(ENOENT ENOSYS EINVAL O_NONBLOCK); +use Socket qw(SOL_SOCKET SCM_RIGHTS); use Config; +our %SIGNUM = (WINCH => 28); # most Linux, {Free,Net,Open}BSD, *Darwin +our ($INOTIFY, %PACK); # $VERSION = '0.25'; # Sys::Syscall version our @EXPORT_OK = qw(epoll_ctl epoll_create epoll_wait - EPOLLIN EPOLLOUT EPOLLET - EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD - EPOLLONESHOT EPOLLEXCLUSIVE - signalfd $SFD_NONBLOCK); -our %EXPORT_TAGS = (epoll => [qw(epoll_ctl epoll_create epoll_wait - EPOLLIN EPOLLOUT - EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD - EPOLLONESHOT EPOLLEXCLUSIVE)], - ); - + EPOLLIN EPOLLOUT EPOLLET + EPOLL_CTL_ADD EPOLL_CTL_DEL EPOLL_CTL_MOD + EPOLLONESHOT EPOLLEXCLUSIVE + signalfd rename_noreplace %SIGNUM $F_SETPIPE_SZ); use constant { EPOLLIN => 1, EPOLLOUT => 4, @@ -41,240 +42,493 @@ use constant { EPOLL_CTL_ADD => 1, EPOLL_CTL_DEL => 2, EPOLL_CTL_MOD => 3, + SIZEOF_int => $Config{intsize}, + SIZEOF_size_t => $Config{sizesize}, + SIZEOF_ptr => $Config{ptrsize}, + NUL => "\0", }; -our $loaded_syscall = 0; - -sub _load_syscall { - # props to Gaal for this! - return if $loaded_syscall++; - my $clean = sub { - delete @INC{qw<syscall.ph asm/unistd.ph bits/syscall.ph - _h2ph_pre.ph sys/syscall.ph>}; - }; - $clean->(); # don't trust modules before us - my $rv = eval { require 'syscall.ph'; 1 } || eval { require 'sys/syscall.ph'; 1 }; - $clean->(); # don't require modules after us trust us - return $rv; -} - +use constant TMPL_size_t => SIZEOF_size_t == 8 ? 'Q' : 'L'; -our ( - $SYS_epoll_create, - $SYS_epoll_ctl, - $SYS_epoll_wait, - $SYS_signalfd4, - ); +our ($SYS_epoll_create, + $SYS_epoll_ctl, + $SYS_epoll_wait, + $SYS_signalfd4, + $SYS_renameat2, + $F_SETPIPE_SZ, + $SYS_sendmsg, + $SYS_recvmsg); +my $SYS_fstatfs; # don't need fstatfs64, just statfs.f_type +my ($FS_IOC_GETFLAGS, $FS_IOC_SETFLAGS); my $SFD_CLOEXEC = 02000000; # Perl does not expose O_CLOEXEC -our $SFD_NONBLOCK = O_NONBLOCK; our $no_deprecated = 0; if ($^O eq "linux") { - my $machine = (POSIX::uname())[-1]; - # whether the machine requires 64-bit numbers to be on 8-byte - # boundaries. - my $u64_mod_8 = 0; - - # if we're running on an x86_64 kernel, but a 32-bit process, - # we need to use the x32 or i386 syscall numbers. - if ($machine eq "x86_64" && $Config{ptrsize} == 4) { - $machine = $Config{cppsymbols} =~ /\b__ILP32__=1\b/ ? 'x32' : 'i386'; - } + $F_SETPIPE_SZ = 1031; + my (undef, undef, $release, undef, $machine) = POSIX::uname(); + my ($maj, $min) = ($release =~ /\A([0-9]+)\.([0-9]+)/); + $SYS_renameat2 = 0 if "$maj.$min" < 3.15; + # whether the machine requires 64-bit numbers to be on 8-byte + # boundaries. + my $u64_mod_8 = 0; - # Similarly for mips64 vs mips - if ($machine eq "mips64" && $Config{ptrsize} == 4) { - $machine = "mips"; - } - - if ($machine =~ m/^i[3456]86$/) { - $SYS_epoll_create = 254; - $SYS_epoll_ctl = 255; - $SYS_epoll_wait = 256; - $SYS_signalfd4 = 327; - } elsif ($machine eq "x86_64") { - $SYS_epoll_create = 213; - $SYS_epoll_ctl = 233; - $SYS_epoll_wait = 232; - $SYS_signalfd4 = 289; - } elsif ($machine eq 'x32') { - $SYS_epoll_create = 1073742037; - $SYS_epoll_ctl = 1073742057; - $SYS_epoll_wait = 1073742056; - $SYS_signalfd4 = 1073742113; - } elsif ($machine eq 'sparc64') { - $SYS_epoll_create = 193; - $SYS_epoll_ctl = 194; - $SYS_epoll_wait = 195; - $u64_mod_8 = 1; - $SYS_signalfd4 = 317; - $SFD_CLOEXEC = 020000000; - } elsif ($machine =~ m/^parisc/) { - $SYS_epoll_create = 224; - $SYS_epoll_ctl = 225; - $SYS_epoll_wait = 226; - $u64_mod_8 = 1; - $SYS_signalfd4 = 309; - } elsif ($machine =~ m/^ppc64/) { - $SYS_epoll_create = 236; - $SYS_epoll_ctl = 237; - $SYS_epoll_wait = 238; - $u64_mod_8 = 1; - $SYS_signalfd4 = 313; - } elsif ($machine eq "ppc") { - $SYS_epoll_create = 236; - $SYS_epoll_ctl = 237; - $SYS_epoll_wait = 238; - $u64_mod_8 = 1; - $SYS_signalfd4 = 313; - } elsif ($machine =~ m/^s390/) { - $SYS_epoll_create = 249; - $SYS_epoll_ctl = 250; - $SYS_epoll_wait = 251; - $u64_mod_8 = 1; - $SYS_signalfd4 = 322; - } elsif ($machine eq "ia64") { - $SYS_epoll_create = 1243; - $SYS_epoll_ctl = 1244; - $SYS_epoll_wait = 1245; - $u64_mod_8 = 1; - $SYS_signalfd4 = 289; - } elsif ($machine eq "alpha") { - # natural alignment, ints are 32-bits - $SYS_epoll_create = 407; - $SYS_epoll_ctl = 408; - $SYS_epoll_wait = 409; - $u64_mod_8 = 1; - $SYS_signalfd4 = 484; - $SFD_CLOEXEC = 010000000; - } elsif ($machine eq "aarch64") { - $SYS_epoll_create = 20; # (sys_epoll_create1) - $SYS_epoll_ctl = 21; - $SYS_epoll_wait = 22; # (sys_epoll_pwait) - $u64_mod_8 = 1; - $no_deprecated = 1; - $SYS_signalfd4 = 74; - } elsif ($machine =~ m/arm(v\d+)?.*l/) { - # ARM OABI - $SYS_epoll_create = 250; - $SYS_epoll_ctl = 251; - $SYS_epoll_wait = 252; - $u64_mod_8 = 1; - $SYS_signalfd4 = 355; - } elsif ($machine =~ m/^mips64/) { - $SYS_epoll_create = 5207; - $SYS_epoll_ctl = 5208; - $SYS_epoll_wait = 5209; - $u64_mod_8 = 1; - $SYS_signalfd4 = 5283; - } elsif ($machine =~ m/^mips/) { - $SYS_epoll_create = 4248; - $SYS_epoll_ctl = 4249; - $SYS_epoll_wait = 4250; - $u64_mod_8 = 1; - $SYS_signalfd4 = 4324; - } else { - # as a last resort, try using the *.ph files which may not - # exist or may be wrong - _load_syscall(); - $SYS_epoll_create = eval { &SYS_epoll_create; } || 0; - $SYS_epoll_ctl = eval { &SYS_epoll_ctl; } || 0; - $SYS_epoll_wait = eval { &SYS_epoll_wait; } || 0; - - # Note: do NOT add new syscalls to depend on *.ph, here. - # Better to miss syscalls (so we can fallback to IO::Poll) - # than to use wrong ones, since the names are not stable - # (at least not on FreeBSD), if the actual numbers are. - } - - if ($u64_mod_8) { - *epoll_wait = \&epoll_wait_mod8; - *epoll_ctl = \&epoll_ctl_mod8; - } else { - *epoll_wait = \&epoll_wait_mod4; - *epoll_ctl = \&epoll_ctl_mod4; - } + if (SIZEOF_ptr == 4) { + # if we're running on an x86_64 kernel, but a 32-bit process, + # we need to use the x32 or i386 syscall numbers. + if ($machine eq 'x86_64') { + my $s = $Config{cppsymbols}; + $machine = ($s =~ /\b__ILP32__=1\b/ && + $s =~ /\b__x86_64__=1\b/) ? + 'x32' : 'i386' + } elsif ($machine eq 'mips64') { # similarly for mips64 vs mips + $machine = 'mips'; + } + } + if ($machine =~ m/^i[3456]86$/) { + $SYS_epoll_create = 254; + $SYS_epoll_ctl = 255; + $SYS_epoll_wait = 256; + $SYS_signalfd4 = 327; + $SYS_renameat2 //= 353; + $SYS_fstatfs = 100; + $SYS_sendmsg = 370; + $SYS_recvmsg = 372; + $INOTIFY = { # usage: `use constant $PublicInbox::Syscall::INOTIFY' + SYS_inotify_init1 => 332, + SYS_inotify_add_watch => 292, + SYS_inotify_rm_watch => 293, + }; + $FS_IOC_GETFLAGS = 0x80046601; + $FS_IOC_SETFLAGS = 0x40046602; + } elsif ($machine eq "x86_64") { + $SYS_epoll_create = 213; + $SYS_epoll_ctl = 233; + $SYS_epoll_wait = 232; + $SYS_signalfd4 = 289; + $SYS_renameat2 //= 316; + $SYS_fstatfs = 138; + $SYS_sendmsg = 46; + $SYS_recvmsg = 47; + $INOTIFY = { + SYS_inotify_init1 => 294, + SYS_inotify_add_watch => 254, + SYS_inotify_rm_watch => 255, + }; + $FS_IOC_GETFLAGS = 0x80086601; + $FS_IOC_SETFLAGS = 0x40086602; + } elsif ($machine eq 'x32') { + $SYS_epoll_create = 1073742037; + $SYS_epoll_ctl = 1073742057; + $SYS_epoll_wait = 1073742056; + $SYS_signalfd4 = 1073742113; + $SYS_renameat2 //= 0x40000000 + 316; + $SYS_fstatfs = 138; + $SYS_sendmsg = 0x40000206; + $SYS_recvmsg = 0x40000207; + $FS_IOC_GETFLAGS = 0x80046601; + $FS_IOC_SETFLAGS = 0x40046602; + $INOTIFY = { + SYS_inotify_init1 => 1073742118, + SYS_inotify_add_watch => 1073742078, + SYS_inotify_rm_watch => 1073742079, + }; + } elsif ($machine eq 'sparc64') { + $SYS_epoll_create = 193; + $SYS_epoll_ctl = 194; + $SYS_epoll_wait = 195; + $u64_mod_8 = 1; + $SYS_signalfd4 = 317; + $SYS_renameat2 //= 345; + $SFD_CLOEXEC = 020000000; + $SYS_fstatfs = 158; + $SYS_sendmsg = 114; + $SYS_recvmsg = 113; + $FS_IOC_GETFLAGS = 0x40086601; + $FS_IOC_SETFLAGS = 0x80086602; + } elsif ($machine =~ m/^parisc/) { # untested, no machine on cfarm + $SYS_epoll_create = 224; + $SYS_epoll_ctl = 225; + $SYS_epoll_wait = 226; + $u64_mod_8 = 1; + $SYS_signalfd4 = 309; + $SIGNUM{WINCH} = 23; + } elsif ($machine =~ m/^ppc64/) { + $SYS_epoll_create = 236; + $SYS_epoll_ctl = 237; + $SYS_epoll_wait = 238; + $u64_mod_8 = 1; + $SYS_signalfd4 = 313; + $SYS_renameat2 //= 357; + $SYS_fstatfs = 100; + $SYS_sendmsg = 341; + $SYS_recvmsg = 342; + $FS_IOC_GETFLAGS = 0x40086601; + $FS_IOC_SETFLAGS = 0x80086602; + $INOTIFY = { + SYS_inotify_init1 => 318, + SYS_inotify_add_watch => 276, + SYS_inotify_rm_watch => 277, + }; + } elsif ($machine eq "ppc") { # untested, no machine on cfarm + $SYS_epoll_create = 236; + $SYS_epoll_ctl = 237; + $SYS_epoll_wait = 238; + $u64_mod_8 = 1; + $SYS_signalfd4 = 313; + $SYS_renameat2 //= 357; + $SYS_fstatfs = 100; + $FS_IOC_GETFLAGS = 0x40086601; + $FS_IOC_SETFLAGS = 0x80086602; + } elsif ($machine =~ m/^s390/) { # untested, no machine on cfarm + $SYS_epoll_create = 249; + $SYS_epoll_ctl = 250; + $SYS_epoll_wait = 251; + $u64_mod_8 = 1; + $SYS_signalfd4 = 322; + $SYS_renameat2 //= 347; + $SYS_fstatfs = 100; + $SYS_sendmsg = 370; + $SYS_recvmsg = 372; + } elsif ($machine eq 'ia64') { # untested, no machine on cfarm + $SYS_epoll_create = 1243; + $SYS_epoll_ctl = 1244; + $SYS_epoll_wait = 1245; + $u64_mod_8 = 1; + $SYS_signalfd4 = 289; + } elsif ($machine eq "alpha") { # untested, no machine on cfarm + # natural alignment, ints are 32-bits + $SYS_epoll_create = 407; + $SYS_epoll_ctl = 408; + $SYS_epoll_wait = 409; + $u64_mod_8 = 1; + $SYS_signalfd4 = 484; + $SFD_CLOEXEC = 010000000; + } elsif ($machine =~ /\A(?:loong|a)arch64\z/ || $machine eq 'riscv64') { + $SYS_epoll_create = 20; # (sys_epoll_create1) + $SYS_epoll_ctl = 21; + $SYS_epoll_wait = 22; # (sys_epoll_pwait) + $u64_mod_8 = 1; + $no_deprecated = 1; + $SYS_signalfd4 = 74; + $SYS_renameat2 //= 276; + $SYS_fstatfs = 44; + $SYS_sendmsg = 211; + $SYS_recvmsg = 212; + $INOTIFY = { + SYS_inotify_init1 => 26, + SYS_inotify_add_watch => 27, + SYS_inotify_rm_watch => 28, + }; + $FS_IOC_GETFLAGS = 0x80086601; + $FS_IOC_SETFLAGS = 0x40086602; + } elsif ($machine =~ m/arm(v\d+)?.*l/) { # ARM OABI (untested on cfarm) + $SYS_epoll_create = 250; + $SYS_epoll_ctl = 251; + $SYS_epoll_wait = 252; + $u64_mod_8 = 1; + $SYS_signalfd4 = 355; + $SYS_renameat2 //= 382; + $SYS_fstatfs = 100; + $SYS_sendmsg = 296; + $SYS_recvmsg = 297; + } elsif ($machine =~ m/^mips64/) { # cfarm only has 32-bit userspace + $SYS_epoll_create = 5207; + $SYS_epoll_ctl = 5208; + $SYS_epoll_wait = 5209; + $u64_mod_8 = 1; + $SYS_signalfd4 = 5283; + $SYS_renameat2 //= 5311; + $SYS_fstatfs = 5135; + $SYS_sendmsg = 5045; + $SYS_recvmsg = 5046; + $FS_IOC_GETFLAGS = 0x40046601; + $FS_IOC_SETFLAGS = 0x80046602; + } elsif ($machine =~ m/^mips/) { # 32-bit, tested on mips64 cfarm host + $SYS_epoll_create = 4248; + $SYS_epoll_ctl = 4249; + $SYS_epoll_wait = 4250; + $u64_mod_8 = 1; + $SYS_signalfd4 = 4324; + $SYS_renameat2 //= 4351; + $SYS_fstatfs = 4100; + $SYS_sendmsg = 4179; + $SYS_recvmsg = 4177; + $FS_IOC_GETFLAGS = 0x40046601; + $FS_IOC_SETFLAGS = 0x80046602; + $SIGNUM{WINCH} = 20; + $INOTIFY = { + SYS_inotify_init1 => 4329, + SYS_inotify_add_watch => 4285, + SYS_inotify_rm_watch => 4286, + }; + } else { + warn <<EOM; +machine=$machine ptrsize=$Config{ptrsize} has no syscall definitions +git clone https://80x24.org/public-inbox.git and +Send the output of ./devel/sysdefs-list to meta\@public-inbox.org +EOM + } + if ($u64_mod_8) { + *epoll_wait = \&epoll_wait_mod8; + *epoll_ctl = \&epoll_ctl_mod8; + } else { + *epoll_wait = \&epoll_wait_mod4; + *epoll_ctl = \&epoll_ctl_mod4; + } +} elsif ($^O =~ /\A(?:freebsd|openbsd|netbsd|dragonfly)\z/) { +# don't use syscall.ph here, name => number mappings are not stable on *BSD +# but the actual numbers are. +# OpenBSD perl redirects syscall perlop to libc functions +# https://cvsweb.openbsd.org/src/gnu/usr.bin/perl/gen_syscall_emulator.pl +# https://www.netbsd.org/docs/internals/en/chap-processes.html#syscall_versioning +# https://wiki.freebsd.org/AddingSyscalls#Backward_compatibily +# (I'm assuming Dragonfly copies FreeBSD, here, too) + $SYS_recvmsg = 27; + $SYS_sendmsg = 28; } -elsif ($^O eq "freebsd") { - if ($ENV{FREEBSD_SENDFILE}) { - # this is still buggy and in development - } -} +BEGIN { + if ($^O eq 'linux') { + %PACK = ( + TMPL_cmsg_len => TMPL_size_t, + # cmsg_len, cmsg_level, cmsg_type + SIZEOF_cmsghdr => SIZEOF_int * 2 + SIZEOF_size_t, + CMSG_DATA_off => '', + TMPL_msghdr => 'PL' . # msg_name, msg_namelen + '@'.(2 * SIZEOF_ptr).'P'. # msg_iov + 'i'. # msg_iovlen + '@'.(4 * SIZEOF_ptr).'P'. # msg_control + 'L'. # msg_controllen (socklen_t) + 'i', # msg_flags + ); + } elsif ($^O =~ /\A(?:freebsd|openbsd|netbsd|dragonfly)\z/) { + %PACK = ( + TMPL_cmsg_len => 'L', # socklen_t + SIZEOF_cmsghdr => SIZEOF_int * 3, + CMSG_DATA_off => SIZEOF_ptr == 8 ? '@16' : '', + TMPL_msghdr => 'PL' . # msg_name, msg_namelen + '@'.(2 * SIZEOF_ptr).'P'. # msg_iov + TMPL_size_t. # msg_iovlen + '@'.(4 * SIZEOF_ptr).'P'. # msg_control + TMPL_size_t. # msg_controllen + 'i', # msg_flags -############################################################################ -# epoll functions -############################################################################ + ) + } + $PACK{CMSG_ALIGN_size} = SIZEOF_size_t; + $PACK{SIZEOF_cmsghdr} //= 0; + $PACK{TMPL_cmsg_len} //= undef; + $PACK{CMSG_DATA_off} //= undef; + $PACK{TMPL_msghdr} //= undef; +} -sub epoll_defined { return $SYS_epoll_create ? 1 : 0; } +# SFD_CLOEXEC is arch-dependent, so IN_CLOEXEC may be, too +$INOTIFY->{IN_CLOEXEC} //= 0x80000 if $INOTIFY; sub epoll_create { - syscall($SYS_epoll_create, $no_deprecated ? 0 : ($_[0]||100)+0); + syscall($SYS_epoll_create, $no_deprecated ? 0 : 100); } # epoll_ctl wrapper # ARGS: (epfd, op, fd, events_mask) sub epoll_ctl_mod4 { - syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, pack("LLL", $_[3], $_[2], 0)); + syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, + pack("LLL", $_[3], $_[2], 0)); } + sub epoll_ctl_mod8 { - syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, pack("LLLL", $_[3], 0, $_[2], 0)); + syscall($SYS_epoll_ctl, $_[0]+0, $_[1]+0, $_[2]+0, + pack("LLLL", $_[3], 0, $_[2], 0)); } # epoll_wait wrapper # ARGS: (epfd, maxevents, timeout (milliseconds), arrayref) # arrayref: values modified to be [$fd, $event] -our $epoll_wait_events; +our $epoll_wait_events = ''; our $epoll_wait_size = 0; sub epoll_wait_mod4 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 12 x $epoll_wait_size; - } - my $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - for (0..$ct-1) { - @{$_[3]->[$_]}[1,0] = unpack("LL", substr($epoll_wait_events, 12*$_, 8)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 12 - 1, 8) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec); + for (0..$ct - 1) { + # 12-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 8 bytes: epoll_data_t union (first 4 bytes are the fd) + # So we skip the first 4 bytes and take the middle 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 12 * $_ + 4, 4)); + } } sub epoll_wait_mod8 { - # resize our static buffer if requested size is bigger than we've ever done - if ($_[1] > $epoll_wait_size) { - $epoll_wait_size = $_[1]; - $epoll_wait_events = "\0" x 16 x $epoll_wait_size; - } - my $ct; - if ($no_deprecated) { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0, undef); - } else { - $ct = syscall($SYS_epoll_wait, $_[0]+0, $epoll_wait_events, $_[1]+0, $_[2]+0); - } - for (0..$ct-1) { - # 16 byte epoll_event structs, with format: - # 4 byte mask [idx 1] - # 4 byte padding (we put it into idx 2, useless) - # 8 byte data (first 4 bytes are fd, into idx 0) - @{$_[3]->[$_]}[1,2,0] = unpack("LLL", substr($epoll_wait_events, 16*$_, 12)); - } - return $ct; + my ($epfd, $maxevents, $timeout_msec, $events) = @_; + + # resize our static buffer if maxevents bigger than we've ever done + if ($maxevents > $epoll_wait_size) { + $epoll_wait_size = $maxevents; + vec($epoll_wait_events, $maxevents * 16 - 1, 8) = 0; + } + @$events = (); + my $ct = syscall($SYS_epoll_wait, $epfd, $epoll_wait_events, + $maxevents, $timeout_msec, + $no_deprecated ? undef : ()); + for (0..$ct - 1) { + # 16-byte struct epoll_event + # 4 bytes uint32_t events mask (skipped, useless to us) + # 4 bytes padding (skipped, useless) + # 8 bytes epoll_data_t union (first 4 bytes are the fd) + # So skip the first 8 bytes, take 4, and ignore the last 4: + $events->[$_] = unpack('L', substr($epoll_wait_events, + 16 * $_ + 8, 4)); + } } -sub signalfd ($$$) { - my ($fd, $signos, $flags) = @_; +sub signalfd ($) { + my ($signos) = @_; if ($SYS_signalfd4) { my $set = POSIX::SigSet->new(@$signos); - syscall($SYS_signalfd4, $fd, "$$set", + syscall($SYS_signalfd4, -1, "$$set", # $Config{sig_count} is NSIG, so this is NSIG/8: int($Config{sig_count}/8), - $flags|$SFD_CLOEXEC); + # SFD_NONBLOCK == O_NONBLOCK for every architecture + O_NONBLOCK|$SFD_CLOEXEC); } else { $! = ENOSYS; undef; } } +sub _rename_noreplace_racy ($$) { + my ($old, $new) = @_; + if (link($old, $new)) { + warn "unlink $old: $!\n" if !unlink($old) && $! != ENOENT; + 1 + } else { + undef; + } +} + +# TODO: support FD args? +sub rename_noreplace ($$) { + my ($old, $new) = @_; + if ($SYS_renameat2) { # RENAME_NOREPLACE = 1, AT_FDCWD = -100 + my $ret = syscall($SYS_renameat2, -100, $old, -100, $new, 1); + if ($ret == 0) { + 1; # like rename() perlop + } elsif ($! == ENOSYS || $! == EINVAL) { + undef $SYS_renameat2; + _rename_noreplace_racy($old, $new); + } else { + undef + } + } else { + _rename_noreplace_racy($old, $new); + } +} + +sub nodatacow_fh ($) { + my ($fh) = @_; + my $buf = "\0" x 120; + syscall($SYS_fstatfs // return, fileno($fh), $buf) == 0 or + return warn("fstatfs: $!\n"); + my $f_type = unpack('l!', $buf); # statfs.f_type is a signed word + return if $f_type != 0x9123683E; # BTRFS_SUPER_MAGIC + + $FS_IOC_GETFLAGS // + return warn('FS_IOC_GETFLAGS undefined for platform'); + ioctl($fh, $FS_IOC_GETFLAGS, $buf) // + return warn("FS_IOC_GETFLAGS: $!\n"); + my $attr = unpack('l!', $buf); + return if ($attr & 0x00800000); # FS_NOCOW_FL; + ioctl($fh, $FS_IOC_SETFLAGS, pack('l', $attr | 0x00800000)) // + warn("FS_IOC_SETFLAGS: $!\n"); +} + +sub nodatacow_dir { + if (open my $fh, '<', $_[0]) { nodatacow_fh($fh) } +} + +use constant \%PACK; +sub CMSG_ALIGN ($) { ($_[0] + CMSG_ALIGN_size - 1) & ~(CMSG_ALIGN_size - 1) } +use constant CMSG_ALIGN_SIZEOF_cmsghdr => CMSG_ALIGN(SIZEOF_cmsghdr); +sub CMSG_SPACE ($) { CMSG_ALIGN($_[0]) + CMSG_ALIGN_SIZEOF_cmsghdr } +sub CMSG_LEN ($) { CMSG_ALIGN_SIZEOF_cmsghdr + $_[0] } +use constant msg_controllen_max => + CMSG_SPACE(10 * SIZEOF_int) + SIZEOF_cmsghdr; # space for 10 FDs + +if (defined($SYS_sendmsg) && defined($SYS_recvmsg)) { +no warnings 'once'; +require PublicInbox::CmdIPC4; + +*send_cmd4 = sub ($$$$;$) { + my ($sock, $fds, undef, $flags, $tries) = @_; + my $iov = pack('P'.TMPL_size_t, + $_[2] // NUL, length($_[2] // NUL) || 1); + my $fd_space = scalar(@$fds) * SIZEOF_int; + my $msg_controllen = CMSG_SPACE($fd_space); + my $cmsghdr = pack(TMPL_cmsg_len . + 'LL' . # cmsg_level, cmsg_type, + CMSG_DATA_off.('i' x scalar(@$fds)). # CMSG_DATA + '@'.($msg_controllen - 1).'x1', # pad to space, not len + CMSG_LEN($fd_space), # cmsg_len + SOL_SOCKET, SCM_RIGHTS, # cmsg_{level,type} + @$fds); # CMSG_DATA + my $mh = pack(TMPL_msghdr, + undef, 0, # msg_name, msg_namelen (unused) + $iov, 1, # msg_iov, msg_iovlen + $cmsghdr, # msg_control + $msg_controllen, + 0); # msg_flags + my $s; + $tries //= 50; + do { + $s = syscall($SYS_sendmsg, fileno($sock), $mh, $flags); + } while ($s < 0 && PublicInbox::CmdIPC4::sendmsg_retry($tries)); + $s >= 0 ? $s : undef; +}; + +*recv_cmd4 = sub ($$$) { + my ($sock, undef, $len) = @_; + vec($_[1] //= '', $len - 1, 8) = 0; + my $cmsghdr = "\0" x msg_controllen_max; # 10 * sizeof(int) + my $iov = pack('P'.TMPL_size_t, $_[1], $len); + my $mh = pack(TMPL_msghdr, + undef, 0, # msg_name, msg_namelen (unused) + $iov, 1, # msg_iov, msg_iovlen + $cmsghdr, # msg_control + msg_controllen_max, + 0); # msg_flags + my $r; + do { + $r = syscall($SYS_recvmsg, fileno($sock), $mh, 0); + } while ($r < 0 && $!{EINTR}); + if ($r < 0) { + $_[1] = ''; + return (undef); + } + substr($_[1], $r, length($_[1]), ''); + my @ret; + if ($r > 0) { + my ($len, $lvl, $type, @fds) = unpack(TMPL_cmsg_len. + 'LL'. # cmsg_level, cmsg_type + CMSG_DATA_off.'i*', # @fds + $cmsghdr); + if ($lvl == SOL_SOCKET && $type == SCM_RIGHTS) { + $len -= CMSG_ALIGN_SIZEOF_cmsghdr; + @ret = @fds[0..(($len / SIZEOF_int) - 1)]; + } + } + @ret; +}; +} + 1; =head1 WARRANTY diff --git a/lib/PublicInbox/TLS.pm b/lib/PublicInbox/TLS.pm index 0f838e25..3ce57f1b 100644 --- a/lib/PublicInbox/TLS.pm +++ b/lib/PublicInbox/TLS.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # IO::Socket::SSL support code @@ -6,6 +6,7 @@ package PublicInbox::TLS; use strict; use IO::Socket::SSL; use PublicInbox::Syscall qw(EPOLLIN EPOLLOUT); +use Carp qw(carp croak); sub err () { $SSL_ERROR } @@ -13,7 +14,32 @@ sub err () { $SSL_ERROR } sub epollbit () { return EPOLLIN if $SSL_ERROR == SSL_WANT_READ; return EPOLLOUT if $SSL_ERROR == SSL_WANT_WRITE; - die "unexpected SSL error: $SSL_ERROR"; + carp "unexpected SSL error: $SSL_ERROR"; + undef; +} + +sub _ctx_new ($) { + my ($tlsd) = @_; + my $ctx = IO::Socket::SSL::SSL_Context->new( + @{$tlsd->{ssl_ctx_opt}}, SSL_server => 1) or + croak "SSL_Context->new: $SSL_ERROR"; + + # save ~34K per idle connection (cf. SSL_CTX_set_mode(3ssl)) + # RSS goes from 346MB to 171MB with 10K idle NNTPS clients on amd64 + # cf. https://rt.cpan.org/Ticket/Display.html?id=129463 + my $mode = eval { Net::SSLeay::MODE_RELEASE_BUFFERS() }; + if ($mode && $ctx->{context}) { + eval { Net::SSLeay::CTX_set_mode($ctx->{context}, $mode) }; + warn "W: $@ (setting SSL_MODE_RELEASE_BUFFERS)\n" if $@; + } + $ctx; +} + +sub start { + my ($io, $tlsd) = @_; + IO::Socket::SSL->start_SSL($io, SSL_server => 1, + SSL_reuse_ctx => ($tlsd->{ssl_ctx} //= _ctx_new($tlsd)), + SSL_startHandshake => 0); } 1; diff --git a/lib/PublicInbox/TailNotify.pm b/lib/PublicInbox/TailNotify.pm new file mode 100644 index 00000000..84340a35 --- /dev/null +++ b/lib/PublicInbox/TailNotify.pm @@ -0,0 +1,97 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# only used for tests at the moment... +package PublicInbox::TailNotify; +use v5.12; +use parent qw(PublicInbox::DirIdle); # not optimal, maybe.. +use PublicInbox::DS qw(now); + +my ($TAIL_MOD, $ino_cls); +if ($^O eq 'linux' && eval { require PublicInbox::Inotify; 1 }) { + $TAIL_MOD = PublicInbox::Inotify::IN_MOVED_TO() | + PublicInbox::Inotify::IN_CREATE() | + PublicInbox::Inotify::IN_MODIFY(); + $ino_cls = 'PublicInbox::Inotify'; +} elsif (eval { require PublicInbox::KQNotify }) { + $TAIL_MOD = PublicInbox::KQNotify::MOVED_TO_OR_CREATE() | + IO::KQueue::NOTE_DELETE() | IO::KQueue::NOTE_RENAME(); + $ino_cls = 'PublicInbox::KQNotify'; +} else { + require PublicInbox::FakeInotify; + $TAIL_MOD = PublicInbox::FakeInotify::MOVED_TO_OR_CREATE() | + PublicInbox::FakeInotify::IN_MODIFY() | + PublicInbox::FakeInotify::IN_DELETE(); +} +require IO::Poll if $ino_cls; + +sub reopen_file ($) { + my ($self) = @_; + + open my $fh, '<', $self->{fn} or return undef; + my @st = stat $fh or die "fstat($self->{fn}): $!"; + $self->{ino_dev} = "@st[0, 1]"; + $self->{inot}->watch($self->{fn}, $TAIL_MOD); + $self->{watch_fh} = $fh; # return value +} + +sub new { + my ($cls, $fn) = @_; + my $self = bless { fn => $fn }, $cls; + if ($ino_cls) { + $self->{inot} = $ino_cls->new or die "E: $ino_cls->new: $!"; + $self->{inot}->blocking(0); + my ($dn) = ($fn =~ m!\A(.+)/+[^/]+\z!); + $self->{inot}->watch($dn // '.', $TAIL_MOD); + } else { + $self->{inot} = PublicInbox::FakeInotify->new; + } + reopen_file($self); + $self->{inot}->watch($fn, $TAIL_MOD); + $self; +} + +sub delete_self { + for (@_) { return 1 if $_->IN_DELETE_SELF } + undef; +} + +sub getlines { + my ($self, $timeo) = @_; + my ($fh, $buf, $rfds, @ret, @events); + my $end = defined($timeo) ? now + $timeo : undef; +again: + while (1) { + @events = $self->{inot}->read; # Linux::Inotify2::read + last if @events; + return () if defined($timeo) && (!$timeo || (now > $end)); + my $wait = 0.1; + if ($ino_cls) { + vec($rfds = '', $self->{inot}->fileno, 1) = 1; + if (defined $end) { + $wait = $end - now; + $wait = 0 if $wait < 0; + } else { + undef $wait; + } + } + select($rfds, undef, undef, $wait); + } + if ($fh = $self->{watch_fh}) { + sysread($fh, $buf, -s $fh) and + push @ret, split(/^/sm, $buf); + my @st = stat($self->{fn}); + if (!@st || "@st[0, 1]" ne $self->{ino_dev} || + delete_self(@events)) { + delete @$self{qw(ino_dev watch_fh)}; + } + } + if ($fh = $self->{watch_fh} // reopen_file($self)) { + sysread($fh, $buf, -s $fh) and + push @ret, split(/^/sm, $buf); + } + goto again if (!@ret && (!defined($end) || now < $end)); + @ret; +} + +1; diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index 299b9c6a..3a67ab54 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # internal APIs used only for tests @@ -6,68 +6,197 @@ package PublicInbox::TestCommon; use strict; use parent qw(Exporter); use v5.10.1; -use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD :seek); +use Fcntl qw(F_SETFD F_GETFD FD_CLOEXEC :seek); use POSIX qw(dup2); use IO::Socket::INET; -our @EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods - run_script start_script key2sub xsys xqx eml_load tick - have_xapian_compact); +use File::Spec; +use Scalar::Util qw(isvstring); +use Carp (); +our @EXPORT; +my $lei_loud = $ENV{TEST_LEI_ERR_LOUD}; +our $tail_cmd = $ENV{TAIL}; +our ($lei_opt, $lei_out, $lei_err); +use autodie qw(chdir close fcntl mkdir open opendir seek unlink); +$ENV{XDG_CACHE_HOME} //= "$ENV{HOME}/.cache"; # reuse C++ xap_helper builds + +$_ = File::Spec->rel2abs($_) for (grep(!m!^/!, @INC)); +our $CURRENT_DAEMON; +BEGIN { + @EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods + run_script start_script key2sub xsys xsys_e xqx eml_load tick + have_xapian_compact json_utf8 setup_public_inboxes create_inbox + create_dir + create_coderepo require_bsd kernel_version check_broken_tmpfs + quit_waiter_pipe wait_for_eof require_git_http_backend + tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt + test_httpd xbail require_cmd is_xdeeply tail_f + ignore_inline_c_missing no_pollerfd no_coredump cfg_new + strace strace_inject lsof_pid oct_is); + require Test::More; + my @methods = grep(!/\W/, @Test::More::EXPORT); + eval(join('', map { "*$_=\\&Test::More::$_;" } @methods)); + die $@ if $@; + push @EXPORT, @methods; +} + +sub kernel_version () { + state $version = do { + require POSIX; + my @u = POSIX::uname(); + if ($u[2] =~ /\A([0-9]+(?:\.[0-9]+)+)/) { + eval "v$1"; + } else { + local $" = "', `"; + diag "Unable to get kernel version from: `@u'"; + undef; + } + }; +} + +sub check_broken_tmpfs () { + return if $^O ne 'dragonfly' || kernel_version ge v6.5; + diag 'EVFILT_VNODE + tmpfs is broken on dragonfly <= 6.4 (have: '. + sprintf('%vd', kernel_version).')'; + 1; +} + +sub require_bsd (;$) { + state $ok = ($^O =~ m!\A(?:free|net|open)bsd\z! || + $^O eq 'dragonfly'); + return 1 if $ok; + return if defined(wantarray); + my $m = "$0 is BSD-only (\$^O=$^O)"; + @_ ? skip($m, 1) : plan(skip_all => $m); +} + +sub xbail (@) { BAIL_OUT join(' ', map { ref() ? (explain($_)) : ($_) } @_) } + +sub read_all ($;$$$) { + require PublicInbox::IO; + PublicInbox::IO::read_all($_[0], $_[1], $_[2], $_[3]) +} sub eml_load ($) { my ($path, $cb) = @_; - open(my $fh, '<', $path) or die "open $path: $!"; + open(my $fh, '<', $path); require PublicInbox::Eml; - PublicInbox::Eml->new(\(do { local $/; <$fh> })); + PublicInbox::Eml->new(\(scalar read_all $fh)); } sub tmpdir (;$) { my ($base) = @_; require File::Temp; - unless (defined $base) { - ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); - } - my $tmpdir = File::Temp->newdir("pi-$base-$$-XXXXXX", TMPDIR => 1); - ($tmpdir->dirname, $tmpdir); + ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!) unless defined $base; + my $tmpdir = File::Temp->newdir("pi-$base-$$-XXXX", TMPDIR => 1); + wantarray ? ($tmpdir->dirname, $tmpdir) : $tmpdir; } sub tcp_server () { - IO::Socket::INET->new( - LocalAddr => '127.0.0.1', + my %opt = ( ReuseAddr => 1, Proto => 'tcp', Type => Socket::SOCK_STREAM(), Listen => 1024, Blocking => 0, - ) or Test::More::BAIL_OUT("failed to create TCP server: $!"); + ); + eval { + die 'IPv4-only' if $ENV{TEST_IPV4_ONLY}; + my $pkg; + for (qw(IO::Socket::IP IO::Socket::INET6)) { + eval "require $_" or next; + $pkg = $_ and last; + } + $pkg->new(%opt, LocalAddr => '[::1]'); + } || eval { + die 'IPv6-only' if $ENV{TEST_IPV6_ONLY}; + IO::Socket::INET->new(%opt, LocalAddr => '127.0.0.1') + } || BAIL_OUT "failed to create TCP server: $! ($@)"; +} + +sub tcp_host_port ($) { + my ($s) = @_; + my ($h, $p) = ($s->sockhost, $s->sockport); + my $ipv4 = $s->sockdomain == Socket::AF_INET(); + if (wantarray) { + $ipv4 ? ($h, $p) : ("[$h]", $p); + } else { + $ipv4 ? "$h:$p" : "[$h]:$p"; + } } sub tcp_connect { my ($dest, %opt) = @_; - my $addr = $dest->sockhost . ':' . $dest->sockport; - my $s = IO::Socket::INET->new( + my $addr = tcp_host_port($dest); + my $s = ref($dest)->new( Proto => 'tcp', Type => Socket::SOCK_STREAM(), PeerAddr => $addr, %opt, - ) or Test::More::BAIL_OUT("failed to connect to $addr: $!"); + ) or BAIL_OUT "failed to connect to $addr: $!"; $s->autoflush(1); $s; } +sub require_cmd ($;$) { + my ($cmd, $nr) = @_; + require PublicInbox::Spawn; + state %CACHE; + my $bin = $CACHE{$cmd} //= PublicInbox::Spawn::which($cmd); + return $bin if $bin; + return plan(skip_all => "$cmd missing from PATH for $0") if !$nr; + defined(wantarray) ? undef : skip("$cmd missing", $nr); +} + +sub have_xapian_compact (;$) { + require_cmd($ENV{XAPIAN_COMPACT} || 'xapian-compact', @_ ? $_[0] : ()); +} + sub require_git ($;$) { - my ($req, $maybe) = @_; - my ($req_maj, $req_min, $req_sub) = split(/\./, $req); - my ($cur_maj, $cur_min, $cur_sub) = (xqx([qw(git --version)]) - =~ /version (\d+)\.(\d+)(?:\.(\d+))?/); - - my $req_int = ($req_maj << 24) | ($req_min << 16) | ($req_sub // 0); - my $cur_int = ($cur_maj << 24) | ($cur_min << 16) | ($cur_sub // 0); - if ($cur_int < $req_int) { - return 0 if $maybe; - Test::More::plan(skip_all => - "git $req+ required, have $cur_maj.$cur_min.$cur_sub"); + my ($req, $nr) = @_; + require PublicInbox::Git; + state $cur_vstr = PublicInbox::Git::git_version(); + $req = eval("v$req") unless isvstring($req); + + return 1 if $cur_vstr ge $req; + state $cur_ver = sprintf('%vd', $cur_vstr); + return plan skip_all => "git $req+ required, have $cur_ver" if !$nr; + defined(wantarray) ? undef : + skip("git $req+ required (have $cur_ver)", $nr) +} + +sub require_git_http_backend (;$) { + my ($nr) = @_; + state $ok = do { + require PublicInbox::Git; + my $git = PublicInbox::Git::git_exe() or plan + skip_all => 'nothing in public-inbox works w/o git'; + my $rdr = { 1 => \my $out, 2 => \my $err }; + xsys([$git, qw(http-backend)], undef, $rdr); + $out =~ /^Status:/ism; + }; + if (!$ok) { + my $msg = "`git http-backend' not available"; + defined($nr) ? skip $msg, $nr : plan skip_all => $msg; } - 1; + $ok; +} + +my %IPv6_VERSION = ( + 'Net::NNTP' => 3.00, + 'Mail::IMAPClient' => 3.40, + 'HTTP::Tiny' => 0.042, + 'Net::POP3' => 2.32, +); + +sub need_accept_filter ($) { + my ($af) = @_; + return if $^O eq 'netbsd'; # since NetBSD 5.0, no kldstat needed + $^O =~ /\A(?:freebsd|dragonfly)\z/ or + skip 'SO_ACCEPTFILTER is FreeBSD/NetBSD/Dragonfly-only so far', + 1; + state $tried = {}; + ($tried->{$af} //= system("kldstat -m $af >/dev/null")) and + skip "$af not loaded: kldload $af", 1; } sub require_mods { @@ -75,16 +204,41 @@ sub require_mods { my $maybe = pop @mods if $mods[-1] =~ /\A[0-9]+\z/; my @need; while (my $mod = shift(@mods)) { - if ($mod eq 'Search::Xapian') { + if ($mod eq 'lei') { + require_git(2.6, $maybe ? $maybe : ()); + push @mods, qw(DBD::SQLite Xapian +SCM_RIGHTS); + $mod = 'json'; # fall-through + } + if ($mod eq 'json') { + $mod = 'Cpanel::JSON::XS||JSON::MaybeXS||JSON||JSON::PP' + } elsif ($mod eq '-httpd') { + push @mods, qw(Plack::Builder Plack::Util); + next; + } elsif ($mod eq '-imapd') { + push @mods, qw(Parse::RecDescent DBD::SQLite); + next; + } elsif ($mod eq '-nntpd' || $mod eq 'v2') { + push @mods, qw(DBD::SQLite); + next; + } + if ($mod eq 'Xapian') { if (eval { require PublicInbox::Search } && PublicInbox::Search::load_xapian()) { next; } - } elsif ($mod eq 'Search::Xapian::WritableDatabase') { - if (eval { require PublicInbox::SearchIdx } && - PublicInbox::SearchIdx::load_xapian_writable()){ - next; + } elsif ($mod eq '+SCM_RIGHTS') { + if (my $msg = need_scm_rights()) { + push @need, $msg; + next; } + } elsif ($mod eq ':fcntl_lock') { + next if $^O eq 'linux' || require_bsd; + diag "untested platform: $^O, ". + "requiring File::FcntlLock..."; + push @mods, 'File::FcntlLock'; + } elsif ($mod =~ /\A\+(accf_.*)\z/) { + need_accept_filter($1); + next } elsif (index($mod, '||') >= 0) { # "Foo||Bar" my $ok; for my $m (split(/\Q||\E/, $mod)) { @@ -98,6 +252,7 @@ sub require_mods { eval "require $mod"; } if ($@) { + diag "require $mod: $@" if $mod =~ /Gcf2/; push @need, $mod; } elsif ($mod eq 'IO::Socket::SSL' && # old versions of IO::Socket::SSL aren't supported @@ -106,11 +261,15 @@ sub require_mods { !eval{ IO::Socket::SSL->VERSION(2.007); 1 }) { push @need, $@; } + if (defined(my $v = $IPv6_VERSION{$mod})) { + $ENV{TEST_IPV4_ONLY} = 1 if !eval { $mod->VERSION($v) }; + } } return unless @need; my $m = join(', ', @need)." missing for $0"; - Test::More::skip($m, $maybe) if $maybe; - Test::More::plan(skip_all => $m) + $m =~ s/\bEmail::MIME\b/Email::MIME (development purposes only)/; + skip($m, $maybe) if $maybe; + plan(skip_all => $m) } sub key2script ($) { @@ -122,8 +281,8 @@ sub key2script ($) { 'blib/script/'.$key; } -my @io_mode = ([ *STDIN{IO}, '<&' ], [ *STDOUT{IO}, '>&' ], - [ *STDERR{IO}, '>&' ]); +my @io_mode = ([ *STDIN{IO}, '+<&' ], [ *STDOUT{IO}, '+>&' ], + [ *STDERR{IO}, '+>&' ]); sub _prepare_redirects ($) { my ($fhref) = @_; @@ -131,9 +290,9 @@ sub _prepare_redirects ($) { for (my $fd = 0; $fd <= $#io_mode; $fd++) { my $fh = $fhref->[$fd] or next; my ($oldfh, $mode) = @{$io_mode[$fd]}; - open my $orig, $mode, $oldfh or die "$$oldfh $mode stash: $!"; + open(my $orig, $mode, $oldfh); $orig_io->[$fd] = $orig; - open $oldfh, $mode, $fh or die "$$oldfh $mode redirect: $!"; + open $oldfh, $mode, $fh; } $orig_io; } @@ -143,7 +302,7 @@ sub _undo_redirects ($) { for (my $fd = 0; $fd <= $#io_mode; $fd++) { my $fh = $orig_io->[$fd] or next; my ($oldfh, $mode) = @{$io_mode[$fd]}; - open $oldfh, $mode, $fh or die "$$oldfh $mode redirect: $!"; + open $oldfh, $mode, $fh; } } @@ -164,13 +323,13 @@ sub run_script_exit { die RUN_SCRIPT_EXIT; } -my %cached_scripts; +our %cached_scripts; sub key2sub ($) { my ($key) = @_; $cached_scripts{$key} //= do { my $f = key2script($key); - open my $fh, '<', $f or die "open $f: $!"; - my $str = do { local $/; <$fh> }; + open my $fh, '<', $f; + my $str = read_all($fh); my $pkg = (split(m!/!, $f))[-1]; $pkg =~ s/([a-z])([a-z0-9]+)(\.t)?\z/\U$1\E$2/; $pkg .= "_T" if $3; @@ -185,7 +344,7 @@ use subs qw(exit); sub main { # the below "line" directive is a magic comment, see perlsyn(1) manpage # line 1 "$f" -$str +{ $str } 0; } 1; @@ -214,59 +373,107 @@ sub _run_sub ($$$) { } } +sub no_coredump (@) { + my @dirs = @_; + my $cwdfh; + opendir($cwdfh, '.') if @dirs; + my @found; + for (@dirs, '.') { + chdir $_; + my @cores = glob('core.* *.core'); + push @cores, 'core' if -d 'core'; + push(@found, "@cores found in $_") if @cores; + chdir $cwdfh if $cwdfh; + } + return if !@found; # keep it quiet. + is(scalar(@found), 0, 'no core dumps found'); + diag(join("\n", @found) . Carp::longmess()); + if (-t STDIN) { + diag 'press ENTER to continue, (q) to quit'; + chomp(my $line = <STDIN>); + xbail 'user quit' if $line =~ /\Aq/; + } +} + sub run_script ($;$$) { my ($cmd, $env, $opt) = @_; + no_coredump($opt->{-C} ? ($opt->{-C}) : ()); my ($key, @argv) = @$cmd; my $run_mode = $ENV{TEST_RUN_MODE} // $opt->{run_mode} // 1; my $sub = $run_mode == 0 ? undef : key2sub($key); my $fhref = []; my $spawn_opt = {}; + my @tail_paths; + local $tail_cmd = $tail_cmd; for my $fd (0..2) { my $redir = $opt->{$fd}; my $ref = ref($redir); if ($ref eq 'SCALAR') { - open my $fh, '+>', undef or die "open: $!"; + my $fh; + if ($ENV{TAIL_ALL} && $fd > 0) { + # tail -F is better, but not portable :< + $tail_cmd //= 'tail -f'; + require File::Temp; + $fh = File::Temp->new("fd.$fd-XXXX", TMPDIR=>1); + push @tail_paths, $fh->filename; + } else { + open $fh, '+>', undef; + } + $fh or xbail $!; $fhref->[$fd] = $fh; $spawn_opt->{$fd} = $fh; next if $fd > 0; $fh->autoflush(1); print $fh $$redir or die "print: $!"; - seek($fh, 0, SEEK_SET) or die "seek: $!"; + seek($fh, 0, SEEK_SET); } elsif ($ref eq 'GLOB') { $spawn_opt->{$fd} = $fhref->[$fd] = $redir; } elsif ($ref) { die "unable to deal with $ref $redir"; } } + my $tail = @tail_paths ? tail_f(@tail_paths, $opt) : undef; + if ($key =~ /-(index|cindex|extindex|convert|xcpdb)\z/) { + unshift @argv, '--no-fsync'; + } if ($run_mode == 0) { # spawn an independent new process, like real-world use cases: require PublicInbox::Spawn; my $cmd = [ key2script($key), @argv ]; - my $pid = PublicInbox::Spawn::spawn($cmd, $env, $spawn_opt); - if (defined $pid) { - my $r = waitpid($pid, 0); - defined($r) or die "waitpid: $!"; - $r == $pid or die "waitpid: expected $pid, got $r"; + if (my $d = $opt->{'-C'}) { + $cmd->[0] = File::Spec->rel2abs($cmd->[0]); + $spawn_opt->{'-C'} = $d; } + PublicInbox::Spawn::run_wait($cmd, $env, $spawn_opt); } else { # localize and run everything in the same process: # note: "local *STDIN = *STDIN;" and so forth did not work in # old versions of perl + my $umask = umask; local %ENV = $env ? (%ENV, %$env) : %ENV; - local %SIG = %SIG; + local @SIG{keys %SIG} = map { undef } values %SIG; + local $SIG{FPE} = 'IGNORE'; # Perl default local $0 = join(' ', @$cmd); my $orig_io = _prepare_redirects($fhref); + opendir(my $cwdfh, '.'); + chdir $opt->{-C} if defined $opt->{-C}; _run_sub($sub, $key, \@argv); + # n.b. all our uses of PublicInbox::DS should be fine + # with this and we can't Reset here. + chdir($cwdfh); _undo_redirects($orig_io); + select STDOUT; + umask($umask); } + { local $?; undef $tail }; # slurp the redirects back into user-supplied strings for my $fd (1..2) { my $fh = $fhref->[$fd] or next; - seek($fh, 0, SEEK_SET) or die "seek: $!"; - my $redir = $opt->{$fd}; - local $/; - $$redir = <$fh>; + next unless -f $fh; + seek($fh, 0, SEEK_SET); + ${$opt->{$fd}} = read_all($fh); } + no_coredump($opt->{-C} ? ($opt->{-C}) : ()); $? == 0; } @@ -276,17 +483,17 @@ sub tick (;$) { 1; } -sub wait_for_tail ($;$) { +sub wait_for_tail { my ($tail_pid, $want) = @_; - my $wait = 2; + my $wait = 2; # "tail -F" sleeps 1.0s at-a-time w/o inotify/kevent if ($^O eq 'linux') { # GNU tail may use inotify state $tail_has_inotify; - return tick if $want < 0 && $tail_has_inotify; - my $end = time + $wait; + return tick if !$want && $tail_has_inotify; # before TERM + my $end = time + $wait; # wait for startup: my @ino; do { @ino = grep { - readlink($_) =~ /\binotify\b/ + (readlink($_) // '') =~ /\binotify\b/ } glob("/proc/$tail_pid/fd/*"); } while (!@ino && time <= $end and tick); return if !@ino; @@ -294,7 +501,7 @@ sub wait_for_tail ($;$) { $ino[0] =~ s!/fd/!/fdinfo/!; my @info; do { - if (open my $fh, '<', $ino[0]) { + if (CORE::open(my $fh, '<', $ino[0])) { local $/ = "\n"; @info = grep(/^inotify wd:/, <$fh>); } @@ -318,6 +525,11 @@ sub xsys { $? >> 8 } +sub xsys_e { # like "/bin/sh -e" + xsys(@_) == 0 or + BAIL_OUT (ref $_[0] ? "@{$_[0]}" : "@_"). " failed \$?=$?" +} + # like `backtick` or qx{} op, but uses spawn() for env/rdr + vfork sub xqx { my ($cmd, $env, $rdr) = @_; @@ -326,13 +538,38 @@ sub xqx { wantarray ? split(/^/m, $out) : $out; } +sub tail_f (@) { + my @f = grep(defined, @_); + $tail_cmd or return; # "tail -F" or "tail -f" + my $opt = (ref($f[-1]) eq 'HASH') ? pop(@f) : {}; + my $clofork = $opt->{-CLOFORK} // []; + my @cfmap = map { + my $fl = fcntl($_, F_GETFD, 0); + fcntl($_, F_SETFD, $fl | FD_CLOEXEC) unless $fl & FD_CLOEXEC; + ($_, $fl); + } @$clofork; + for (@f) { open(my $fh, '>>', $_) }; + my $cmd = [ split(/ /, $tail_cmd), @f ]; + require PublicInbox::Spawn; + my $pid = PublicInbox::Spawn::spawn($cmd, undef, { 1 => 2 }); + while (my ($io, $fl) = splice(@cfmap, 0, 2)) { + fcntl($io, F_SETFD, $fl); + } + wait_for_tail($pid, scalar @f); + require PublicInbox::AutoReap; + PublicInbox::AutoReap->new($pid, \&wait_for_tail); +} + sub start_script { my ($cmd, $env, $opt) = @_; my ($key, @argv) = @$cmd; my $run_mode = $ENV{TEST_RUN_MODE} // $opt->{run_mode} // 2; my $sub = $run_mode == 0 ? undef : key2sub($key); - my $tail_pid; - if (my $tail_cmd = $ENV{TAIL}) { + my $tail; + my @xh = split(/\s+/, $ENV{TEST_DAEMON_XH} // ''); + @xh = () if $key !~ /-(?:imapd|netd|httpd|pop3d|nntpd)\z/; + push @argv, @xh; + if ($tail_cmd) { my @paths; for (@argv) { next unless /\A--std(?:err|out)=(.+)\z/; @@ -350,34 +587,27 @@ sub start_script { } } } - if (@paths) { - defined($tail_pid = fork) or die "fork: $!\n"; - if ($tail_pid == 0) { - # make sure files exist, first - open my $fh, '>>', $_ for @paths; - open(STDOUT, '>&STDERR') or die "1>&2: $!"; - exec(split(' ', $tail_cmd), @paths); - die "$tail_cmd failed: $!"; - } - wait_for_tail($tail_pid, scalar @paths); - } + $tail = tail_f(@paths, $opt); } - defined(my $pid = fork) or die "fork: $!\n"; + require PublicInbox::DS; + my $oset = PublicInbox::DS::block_signals(); + require PublicInbox::OnDestroy; + my $tmp_mask = PublicInbox::OnDestroy::all( + \&PublicInbox::DS::sig_setmask, $oset); + my $pid = PublicInbox::DS::fork_persist(); if ($pid == 0) { - eval { PublicInbox::DS->Reset }; + close($_) for (@{delete($opt->{-CLOFORK}) // []}); # pretend to be systemd (cf. sd_listen_fds(3)) # 3 == SD_LISTEN_FDS_START my $fd; - for ($fd = 0; 1; $fd++) { - my $s = $opt->{$fd}; - last if $fd >= 3 && !defined($s); - next unless $s; - my $fl = fcntl($s, F_GETFD, 0); - if (($fl & FD_CLOEXEC) != FD_CLOEXEC) { - warn "got FD:".fileno($s)." w/o CLOEXEC\n"; + for ($fd = 0; $fd < 3 || defined($opt->{$fd}); $fd++) { + my $io = $opt->{$fd} // next; + my $old = fileno($io); + if ($old == $fd) { + fcntl($io, F_SETFD, 0); + } else { + dup2($old, $fd) // die "dup2($old, $fd): $!"; } - fcntl($s, F_SETFD, $fl &= ~FD_CLOEXEC); - dup2(fileno($s), $fd) or die "dup2 failed: $!\n"; } %ENV = (%ENV, %$env) if $env; my $fds = $fd - 3; @@ -385,9 +615,12 @@ sub start_script { $ENV{LISTEN_PID} = $$; $ENV{LISTEN_FDS} = $fds; } - $0 = join(' ', @$cmd); + if ($opt->{-C}) { chdir($opt->{-C}) } + $0 = join(' ', @$cmd, @xh); + local @SIG{keys %SIG} = map { undef } values %SIG; + local $SIG{FPE} = 'IGNORE'; # Perl default + undef $tmp_mask; if ($sub) { - eval { PublicInbox::DS->Reset }; _run_sub($sub, $key, \@argv); POSIX::_exit($? >> 8); } else { @@ -395,48 +628,430 @@ sub start_script { die "FAIL: ",join(' ', $key, @argv), ": $!\n"; } } - PublicInboxTestProcess->new($pid, $tail_pid); + undef $tmp_mask; + require PublicInbox::AutoReap; + my $td = PublicInbox::AutoReap->new($pid); + $td->{-extra} = $tail; + $td; +} + +# favor lei() or lei_ok() over $lei for new code +sub lei (@) { + my ($cmd, $env, $xopt) = @_; + $lei_out = $lei_err = ''; + if (!ref($cmd)) { + ($env, $xopt) = grep { (!defined) || ref } @_; + $cmd = [ grep { defined && !ref } @_ ]; + } + my $res = run_script(['lei', @$cmd], $env, $xopt // $lei_opt); + if ($lei_err ne '') { + if ($lei_err =~ /Use of uninitialized/ || + $lei_err =~ m!\bArgument .*? isn't numeric in !) { + fail "lei_err=$lei_err"; + } else { + diag "lei_err=$lei_err" if $lei_loud; + } + } + $res; +}; + +sub lei_ok (@) { + state $PWD = $ENV{PWD} // Cwd::getcwd(); + my $msg = ref($_[-1]) eq 'SCALAR' ? pop(@_) : undef; + my $tmpdir = quotemeta(File::Spec->tmpdir); + # filter out anything that looks like a path name for consistent logs + my @msg = ref($_[0]) eq 'ARRAY' ? @{$_[0]} : @_; + if (!$lei_loud) { + for (@msg) { + s!(127\.0\.0\.1|\[::1\]):(?:\d+)!$1:\$PORT!g; + s!$tmpdir\b/(?:[^/]+/)?!\$TMPDIR/!g; + s!\Q$PWD\E\b!\$PWD!g; + } + } + ok(lei(@_), "lei @msg". ($msg ? " ($$msg)" : '')) or + diag "\$?=$? err=$lei_err"; +} + +sub json_utf8 () { + state $x = ref(PublicInbox::Config->json)->new->utf8->canonical; +} + +sub is_xdeeply ($$$) { + my ($x, $y, $desc) = @_; + my $ok = is_deeply($x, $y, $desc); + diag explain([$x, '!=', $y]) if !$ok; + $ok; +} + +sub ignore_inline_c_missing { + $_[0] = join('', grep(/\S/, grep(!/compilation aborted/, + grep(!/\bInline\b/, split(/^/m, $_[0]))))); +} + +sub need_scm_rights () { + state $ok = PublicInbox::Spawn->can('send_cmd4') || do { + require PublicInbox::Syscall; + PublicInbox::Syscall->can('send_cmd4'); # Linux only + } || eval { require Socket::MsgHdr; 1 }; + return if $ok; + 'need SCM_RIGHTS support: Inline::C unconfigured/missing '. + '(mkdir -p ~/.cache/public-inbox/inline-c) OR Socket::MsgHdr missing'; } -sub have_xapian_compact () { +# returns a pipe with FD_CLOEXEC disabled on the write-end +sub quit_waiter_pipe () { + pipe(my $r, my $w); + fcntl($w, F_SETFD, fcntl($w, F_GETFD, 0) & ~FD_CLOEXEC); + ($r, $w); +} + +sub wait_for_eof ($$;$) { + my ($io, $msg, $sec) = @_; + vec(my $rset = '', fileno($io), 1) = 1; + ok(select($rset, undef, undef, $sec // 9), "$msg (select)"); + is(my $line = <$io>, undef, "$msg EOF"); +} + +sub test_lei { +SKIP: { + my ($cb) = pop @_; + my $test_opt = shift // {}; + require_git(2.6, 1); + my $mods = $test_opt->{mods} // [ 'lei' ]; + require_mods(@$mods, 2); + + # set PERL_INLINE_DIRECTORY before clobbering XDG_CACHE_HOME require PublicInbox::Spawn; - # $ENV{XAPIAN_COMPACT} is used by PublicInbox/Xapcmd.pm, too - PublicInbox::Spawn::which($ENV{XAPIAN_COMPACT} || 'xapian-compact'); + require PublicInbox::Config; + require File::Path; + eval { # use XDG_CACHE_HOME, first: + require PublicInbox::XapHelperCxx; + PublicInbox::XapHelperCxx::check_build(); + }; + local %ENV = %ENV; + delete $ENV{XDG_DATA_HOME}; + delete $ENV{XDG_CONFIG_HOME}; + delete $ENV{XDG_CACHE_HOME}; + $ENV{GIT_COMMITTER_EMAIL} = 'lei@example.com'; + $ENV{GIT_COMMITTER_NAME} = 'lei user'; + $ENV{LANG} = $ENV{LC_ALL} = 'C'; + my (undef, $fn, $lineno) = caller(0); + my $t = "$fn:$lineno"; + $lei_opt = { 1 => \$lei_out, 2 => \$lei_err }; + my ($daemon_pid, $for_destroy, $daemon_xrd); + my $tmpdir = $test_opt->{tmpdir}; + File::Path::mkpath($tmpdir) if defined $tmpdir; + ($tmpdir, $for_destroy) = tmpdir unless $tmpdir; + my ($dead_r, $dead_w); + state $persist_xrd = $ENV{TEST_LEI_DAEMON_PERSIST_DIR}; + SKIP: { + $ENV{TEST_LEI_ONESHOT} and + xbail 'TEST_LEI_ONESHOT no longer supported'; + my $home = "$tmpdir/lei-daemon"; + mkdir($home, 0700); + local $ENV{HOME} = $home; + my $persist; + if ($persist_xrd && !$test_opt->{daemon_only}) { + $persist = $daemon_xrd = $persist_xrd; + } else { + $daemon_xrd = "$home/xdg_run"; + mkdir($daemon_xrd, 0700); + ($dead_r, $dead_w) = quit_waiter_pipe; + } + local $ENV{XDG_RUNTIME_DIR} = $daemon_xrd; + $cb->(); # likely shares $dead_w with lei-daemon + undef $dead_w; # so select() wakes up when daemon dies + if ($persist) { # remove before ~/.local gets removed + File::Path::rmtree([glob("$home/*")]); + File::Path::rmtree("$home/.config"); + } else { + no_coredump $tmpdir; + lei_ok(qw(daemon-pid), \"daemon-pid after $t"); + chomp($daemon_pid = $lei_out); + if (!$daemon_pid) { + fail("daemon not running after $t"); + skip 'daemon died unexpectedly', 2; + } + ok(kill(0, $daemon_pid), "daemon running after $t"); + lei_ok(qw(daemon-kill), \"daemon-kill after $t"); + } + }; # SKIP for lei_daemon + if ($daemon_pid) { + wait_for_eof($dead_r, 'daemon quit pipe'); + no_coredump $tmpdir; + my $f = "$daemon_xrd/lei/errors.log"; + open my $fh, '<', $f; + my @l = <$fh>; + is_xdeeply(\@l, [], + "$t daemon XDG_RUNTIME_DIR/lei/errors.log empty"); + } +}; # SKIP if missing git 2.6+ || Xapian || SQLite || json +} # /test_lei + +# returns the pathname to a ~/.public-inbox/config in scalar context, +# ($test_home, $pi_config_pathname) in list context +sub setup_public_inboxes () { + my $test_home = "t/home2"; + my $pi_config = "$test_home/.public-inbox/config"; + my $stamp = "$test_home/setup-stamp"; + my @ret = ($test_home, $pi_config); + return @ret if -f $stamp; + + require PublicInbox::Lock; + my $lk = PublicInbox::Lock->new("$test_home/setup.lock"); + my $end = $lk->lock_for_scope; + return @ret if -f $stamp; + + local $ENV{PI_CONFIG} = $pi_config; + for my $V (1, 2) { + run_script([qw(-init --skip-docdata), "-V$V", + '--newsgroup', "t.v$V", "t$V", + "$test_home/t$V", "http://example.com/t$V", + "t$V\@example.com" ]) or xbail "init v$V"; + unlink "$test_home/t$V/description"; + } + require PublicInbox::Config; + require PublicInbox::InboxWritable; + my $cfg = PublicInbox::Config->new; + my $seen = 0; + $cfg->each_inbox(sub { + my ($ibx) = @_; + $ibx->{-no_fsync} = 1; + my $im = PublicInbox::InboxWritable->new($ibx)->importer(0); + my $V = $ibx->version; + my @eml = (glob('t/*.eml'), 't/data/0001.patch'); + for (@eml) { + next if $_ eq 't/psgi_v2-old.eml'; # dup mid + $im->add(eml_load($_)) or BAIL_OUT "v$V add $_"; + $seen++; + } + $im->done; + }); + $seen or BAIL_OUT 'no imports'; + open my $fh, '>', $stamp; + @ret; } -package PublicInboxTestProcess; -use strict; +our %COMMIT_ENV = ( + GIT_AUTHOR_NAME => 'A U Thor', + GIT_COMMITTER_NAME => 'C O Mitter', + GIT_AUTHOR_EMAIL => 'a@example.com', + GIT_COMMITTER_EMAIL => 'c@example.com', +); -# prevent new threads from inheriting these objects -sub CLONE_SKIP { 1 } +# for memoizing based on coderefs and various create_* params +sub my_sum { + require PublicInbox::SHA; + require Data::Dumper; + my $d = Data::Dumper->new(\@_); + $d->$_(1) for qw(Deparse Sortkeys Terse); + my @l = split /\n/s, $d->Dump; + @l = grep !/\$\^H\{.+?[A-Z]+\(0x[0-9a-f]+\)/, @l; # autodie addresses + my @addr = grep /[A-Za-z]+\(0x[0-9a-f]+\)/, @l; + xbail 'undumpable addresses: ', \@addr if @addr; + substr PublicInbox::SHA::sha256_hex(join('', @l)), 0, 8; +} -sub new { - my ($klass, $pid, $tail_pid) = @_; - bless { pid => $pid, tail_pid => $tail_pid, owner => $$ }, $klass; +sub create_dir (@) { + my ($ident, $cb) = (shift, pop); + my %opt = @_; + require PublicInbox::Lock; + require PublicInbox::Import; + my $tmpdir = delete $opt{tmpdir}; + my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); + my $dir = "t/data-gen/$base.$ident-".my_sum($cb, \%opt); + require File::Path; + my $new = File::Path::make_path($dir); + my $lk = PublicInbox::Lock->new("$dir/creat.lock"); + my $scope = $lk->lock_for_scope; + if (!-f "$dir/creat.stamp") { + opendir(my $cwd, '.'); + chdir($dir); + local %ENV = (%ENV, %COMMIT_ENV); + $cb->($dir); + chdir($cwd); # some $cb chdir around + open my $s, '>', "$dir/creat.stamp"; + } + return $dir if !defined($tmpdir); + xsys_e([qw(/bin/cp -Rp), $dir, $tmpdir]); + $tmpdir; } -sub kill { - my ($self, $sig) = @_; - CORE::kill($sig // 'TERM', $self->{pid}); +sub create_coderepo (@) { + my $ident = shift; + require PublicInbox::Import; + my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!); + create_dir "$ident-$db", @_; } -sub join { - my ($self, $sig) = @_; - my $pid = delete $self->{pid} or return; - CORE::kill($sig, $pid) if defined $sig; - my $ret = waitpid($pid, 0); - defined($ret) or die "waitpid($pid): $!"; - $ret == $pid or die "waitpid($pid) != $ret"; +sub create_inbox ($;@) { + my $ident = shift; + my $cb = pop; + my %opt = @_; + require PublicInbox::Lock; + require PublicInbox::InboxWritable; + require PublicInbox::Import; + my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!); + my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!); + my $tmpdir = delete $opt{tmpdir}; + my $dir = "t/data-gen/$base.$ident-".my_sum($db, $cb, \%opt); + require File::Path; + my $new = File::Path::make_path($dir); + my $lk = PublicInbox::Lock->new("$dir/creat.lock"); + $opt{inboxdir} = File::Spec->rel2abs($dir); + $opt{name} //= $ident; + my $scope = $lk->lock_for_scope; + my $pre_cb = delete $opt{pre_cb}; + $pre_cb->($dir) if $pre_cb && $new; + $opt{-no_fsync} = 1; + my $no_gc = delete $opt{-no_gc}; + my $addr = $opt{address} // []; + $opt{-primary_address} //= $addr->[0] // "$ident\@example.com"; + my $parallel = delete($opt{importer_parallel}) // 0; + my $creat_opt = { nproc => delete($opt{nproc}) // 1 }; + my $ibx = PublicInbox::InboxWritable->new({ %opt }, $creat_opt); + if (!-f "$dir/creat.stamp") { + my $im = $ibx->importer($parallel); + $cb->($im, $ibx); + $im->done if $im; + unless ($no_gc) { + my @to_gc = $ibx->version == 1 ? ($ibx->{inboxdir}) : + glob("$ibx->{inboxdir}/git/*.git"); + for my $dir (@to_gc) { + xsys_e([ qw(git gc -q) ], { GIT_DIR => $dir }); + } + } + open my $s, '>', "$dir/creat.stamp"; + } + if ($tmpdir) { + undef $ibx; + xsys([qw(/bin/cp -Rp), $dir, $tmpdir]) == 0 or + BAIL_OUT "cp $dir $tmpdir"; + $opt{inboxdir} = $tmpdir; + $ibx = PublicInbox::InboxWritable->new(\%opt); + } + $ibx; } -sub DESTROY { - my ($self) = @_; - return if $self->{owner} != $$; - if (my $tail_pid = delete $self->{tail_pid}) { - PublicInbox::TestCommon::wait_for_tail($tail_pid, -1); - CORE::kill('TERM', $tail_pid); +sub test_httpd ($$;$$) { + my ($env, $client, $skip, $cb) = @_; + my ($tmpdir, $for_destroy); + $env->{TMPDIR} //= do { + ($tmpdir, $for_destroy) = tmpdir(); + $tmpdir; + }; + for (qw(PI_CONFIG)) { $env->{$_} or BAIL_OUT "$_ unset" } + SKIP: { + require_mods(qw(Plack::Test::ExternalServer LWP::UserAgent), + $skip // 1); + my $sock = tcp_server() or die; + my ($out, $err) = map { "$env->{TMPDIR}/std$_.log" } qw(out err); + my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ]; + my $td = start_script($cmd, $env, { 3 => $sock }); + my ($h, $p) = tcp_host_port($sock); + local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p"; + my $ua = LWP::UserAgent->new; + $ua->max_redirect(0); + local $CURRENT_DAEMON = $td; + Plack::Test::ExternalServer::test_psgi(client => $client, + ua => $ua); + $cb->() if $cb; + $td->join('TERM'); + open my $fh, '<', $err; + my $e = read_all($fh); + if ($e =~ s/^Plack::Middleware::ReverseProxy missing,\n//gms) { + $e =~ s/^URL generation for redirects .*\n//gms; + } + is($e, '', 'no errors'); + } +}; + +# TODO: support fstat(1) on OpenBSD, lsof already works on FreeBSD + Linux +# don't use this for deleted file checks, we only check that on Linux atm +# and we can readlink /proc/PID/fd/* directly +sub lsof_pid ($;$) { + my ($pid, $rdr) = @_; + state $lsof = require_cmd('lsof', 1); + $lsof or skip 'lsof missing/broken', 1; + my @out = xqx([$lsof, '-p', $pid], undef, $rdr); + if ($?) { + undef $lsof; + skip "lsof -p PID broken \$?=$?", 1; + } + my @cols = split ' ', $out[0]; + if (($cols[7] // '') eq 'NODE') { # normal lsof + @out; + } else { # busybox lsof ignores -p, so we DIY it + grep /\b$pid\b/, @out; } - $self->join('TERM'); +} + +sub no_pollerfd ($) { + my ($pid) = @_; + my ($re, @cmd); + $^O eq 'linux' and + ($re, @cmd) = (qr/\Q[eventpoll]\E/, qw(lsof -p), $pid); + # n.b. *BSDs uses kqueue to emulate signalfd and/or inotify, + # and we can't distinguish which is which easily. + SKIP: { + (@cmd && $re) or + skip 'open poller test is Linux-only', 1; + my $bin = require_cmd($cmd[0], 1) or skip "$cmd[0] missing", 1; + $cmd[0] = $bin; + my @of = xqx(\@cmd, {}, {2 => \(my $e)}); + my $err = $?; + skip "$bin broken? (\$?=$err) ($e)", 1 if $err; + @of = grep /\b$pid\b/, @of; # busybox lsof ignores -p + is(grep(/$re/, @of), 0, "no $re FDs") or diag explain(\@of); + } +} + +sub cfg_new ($;@) { + my ($tmpdir, @body) = @_; + require PublicInbox::Config; + my $f = "$tmpdir/tmp_cfg"; + open my $fh, '>', $f; + print $fh @body; + close $fh; + PublicInbox::Config->new($f); +} + +our $strace_cmd; +sub strace (@) { + my ($for_daemon) = @_; + skip 'linux only test', 1 if $^O ne 'linux'; + if ($for_daemon) { + my $f = '/proc/sys/kernel/yama/ptrace_scope'; + # TODO: we could fiddle with prctl in the daemon to make + # things work, but I'm not sure it's worth it... + state $ps = do { + my $fh; + CORE::open($fh, '<', $f) ? readline($fh) : 0; + }; + chomp $ps; + skip "strace unusable on daemons\n$f is `$ps' (!= 0)", 1 if $ps; + } + require_cmd('strace', 1) or skip 'strace not available', 1; +} + +sub strace_inject (;$) { + my $cmd = strace(@_); + state $ver = do { + require PublicInbox::Spawn; + my $v = PublicInbox::Spawn::run_qx([$cmd, '-V']); + $v =~ m!version\s+([1-9]+\.[0-9]+)! or + xbail "no strace -V: $v"; + eval("v$1"); + }; + $ver ge v4.16 or skip "$cmd too old for syscall injection (". + sprintf('v%vd', $ver). ' < v4.16)', 1; + $cmd +} + +sub oct_is ($$$) { + my ($got, $exp, $msg) = @_; + @_ = (sprintf('0%03o', $got), sprintf('0%03o', $exp), $msg); + goto &is; # tail recursion to get lineno from callers on failure } package PublicInbox::TestCommon::InboxWakeup; diff --git a/lib/PublicInbox/Tmpfile.pm b/lib/PublicInbox/Tmpfile.pm index 25bb3a52..72dd9d24 100644 --- a/lib/PublicInbox/Tmpfile.pm +++ b/lib/PublicInbox/Tmpfile.pm @@ -1,9 +1,9 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::Tmpfile; -use strict; -use warnings; -use base qw(Exporter); +use v5.12; +use parent qw(Exporter); +use autodie qw(unlink); our @EXPORT = qw(tmpfile); use Fcntl qw(:DEFAULT); use Errno qw(EEXIST); @@ -13,12 +13,15 @@ use File::Spec; # unlinked filename which makes sense when viewed with lsof # (at least on Linux) # And if we ever stop caring to have debuggable filenames, O_TMPFILE :) +# +# This is also for Perl <5.32 which lacks: open(..., '+>>', undef) +# <https://rt.perl.org/Ticket/Display.html?id=134221> sub tmpfile ($;$$) { my ($id, $sock, $append) = @_; if (defined $sock) { # add the socket inode number so we can figure out which # socket it belongs to - my @st = stat($sock); + my @st = stat($sock) or die "stat($sock): $!"; $id .= '-ino:'.$st[1]; } $id =~ tr!/!^!; @@ -28,7 +31,7 @@ sub tmpfile ($;$$) { do { my $fn = File::Spec->tmpdir . "/$id-".time.'-'.rand; if (sysopen(my $fh, $fn, $fl, 0600)) { # likely - unlink($fn) or warn "unlink($fn): $!"; # FS broken + unlink($fn); return $fh; # success } } while ($! == EEXIST); diff --git a/lib/PublicInbox/URIimap.pm b/lib/PublicInbox/URIimap.pm index 56b6002a..41c2842a 100644 --- a/lib/PublicInbox/URIimap.pm +++ b/lib/PublicInbox/URIimap.pm @@ -1,20 +1,24 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # cf. RFC 5092, which the `URI' package doesn't support # # This depends only on the documented public API of the `URI' dist, # not on internal `_'-prefixed subclasses such as `URI::_server' # -# <https://metacpan.org/pod/URI::imap> exists, but it's not in -# common distros. +# <https://metacpan.org/pod/URI::imap> exists, but it appears +# unmaintained, isn't in common distros, nor does it support +# ';FOO=BAR' parameters such as UIDVALIDITY # # RFC 2192 also describes ";TYPE=<list_type>" package PublicInbox::URIimap; -use strict; +use v5.12; use URI::Split qw(uri_split uri_join); # part of URI -use URI::Escape qw(uri_unescape); +use URI::Escape qw(uri_unescape uri_escape); +use overload '""' => \&as_string; my %default_ports = (imap => 143, imaps => 993); +# for enc-auth-type and enc-user in RFC 5092 +my $achar = qr/[A-Za-z0-9%\-_\.\!\$'\(\)\+\,\&\=\*]+/; sub new { my ($class, $url) = @_; @@ -28,6 +32,9 @@ sub canonical { my ($scheme, $auth, $path, $query, $_frag) = uri_split($$self); $path =~ s!\A/+!/!; # excessive leading slash + # upper-case uidvalidity= and uid= parameter names + $path =~ s/;([^=]+)=([^;]*)/;\U$1\E=$2/g; + # lowercase the host portion $auth =~ s#\A(.*@)?(.*?)(?::([0-9]+))?\z# my $ret = ($1//'').lc($2); @@ -55,7 +62,7 @@ sub path { my ($self) = @_; my (undef, undef, $path) = uri_split($$self); $path =~ s!\A/+!!; - $path =~ s/;.*\z//; # ;UIDVALIDITY=nz-number + $path =~ s!/?;.*\z!!; # [;UIDVALIDITY=nz-number]/;UID=nz-number $path eq '' ? undef : $path; } @@ -65,7 +72,38 @@ sub mailbox { defined($path) ? uri_unescape($path) : undef; } -# TODO: UIDVALIDITY, search, and other params +sub uidvalidity { # read/write + my ($self, $val) = @_; + my ($scheme, $auth, $path, $query, $frag) = uri_split($$self); + if (defined $val) { + if ($path =~ s!;UIDVALIDITY=[^;/]*\b!;UIDVALIDITY=$val!i or + $path =~ s!/;!;UIDVALIDITY=$val/;!i) { + # s// already changed it + } else { # both s// failed, so just append + $path .= ";UIDVALIDITY=$val"; + } + $$self = uri_join($scheme, $auth, $path, $query, $frag); + } + $path =~ s!\A/+!!; + $path =~ m!\A[^;]+;UIDVALIDITY=([1-9][0-9]*)\b!i ? + ($1 + 0) : undef; +} + +sub uid { + my ($self, $val) = @_; + my ($scheme, $auth, $path, $query, $frag) = uri_split($$self); + if (scalar(@_) == 2) { + if (!defined $val) { + $path =~ s!/;UID=[^;/]*\b!!i; + } else { + $path =~ s!/;UID=[^;/]*\b!/;UID=$val!i or + $path .= "/;UID=$val"; + } + $$self = uri_join($scheme, $auth, $path, $query); + } + $path =~ m!\A/[^;]+(?:;UIDVALIDITY=[^;/]+)?/;UID=([1-9][0-9]*)\b!i ? + ($1 + 0) : undef; +} sub port { my ($self) = @_; @@ -80,12 +118,34 @@ sub authority { } sub user { - my ($self) = @_; - my (undef, $auth) = uri_split($$self); - $auth =~ s/@.*\z// or return undef; # drop host:port - $auth =~ s/;.*\z//; # drop ;AUTH=... - $auth =~ s/:.*\z//; # drop password - uri_unescape($auth); + my ($self, $val) = @_; + my ($scheme, $auth, $path, $query) = uri_split($$self); + my $at_host_port; + $auth =~ s/(@.*)\z// and $at_host_port = $1; # stash host:port for now + if (scalar(@_) == 2) { # set, this clobbers password, too + if (defined $val) { + my $uval = uri_escape($val); + if (defined($at_host_port)) { + $auth =~ s!\A.*?(;AUTH=$achar).*!$uval$1!ix + or $auth = $uval; + } else { + substr($auth, 0, 0) = "$uval@"; + } + } elsif (defined($at_host_port)) { # clobber + $auth =~ s!\A.*?(;AUTH=$achar).*!$1!i or $auth = ''; + if ($at_host_port && $auth eq '') { + $at_host_port =~ s/\A\@//; + } + } + $at_host_port //= ''; + $$self = uri_join($scheme, $auth.$at_host_port, $path, $query); + $val; + } else { # read-only + $at_host_port // return undef; # explicit undef for scalar + $auth =~ s/;.*\z//; # drop ;AUTH=... + $auth =~ s/:.*\z//; # drop password + $auth eq '' ? undef : uri_unescape($auth); + } } sub password { @@ -97,10 +157,32 @@ sub password { } sub auth { - my ($self) = @_; - my (undef, $auth) = uri_split($$self); - $auth =~ s/@.*\z//; # drop host:port - $auth =~ /;AUTH=(.+)\z/i ? uri_unescape($1) : undef; + my ($self, $val) = @_; + my ($scheme, $auth, $path, $query) = uri_split($$self); + my $at_host_port; + $auth =~ s/(@.*)\z// and $at_host_port = $1; # stash host:port for now + if (scalar(@_) == 2) { + if (defined $val) { + my $uval = uri_escape($val); + if ($auth =~ s!;AUTH=$achar!;AUTH=$uval!ix) { + # replaced existing + } elsif (defined($at_host_port)) { + $auth .= ";AUTH=$uval"; + } else { + substr($auth, 0, 0) = ";AUTH=$uval@"; + } + } else { # clobber + $auth =~ s!;AUTH=$achar!!i; + if ($at_host_port && $auth eq '') { + $at_host_port =~ s/\A\@//; + } + } + $at_host_port //= ''; + $$self = uri_join($scheme, $auth.$at_host_port, $path, $query); + $val; + } else { # read-only + $auth =~ /;AUTH=(.+)\z/i ? uri_unescape($1) : undef; + } } sub scheme { @@ -110,4 +192,6 @@ sub scheme { sub as_string { ${$_[0]} } +sub clone { ref($_[0])->new(as_string($_[0])) } + 1; diff --git a/lib/PublicInbox/URInntps.pm b/lib/PublicInbox/URInntps.pm new file mode 100644 index 00000000..88c8d641 --- /dev/null +++ b/lib/PublicInbox/URInntps.pm @@ -0,0 +1,18 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# deal with the lack of URI::nntps in upstream URI. +# nntps is IANA registered, snews is deprecated +# cf. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=983419 +# Fixed in URI 5.08, we can drop this by 2035 when LTS distros all have it +package PublicInbox::URInntps; +use v5.12; +use parent qw(URI::snews); +use URI; + +sub new { + my ($class, $url) = @_; + $url =~ m!\Anntps://!i ? bless(\$url, $class) : URI->new($url); +} + +1; diff --git a/lib/PublicInbox/Umask.pm b/lib/PublicInbox/Umask.pm new file mode 100644 index 00000000..2c859e65 --- /dev/null +++ b/lib/PublicInbox/Umask.pm @@ -0,0 +1,70 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# base class to ensures Xapian||SQLite files respect core.sharedRepository +# of git repos +package PublicInbox::Umask; +use v5.12; +use PublicInbox::OnDestroy; + +use constant { + PERM_UMASK => 0, + OLD_PERM_GROUP => 1, + OLD_PERM_EVERYBODY => 2, + PERM_GROUP => 0660, + PERM_EVERYBODY => 0664, +}; + +sub _read_git_config_perm { + my ($self) = @_; + chomp(my $perm = $self->git->qx('config', 'core.sharedRepository')); + $perm; +} + +sub _git_config_perm { + my $self = shift; + my $perm = scalar @_ ? $_[0] : _read_git_config_perm($self); + $perm //= ''; + return PERM_UMASK if $perm eq '' || $perm eq 'umask'; + return PERM_GROUP if $perm eq 'group'; + return PERM_EVERYBODY if $perm =~ /\A(?:all|world|everybody)\z/; + return PERM_GROUP if ($perm =~ /\A(?:true|yes|on|1)\z/); + return PERM_UMASK if ($perm =~ /\A(?:false|no|off|0)\z/); + + my $i = oct($perm); + return PERM_UMASK if $i == PERM_UMASK; + return PERM_GROUP if $i == OLD_PERM_GROUP; + return PERM_EVERYBODY if $i == OLD_PERM_EVERYBODY; + + if (($i & 0600) != 0600) { + die "core.sharedRepository mode invalid: ". + sprintf('%.3o', $i) . "\nOwner must have permissions\n"; + } + ($i & 0666); +} + +sub _umask_for { + my ($perm) = @_; # _git_config_perm return value + my $rv = $perm; + return umask if $rv == 0; + + # set +x bit if +r or +w were set + $rv |= 0100 if ($rv & 0600); + $rv |= 0010 if ($rv & 0060); + $rv |= 0001 if ($rv & 0006); + (~$rv & 0777); +} + +sub with_umask { + my ($self, $cb, @arg) = @_; + my $old = umask($self->{umask} //= umask_prepare($self)); + my $restore = on_destroy \&CORE::umask, $old; + $cb ? $cb->(@arg) : $restore; +} + +sub umask_prepare { + my ($self) = @_; + _umask_for(_git_config_perm($self)); +} + +1; diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm index 945e7ae7..ddbd7a2e 100644 --- a/lib/PublicInbox/Unsubscribe.pm +++ b/lib/PublicInbox/Unsubscribe.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Standalone PSGI app to handle HTTP(s) unsubscribe links generated @@ -12,7 +12,8 @@ use warnings; use Crypt::CBC; use Plack::Util; use MIME::Base64 qw(decode_base64url); -my $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +my @CODE_URL = qw(http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git + https://public-inbox.org/public-inbox.git); my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8'); sub new { @@ -38,13 +39,15 @@ sub new { my $unsubscribe = $opt{unsubscribe} or die "`unsubscribe' callback not given\n"; + my $code_url = $opt{code_url} || \@CODE_URL; + $code_url = [ $code_url ] if ref($code_url) ne 'ARRAY'; bless { - pi_config => $opt{pi_config}, # PublicInbox::Config + pi_cfg => $opt{pi_config}, # PublicInbox::Config owner_email => $opt{owner_email}, cipher => $cipher, unsubscribe => $unsubscribe, contact => qq(<a\nhref="mailto:$e">$e</a>), - code_url => $opt{code_url} || $CODE_URL, + code_url => $code_url, confirm => $opt{confirm}, }, $class; } @@ -78,10 +81,7 @@ sub _user_list_addr { } my $user = eval { $self->{cipher}->decrypt(decode_base64url($u)) }; if (!defined $user || index($user, '@') < 1) { - my $err = quotemeta($@); - my $errors = $env->{'psgi.errors'}; - $errors->print("error decrypting: $u\n"); - $errors->print("$_\n") for split("\n", $err); + warn "error decrypting: $u: ", ($@ ? quotemeta($@) : ()); $u = Plack::Util::encode_html($u); return r($self, 400, 'Bad request', "Failed to decrypt: $u"); } @@ -138,7 +138,7 @@ sub r { "<html><head><title>$title</title></head><body><pre>". join("\n", "<b>$title</b>\n", @body) . '</pre><hr>'. "<pre>This page is available under AGPL-3.0+\n" . - "git clone $self->{code_url}\n" . + join('', map { "git clone $_\n" } @{$self->{code_url}}) . qq(Email $self->{contact} if you have any questions). '</pre></body></html>' ] ]; @@ -149,9 +149,9 @@ sub archive_info { my $archive_url = $self->{archive_urls}->{$list_addr}; unless ($archive_url) { - if (my $config = $self->{pi_config}) { + if (my $cfg = $self->{pi_cfg}) { # PublicInbox::Config::lookup - my $ibx = $config->lookup($list_addr); + my $ibx = $cfg->lookup($list_addr); # PublicInbox::Inbox::base_url $archive_url = $ibx->base_url if $ibx; } diff --git a/lib/PublicInbox/UserContent.pm b/lib/PublicInbox/UserContent.pm index 789da2f1..f28610f7 100644 --- a/lib/PublicInbox/UserContent.pm +++ b/lib/PublicInbox/UserContent.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Self-updating module containing a sample CSS for client-side @@ -17,13 +17,18 @@ sub CSS () { * It reduces eyestrain for me, and energy usage for all: * https://en.wikipedia.org/wiki/Light-on-dark_color_scheme */ - * { background:#000 !important; color:#ccc !important } + * { font-size: 100% !important; + font-family: monospace !important; + background:#000 !important; + color:#ccc !important } + pre { white-space: pre-wrap !important } /* * Underlined links add visual noise which make them hard-to-read. * Use colors to make them stand out, instead. */ - a:link { color:#69f !important; text-decoration:none !important } + a:link { color:#69f !important; + text-decoration:none !important } a:visited { color:#96f !important } /* quoted text in emails gets a different color */ @@ -101,6 +106,7 @@ if (scalar(@ARGV) == 1 && -r __FILE__) { open my $rw, '+<', __FILE__ or die $!; my $out = do { local $/; <$rw> } or die $!; + $css =~ s/; /;\n\t\t/g; $out =~ s/^sub CSS.*^_\n\}/sub CSS () {\n\t<<'_'\n${css}_\n}/sm; seek $rw, 0, 0; print $rw $out or die $!; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 5ff2af10..15a73158 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # This interface wraps and mimics PublicInbox::Import @@ -6,70 +6,39 @@ package PublicInbox::V2Writable; use strict; use v5.10.1; -use parent qw(PublicInbox::Lock); +use parent qw(PublicInbox::Lock PublicInbox::IPC); use PublicInbox::SearchIdxShard; +use PublicInbox::IPC qw(nproc_shards); use PublicInbox::Eml; use PublicInbox::Git; use PublicInbox::Import; +use PublicInbox::MultiGit; use PublicInbox::MID qw(mids references); -use PublicInbox::ContentHash qw(content_hash content_digest); +use PublicInbox::ContentHash qw(content_hash content_digest git_sha); use PublicInbox::InboxWritable; use PublicInbox::OverIdx; use PublicInbox::Msgmap; -use PublicInbox::Spawn qw(spawn popen_rd); -use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size); +use PublicInbox::Spawn qw(spawn popen_rd run_die); +use PublicInbox::Search; +use PublicInbox::SearchIdx qw(log2stack is_ancestor check_size is_bad_blob); use IO::Handle; # ->autoflush -use File::Temp (); +use POSIX (); my $OID = qr/[a-f0-9]{40,}/; # an estimate of the post-packed size to the raw uncompressed size -my $PACKING_FACTOR = 0.4; - -# SATA storage lags behind what CPUs are capable of, so relying on -# nproc(1) can be misleading and having extra Xapian shards is a -# waste of FDs and space. It can also lead to excessive IO latency -# and slow things down. Users on NVME or other fast storage can -# use the NPROC env or switches in our script/public-inbox-* programs -# to increase Xapian shards -our $NPROC_MAX_DEFAULT = 4; - -sub detect_nproc () { - # getconf(1) is POSIX, but *NPROCESSORS* vars are not - for (qw(_NPROCESSORS_ONLN NPROCESSORS_ONLN)) { - `getconf $_ 2>/dev/null` =~ /^(\d+)$/ and return $1; - } - for my $nproc (qw(nproc gnproc)) { # GNU coreutils nproc - `$nproc 2>/dev/null` =~ /^(\d+)$/ and return $1; - } - - # should we bother with `sysctl hw.ncpu`? Those only give - # us total processor count, not online processor count. - undef -} - -sub nproc_shards ($) { - my ($creat_opt) = @_; - my $n = $creat_opt->{nproc} if ref($creat_opt) eq 'HASH'; - $n //= $ENV{NPROC}; - if (!$n) { - # assume 2 cores if not detectable or zero - state $NPROC_DETECTED = detect_nproc() || 2; - $n = $NPROC_DETECTED; - $n = $NPROC_MAX_DEFAULT if $n > $NPROC_MAX_DEFAULT; - } - - # subtract for the main process and git-fast-import - $n -= 1; - $n < 1 ? 1 : $n; -} +our $PACKING_FACTOR = 0.4; sub count_shards ($) { my ($self) = @_; # always load existing shards in case core count changes: # Also, shard count may change while -watch is running - my $srch = $self->{ibx}->search or return 0; - delete $self->{ibx}->{search}; - $srch->{nshard} // 0 + if (my $ibx = $self->{ibx}) { + my $srch = $ibx->search or return 0; + delete $ibx->{search}; + $srch->{nshard} // 0 + } else { # ExtSearchIdx + $self->{nshard} = scalar($self->xdb_shards_flat); + } } sub new { @@ -79,18 +48,14 @@ sub new { $v2ibx = PublicInbox::InboxWritable->new($v2ibx); my $dir = $v2ibx->assert_usable_dir; unless (-d $dir) { - if ($creat) { - require File::Path; - File::Path::mkpath($dir); - } else { - die "$dir does not exist\n"; - } + die "$dir does not exist\n" if !$creat; + require File::Path; + File::Path::mkpath($dir); } - $v2ibx->umask_prepare; - my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; my $self = { ibx => $v2ibx, + mg => PublicInbox::MultiGit->new($dir, 'all.git', 'git'), im => undef, # PublicInbox::Import parallel => 1, transact_bytes => 0, @@ -117,35 +82,34 @@ sub init_inbox { } $self->idx_init; $self->{mm}->skip_artnum($skip_artnum) if defined $skip_artnum; - my $epoch_max = -1; - git_dir_latest($self, \$epoch_max); - if (defined $skip_epoch && $epoch_max == -1) { - $epoch_max = $skip_epoch; - } - $self->git_init($epoch_max >= 0 ? $epoch_max : 0); + my $max = $self->{ibx}->max_git_epoch; + $max = $skip_epoch if (defined($skip_epoch) && !defined($max)); + $self->{mg}->add_epoch($max // 0); $self->done; } -# returns undef on duplicate or spam -# mimics Import::add and wraps it for v2 -sub add { - my ($self, $eml, $check_cb) = @_; - $self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb); +sub idx_shard ($$) { + my ($self, $num) = @_; + $self->{idx_shards}->[$num % scalar(@{$self->{idx_shards}})]; } # indexes a message, returns true if checkpointing is needed -sub do_idx ($$$$) { - my ($self, $msgref, $mime, $smsg) = @_; - $smsg->{bytes} = $smsg->{raw_bytes} + crlf_adjust($$msgref); - $self->{oidx}->add_overview($mime, $smsg); - my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); - $idx->index_raw($msgref, $mime, $smsg); - my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; +sub do_idx ($$$) { + my ($self, $eml, $smsg) = @_; + $self->{oidx}->add_overview($eml, $smsg); + if ($self->{-need_xapian}) { + my $idx = idx_shard($self, $smsg->{num}); + $idx->index_eml($eml, $smsg); + } + my $n = $self->{transact_bytes} += $smsg->{bytes}; $n >= $self->{batch_bytes}; } -sub _add { +# returns undef on duplicate or spam +# mimics Import::add and wraps it for v2 +sub add { my ($self, $mime, $check_cb) = @_; + my $restore = $self->{ibx}->with_umask; # spam check: if ($check_cb) { @@ -168,11 +132,9 @@ sub _add { $cmt = $im->get_mark($cmt); $self->{last_commit}->[$self->{epoch_max}] = $cmt; - my $msgref = delete $smsg->{-raw_email}; - if (do_idx($self, $msgref, $mime, $smsg)) { + if (do_idx($self, $mime, $smsg)) { $self->checkpoint; } - $cmt; } @@ -249,11 +211,6 @@ sub v2_num_for_harder { ($num, $mid0); } -sub idx_shard { - my ($self, $shard_i) = @_; - $self->{idx_shards}->[$shard_i]; -} - sub _idx_init { # with_umask callback my ($self, $opt) = @_; $self->lock_acquire unless $opt && $opt->{-skip_lock}; @@ -264,22 +221,31 @@ sub _idx_init { # with_umask callback $self->{shards} = $nshards if $nshards && $nshards != $self->{shards}; $self->{batch_bytes} = $opt->{batch_size} // $PublicInbox::SearchIdx::BATCH_BYTES; - $self->{batch_bytes} *= $self->{shards} if $self->{parallel}; # need to create all shards before initializing msgmap FD # idx_shards must be visible to all forked processes my $max = $self->{shards} - 1; my $idx = $self->{idx_shards} = []; push @$idx, PublicInbox::SearchIdxShard->new($self, $_) for (0..$max); + $self->{-need_xapian} = $idx->[0]->need_xapian; + + # SearchIdxShard may do their own flushing, so don't scale + # until after forking + $self->{batch_bytes} *= $self->{shards} if $self->{parallel}; + + my $ibx = $self->{ibx} or return; # ExtIdxSearch # Now that all subprocesses are up, we can open the FDs # for SQLite: - my $mm = $self->{mm} = PublicInbox::Msgmap->new_file( - "$self->{ibx}->{inboxdir}/msgmap.sqlite3", - $self->{ibx}->{-no_fsync} ? 2 : 1); + my $mm = $self->{mm} = PublicInbox::Msgmap->new_file($ibx, 1); $mm->{dbh}->begin_work; } +sub parallel_init ($$) { + my ($self, $indexlevel) = @_; + $self->{parallel} = 0 if ($indexlevel // 'full') eq 'basic'; +} + # idempotent sub idx_init { my ($self, $opt) = @_; @@ -292,17 +258,7 @@ sub idx_init { delete @$ibx{qw(mm search)}; $ibx->git->cleanup; - $self->{parallel} = 0 if ($ibx->{indexlevel}//'') eq 'basic'; - if ($self->{parallel}) { - pipe(my ($r, $w)) or die "pipe failed: $!"; - # pipe for barrier notifications doesn't need to be big, - # 1031: F_SETPIPE_SZ - fcntl($w, 1031, 4096) if $^O eq 'linux'; - $self->{bnote} = [ $r, $w ]; - $w->autoflush(1); - } - - $ibx->umask_prepare; + parallel_init($self, $ibx->{indexlevel}); $ibx->with_umask(\&_idx_init, $self, $opt); } @@ -312,14 +268,10 @@ sub idx_init { sub _replace_oids ($$$) { my ($self, $mime, $replace_map) = @_; $self->done; - my $pfx = "$self->{ibx}->{inboxdir}/git"; + my $ibx = $self->{ibx}; + my $pfx = "$ibx->{inboxdir}/git"; my $rewrites = []; # epoch => commit - my $max = $self->{epoch_max}; - - unless (defined($max)) { - defined(my $latest = git_dir_latest($self, \$max)) or return; - $self->{epoch_max} = $max; - } + my $max = $self->{epoch_max} //= $ibx->max_git_epoch // return; foreach my $i (0..$max) { my $git_dir = "$pfx/$i.git"; @@ -414,7 +366,7 @@ sub rewrite_internal ($$;$$$) { } else { # ->purge or ->remove $self->{mm}->num_delete($num); } - unindex_oid_remote($self, $oid, $mid); + unindex_oid_aux($self, $oid, $mid); } } @@ -433,17 +385,16 @@ sub rewrite_internal ($$;$$$) { # (retval[2]) is not part of the stable API shared with Import->remove sub remove { my ($self, $eml, $cmt_msg) = @_; - my $r = $self->{ibx}->with_umask(\&rewrite_internal, - $self, $eml, $cmt_msg); + my $restore = $self->{ibx}->with_umask; + my $r = rewrite_internal($self, $eml, $cmt_msg); defined($r) && defined($r->[0]) ? @$r: undef; } sub _replace ($$;$$) { my ($self, $old_eml, $new_eml, $sref) = @_; - my $arg = [ $self, $old_eml, undef, $new_eml, $sref ]; - my $rewritten = $self->{ibx}->with_umask(\&rewrite_internal, - $self, $old_eml, undef, $new_eml, $sref) or return; - + my $restore = $self->{ibx}->with_umask; + my $rewritten = rewrite_internal($self, $old_eml, undef, + $new_eml, $sref) or return; my $rewrites = $rewritten->{rewrites}; # ->done is called if there are rewrites since we gc+prune from git $self->idx_init if @$rewrites; @@ -462,23 +413,6 @@ sub purge { $rewritten->{rewrites} } -# returns the git object_id of $fh, does not write the object to FS -sub git_hash_raw ($$) { - my ($self, $raw) = @_; - # grab the expected OID we have to reindex: - pipe(my($in, $w)) or die "pipe: $!"; - my $git_dir = $self->{ibx}->git->{git_dir}; - my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)]; - my $r = popen_rd($cmd, undef, { 0 => $in }); - print $w $$raw or die "print \$w: $!"; - close $w or die "close \$w: $!"; - local $/ = "\n"; - chomp(my $oid = <$r>); - close $r or die "git hash-object failed: $?"; - $oid =~ /\A$OID\z/ or die "OID not expected: $oid"; - $oid; -} - sub _check_mids_match ($$$) { my ($old_list, $new_list, $hdrs) = @_; my %old_mids = map { $_ => 1 } @$old_list; @@ -513,7 +447,7 @@ sub replace ($$$) { PublicInbox::Import::drop_unwanted_headers($new_mime); my $raw = $new_mime->as_string; - my $expect_oid = git_hash_raw($self, \$raw); + my $expect_oid = git_sha(1, \$raw)->hexdigest; my $rewritten = _replace($self, $old_mime, $new_mime, \$raw) or return; my $need_reindex = $rewritten->{need_reindex}; @@ -531,23 +465,23 @@ W: $list } # make sure we really got the OID: - my ($blob, $type, $bytes) = $self->{ibx}->git->check($expect_oid); + my ($blob, $type, $bytes) = $self->git->check($expect_oid); $blob eq $expect_oid or die "BUG: $expect_oid not found after replace"; # don't leak FDs to Xapian: - $self->{ibx}->git->cleanup; + $self->git->cleanup; # reindex modified messages: for my $smsg (@$need_reindex) { my $new_smsg = bless { blob => $blob, - raw_bytes => $bytes, num => $smsg->{num}, mid => $smsg->{mid}, }, 'PublicInbox::Smsg'; my $sync = { autime => $smsg->{ds}, cotime => $smsg->{ts} }; $new_smsg->populate($new_mime, $sync); - do_idx($self, \$raw, $new_mime, $new_smsg); + $new_smsg->set_bytes($raw, $bytes); + do_idx($self, $new_mime, $new_smsg); } $rewritten->{rewrites}; } @@ -558,7 +492,7 @@ sub last_epoch_commit ($$;$) { $self->{mm}->last_commit_xap($v, $i, $cmt); } -sub set_last_commits ($) { +sub set_last_commits ($) { # this is NOT for ExtSearchIdx my ($self) = @_; defined(my $epoch_max = $self->{epoch_max}) or return; my $last_commit = $self->{last_commit}; @@ -569,65 +503,57 @@ sub set_last_commits ($) { } } -sub barrier_init { - my ($self, $n) = @_; - $self->{bnote} or return; - --$n; - my $barrier = { map { $_ => 1 } (0..$n) }; -} - -sub barrier_wait { - my ($self, $barrier) = @_; - my $bnote = $self->{bnote} or return; - my $r = $bnote->[0]; - while (scalar keys %$barrier) { - defined(my $l = readline($r)) or die "EOF on barrier_wait: $!"; - $l =~ /\Abarrier (\d+)/ or die "bad line on barrier_wait: $l"; - delete $barrier->{$1} or die "bad shard[$1] on barrier wait"; - } -} - # public sub checkpoint ($;$) { my ($self, $wait) = @_; - if (my $im = $self->{im}) { - if ($wait) { - $im->barrier; - } else { - $im->checkpoint; - } - } + $self->{im}->barrier if $self->{im}; my $shards = $self->{idx_shards}; if ($shards) { - my $dbh = $self->{mm}->{dbh}; + my $dbh = $self->{mm}->{dbh} if $self->{mm}; # SQLite msgmap data is second in importance - $dbh->commit; + $dbh->commit if $dbh; + eval { $dbh->do('PRAGMA optimize') }; # SQLite overview is third $self->{oidx}->commit_lazy; # Now deal with Xapian - if ($wait) { - my $barrier = $self->barrier_init(scalar @$shards); - # each shard needs to issue a barrier command - $_->shard_barrier for @$shards; + # start commit_txn_lazy asynchronously on all parallel shards + # (non-parallel waits here) + $_->ipc_do('commit_txn_lazy') for @$shards; + + # transactions started on parallel shards, + # wait for them by issuing an echo command (echo can only + # run after commit_txn_lazy is done) + if ($wait && $self->{parallel}) { + my $i = 0; + for my $shard (@$shards) { + my $echo = $shard->ipc_do('echo', $i); + $echo == $i or die <<""; +shard[$i] bad echo:$echo != $i waiting for txn commit + + ++$i; + } + } - # wait for each Xapian shard - $self->barrier_wait($barrier); - } else { - $_->shard_commit for @$shards; + my $midx = $self->{midx}; # misc index + if ($midx) { + $midx->commit_txn; + $PublicInbox::Search::X{CLOEXEC_UNSET} and + $self->git->cleanup; } # last_commit is special, don't commit these until - # remote shards are done: - $dbh->begin_work; + # Xapian shards are done: + $dbh->begin_work if $dbh; set_last_commits($self); - $dbh->commit; - - $dbh->begin_work; + if ($dbh) { + $dbh->commit; + $dbh->begin_work; + } } $self->{total_bytes} += $self->{transact_bytes}; $self->{transact_bytes} = 0; @@ -658,6 +584,11 @@ sub done { eval { $mm->{dbh}->$m }; $err .= "msgmap $m: $@\n" if $@; } + if ($self->{oidx} && $self->{oidx}->{dbh} && $err) { + eval { $self->{oidx}->rollback_lazy }; + $err .= "overview rollback: $@\n" if $@; + } + my $shards = delete $self->{idx_shards}; if ($shards) { for (@$shards) { @@ -667,91 +598,14 @@ sub done { } eval { $self->{oidx}->dbh_close }; $err .= "over close: $@\n" if $@; - delete $self->{bnote}; + delete $self->{midx}; my $nbytes = $self->{total_bytes}; $self->{total_bytes} = 0; $self->lock_release(!!$nbytes) if $shards; - $self->{ibx}->git->cleanup; + $self->git->cleanup; die $err if $err; } -sub fill_alternates ($$) { - my ($self, $epoch) = @_; - - my $pfx = "$self->{ibx}->{inboxdir}/git"; - my $all = "$self->{ibx}->{inboxdir}/all.git"; - PublicInbox::Import::init_bare($all) unless -d $all; - my $info_dir = "$all/objects/info"; - my $alt = "$info_dir/alternates"; - my (%alt, $new); - my $mode = 0644; - if (-e $alt) { - open(my $fh, '<', $alt) or die "open < $alt: $!\n"; - $mode = (stat($fh))[2] & 07777; - - # we assign a sort score to every alternate and favor - # the newest (highest numbered) one because loose objects - # require scanning epochs and only the latest epoch is - # expected to see loose objects - my $score; - my $other = 0; # in case admin adds non-epoch repos - %alt = map {; - if (m!\A\Q../../\E([0-9]+)\.git/objects\z!) { - $score = $1 + 0; - } else { - $score = --$other; - } - $_ => $score; - } split(/\n+/, do { local $/; <$fh> }); - } - - foreach my $i (0..$epoch) { - my $dir = "../../git/$i.git/objects"; - if (!exists($alt{$dir}) && -d "$pfx/$i.git") { - $alt{$dir} = $i; - $new = 1; - } - } - return unless $new; - - my $fh = File::Temp->new(TEMPLATE => 'alt-XXXXXXXX', DIR => $info_dir); - my $tmp = $fh->filename; - print $fh join("\n", sort { $alt{$b} <=> $alt{$a} } keys %alt), "\n" - or die "print $tmp: $!\n"; - chmod($mode, $fh) or die "fchmod $tmp: $!\n"; - close $fh or die "close $tmp $!\n"; - rename($tmp, $alt) or die "rename $tmp => $alt: $!\n"; - $fh->unlink_on_destroy(0); -} - -sub git_init { - my ($self, $epoch) = @_; - my $git_dir = "$self->{ibx}->{inboxdir}/git/$epoch.git"; - PublicInbox::Import::init_bare($git_dir); - my @cmd = (qw/git config/, "--file=$git_dir/config", - 'include.path', '../../all.git/config'); - PublicInbox::Import::run_die(\@cmd); - fill_alternates($self, $epoch); - $git_dir -} - -sub git_dir_latest { - my ($self, $max) = @_; - $$max = -1; - my $pfx = "$self->{ibx}->{inboxdir}/git"; - return unless -d $pfx; - my $latest; - opendir my $dh, $pfx or die "opendir $pfx: $!\n"; - while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A([0-9]+)\.git\z! or next; - if ($1 > $$max) { - $$max = $1; - $latest = "$pfx/$git_dir"; - } - } - $latest; -} - sub importer { my ($self) = @_; my $im = $self->{im}; @@ -763,14 +617,14 @@ sub importer { $im->done; $im = undef; $self->checkpoint; - my $git_dir = $self->git_init(++$self->{epoch_max}); - my $git = PublicInbox::Git->new($git_dir); + my $dir = $self->{mg}->add_epoch(++$self->{epoch_max}); + my $git = PublicInbox::Git->new($dir); return $self->import_init($git, 0); } } my $epoch = 0; my $max; - my $latest = git_dir_latest($self, \$max); + my $latest = $self->{ibx}->git_dir_latest(\$max); if (defined $latest) { my $git = PublicInbox::Git->new($latest); my $packed_bytes = $git->packed_bytes; @@ -784,8 +638,8 @@ sub importer { } } $self->{epoch_max} = $epoch; - $latest = $self->git_init($epoch); - $self->import_init(PublicInbox::Git->new($latest), 0); + my $dir = $self->{mg}->add_epoch($epoch); + $self->import_init(PublicInbox::Git->new($dir), 0); } sub import_init { @@ -798,23 +652,6 @@ sub import_init { $im; } -# XXX experimental -sub diff ($$$) { - my ($mid, $cur, $new) = @_; - - my $ah = File::Temp->new(TEMPLATE => 'email-cur-XXXXXXXX', TMPDIR => 1); - print $ah $cur->as_string or die "print: $!"; - $ah->flush or die "flush: $!"; - PublicInbox::Import::drop_unwanted_headers($new); - my $bh = File::Temp->new(TEMPLATE => 'email-new-XXXXXXXX', TMPDIR => 1); - print $bh $new->as_string or die "print: $!"; - $bh->flush or die "flush: $!"; - my $cmd = [ qw(diff -u), $ah->filename, $bh->filename ]; - print STDERR "# MID conflict <$mid>\n"; - my $pid = spawn($cmd, undef, { 1 => 2 }); - waitpid($pid, 0) == $pid or die "diff did not finish"; -} - sub get_blob ($$) { my ($self, $smsg) = @_; if (my $im = $self->{im}) { @@ -838,52 +675,68 @@ sub content_exists ($$$) { } my $cur = PublicInbox::Eml->new($msg); return 1 if content_matches($chashes, $cur); - - # XXX DEBUG_DIFF is experimental and may be removed - diff($mid, $cur, $mime) if $ENV{DEBUG_DIFF}; } undef; } sub atfork_child { my ($self) = @_; - if (my $shards = $self->{idx_shards}) { - $_->atfork_child foreach @$shards; + if (my $older_siblings = $self->{idx_shards}) { + $_->ipc_sibling_atfork_child for @$older_siblings; } if (my $im = $self->{im}) { $im->atfork_child; } - die "unexpected mm" if $self->{mm}; - close $self->{bnote}->[0] or die "close bnote[0]: $!\n"; - $self->{bnote}->[1]; + die "BUG: unexpected mm" if $self->{mm}; } sub reindex_checkpoint ($$) { my ($self, $sync) = @_; - $self->{ibx}->git->cleanup; # *async_wait + $self->git->async_wait_all; + $self->update_last_commit($sync); ${$sync->{need_checkpoint}} = 0; my $mm_tmp = $sync->{mm_tmp}; $mm_tmp->atfork_prepare if $mm_tmp; - $self->done; # release lock + die 'BUG: {im} during reindex' if $self->{im}; + if ($self->{ibx_map} && !$sync->{checkpoint_unlocks}) { + checkpoint($self, 1); # no need to release lock on pure index + } else { + $self->done; # release lock + } - if (my $pr = $sync->{-opt}->{-progress}) { + if (my $pr = $sync->{-regen_fmt} ? $sync->{-opt}->{-progress} : undef) { $pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}})); } # allow -watch or -mda to write... $self->idx_init($sync->{-opt}); # reacquire lock + if (my $intvl = $sync->{check_intvl}) { # eidx + $sync->{next_check} = PublicInbox::DS::now() + $intvl; + } $mm_tmp->atfork_parent if $mm_tmp; } +sub index_finalize ($$) { + my ($arg, $index) = @_; + ++$arg->{self}->{nidx}; + if (defined(my $cur = $arg->{cur_cmt})) { + ${$arg->{latest_cmt}} = $cur; + } elsif ($index) { + die 'BUG: {cur_cmt} missing'; + } # else { unindexing @leftovers doesn't set {cur_cmt} +} + sub index_oid { # cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; - return if $size == 0; # purged + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 1); # size == 0 purged returns here + my $self = $arg->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; my ($num, $mid0); my $eml = PublicInbox::Eml->new($$bref); my $mids = mids($eml); my $chash = content_hash($eml); - my $self = $arg->{v2w}; if (scalar(@$mids) == 0) { warn "E: $oid has no Message-ID, skipping\n"; @@ -891,16 +744,20 @@ sub index_oid { # cat_async callback } # {unindexed} is unlikely - if ((my $unindexed = $arg->{unindexed}) && scalar(@$mids) == 1) { - $num = delete($unindexed->{$mids->[0]}); + if (my $unindexed = $arg->{unindexed}) { + my $oidbin = pack('H*', $oid); + my $u = $unindexed->{$oidbin}; + ($num, $mid0) = splice(@$u, 0, 2) if $u; if (defined $num) { - $mid0 = $mids->[0]; $self->{mm}->mid_set($num, $mid0); - delete($arg->{unindexed}) if !keys(%$unindexed); + if (scalar(@$u) == 0) { # done with current OID + delete $unindexed->{$oidbin}; + delete($arg->{unindexed}) if !keys(%$unindexed); + } } } + my $oidx = $self->{oidx}; if (!defined($num)) { # reuse if reindexing (or duplicates) - my $oidx = $self->{oidx}; for my $mid (@$mids) { ($num, $mid0) = $oidx->num_mid0_for_oid($oid, $mid); last if defined $num; @@ -908,6 +765,11 @@ sub index_oid { # cat_async callback } $mid0 //= do { # is this a number we got before? $num = $arg->{mm_tmp}->num_for($mids->[0]); + + # don't clobber existing if Message-ID is reused: + if (my $x = defined($num) ? $oidx->get_art($num) : undef) { + undef($num) if $x->{blob} ne $oid; + } defined($num) ? $mids->[0] : undef; }; if (!defined($num)) { @@ -941,45 +803,53 @@ sub index_oid { # cat_async callback } ++${$arg->{nr}}; my $smsg = bless { - raw_bytes => $size, num => $num, blob => $oid, mid => $mid0, }, 'PublicInbox::Smsg'; $smsg->populate($eml, $arg); - if (do_idx($self, $bref, $eml, $smsg)) { + $smsg->set_bytes($$bref, $size); + if (do_idx($self, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } + index_finalize($arg, 1); } # only update last_commit for $i on reindex iff newer than current -sub update_last_commit ($$$$) { - my ($self, $git, $i, $cmt) = @_; - my $last = last_epoch_commit($self, $i); - if (defined $last && is_ancestor($git, $last, $cmt)) { - my @cmd = (qw(rev-list --count), "$last..$cmt"); - chomp(my $n = $git->qx(@cmd)); +sub update_last_commit { + my ($self, $sync, $stk) = @_; + my $unit = $sync->{unit} // return; + my $latest_cmt = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; + defined($latest_cmt) or return; + my $last = last_epoch_commit($self, $unit->{epoch}); + if (defined $last && is_ancestor($self->git, $last, $latest_cmt)) { + my @cmd = (qw(rev-list --count), "$last..$latest_cmt"); + chomp(my $n = $unit->{git}->qx(@cmd)); return if $n ne '' && $n == 0; } - last_epoch_commit($self, $i, $cmt); -} + # don't rewind if --{since,until,before,after} are in use + return if (defined($last) && + grep(defined, @{$sync->{-opt}}{qw(since until)}) && + is_ancestor($self->git, $latest_cmt, $last)); -sub git_dir_n ($$) { "$_[0]->{ibx}->{inboxdir}/git/$_[1].git" } + last_epoch_commit($self, $unit->{epoch}, $latest_cmt); +} -sub last_commits ($$) { - my ($self, $epoch_max) = @_; +sub last_commits { + my ($self, $sync) = @_; my $heads = []; - for (my $i = $epoch_max; $i >= 0; $i--) { + for (my $i = $sync->{epoch_max}; $i >= 0; $i--) { $heads->[$i] = last_epoch_commit($self, $i); } $heads; } # returns a revision range for git-log(1) -sub log_range ($$$$$) { - my ($self, $sync, $git, $i, $tip) = @_; +sub log_range ($$$) { + my ($sync, $unit, $tip) = @_; my $opt = $sync->{-opt}; my $pr = $opt->{-progress} if (($opt->{verbose} || 0) > 1); + my $i = $unit->{epoch}; my $cur = $sync->{ranges}->[$i] or do { $pr->("$i.git indexing all of $tip\n") if $pr; return $tip; # all of it @@ -993,7 +863,8 @@ sub log_range ($$$$$) { my $range = "$cur..$tip"; $pr->("$i.git checking contiguity... ") if $pr; - if (is_ancestor($git, $cur, $tip)) { # common case + my $git = $unit->{git}; + if (is_ancestor($sync->{self}->git, $cur, $tip)) { # common case $pr->("OK\n") if $pr; my $n = $git->qx(qw(rev-list --count), $range); chomp($n); @@ -1018,62 +889,102 @@ Rewritten history? (in $git->{git_dir}) warn "discarding history at $cur\n"; } warn <<""; -reindexing $git->{git_dir} starting at -$range - - $sync->{unindex_range}->{$i} = "$base..$cur"; +reindexing $git->{git_dir} +starting at $range + + # $cur^0 may no longer exist if pruned by git + if ($git->qx(qw(rev-parse -q --verify), "$cur^0")) { + $unit->{unindex_range} = "$base..$cur"; + } elsif ($base && $git->qx(qw(rev-parse -q --verify), $base)) { + $unit->{unindex_range} = "$base.."; + } else { + warn "W: unable to unindex before $range\n"; + } } $range; } -sub sync_prepare ($$$) { - my ($self, $sync, $epoch_max) = @_; +# overridden by ExtSearchIdx +sub artnum_max { $_[0]->{mm}->num_highwater } + +sub sync_prepare ($$) { + my ($self, $sync) = @_; + $sync->{ranges} = sync_ranges($self, $sync); my $pr = $sync->{-opt}->{-progress}; my $regen_max = 0; - my $head = $self->{ibx}->{ref_head} || 'refs/heads/master'; - - # reindex stops at the current heads and we later rerun index_sync - # without {reindex} - my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex}; - - for (my $i = $epoch_max; $i >= 0; $i--) { - my $git_dir = git_dir_n($self, $i); + my $head = $sync->{ibx}->{ref_head} || 'HEAD'; + my $pfx; + if ($pr) { + ($pfx) = ($sync->{ibx}->{inboxdir} =~ m!([^/]+)\z!g); + $pfx //= $sync->{ibx}->{inboxdir}; + } + + my $reindex_heads; + if ($self->{ibx_map}) { + # ExtSearchIdx won't index messages unless they're in + # over.sqlite3 for a given inbox, so don't read beyond + # what's in the per-inbox index. + $reindex_heads = []; + my $v = PublicInbox::Search::SCHEMA_VERSION; + my $mm = $sync->{ibx}->mm; + for my $i (0..$sync->{epoch_max}) { + $reindex_heads->[$i] = $mm->last_commit_xap($v, $i); + } + } elsif ($sync->{reindex}) { # V2 inbox + # reindex stops at the current heads and we later + # rerun index_sync without {reindex} + $reindex_heads = $self->last_commits($sync); + } + if ($sync->{max_size} = $sync->{-opt}->{max_size}) { + $sync->{index_oid} = $self->can('index_oid'); + } + my $git_pfx = "$sync->{ibx}->{inboxdir}/git"; + for (my $i = $sync->{epoch_max}; $i >= 0; $i--) { + my $git_dir = "$git_pfx/$i.git"; -d $git_dir or next; # missing epochs are fine my $git = PublicInbox::Git->new($git_dir); + my $unit = { git => $git, epoch => $i }; + my $tip; if ($reindex_heads) { - $head = $reindex_heads->[$i] or next; + $tip = $head = $reindex_heads->[$i] or next; + } else { + $tip = $git->qx(qw(rev-parse -q --verify), $head); + next if $?; # new repo + chomp $tip; } - chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head)); - - next if $?; # new repo - my $range = log_range($self, $sync, $git, $i, $tip) or next; + my $range = log_range($sync, $unit, $tip) or next; # can't use 'rev-list --count' if we use --diff-filter - $pr->("$i.git counting $range ... ") if $pr; + $pr->("$pfx $i.git counting $range ... ") if $pr; # Don't bump num_highwater on --reindex by using {D}. # We intentionally do NOT use {D} in the non-reindex case # because we want NNTP article number gaps from unindexed # messages to show up in mirrors, too. $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR - my $stk = log2stack($sync, $git, $range, $self->{ibx}); + my $stk = log2stack($sync, $git, $range); + return 0 if $sync->{quit}; my $nr = $stk ? $stk->num_records : 0; $pr->("$nr\n") if $pr; - $sync->{stacks}->[$i] = $stk if $stk; + $unit->{stack} = $stk; # may be undef + unshift @{$sync->{todo}}, $unit; $regen_max += $nr; } + return 0 if $sync->{quit}; # XXX this should not happen unless somebody bypasses checks in # our code and blindly injects "d" file history into git repos if (my @leftovers = keys %{delete($sync->{D}) // {}}) { warn('W: unindexing '.scalar(@leftovers)." leftovers\n"); - my $arg = { v2w => $self }; - my $all = $self->{ibx}->git; + local $self->{current_info} = 'leftover '; + my $unindex_oid = $self->can('unindex_oid'); for my $oid (@leftovers) { + last if $sync->{quit}; $oid = unpack('H*', $oid); - $self->{current_info} = "leftover $oid"; - $all->cat_async($oid, \&unindex_oid, $arg); + my $req = { %$sync, oid => $oid }; + $self->git->cat_async($oid, $unindex_oid, $req); } - $all->cat_async_wait; + $self->git->async_wait_all; } + return 0 if $sync->{quit}; if (!$regen_max) { $sync->{-regen_fmt} = "%u/?\n"; return 0; @@ -1085,22 +996,25 @@ sub sync_prepare ($$$) { $sync->{-regen_fmt} = "% ${pad}u/$regen_max\n"; $sync->{nr} = \(my $nr = 0); return -1 if $sync->{reindex}; - $regen_max + $self->{mm}->num_highwater() || 0; + $regen_max + $self->artnum_max || 0; } -sub unindex_oid_remote ($$$) { +sub unindex_oid_aux ($$$) { my ($self, $oid, $mid) = @_; my @removed = $self->{oidx}->remove_oid($oid, $mid); + return unless $self->{-need_xapian}; for my $num (@removed) { - my $idx = idx_shard($self, $num % $self->{shards}); - $idx->shard_remove($oid, $num); + idx_shard($self, $num)->ipc_do('xdb_remove', $num); } } sub unindex_oid ($$;$) { # git->cat_async callback - my ($bref, $oid, $type, $size, $sync) = @_; - my $self = $sync->{v2w}; - my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef; + my ($bref, $oid, $type, $size, $arg) = @_; + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 0); + my $self = $arg->{self}; + local $self->{current_info} = "$self->{current_info} $oid"; + my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef; my $mm = $self->{mm}; my $mids = mids(PublicInbox::Eml->new($bref)); undef $$bref; @@ -1116,50 +1030,55 @@ sub unindex_oid ($$;$) { # git->cat_async callback warn "BUG: multiple articles linked to $oid\n", join(',',sort keys %gone), "\n"; } - foreach my $num (keys %gone) { + # reuse (num => mid) mapping in ascending numeric order + for my $num (sort { $a <=> $b } keys %gone) { + $num += 0; if ($unindexed) { my $mid0 = $mm->mid_for($num); - $unindexed->{$mid0} = $num; + my $oidbin = pack('H*', $oid); + push @{$unindexed->{$oidbin}}, $num, $mid0; } $mm->num_delete($num); } - unindex_oid_remote($self, $oid, $mid); + unindex_oid_aux($self, $oid, $mid); } + index_finalize($arg, 0); } +sub git { $_[0]->{ibx}->git } + # this is rare, it only happens when we get discontiguous history in # a mirror because the source used -purge or -edit -sub unindex ($$$$) { - my ($self, $sync, $git, $unindex_range) = @_; - my $unindexed = $sync->{unindexed} //= {}; # $mid0 => $num +sub unindex_todo ($$$) { + my ($self, $sync, $unit) = @_; + my $unindex_range = delete($unit->{unindex_range}) // return; + my $unindexed = $sync->{unindexed} //= {}; # $oidbin => [$num, $mid0] my $before = scalar keys %$unindexed; # order does not matter, here: - my @cmd = qw(log --raw -r - --no-notes --no-color --no-abbrev --no-renames); - my $fh = $git->popen(@cmd, $unindex_range); - my $all = $self->{ibx}->git; + my $fh = $unit->{git}->popen(qw(log --raw -r --no-notes --no-color + --no-abbrev --no-renames), $unindex_range); local $sync->{in_unindex} = 1; + my $unindex_oid = $self->can('unindex_oid'); while (<$fh>) { /\A:\d{6} 100644 $OID ($OID) [AM]\tm$/o or next; - $all->cat_async($1, \&unindex_oid, $sync); + $self->git->cat_async($1, $unindex_oid, { %$sync, oid => $1 }); } - close $fh or die "git log failed: \$?=$?"; - $all->cat_async_wait; + $fh->close or die "git log failed: \$?=$?"; + $self->git->async_wait_all; return unless $sync->{-opt}->{prune}; my $after = scalar keys %$unindexed; return if $before == $after; # ensure any blob can not longer be accessed via dumb HTTP - PublicInbox::Import::run_die(['git', "--git-dir=$git->{git_dir}", - qw(-c gc.reflogExpire=now gc --prune=all --quiet)]); + run_die($unit->{git}->cmd(qw(-c gc.reflogExpire=now gc + --prune=all --quiet))); } -sub sync_ranges ($$$) { - my ($self, $sync, $epoch_max) = @_; +sub sync_ranges ($$) { + my ($self, $sync) = @_; my $reindex = $sync->{reindex}; - - return last_commits($self, $epoch_max) unless $reindex; + return $self->last_commits($sync) unless $reindex; return [] if ref($reindex) ne 'HASH'; my $ranges = $reindex->{from}; # arrayref; @@ -1171,11 +1090,10 @@ sub sync_ranges ($$$) { sub index_xap_only { # git->cat_async callback my ($bref, $oid, $type, $size, $smsg) = @_; - my $self = $smsg->{v2w}; - my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); - $smsg->{raw_bytes} = $size; - $idx->index_raw($bref, undef, $smsg); - $self->{transact_bytes} += $size; + my $self = delete $smsg->{self}; + my $idx = idx_shard($self, $smsg->{num}); + $idx->index_eml(PublicInbox::Eml->new($bref), $smsg); + $self->{transact_bytes} += $smsg->{bytes}; } sub index_xap_step ($$$;$) { @@ -1190,8 +1108,9 @@ sub index_xap_step ($$$;$) { "$beg..$end (% $step)\n"); } for (my $num = $beg; $num <= $end; $num += $step) { + last if $sync->{quit}; my $smsg = $ibx->over->get_art($num) or next; - $smsg->{v2w} = $self; + $smsg->{self} = $self; $ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg); if ($self->{transact_bytes} >= $self->{batch_bytes}) { ${$sync->{nr}} = $num; @@ -1200,42 +1119,59 @@ sub index_xap_step ($$$;$) { } } -sub index_epoch ($$$) { - my ($self, $sync, $i) = @_; - - my $git_dir = git_dir_n($self, $i); - -d $git_dir or return; # missing epochs are fine - my $git = PublicInbox::Git->new($git_dir); - if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { # rare - unindex($self, $sync, $git, $unindex_range); - } - defined(my $stk = $sync->{stacks}->[$i]) or return; - $sync->{stacks}->[$i] = undef; - my $all = $self->{ibx}->git; - while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { - $self->{current_info} = "$i.git $oid"; +sub index_todo ($$$) { + my ($self, $sync, $unit) = @_; + return if $sync->{quit}; + unindex_todo($self, $sync, $unit); + my $stk = delete($unit->{stack}) or return; + my $all = $self->git; + my $index_oid = $self->can('index_oid'); + my $unindex_oid = $self->can('unindex_oid'); + my $pfx; + if ($unit->{git}->{git_dir} =~ m!/([^/]+)/git/([0-9]+\.git)\z!) { + $pfx = "$1 $2"; # v2 + } else { # v1 + ($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g); + $pfx //= $unit->{git}->{git_dir}; + } + local $self->{current_info} = "$pfx "; + local $sync->{latest_cmt} = \(my $latest_cmt); + local $sync->{unit} = $unit; + while (my ($f, $at, $ct, $oid, $cmt) = $stk->pop_rec) { + if ($sync->{quit}) { + warn "waiting to quit...\n"; + $all->async_wait_all; + $self->update_last_commit($sync); + return; + } + my $req = { + %$sync, + autime => $at, + cotime => $ct, + oid => $oid, + cur_cmt => $cmt + }; if ($f eq 'm') { - my $arg = { %$sync, autime => $at, cotime => $ct }; if ($sync->{max_size}) { - $all->check_async($oid, \&check_size, $arg); + $req->{git} = $all; + $all->check_async($oid, \&check_size, $req); } else { - $all->cat_async($oid, \&index_oid, $arg); + $all->cat_async($oid, $index_oid, $req); } } elsif ($f eq 'd') { - $all->cat_async($oid, \&unindex_oid, $sync); + $all->cat_async($oid, $unindex_oid, $req); } if (${$sync->{need_checkpoint}}) { reindex_checkpoint($self, $sync); } } - $all->check_async_wait; - $all->cat_async_wait; - update_last_commit($self, $git, $i, $stk->{latest_cmt}); + $all->async_wait_all; + $self->update_last_commit($sync, $stk); } sub xapian_only { my ($self, $opt, $sync, $art_beg) = @_; - my $seq = $opt->{sequential_shard}; + my $seq = $opt->{'sequential-shard'}; $art_beg //= 0; local $self->{parallel} = 0 if $seq; $self->idx_init($opt); # acquire lock @@ -1243,7 +1179,7 @@ sub xapian_only { $sync //= { need_checkpoint => \(my $bool = 0), -opt => $opt, - v2w => $self, + self => $self, nr => \(my $nr = 0), -regen_fmt => "%u/?\n", }; @@ -1251,6 +1187,7 @@ sub xapian_only { if ($seq || !$self->{parallel}) { my $shard_end = $self->{shards} - 1; for my $i (0..$shard_end) { + last if $sync->{quit}; index_xap_step($self, $sync, $art_beg + $i); if ($i != $shard_end) { reindex_checkpoint($self, $sync); @@ -1260,7 +1197,8 @@ sub xapian_only { index_xap_step($self, $sync, $art_beg, 1); } } - $self->{ibx}->git->cat_async_wait; + $self->git->async_wait_all; + $self->{ibx}->cleanup; $self->done; } @@ -1270,28 +1208,41 @@ sub index_sync { $opt //= {}; return xapian_only($self, $opt) if $opt->{xapian_only}; - my $pr = $opt->{-progress}; my $epoch_max; - my $latest = git_dir_latest($self, \$epoch_max); - return unless defined $latest; + my $latest = $self->{ibx}->git_dir_latest(\$epoch_max) // return; + if ($opt->{'fast-noop'}) { # nanosecond (st_ctim) comparison + use Time::HiRes qw(stat); + if (my @mm = stat("$self->{ibx}->{inboxdir}/msgmap.sqlite3")) { + my $c = $mm[10]; # 10 = ctime (nsec NV) + my @hd = stat("$latest/refs/heads"); + my @pr = stat("$latest/packed-refs"); + return if $c > ($hd[10] // 0) && $c > ($pr[10] // 0); + } + } - my $seq = $opt->{sequential_shard}; + my $pr = $opt->{-progress}; + my $seq = $opt->{'sequential-shard'}; my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; local $self->{ibx}->{indexlevel} = 'basic' if $seq; $self->idx_init($opt); # acquire lock - fill_alternates($self, $epoch_max); + $self->{mg}->fill_alternates; $self->{oidx}->rethread_prepare($opt); my $sync = { need_checkpoint => \(my $bool = 0), - unindex_range => {}, # EPOCH => oid_old..oid_new reindex => $opt->{reindex}, -opt => $opt, - v2w => $self, + self => $self, + ibx => $self->{ibx}, + epoch_max => $epoch_max, }; - $sync->{ranges} = sync_ranges($self, $sync, $epoch_max); - if (sync_prepare($self, $sync, $epoch_max)) { + my $quit = PublicInbox::SearchIdx::quit_cb($sync); + local $SIG{QUIT} = $quit; + local $SIG{INT} = $quit; + local $SIG{TERM} = $quit; + + if (sync_prepare($self, $sync)) { # tmp_clone seems to fail if inside a transaction, so # we rollback here (because we opened {mm} for reading) # Note: we do NOT rely on DBI transactions for atomicity; @@ -1303,16 +1254,13 @@ sub index_sync { # xapian_only works incrementally w/o --reindex if ($seq && !$opt->{reindex}) { - $art_beg = $sync->{mm_tmp}->max; - $art_beg++ if defined($art_beg); + $art_beg = $sync->{mm_tmp}->max || -1; + $art_beg++; } } - if ($sync->{max_size} = $opt->{max_size}) { - $sync->{index_oid} = \&index_oid; - } # work forwards through history - index_epoch($self, $sync, $_) for (0..$epoch_max); - $self->{oidx}->rethread_done($opt); + index_todo($self, $sync, $_) for @{delete($sync->{todo}) // []}; + $self->{oidx}->rethread_done($opt) unless $sync->{quit}; $self->done; if (my $nr = $sync->{nr}) { @@ -1320,14 +1268,21 @@ sub index_sync { $pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr; } + my $quit_warn; # deal with Xapian shards sequentially if ($seq && delete($sync->{mm_tmp})) { - $self->{ibx}->{indexlevel} = $idxlevel; - xapian_only($self, $opt, $sync, $art_beg); + if ($sync->{quit}) { + $quit_warn = 1; + } else { + $self->{ibx}->{indexlevel} = $idxlevel; + xapian_only($self, $opt, $sync, $art_beg); + $quit_warn = 1 if $sync->{quit}; + } } # --reindex on the command-line - if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') { + if (!$sync->{quit} && $opt->{reindex} && + !ref($opt->{reindex}) && $idxlevel ne 'basic') { $self->lock_acquire; my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0); if (my $xdb = $s0->idx_acquire) { @@ -1339,12 +1294,27 @@ sub index_sync { } # reindex does not pick up new changes, so we rerun w/o it: - if ($opt->{reindex}) { + if ($opt->{reindex} && !$sync->{quit} && + !grep(defined, @$opt{qw(since until)})) { my %again = %$opt; $sync = undef; delete @again{qw(rethread reindex -skip_lock)}; index_sync($self, \%again); + $opt->{quit} = $again{quit}; # propagate to caller + } + warn <<EOF if $quit_warn; +W: interrupted, --xapian-only --reindex required upon restart +EOF +} + +sub ipc_atfork_child { + my ($self) = @_; + if (my $lei = delete $self->{lei}) { + $lei->_lei_atfork_child; + my $pkt_op_p = delete $lei->{pkt_op_p}; + close($pkt_op_p->{op_p}); } + $self->SUPER::ipc_atfork_child; } 1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 1d5119cd..44e1f2a8 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -1,13 +1,13 @@ -# Copyright (C) 2014-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used for displaying the HTML web interface. # See Documentation/design_www.txt for this. package PublicInbox::View; use strict; -use warnings; -use bytes (); # only for bytes::length +use v5.10.1; use List::Util qw(max); +use Text::Wrap qw(wrap); # stdlib, we need Perl 5.6+ for $huge use PublicInbox::MsgTime qw(msg_datestamp); use PublicInbox::Hval qw(ascii_html obfuscate_addrs prurl mid_href ts2str fmt_ts); @@ -20,8 +20,10 @@ use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::Reply; use PublicInbox::ViewDiff qw(flush_diff); use PublicInbox::Eml; +use POSIX qw(strftime); use Time::Local qw(timegm); use PublicInbox::Smsg qw(subject_normalized); +use PublicInbox::ContentHash qw(content_hash); use constant COLS => 72; use constant INDENT => ' '; use constant TCHILD => '` '; @@ -31,15 +33,17 @@ sub msg_page_i { my ($ctx, $eml) = @_; if ($eml) { # called by WwwStream::async_eml or getline my $smsg = $ctx->{smsg}; - $ctx->{smsg} = $ctx->{over}->next_by_mid(@{$ctx->{next_arg}}); + my $over = $ctx->{ibx}->over; + $ctx->{smsg} = $over ? $over->next_by_mid(@{$ctx->{next_arg}}) + : $ctx->gone('over'); $ctx->{mhref} = ($ctx->{nr} || $ctx->{smsg}) ? "../${\mid_href($smsg->{mid})}/" : ''; - my $obuf = $ctx->{obuf} = _msg_page_prepare_obuf($eml, $ctx); - multipart_text_as_html($eml, $ctx); - delete $ctx->{obuf}; - $$obuf .= '</pre><hr>'; - $$obuf .= html_footer($ctx, $ctx->{first_hdr}) if !$ctx->{smsg}; - $$obuf; + if (_msg_page_prepare($eml, $ctx, $smsg->{ts})) { + $eml->each_part(\&add_text_body, $ctx, 1); + print { $ctx->{zfh} } '</pre><hr>'; + } + html_footer($ctx, $ctx->{first_hdr}) if !$ctx->{smsg}; + ''; # XXX TODO cleanup } else { # called by WwwStream::async_next or getline $ctx->{smsg}; # may be undef } @@ -48,25 +52,25 @@ sub msg_page_i { # /$INBOX/$MSGID/ for unindexed v1 inboxes sub no_over_html ($) { my ($ctx) = @_; - my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return; # 404 + my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid}) or return; # 404 my $eml = PublicInbox::Eml->new($bref); $ctx->{mhref} = ''; PublicInbox::WwwStream::init($ctx); - my $obuf = $ctx->{obuf} = _msg_page_prepare_obuf($eml, $ctx); - multipart_text_as_html($eml, $ctx); - delete $ctx->{obuf}; - $$obuf .= '</pre><hr>'; - eval { $$obuf .= html_footer($ctx, $eml) }; - html_oneshot($ctx, 200, $obuf); + if (_msg_page_prepare($eml, $ctx)) { # sets {-title_html} + $eml->each_part(\&add_text_body, $ctx, 1); + print { $ctx->{zfh} } '</pre><hr>'; + } + html_footer($ctx, $eml); + $ctx->html_done; } # public functions: (unstable) sub msg_page { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef; - my $over = $ctx->{over} = $ibx->over or return no_over_html($ctx); + my $over = $ibx->over or return no_over_html($ctx); my ($id, $prev); my $next_arg = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; @@ -76,7 +80,8 @@ sub msg_page { # allow user to easily browse the range around this message if # they have ->over $ctx->{-t_max} = $smsg->{ts}; - PublicInbox::WwwStream::aresponse($ctx, 200, \&msg_page_i); + $ctx->{-spfx} = '../' if $ibx->{-repo_objs}; + PublicInbox::WwwStream::aresponse($ctx, \&msg_page_i); } # /$INBOX/$MESSAGE_ID/#R @@ -88,7 +93,7 @@ sub msg_reply ($$) { 'https://en.wikipedia.org/wiki/Posting_style#Interleaved_style'; my $info = ''; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; if (my $url = $ibx->{infourl}) { $url = prurl($ctx->{env}, $url); $info = qq(\n List information: <a\nhref="$url">$url</a>\n); @@ -136,6 +141,9 @@ $info <a href="$se_url">$se_url</a> $link</pre> + + Be sure your reply has a <b>Subject:</b> header at the top and a blank line + before the message body. EOF } @@ -175,6 +183,59 @@ sub nr_to_s ($$$) { $nr == 1 ? "$nr $singular" : "$nr $plural"; } +sub addr2urlmap ($) { + my ($ctx) = @_; + # cache makes a huge difference with /[tT] and large threads + my $key = PublicInbox::Git::host_prefix_url($ctx->{env}, ''); + my $ent = $ctx->{www}->{pi_cfg}->{-addr2urlmap}->{$key} // do { + my $by_addr = $ctx->{www}->{pi_cfg}->{-by_addr}; + my (%addr2url, $url); + while (my ($addr, $ibx) = each %$by_addr) { + $url = $ibx->base_url // $ibx->base_url($ctx->{env}); + $addr2url{$addr} = ascii_html($url) if defined $url; + } + # don't allow attackers to randomly change Host: headers + # and OOM us if the server handles all hostnames: + my $tmp = $ctx->{www}->{pi_cfg}->{-addr2urlmap}; + my @k = keys %$tmp; # random order + delete @$tmp{@k[0..3]} if scalar(@k) > 7; + my $re = join('|', map { quotemeta } keys %addr2url); + $tmp->{$key} = [ qr/\b($re)\b/i, \%addr2url ]; + }; + @$ent; +} + +sub to_cc_html ($$$$) { + my ($ctx, $eml, $field, $t) = @_; + my @vals = $eml->header($field) or return ('', 0); + my (undef, $addr2url) = addr2urlmap($ctx); + my $pairs = PublicInbox::Address::pairs(join(', ', @vals)); + my ($len, $line_len, $html) = (0, 0, ''); + my ($pair, $url); + my ($cur_ibx, $env) = @$ctx{qw(ibx env)}; + # avoid excessive ascii_html calls (already hot in profiles): + my @html = split /\n/, ascii_html(join("\n", map { + $_->[0] // (split(/\@/, $_->[1]))[0]; # addr user if no name + } @$pairs)); + for my $n (@html) { + $pair = shift @$pairs; + if ($line_len) { # 9 = display width of ",\t": + if ($line_len + length($n) > COLS - 9) { + $html .= ",\n\t"; + $len += $line_len; + $line_len = 0; + } else { + $html .= ', '; + $line_len += 2; + } + } + $line_len += length($n); + $url = $addr2url->{lc($pair->[1] // '')}; + $html .= $url ? qq(<a\nhref="$url$t">$n</a>) : $n; + } + ($html, $len + $line_len); +} + # Displays the text of of the message for /$INBOX/$MSGID/[Tt]/ endpoint # this is already inside a <pre> sub eml_entry { @@ -199,7 +260,8 @@ sub eml_entry { my $ds = delete $smsg->{ds}; # for v1 non-Xapian/SQLite users # Deleting these fields saves about 400K as we iterate across 1K msgs - delete @$smsg{qw(ts blob)}; + my ($t, undef) = delete @$smsg{qw(ts blob)}; + $t = $t ? '?t='.ts2str($t) : ''; my $from = _hdr_names_html($eml, 'From'); obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx; @@ -208,9 +270,8 @@ sub eml_entry { my $mhref = $upfx . mid_href($mid_raw) . '/'; $rv .= qq{ (<a\nhref="$mhref">permalink</a> / }; $rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n}; - my $to = fold_addresses(_hdr_names_html($eml, 'To')); - my $cc = fold_addresses(_hdr_names_html($eml, 'Cc')); - my ($tlen, $clen) = (length($to), length($cc)); + my ($to, $tlen) = to_cc_html($ctx, $eml, 'To', $t); + my ($cc, $clen) = to_cc_html($ctx, $eml, 'Cc', $t); my $to_cc = ''; if (($tlen + $clen) > COLS) { $to_cc .= ' To: '.$to."\n" if $tlen; @@ -233,20 +294,22 @@ sub eml_entry { my $html = ascii_html($irt); $rv .= qq(In-Reply-To: <<a\nhref="$href">$html</a>>\n) } - $rv .= "\n"; + say { $ctx->zfh } $rv; # scan through all parts, looking for displayable text $ctx->{mhref} = $mhref; - $ctx->{obuf} = \$rv; - $eml->each_part(\&add_text_body, $ctx, 1); - delete $ctx->{obuf}; + $ctx->{changed_href} = "#e$id"; # for diffstat "files? changed," + $eml->each_part(\&add_text_body, $ctx, 1); # expensive # add the footer - $rv .= "\n<a\nhref=#$id_m\nid=e$id>^</a> ". + $rv = "\n<a\nhref=#$id_m\nid=e$id>^</a> ". "<a\nhref=\"$mhref\">permalink</a>" . " <a\nhref=\"${mhref}raw\">raw</a>" . " <a\nhref=\"${mhref}#R\">reply</a>"; + delete($ctx->{-qry}) and + $rv .= qq[ <a\nhref="${mhref}#related">related</a>]; + my $hr; if (defined(my $pct = $smsg->{pct})) { # used by SearchView.pm $rv .= "\t[relevance $pct%]"; @@ -254,7 +317,6 @@ sub eml_entry { } elsif ($mapping) { my $nested = 'nested'; my $flat = 'flat'; - my $end = ''; if ($ctx->{flat}) { $hr = 1; $flat = "<b>$flat</b>"; @@ -276,8 +338,15 @@ sub eml_entry { sub pad_link ($$;$) { my ($mid, $level, $s) = @_; $s ||= '...'; - my $id = id_compress($mid, 1); - (' 'x19).indent_for($level).th_pfx($level)."<a\nhref=#r$id>($s)</a>\n"; + my $href = defined($mid) ? + ("<a\nhref=#r".id_compress($mid, 1).">($s)</a>\n") : + "($s)\n"; + (' 'x19).indent_for($level).th_pfx($level).$href; +} + +sub _skel_hdr { + # my ($mapping, $mid) = @_; + ($_[0]->{$_[1] // \'bogus'} // [ "(?)\n" ])->[0]; } sub _th_index_lite { @@ -285,8 +354,7 @@ sub _th_index_lite { my $rv = ''; my $mapping = $ctx->{mapping} or return $rv; my $pad = ' '; - my $mid_map = $mapping->{$mid_raw}; - defined $mid_map or + my $mid_map = $mapping->{$mid_raw} // return 'public-inbox BUG: '.ascii_html($mid_raw).' not mapped'; my ($attr, $node, $idx, $level) = @$mid_map; my $children = $node->{children}; @@ -309,39 +377,40 @@ sub _th_index_lite { my $s = ($idx - 1). ' preceding siblings ...'; $rv .= pad_link($pmid, $level, $s); } elsif ($idx == 2) { - my $ppmid = $siblings->[0]->{mid}; - $rv .= $pad . $mapping->{$ppmid}->[0]; + $rv .= $pad . _skel_hdr($mapping, + $siblings->[0] ? + $siblings->[0]->{mid} : undef); } - $rv .= $pad . $mapping->{$pmid}->[0]; + $rv .= $pad . _skel_hdr($mapping, $pmid); } } my $s_s = nr_to_s($nr_s, 'sibling', 'siblings'); my $s_c = nr_to_s($nr_c, 'reply', 'replies'); - $attr =~ s!\n\z!</b>\n!s; + chop $attr; # remove "\n" $attr =~ s!<a\nhref.*</a> (?:" )?!!s; # no point in dup subject $attr =~ s!<a\nhref=[^>]+>([^<]+)</a>!$1!s; # no point linking to self - $rv .= "<b>@ $attr"; + $rv .= "<b>@ $attr</b>\n"; if ($nr_c) { - my $cmid = $children->[0]->{mid}; - $rv .= $pad . $mapping->{$cmid}->[0]; + my $cmid = $children->[0] ? $children->[0]->{mid} : undef; + $rv .= $pad . _skel_hdr($mapping, $cmid); if ($nr_c > 2) { my $s = ($nr_c - 1). ' more replies'; $rv .= pad_link($cmid, $level + 1, $s); } elsif (my $cn = $children->[1]) { - $rv .= $pad . $mapping->{$cn->{mid}}->[0]; + $rv .= $pad . _skel_hdr($mapping, $cn->{mid}); } } my $next = $siblings->[$idx+1] if $siblings && $idx >= 0; if ($next) { my $nmid = $next->{mid}; - $rv .= $pad . $mapping->{$nmid}->[0]; + $rv .= $pad . _skel_hdr($mapping, $nmid); my $nnext = $nr_s - $idx; if ($nnext > 2) { my $s = ($nnext - 1).' subsequent siblings'; $rv .= pad_link($nmid, $level, $s); } elsif (my $nn = $siblings->[$idx + 2]) { - $rv .= $pad . $mapping->{$nn->{mid}}->[0]; + $rv .= $pad . _skel_hdr($mapping, $nn->{mid}); } } $rv .= $pad ."<a\nhref=#r$id>$s_s, $s_c; $ctx->{s_nr}</a>\n"; @@ -370,7 +439,9 @@ sub pre_thread { # walk_thread callback sub thread_eml_entry { my ($ctx, $eml) = @_; my ($beg, $end) = thread_adj_level($ctx, $ctx->{level}); - $beg . '<pre>' . eml_entry($ctx, $eml) . '</pre>' . $end; + print { $ctx->zfh } $beg, '<pre>'; + print { $ctx->{zfh} } eml_entry($ctx, $eml), '</pre>'; + $end; } sub next_in_queue ($$) { @@ -397,15 +468,15 @@ sub stream_thread_i { # PublicInbox::WwwStream::getline callback if (!$ghost_ok) { # first non-ghost $ctx->{-title_html} = ascii_html($smsg->{subject}); - $ctx->zmore($ctx->html_top); + print { $ctx->zfh } $ctx->html_top; } return $smsg; } # buffer the ghost entry and loop - $ctx->zmore(ghost_index_entry($ctx, $lvl, $smsg)); + print { $ctx->zfh } ghost_index_entry($ctx, $lvl, $smsg) } else { # all done - $ctx->zmore(join('', thread_adj_level($ctx, 0))); - $ctx->zmore(${delete($ctx->{skel})}); + print { $ctx->zfh } thread_adj_level($ctx, 0), + ${delete($ctx->{skel})}; return; } } @@ -413,21 +484,23 @@ sub stream_thread_i { # PublicInbox::WwwStream::getline callback sub stream_thread ($$) { my ($rootset, $ctx) = @_; - $ctx->{-queue} = [ map { (0, $_) } @$rootset ]; - PublicInbox::WwwStream::aresponse($ctx, 200, \&stream_thread_i); + @{$ctx->{-queue}} = map { (0, $_) } @$rootset; + PublicInbox::WwwStream::aresponse($ctx, \&stream_thread_i); } # /$INBOX/$MSGID/t/ and /$INBOX/$MSGID/T/ sub thread_html { my ($ctx) = @_; + $ctx->{-upfx} = '../../'; my $mid = $ctx->{mid}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); return missing_thread($ctx) if $nr == 0; + $ctx->{-spfx} = '../../' if $ibx->{-repo_objs}; # link $INBOX_DIR/description text to "index_topics" view around # the newest message in this thread - my $t = ts2str($ctx->{-t_max} = max(map { delete $_->{ts} } @$msgs)); + my $t = ts2str($ctx->{-t_max} = max(map { $_->{ts} } @$msgs)); my $t_fmt = fmt_ts($ctx->{-t_max}); my $skel = '<hr><pre>'; @@ -443,13 +516,12 @@ EOF $skel .= " (download: <a\nhref=\"../t.mbox.gz\">mbox.gz</a>"; $skel .= " / follow: <a\nhref=\"../t.atom\">Atom feed</a>)\n"; $skel .= "-- links below jump to the message on this page --\n"; - $ctx->{-upfx} = '../../'; $ctx->{cur_level} = 0; $ctx->{skel} = \$skel; $ctx->{prev_attr} = ''; $ctx->{prev_level} = 0; - $ctx->{root_anchor} = anchor_for($mid); - $ctx->{mapping} = {}; + $ctx->{root_anchor} = 'm' . id_compress($mid, 1); + $ctx->{mapping} = {}; # mid -> [ header_summary, node, idx, level ] $ctx->{s_nr} = ($nr > 1 ? "$nr+ messages" : 'only message') .' in thread'; @@ -465,7 +537,7 @@ EOF # flat display: lazy load the full message from smsg $ctx->{msgs} = $msgs; $ctx->{-html_tip} = '<pre>'; - PublicInbox::WwwStream::aresponse($ctx, 200, \&thread_html_i); + PublicInbox::WwwStream::aresponse($ctx, \&thread_html_i); } sub thread_html_i { # PublicInbox::WwwStream::getline callback @@ -474,7 +546,7 @@ sub thread_html_i { # PublicInbox::WwwStream::getline callback my $smsg = $ctx->{smsg}; if (exists $ctx->{-html_tip}) { $ctx->{-title_html} = ascii_html($smsg->{subject}); - $ctx->zmore($ctx->html_top); + print { $ctx->zfh } $ctx->html_top; } return eml_entry($ctx, $eml); } else { @@ -482,31 +554,19 @@ sub thread_html_i { # PublicInbox::WwwStream::getline callback return $smsg if exists($smsg->{blob}); } my $skel = delete($ctx->{skel}) or return; # all done - $ctx->zmore($$skel); + print { $ctx->zfh } $$skel; undef; } } -sub multipart_text_as_html { - # ($mime, $ctx) = @_; # each_part may do "$_[0] = undef" - - # scan through all parts, looking for displayable text - $_[0]->each_part(\&add_text_body, $_[1], 1); -} - sub submsg_hdr ($$) { my ($ctx, $eml) = @_; - my $obfs_ibx = $ctx->{-obfs_ibx}; - my $rv = $ctx->{obuf}; - $$rv .= "\n"; + my $s = "\n"; for my $h (qw(From To Cc Subject Date Message-ID X-Alt-Message-ID)) { - my @v = $eml->header($h); - for my $v (@v) { - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $v = ascii_html($v); - $$rv .= "$h: $v\n"; - } + $s .= "$h: $_\n" for $eml->header($h); } + obfuscate_addrs($ctx->{-obfs_ibx}, $s) if $ctx->{-obfs_ibx}; + ascii_html($s); } sub attach_link ($$$$;$) { @@ -517,8 +577,8 @@ sub attach_link ($$$$;$) { # downloads for 0-byte multipart attachments return unless $part->{bdy}; - my $nl = $idx eq '1' ? '' : "\n"; # like join("\n", ...) - my $size = bytes::length($part->body); + my $size = length($part->body); + delete $part->{bdy}; # save memory # hide attributes normally, unless we want to aid users in # spotting MUA problems: @@ -532,46 +592,36 @@ sub attach_link ($$$$;$) { } else { $sfn = 'a.bin'; } - my $rv = $ctx->{obuf}; - $$rv .= qq($nl<a\nhref="$ctx->{mhref}$idx-$sfn">); - if ($err) { - $$rv .= <<EOF; + my $rv = $idx eq '1' ? '' : "\n"; # like join("\n", ...) + $rv .= qq(<a\nhref="$ctx->{mhref}$idx-$sfn">); + $rv .= <<EOF if $err; [-- Warning: decoded text below may be mangled, UTF-8 assumed --] EOF - } - $$rv .= "[-- Attachment #$idx: "; - my $ts = "Type: $ct, Size: $size bytes"; + $rv .= "[-- Attachment #$idx: "; my $desc = $part->header('Content-Description') // $fn // ''; - $desc = ascii_html($desc); - $$rv .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]"; - $$rv .= "</a>\n"; - - submsg_hdr($ctx, $part) if $part->{is_submsg}; - - undef; + $rv .= ascii_html($desc)." --]\n[-- " if $desc ne ''; + $rv .= "Type: $ct, Size: $size bytes --]</a>\n"; + $rv .= submsg_hdr($ctx, $part) if $part->{is_submsg}; + $rv; } sub add_text_body { # callback for each_part my ($p, $ctx) = @_; my $upfx = $ctx->{mhref}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; # $p - from each_part: [ Email::MIME-like, depth, $idx ] my ($part, $depth, $idx) = @$p; my $ct = $part->content_type || 'text/plain'; my $fn = $part->filename; my ($s, $err) = msg_part_text($part, $ct); - return attach_link($ctx, $ct, $p, $fn) unless defined $s; - - my $rv = $ctx->{obuf}; - if ($part->{is_submsg}) { - submsg_hdr($ctx, $part); - $$rv .= "\n"; - } + my $zfh = $ctx->zfh; + $s // return print $zfh (attach_link($ctx, $ct, $p, $fn) // ''); + say $zfh submsg_hdr($ctx, $part) if $part->{is_submsg}; # makes no difference to browsers, and don't screw up filename # link generation in diffs with the extra '%0D' - $s =~ s/\r\n/\n/sg; + $s =~ s/\r+\n/\n/sg; # will be escaped to `•' in HTML obfuscate_addrs($ibx, $s, "\x{2022}") if $ibx->{obfuscate}; @@ -590,133 +640,129 @@ sub add_text_body { # callback for each_part $ctx->{-anchors} = {} if $s =~ /^diff --git /sm; $diff = 1; delete $ctx->{-long_path}; - my $spfx; - if ($ibx->{-repo_objs}) { - if (index($upfx, '//') >= 0) { # absolute URL (Atom feeds) - $spfx = $upfx; - $spfx =~ s!/([^/]*)/\z!/!; - } else { - my $n_slash = $upfx =~ tr!/!/!; - if ($n_slash == 0) { - $spfx = '../'; - } elsif ($n_slash == 1) { - $spfx = ''; - } else { # nslash == 2 - $spfx = '../../'; - } - } - } - $ctx->{-spfx} = $spfx; }; - # some editors don't put trailing newlines at the end: - $s .= "\n" unless $s =~ /\n\z/s; - # split off quoted and unquoted blocks: my @sections = PublicInbox::MsgIter::split_quotes($s); undef $s; # free memory if (defined($fn) || ($depth > 0 && !$part->{is_submsg}) || $err) { # badly-encoded message with $err? tell the world about it! - attach_link($ctx, $ct, $p, $fn, $err); - $$rv .= "\n"; + say $zfh attach_link($ctx, $ct, $p, $fn, $err); } - foreach my $cur (@sections) { + delete $part->{bdy}; # save memory + for my $cur (@sections) { # $cur may be huge if ($cur =~ /\A>/) { # we use a <span> here to allow users to specify # their own color for quoted text - $$rv .= qq(<span\nclass="q">); - $$rv .= $l->to_html($cur); - $$rv .= '</span>'; + print $zfh qq(<span\nclass="q">), + $l->to_html($cur), '</span>'; } elsif ($diff) { flush_diff($ctx, \$cur); - } else { - # regular lines, OK - $$rv .= $l->to_html($cur); + } else { # regular lines, OK + print $zfh $l->to_html($cur); } undef $cur; # free memory } } -sub _msg_page_prepare_obuf { - my ($eml, $ctx) = @_; - my $over = $ctx->{-inbox}->over; - my $obfs_ibx = $ctx->{-obfs_ibx}; - my $rv = ''; +sub _msg_page_prepare { + my ($eml, $ctx, $ts) = @_; + my $have_over = !!$ctx->{ibx}->over; my $mids = mids_for_index($eml); my $nr = $ctx->{nr}++; if ($nr) { # unlikely - $rv .= '<pre>'; + if ($ctx->{chash} eq content_hash($eml)) { + warn "W: BUG? @$mids not deduplicated properly\n"; + return; + } + $ctx->{-html_tip} = +qq[<pre>WARNING: multiple messages have this Message-ID (<a +href="d/">diff</a>)</pre><pre>]; } else { $ctx->{first_hdr} = $eml->header_obj; - if ($ctx->{smsg}) { - $rv .= -"<pre>WARNING: multiple messages have this Message-ID\n</pre>"; - } - $rv .= "<pre\nid=b>"; # anchor for body start + $ctx->{chash} = content_hash($eml) if $ctx->{smsg}; # reused MID + $ctx->{-html_tip} = "<pre\nid=b>"; # anchor for body start } - $ctx->{-upfx} = '../' if $over; + $ctx->{-upfx} = '../'; my @title; # (Subject[0], From[0]) + my $hbuf = ''; for my $v ($eml->header('From')) { my @n = PublicInbox::Address::names($v); - $v = ascii_html($v); - $title[1] //= ascii_html(join(', ', @n)); - if ($obfs_ibx) { - obfuscate_addrs($obfs_ibx, $v); - obfuscate_addrs($obfs_ibx, $title[1]); - } - $rv .= "From: $v\n" if $v ne ''; + $title[1] //= join(', ', @n); + $hbuf .= "From: $v\n" if $v ne ''; } - foreach my $h (qw(To Cc)) { + for my $h (qw(To Cc)) { for my $v ($eml->header($h)) { fold_addresses($v); - $v = ascii_html($v); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $rv .= "$h: $v\n" if $v ne ''; + $hbuf .= "$h: $v\n" if $v ne ''; } } my @subj = $eml->header('Subject'); - if (@subj) { - my $v = ascii_html(shift @subj); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $rv .= 'Subject: '; - $rv .= $over ? qq(<a\nhref="#r"\nid=t>$v</a>\n) : "$v\n"; - $title[0] = $v; - for $v (@subj) { # multi-Subject message :< - $v = ascii_html($v); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; - $rv .= "Subject: $v\n"; - } - } else { # dummy anchor for thread skeleton at bottom of page - $rv .= qq(<a\nhref="#r"\nid=t></a>) if $over; - $title[0] = '(no subject)'; - } - for my $v ($eml->header('Date')) { - $v = ascii_html($v); - obfuscate_addrs($obfs_ibx, $v) if $obfs_ibx; # possible :P - $rv .= "Date: $v\n"; + $hbuf .= "Subject: $_\n" for @subj; + $title[0] = $subj[0] // '(no subject)'; + $hbuf .= "Date: $_\n" for $eml->header('Date'); + $hbuf = ascii_html($hbuf); + my $t = $ts ? '?t='.ts2str($ts) : ''; + my ($re, $addr2url) = addr2urlmap($ctx); + $hbuf =~ s!$re!qq(<a\nhref=").$addr2url->{lc $1}.qq($t">$1</a>)!sge; + $ctx->{-title_html} = ascii_html(join(' - ', @title)); + if (my $obfs_ibx = $ctx->{-obfs_ibx}) { + obfuscate_addrs($obfs_ibx, $hbuf); + obfuscate_addrs($obfs_ibx, $ctx->{-title_html}); } - if (!$nr) { # first (and only) message, common case - $ctx->{-title_html} = join(' - ', @title); - $rv = $ctx->html_top . $rv; + + # [thread overview] link is typically added after Date, + # but added after Subject, or even nothing. + if ($have_over) { + chop $hbuf; # drop "\n", or noop if $rv eq '' + $hbuf .= qq{\t<a\nhref="#r">[thread overview]</a>\n}; + $hbuf =~ s!^Subject:\x20(.*?)(\n[A-Z]|\z) + !Subject: <a\nhref="#r"\nid=t>$1</a>$2!msx or + $hbuf .= qq(<a\nhref="#r\nid=t></a>); } if (scalar(@$mids) == 1) { # common case - my $mhtml = ascii_html($mids->[0]); - $rv .= "Message-ID: <$mhtml> "; - $rv .= "(<a\nhref=\"raw\">raw</a>)\n"; + my $x = ascii_html($mids->[0]); + $hbuf .= qq[Message-ID: <$x> (<a href="raw">raw</a>)\n]; + } + if (!$nr) { # first (and only) message, common case + print { $ctx->zfh } $ctx->html_top, $hbuf; } else { + delete $ctx->{-title_html}; + print { $ctx->zfh } $ctx->{-html_tip}, $hbuf; + } + $ctx->{-linkify} //= PublicInbox::Linkify->new; + $hbuf = ''; + if (scalar(@$mids) != 1) { # unlikely, but it happens :< # X-Alt-Message-ID can happen if a message is injected from # public-inbox-nntpd because of multiple Message-ID headers. - my $lnk = PublicInbox::Linkify->new; - my $s = ''; for my $h (qw(Message-ID X-Alt-Message-ID)) { - $s .= "$h: $_\n" for ($eml->header_raw($h)); + $hbuf .= "$h: $_\n" for ($eml->header_raw($h)); } - $lnk->linkify_mids('..', \$s, 1); - $rv .= $s; + $ctx->{-linkify}->linkify_mids('..', \$hbuf, 1); # escapes HTML + print { $ctx->{zfh} } $hbuf; + $hbuf = ''; } - $rv .= _parent_headers($eml, $over); - $rv .= "\n"; - \$rv; + my @irt = $eml->header_raw('In-Reply-To'); + my $refs; + if (@irt) { # ("so-and-so's message of $DATE") added by some MUAs + for (grep(/=\?/, @irt)) { + s/(=\?.*)\z/PublicInbox::Eml::mhdr_decode $1/se; + } + } else { + $refs = references($eml); + $irt[0] = pop(@$refs) if scalar @$refs; + } + $hbuf .= "In-Reply-To: $_\n" for @irt; + + # do not display References: if search is present, + # we show the thread skeleton at the bottom, instead. + if (!$have_over) { + $refs //= references($eml); + $hbuf .= 'References: <'.join(">\n\t<", @$refs).">\n" if @$refs; + } + $ctx->{-linkify}->linkify_mids('..', \$hbuf); # escapes HTML + say { $ctx->{zfh} } $hbuf; + 1; } sub SKEL_EXPAND () { @@ -729,7 +775,7 @@ sub SKEL_EXPAND () { sub thread_skel ($$$) { my ($skel, $ctx, $hdr) = @_; my $mid = mids($hdr)->[0]; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); my $parent = in_reply_to($hdr); $$skel .= "\n<b>Thread overview: </b>"; @@ -738,7 +784,8 @@ sub thread_skel ($$$) { $$skel .= SKEL_EXPAND."\n "; $$skel .= ghost_parent('../', $parent) . "\n"; } else { - $$skel .= '[no followups] '.SKEL_EXPAND."\n"; + $$skel .= "<a\nid=r>[no followups]</a> ". + SKEL_EXPAND."\n"; } $ctx->{next_msg} = undef; $ctx->{parent_msg} = $parent; @@ -752,7 +799,6 @@ sub thread_skel ($$$) { # when multiple Subject: headers are present, so we follow suit: my $subj = $hdr->header('Subject') // ''; $subj = '(no subject)' if $subj eq ''; - $ctx->{prev_subj} = [ split(/ /, subject_normalized($subj)) ]; $ctx->{cur} = $mid; $ctx->{prev_attr} = ''; $ctx->{prev_level} = 0; @@ -765,54 +811,47 @@ sub thread_skel ($$$) { $ctx->{parent_msg} = $parent; } -sub _parent_headers { - my ($hdr, $over) = @_; - my $rv = ''; - my @irt = $hdr->header_raw('In-Reply-To'); - my $refs; - if (@irt) { - my $lnk = PublicInbox::Linkify->new; - $rv .= "In-Reply-To: $_\n" for @irt; - $lnk->linkify_mids('..', \$rv); - } else { - $refs = references($hdr); - my $irt = pop @$refs; - if (defined $irt) { - my $html = ascii_html($irt); - my $href = mid_href($irt); - $rv .= "In-Reply-To: <"; - $rv .= "<a\nhref=\"../$href/\">$html</a>>\n"; - } - } - - # do not display References: if search is present, - # we show the thread skeleton at the bottom, instead. - return $rv if $over; - - $refs //= references($hdr); - if (@$refs) { - @$refs = map { linkify_ref_no_over($_) } @$refs; - $rv .= 'References: '. join("\n\t", @$refs) . "\n"; - } - $rv; -} - -# returns a string buffer +# writes to zbuf sub html_footer { my ($ctx, $hdr) = @_; - my $ibx = $ctx->{-inbox}; my $upfx = '../'; - my $skel; - my $rv = '<pre>'; - if ($ibx->over) { + my (@related, $skel); + my $foot = '<pre>'; + my $qry = delete $ctx->{-qry}; + if ($qry && $ctx->{ibx}->isrch) { + my $q = ''; # search for either ancestor or descendent patches + for (@{$qry->{dfpre}}, @{$qry->{dfpost}}) { + chop if length > 7; # include 1 abbrev "older" patches + $q .= "dfblob:$_ "; + } + chop $q; # omit trailing SP + local $Text::Wrap::columns = COLS; + local $Text::Wrap::huge = 'overflow'; + $q = wrap('', '', $q); + my $rows = ($q =~ tr/\n/\n/) + 1; + $q = ascii_html($q); + $related[0] = <<EOM; +<form id=related +action=$upfx +><pre>find likely ancestor, descendant, or conflicting patches for <a +href=#t>this message</a>: +<textarea name=q cols=${\COLS} rows=$rows>$q</textarea> +<input type=submit value=search +/>\t(<a href=${upfx}_/text/help/#search>help</a>)</pre></form> +EOM + # TODO: related codesearch + # my $csrchv = $ctx->{ibx}->{-csrch} // []; + # push @related, '<pre>'.ascii_html(Dumper($csrchv)).'</pre>'; + } + if ($ctx->{ibx}->over) { my $t = ts2str($ctx->{-t_max}); my $t_fmt = fmt_ts($ctx->{-t_max}); - $skel .= <<EOF; - other threads:[<a + my $fallback = @related ? "\t" : "<a id=related>\t</a>"; + $skel = <<EOF; +${fallback}other threads:[<a href="$upfx?t=$t">~$t_fmt UTC</a>|<a href="$upfx">newest</a>] EOF - thread_skel(\$skel, $ctx, $hdr); my ($next, $prev); my $parent = ' '; @@ -820,48 +859,32 @@ EOF if (my $n = $ctx->{next_msg}) { $n = mid_href($n); - $next = "<a\nhref=\"$upfx$n/\"\nrel=next>next</a>"; + $next = qq(<a\nhref="$upfx$n/"\nrel=next>next</a>); } - my $u; my $par = $ctx->{parent_msg}; - if ($par) { - $u = mid_href($par); - $u = "$upfx$u/"; - } + my $u = $par ? $upfx.mid_href($par).'/' : undef; if (my $p = $ctx->{prev_msg}) { $prev = mid_href($p); if ($p && $par && $p eq $par) { - $prev = "<a\nhref=\"$upfx$prev/\"\n" . + $prev = qq(<a\nhref="$upfx$prev/"\n) . 'rel=prev>prev parent</a>'; $parent = ''; } else { - $prev = "<a\nhref=\"$upfx$prev/\"\n" . + $prev = qq(<a\nhref="$upfx$prev/"\n) . 'rel=prev>prev</a>'; - $parent = " <a\nhref=\"$u\">parent</a>" if $u; + $parent = qq( <a\nhref="$u">parent</a>) if $u; } } elsif ($u) { # unlikely - $parent = " <a\nhref=\"$u\"\nrel=prev>parent</a>"; + $parent = qq( <a\nhref="$u"\nrel=prev>parent</a>); } - $rv .= "$next $prev$parent "; + $foot .= "$next $prev$parent "; } else { # unindexed inboxes w/o over $skel = qq( <a\nhref="$upfx">latest</a>); } - $rv .= qq(<a\nhref="#R">reply</a>); - $rv .= $skel; - $rv .= '</pre>'; - $rv .= msg_reply($ctx, $hdr); -} - -sub linkify_ref_no_over { - my ($mid) = @_; - my $href = mid_href($mid); - my $html = ascii_html($mid); - "<<a\nhref=\"../$href/\">$html</a>>"; -} - -sub anchor_for { - my ($msgid) = @_; - 'm' . id_compress($msgid, 1); + # $skel may be big for big threads, don't append it to $foot + print { $ctx->zfh } $foot, qq(<a\nhref="#R">reply</a>), + $skel, '</pre>', @related, + msg_reply($ctx, $hdr); } sub ghost_parent { @@ -922,8 +945,8 @@ sub thread_results { my $tip = splice(@$rootset, $idx, 1); @$rootset = reverse @$rootset; unshift @$rootset, $tip; - $ctx->{sl_note} = strict_loose_note($nr); } + $ctx->{sl_note} = strict_loose_note($nr); } $rootset } @@ -965,7 +988,7 @@ sub skel_dump { # walk_thread callback $$skel .= delete($ctx->{sl_note}) || ''; } - my $f = ascii_html($smsg->{from_name}); + my $f = ascii_html(delete $smsg->{from_name}); my $obfs_ibx = $ctx->{-obfs_ibx}; obfuscate_addrs($obfs_ibx, $f) if $obfs_ibx; @@ -1059,11 +1082,13 @@ sub _skel_ghost { 1; } +# note: we favor Date: here because git-send-email increments it +# to preserve [PATCH $N/$M] ordering in series (it can't control Received:) sub sort_ds { - [ sort { + @{$_[0]} = sort { (eval { $a->topmost->{ds} } || 0) <=> (eval { $b->topmost->{ds} } || 0) - } @{$_[0]} ]; + } @{$_[0]}; } # accumulate recent topics if search is supported @@ -1072,7 +1097,7 @@ sub acc_topic { # walk_thread callback my ($ctx, $level, $smsg) = @_; my $mid = $smsg->{mid}; my $has_blob = $smsg->{blob} // do { - if (my $by_mid = $ctx->{-inbox}->smsg_by_mid($mid)) { + if (my $by_mid = $ctx->{ibx}->smsg_by_mid($mid)) { %$smsg = (%$smsg, %$by_mid); 1; } @@ -1080,9 +1105,10 @@ sub acc_topic { # walk_thread callback if ($has_blob) { my $subj = subject_normalized($smsg->{subject}); $subj = '(no subject)' if $subj eq ''; + my $ts = $smsg->{ts}; my $ds = $smsg->{ds}; if ($level == 0) { # new, top-level topic - my $topic = [ $ds, 1, { $subj => $mid }, $subj ]; + my $topic = [ $ts, $ds, 1, { $subj => $mid }, $subj ]; $ctx->{-cur_topic} = $topic; push @{$ctx->{order}}, $topic; return 1; @@ -1090,10 +1116,11 @@ sub acc_topic { # walk_thread callback # continue existing topic my $topic = $ctx->{-cur_topic}; # should never be undef - $topic->[0] = $ds if $ds > $topic->[0]; - $topic->[1]++; # bump N+ message counter - my $seen = $topic->[2]; - if (scalar(@$topic) == 3) { # parent was a ghost + $topic->[0] = $ts if $ts > $topic->[0]; + $topic->[1] = $ds if $ds > $topic->[1]; + $topic->[2]++; # bump N+ message counter + my $seen = $topic->[3]; + if (scalar(@$topic) == 4) { # parent was a ghost push @$topic, $subj; } elsif (!defined($seen->{$subj})) { push @$topic, $level, $subj; # @extra messages @@ -1101,7 +1128,7 @@ sub acc_topic { # walk_thread callback $seen->{$subj} = $mid; # latest for subject } else { # ghost message return 1 if $level != 0; # ignore child ghosts - my $topic = $ctx->{-cur_topic} = [ -666, 0, {} ]; + my $topic = $ctx->{-cur_topic} = [ -666, -666, 0, {} ]; push @{$ctx->{order}}, $topic; } 1; @@ -1116,12 +1143,13 @@ sub dump_topics { } my @out; - my $ibx = $ctx->{-inbox}; - my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef; - + my $obfs_ibx = $ctx->{ibx}->{obfuscate} ? $ctx->{ibx} : undef; + if (my $note = delete $ctx->{t_note}) { + push @out, $note; # "messages from ... to ..." + } # sort by recency, this allows new posts to "bump" old topics... foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) { - my ($ds, $n, $seen, $top_subj, @extra) = @$topic; + my ($ts, $ds, $n, $seen, $top_subj, @extra) = @$topic; @$topic = (); next unless defined $top_subj; # ghost topic my $mid = delete $seen->{$top_subj}; @@ -1141,13 +1169,11 @@ sub dump_topics { $anchor = '#t'; # thread skeleton } - my $mbox = qq(<a\nhref="$href/t.mbox.gz">mbox.gz</a>); - my $atom = qq(<a\nhref="$href/t.atom">Atom</a>); my $s = "<a\nhref=\"$href/T/$anchor\">$top_subj</a>\n" . - " $ds UTC $n - $mbox / $atom\n"; - for (my $i = 0; $i < scalar(@extra); $i += 2) { - my $level = $extra[$i]; - my $subj = $extra[$i + 1]; # already normalized + " $ds UTC $n\n"; + while (@extra) { + my $level = shift @extra; + my $subj = shift @extra; # already normalized $mid = delete $seen->{$subj}; my @subj = split(/ /, $subj); my @next_prev = @subj; # full copy @@ -1179,7 +1205,11 @@ sub pagination_footer ($$) { $next = $next ? "$next | " : ' | '; $prev .= qq[ | <a\nhref="$latest">latest</a>]; } - "<hr><pre>page: $next$prev</pre>"; + my $rv = '<hr><pre id=nav>'; + $rv .= "page: $next$prev\n" if $next || $prev; + $rv .= q{- recent:[<b>subjects (threaded)</b>|<a +href="./topics_new.html">topics (new)</a>|<a +href="./topics_active.html">topics (active)</a>]</pre>}; } sub paginate_recent ($$) { @@ -1194,23 +1224,30 @@ sub paginate_recent ($$) { $t =~ s/\A([0-9]{8,14})-// and $after = str2ts($1); $t =~ /\A([0-9]{8,14})\z/ and $before = str2ts($1); - my $ibx = $ctx->{-inbox}; - my $msgs = $ibx->recent($opts, $after, $before); - my $nr = scalar @$msgs; - if ($nr < $lim && defined($after)) { + my $msgs = $ctx->{ibx}->over->recent($opts, $after, $before); + if (defined($after) && scalar(@$msgs) < $lim) { $after = $before = undef; - $msgs = $ibx->recent($opts); - $nr = scalar @$msgs; + $msgs = $ctx->{ibx}->over->recent($opts); } - my $more = $nr == $lim; + my $more = scalar(@$msgs) == $lim; my ($newest, $oldest); - if ($nr) { + if (@$msgs) { $newest = $msgs->[0]->{ts}; $oldest = $msgs->[-1]->{ts}; # if we only had $after, our SQL query in ->recent ordered if ($newest < $oldest) { ($oldest, $newest) = ($newest, $oldest); - $more = 0 if defined($after) && $after < $oldest; + $more = undef if defined($after) && $after < $oldest; + } + if (defined($after // $before)) { + my $n = strftime('%Y-%m-%d %H:%M:%S', gmtime($newest)); + my $o = strftime('%Y-%m-%d %H:%M:%S', gmtime($oldest)); + $ctx->{t_note} = <<EOM; + messages from $o to $n UTC [<a href="#nav">more...</a>] +EOM + my $s = ts2str($newest); + $ctx->{prev_page} = qq[<a\nhref="?t=$s-"\nrel=prev>] . + 'prev (newer)</a>'; } } if (defined($oldest) && $more) { @@ -1218,11 +1255,6 @@ sub paginate_recent ($$) { $ctx->{next_page} = qq[<a\nhref="?t=$s"\nrel=next>] . 'next (older)</a>'; } - if (defined($newest) && (defined($before) || defined($after))) { - my $s = ts2str($newest); - $ctx->{prev_page} = qq[<a\nhref="?t=$s-"\nrel=prev>] . - 'prev (newer)</a>'; - } $msgs; } @@ -1230,11 +1262,8 @@ sub paginate_recent ($$) { sub index_topics { my ($ctx) = @_; my $msgs = paginate_recent($ctx, 200); # 200 is our window - if (@$msgs) { - walk_thread(thread_results($ctx, $msgs), $ctx, \&acc_topic); - } - html_oneshot($ctx, dump_topics($ctx), \pagination_footer($ctx, '.')); - + walk_thread(thread_results($ctx, $msgs), $ctx, \&acc_topic) if @$msgs; + html_oneshot($ctx, dump_topics($ctx), pagination_footer($ctx, '.')); } sub thread_adj_level { @@ -1264,8 +1293,34 @@ sub thread_adj_level { sub ghost_index_entry { my ($ctx, $level, $node) = @_; my ($beg, $end) = thread_adj_level($ctx, $level); - $beg . '<pre>'. ghost_parent($ctx->{-upfx}, $node->{mid}) + $beg . '<pre>'. ghost_parent($ctx->{-upfx}, $node->{mid} // '?') . '</pre>' . $end; } +# /$INBOX/$MSGID/d/ endpoint +sub diff_msg { + my ($ctx) = @_; + require PublicInbox::MailDiff; + my $ibx = $ctx->{ibx}; + my $over = $ibx->over or return no_over_html($ctx); + my ($id, $prev); + my $md = bless { ctx => $ctx }, 'PublicInbox::MailDiff'; + my $next_arg = $md->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; + my $smsg = $md->{smsg} = $over->next_by_mid(@$next_arg) or + return; # undef == 404 + $ctx->{-t_max} = $smsg->{ts}; + $ctx->{-upfx} = '../../'; + $ctx->{-apfx} = '//'; # fail on to_attr() + $ctx->{-linkify} = PublicInbox::Linkify->new; + my $mid = ascii_html($smsg->{mid}); + $ctx->{-title_html} = "diff for duplicates of <$mid>"; + PublicInbox::WwwStream::html_init($ctx); + print { $ctx->{zfh} } '<pre>diff for duplicates of <<a href="../">', + $mid, "</a>>\n\n"; + sub { + $ctx->attach($_[0]->([200, delete $ctx->{-res_hdr}])); + $md->begin_mail_diff; + }; +} + 1; diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm index 7ec57d8d..d078c5f9 100644 --- a/lib/PublicInbox/ViewDiff.pm +++ b/lib/PublicInbox/ViewDiff.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # used by PublicInbox::View @@ -7,17 +7,13 @@ # (or reconstruct) blobs. package PublicInbox::ViewDiff; -use 5.010_001; -use strict; -use warnings; -use base qw(Exporter); -our @EXPORT_OK = qw(flush_diff); +use v5.12; +use parent qw(Exporter); +our @EXPORT_OK = qw(flush_diff uri_escape_path); use URI::Escape qw(uri_escape_utf8); -use PublicInbox::Hval qw(ascii_html to_attr); +use PublicInbox::Hval qw(ascii_html to_attr utf8_maybe); use PublicInbox::Git qw(git_unquote); -sub UNSAFE () { "^A-Za-z0-9\-\._~/" } - my $OID_NULL = '0{7,}'; my $OID_BLOB = '[a-f0-9]{7,}'; my $LF = qr!\n!; @@ -30,7 +26,7 @@ my $DIFFSTAT_COMMENT = my $NULL_TO_BLOB = qr/^(index $OID_NULL\.\.)($OID_BLOB)\b/ms; my $BLOB_TO_NULL = qr/^index ($OID_BLOB)(\.\.$OID_NULL)\b/ms; my $BLOB_TO_BLOB = qr/^index ($OID_BLOB)\.\.($OID_BLOB)/ms; -my $EXTRACT_DIFFS = qr/( +our $EXTRACT_DIFFS = qr/( (?: # begin header stuff, don't capture filenames, here, # but instead wait for the --- and +++ lines. (?:^diff\x20--git\x20$FN\x20$FN$LF) @@ -41,24 +37,26 @@ my $EXTRACT_DIFFS = qr/( ^index\x20($OID_BLOB)\.\.($OID_BLOB)$ANY*$LF ^---\x20($FN)$LF ^\+{3}\x20($FN)$LF)/msx; -my $IS_OID = qr/\A$OID_BLOB\z/s; +our $IS_OID = qr/\A$OID_BLOB\z/s; + +sub uri_escape_path { + # '/' + $URI::Escape::Unsafe{RFC3986} + uri_escape_utf8($_[0], "^A-Za-z0-9\-\._~/"); +} # link to line numbers in blobs -sub diff_hunk ($$$$) { - my ($dst, $dctx, $ca, $cb) = @_; +sub diff_hunk ($$$) { + my ($dctx, $ca, $cb) = @_; my ($oid_a, $oid_b, $spfx) = @$dctx{qw(oid_a oid_b spfx)}; if (defined($spfx) && defined($oid_a) && defined($oid_b)) { - my ($n) = ($ca =~ /^-([0-9]+)/); - $n = defined($n) ? "#n$n" : ''; + my $n = ($ca =~ /^-([0-9]+)/) ? "#n$1" : ''; + my $x = qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>); - $$dst .= qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>); - - ($n) = ($cb =~ /^\+([0-9]+)/); - $n = defined($n) ? "#n$n" : ''; - $$dst .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@); + $n = ($cb =~ /^\+([0-9]+)/) ? "#n$1" : ''; + $x .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@); } else { - $$dst .= "@@ $ca $cb @@"; + "@@ $ca $cb @@"; } } @@ -68,8 +66,8 @@ sub oid ($$$) { } # returns true if diffstat anchor written, false otherwise -sub anchor0 ($$$$) { - my ($dst, $ctx, $fn, $rest) = @_; +sub anchor0 ($$$) { + my ($ctx, $fn, $rest) = @_; my $orig = $fn; @@ -79,27 +77,24 @@ sub anchor0 ($$$$) { # which works well in practice. If projects put "=>", or trailing # spaces in filenames, oh well :P $fn =~ s/$DIFFSTAT_COMMENT//; - $fn =~ s/{(?:.+) => (.+)}/$1/ or $fn =~ s/.* => (.+)/$1/; + $fn =~ s/\{(?:.+) => (.+)\}/$1/ or $fn =~ s/.* => (.+)/$1/; $fn = git_unquote($fn); # long filenames will require us to check in anchor1() push(@{$ctx->{-long_path}}, $fn) if $fn =~ s!\A\.\.\./?!!; - if (my $attr = to_attr($ctx->{-apfx}.$fn)) { - $ctx->{-anchors}->{$attr} = 1; - my $spaces = ($orig =~ s/( +)\z//) ? $1 : ''; - $$dst .= " <a\nid=i$attr\nhref=#$attr>" . - ascii_html($orig) . '</a>' . $spaces . + my $attr = to_attr($ctx->{-apfx}.$fn) // return; + $ctx->{-anchors}->{$attr} = 1; + my $spaces = ($orig =~ s/( +)\z//) ? $1 : ''; + print { $ctx->{zfh} } " <a\nid=i$attr\nhref=#$attr>", + ascii_html($orig), '</a>', $spaces, $ctx->{-linkify}->to_html($rest); - return 1; - } - undef; } # returns "diff --git" anchor destination, undef otherwise sub anchor1 ($$) { my ($ctx, $pb) = @_; - my $attr = to_attr($ctx->{-apfx}.$pb) or return; + my $attr = to_attr($ctx->{-apfx}.$pb) // return; my $ok = delete $ctx->{-anchors}->{$attr}; @@ -107,10 +102,10 @@ sub anchor1 ($$) { # assume diffstat and diff output follow the same order, # and ignore different ordering (could be malicious input) unless ($ok) { - my $fn = shift(@{$ctx->{-long_path}}) or return; + my $fn = shift(@{$ctx->{-long_path}}) // return; $pb =~ /\Q$fn\E\z/s or return; - $attr = to_attr($ctx->{-apfx}.$fn) or return; - $ok = delete $ctx->{-anchors}->{$attr} or return; + $attr = to_attr($ctx->{-apfx}.$fn) // return; + $ok = delete $ctx->{-anchors}->{$attr} // return; } $ok ? "<a\nhref=#i$attr\nid=$attr>diff</a> --git" : undef } @@ -122,18 +117,14 @@ sub diff_header ($$$) { my $dctx = { spfx => $spfx }; # get rid of leading "a/" or "b/" (or whatever --{src,dst}-prefix are) - $pa = (split('/', git_unquote($pa), 2))[1] if $pa ne '/dev/null'; - $pb = (split('/', git_unquote($pb), 2))[1] if $pb ne '/dev/null'; + $pa = (split(m'/', git_unquote($pa), 2))[1] if $pa ne '/dev/null'; + $pb = (split(m'/', git_unquote($pb), 2))[1] if $pb ne '/dev/null'; if ($pa eq $pb && $pb ne '/dev/null') { - $dctx->{Q} = "?b=".uri_escape_utf8($pb, UNSAFE); + $dctx->{Q} = '?b='.uri_escape_path($pb); } else { my @q; - if ($pb ne '/dev/null') { - push @q, 'b='.uri_escape_utf8($pb, UNSAFE); - } - if ($pa ne '/dev/null') { - push @q, 'a='.uri_escape_utf8($pa, UNSAFE); - } + push @q, 'b='.uri_escape_path($pb) if $pb ne '/dev/null'; + push @q, 'a='.uri_escape_path($pa) if $pa ne '/dev/null'; $dctx->{Q} = '?'.join('&', @q); } @@ -143,44 +134,48 @@ sub diff_header ($$$) { # no need to capture oid_a and oid_b on add/delete, # we just linkify OIDs directly via s///e in conditional - if (($$x =~ s/$NULL_TO_BLOB/$1 . oid($dctx, $spfx, $2)/e) || - ($$x =~ s/$BLOB_TO_NULL/ - 'index ' . oid($dctx, $spfx, $1) . $2/e)) { + if ($$x =~ s/$NULL_TO_BLOB/$1 . oid($dctx, $spfx, $2)/e) { + push @{$ctx->{-qry}->{dfpost}}, $2; + } elsif ($$x =~ s/$BLOB_TO_NULL/'index '.oid($dctx, $spfx, $1).$2/e) { + push @{$ctx->{-qry}->{dfpre}}, $1; } elsif ($$x =~ $BLOB_TO_BLOB) { # modification-only, not add/delete: # linkify hunk headers later using oid_a and oid_b @$dctx{qw(oid_a oid_b)} = ($1, $2); + push @{$ctx->{-qry}->{dfpre}}, $1; + push @{$ctx->{-qry}->{dfpost}}, $2; } else { warn "BUG? <$$x> had no ^index line"; } $$x =~ s!^diff --git!anchor1($ctx, $pb) // 'diff --git'!ems; - my $dst = $ctx->{obuf}; - $$dst .= qq(<span\nclass="head">); - $$dst .= $$x; - $$dst .= '</span>'; + print { $ctx->{zfh} } qq(<span\nclass="head">), $$x, '</span>'; $dctx; } sub diff_before_or_after ($$) { my ($ctx, $x) = @_; - my $linkify = $ctx->{-linkify}; - my $dst = $ctx->{obuf}; - my $anchors = exists($ctx->{-anchors}) ? 1 : 0; - for my $y (split(/(^---\n)/sm, $$x)) { - if ($y =~ /\A---\n\z/s) { - $$dst .= "---\n"; # all HTML is "\r\n" => "\n" - $anchors |= 2; - } elsif ($anchors == 3 && $y =~ /^ [0-9]+ files? changed, /sm) { - # ok, looks like a diffstat, go line-by-line: - for my $l (split(/^/m, $y)) { - if ($l =~ /^ (.+)( +\| .*\z)/s) { - anchor0($dst, $ctx, $1, $2) and next; - } - $$dst .= $linkify->to_html($l); - } - } else { # commit message, notes, etc - $$dst .= $linkify->to_html($y); + if (exists $ctx->{-anchors} && $$x =~ # diffstat lines: + /((?:^\x20(?:[^\n]+?)(?:\x20+\|\x20[^\n]*\n))+) + (\x20[0-9]+\x20files?\x20)changed,/msx) { + my $pre = substr($$x, 0, $-[0]); # (likely) short prefix + substr($$x, 0, $+[0], ''); # sv_chop on $$x ($$x may be long) + my @x = ($2, $1); + my $lnk = $ctx->{-linkify}; + my $zfh = $ctx->{zfh}; + # uninteresting prefix + print $zfh $lnk->to_html($pre); + for my $l (split(/^/m, pop(@x))) { # $2 per-file stat lines + $l =~ /^ (.+)( +\| .*\z)/s and + anchor0($ctx, $1, $2) and next; + print $zfh $lnk->to_html($l); } + my $ch = $ctx->{changed_href} // '#related'; + print $zfh pop(@x), # $3 /^ \d+ files? / + qq(<a href="$ch">changed</a>,), + # insertions/deletions, notes, commit message, etc: + $lnk->to_html($$x); + } else { + print { $ctx->{zfh} } $ctx->{-linkify}->to_html($$x); } } @@ -191,9 +186,9 @@ sub flush_diff ($$) { my @top = split($EXTRACT_DIFFS, $$cur); undef $$cur; # free memory - my $linkify = $ctx->{-linkify}; - my $dst = $ctx->{obuf}; + my $lnk = $ctx->{-linkify}; my $dctx; # {}, keys: Q, oid_a, oid_b + my $zfh = $ctx->zfh; while (defined(my $x = shift @top)) { if (scalar(@top) >= 4 && @@ -201,7 +196,8 @@ sub flush_diff ($$) { $top[0] =~ $IS_OID) { $dctx = diff_header(\$x, $ctx, \@top); } elsif ($dctx) { - my $after = ''; + open(my $afh, '>>:utf8', \(my $after='')) or + die "open: $!"; # Quiet "Complex regular subexpression recursion limit" # warning. Perl will truncate matches upon hitting @@ -216,29 +212,33 @@ sub flush_diff ($$) { for my $s (split(/((?:(?:^\+[^\n]*\n)+)| (?:(?:^-[^\n]*\n)+)| (?:^@@ [^\n]+\n))/xsm, $x)) { + undef $x; if (!defined($dctx)) { - $after .= $s; + print $afh $s; } elsif ($s =~ s/\A@@ (\S+) (\S+) @@//) { - $$dst .= qq(<span\nclass="hunk">); - diff_hunk($dst, $dctx, $1, $2); - $$dst .= $linkify->to_html($s); - $$dst .= '</span>'; - } elsif ($s =~ /\A\+/) { - $$dst .= qq(<span\nclass="add">); - $$dst .= $linkify->to_html($s); - $$dst .= '</span>'; + print $zfh qq(<span\nclass="hunk">), + diff_hunk($dctx, $1, $2), + $lnk->to_html($s), + '</span>'; + } elsif ($s =~ /\A\+/) { # $s may be huge + print $zfh qq(<span\nclass="add">), + $lnk->to_html($s), + '</span>'; } elsif ($s =~ /\A-- $/sm) { # email sig starts $dctx = undef; - $after .= $s; - } elsif ($s =~ /\A-/) { - $$dst .= qq(<span\nclass="del">); - $$dst .= $linkify->to_html($s); - $$dst .= '</span>'; - } else { - $$dst .= $linkify->to_html($s); + print $afh $s; + } elsif ($s =~ /\A-/) { # $s may be huge + print $zfh qq(<span\nclass="del">), + $lnk->to_html($s), + '</span>'; + } else { # $s may be huge + print $zfh $lnk->to_html($s); } } - diff_before_or_after($ctx, \$after) unless $dctx; + if (!$dctx) { + utf8_maybe($after); + diff_before_or_after($ctx, \$after); + } } else { diff_before_or_after($ctx, \$x); } diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm index 87927d5e..83a83698 100644 --- a/lib/PublicInbox/ViewVCS.pm +++ b/lib/PublicInbox/ViewVCS.pm @@ -1,8 +1,7 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # show any VCS object, similar to "git show" -# FIXME: we only show blobs for now # # This can use a "solver" to reconstruct blobs based on git # patches (with abbreviated OIDs in the header). However, the @@ -15,13 +14,23 @@ package PublicInbox::ViewVCS; use strict; -use warnings; -use bytes (); # only for bytes::length +use v5.10.1; +use File::Temp 0.19 (); # newdir use PublicInbox::SolverGit; +use PublicInbox::Git; +use PublicInbox::GitAsyncCat; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::Linkify; use PublicInbox::Tmpfile; -use PublicInbox::Hval qw(ascii_html to_filename); +use PublicInbox::ViewDiff qw(flush_diff uri_escape_path); +use PublicInbox::View; +use PublicInbox::Eml; +use PublicInbox::OnDestroy; +use Text::Wrap qw(wrap); +use PublicInbox::Hval qw(ascii_html to_filename prurl utf8_maybe); +use POSIX qw(strftime); +use autodie qw(open seek truncate); +use Fcntl qw(SEEK_SET); my $hl = eval { require PublicInbox::HlMod; PublicInbox::HlMod->new; @@ -30,139 +39,565 @@ my $hl = eval { my %QP_MAP = ( A => 'oid_a', a => 'path_a', b => 'path_b' ); our $MAX_SIZE = 1024 * 1024; # TODO: configurable my $BIN_DETECT = 8000; # same as git +my $SHOW_FMT = '--pretty=format:'.join('%n', '%P', '%p', '%H', '%T', '%s', '%f', + '%an <%ae> %ai', '%cn <%ce> %ci', '%b%x00'); -sub html_page ($$$) { - my ($ctx, $code, $strref) = @_; +my %GIT_MODE = ( + '100644' => ' ', # blob + '100755' => 'x', # executable blob + '040000' => 'd', # tree + '120000' => 'l', # symlink + '160000' => 'g', # commit (gitlink) +); + +# TODO: not fork safe, but we don't fork w/o exec in PublicInbox::WWW +my (@solver_q, $solver_lim); +my $solver_nr = 0; + +sub html_page ($$;@) { + my ($ctx, $code) = @_[0, 1]; my $wcb = delete $ctx->{-wcb}; - $ctx->{-upfx} = '../../'; # from "/$INBOX/$OID/s/" - my $res = html_oneshot($ctx, $code, $strref); + $ctx->{-upfx} //= '../../'; # from "/$INBOX/$OID/s/" + my $res = html_oneshot($ctx, $code, @_[2..$#_]); $wcb ? $wcb->($res) : $res; } +sub dbg_log ($) { + my ($ctx) = @_; + my $log = delete $ctx->{lh} // die 'BUG: already captured debug log'; + if (!CORE::seek($log, 0, SEEK_SET)) { + warn "seek(log): $!"; + return '<pre>debug log seek error</pre>'; + } + $log = eval { PublicInbox::IO::read_all $log } // do { + warn "read(log): $@"; + return '<pre>debug log read error</pre>'; + }; + return '' if $log eq ''; + $ctx->{-linkify} //= PublicInbox::Linkify->new; + "<hr><pre>debug log:\n\n". + $ctx->{-linkify}->to_html($log).'</pre>'; +} + sub stream_blob_parse_hdr { # {parse_hdr} for Qspawn my ($r, $bref, $ctx) = @_; - my ($res, $logref) = delete @$ctx{qw(-res -logref)}; - my ($git, $oid, $type, $size, $di) = @$res; + my ($git, $oid, $type, $size, $di) = @{$ctx->{-res}}; my @cl = ('Content-Length', $size); - if (!defined $r) { # error - html_page($ctx, 500, $logref); + if (!defined $r) { # sysread error + html_page($ctx, 500, dbg_log($ctx)); } elsif (index($$bref, "\0") >= 0) { [200, [qw(Content-Type application/octet-stream), @cl] ]; } else { - my $n = bytes::length($$bref); + my $n = length($$bref); if ($n >= $BIN_DETECT || $n == $size) { return [200, [ 'Content-Type', 'text/plain; charset=UTF-8', @cl ] ]; } if ($r == 0) { - warn "premature EOF on $oid $$logref\n"; - return html_page($ctx, 500, $logref); + my $log = dbg_log($ctx); + warn "premature EOF on $oid $log"; + return html_page($ctx, 500, $log); } - @$ctx{qw(-res -logref)} = ($res, $logref); undef; # bref keeps growing } } -sub stream_large_blob ($$$$) { - my ($ctx, $res, $logref, $fn) = @_; - $ctx->{-logref} = $logref; +sub stream_large_blob ($$) { + my ($ctx, $res) = @_; $ctx->{-res} = $res; my ($git, $oid, $type, $size, $di) = @$res; - my $cmd = ['git', "--git-dir=$git->{git_dir}", 'cat-file', $type, $oid]; + my $cmd = $git->cmd('cat-file', $type, $oid); my $qsp = PublicInbox::Qspawn->new($cmd); - my $env = $ctx->{env}; - $env->{'qspawn.wcb'} = delete $ctx->{-wcb}; - $qsp->psgi_return($env, undef, \&stream_blob_parse_hdr, $ctx); + $ctx->{env}->{'qspawn.wcb'} = $ctx->{-wcb}; + $qsp->psgi_yield($ctx->{env}, undef, \&stream_blob_parse_hdr, $ctx); } -sub show_other_result ($$) { +sub show_other_result ($$) { # future-proofing my ($bref, $ctx) = @_; - my ($qsp, $logref) = delete @$ctx{qw(-qsp -logref)}; - if (my $err = $qsp->{err}) { - utf8::decode($$err); - $$logref .= "git show error: $err"; - return html_page($ctx, 500, $logref); + if (my $qsp_err = delete $ctx->{-qsp_err}) { + return html_page($ctx, 500, dbg_log($ctx) . + "git show error:$qsp_err"); } my $l = PublicInbox::Linkify->new; - utf8::decode($$bref); - $$bref = '<pre>'. $l->to_html($$bref); - $$bref .= '</pre><hr>' . $$logref; - html_page($ctx, 200, $bref); + utf8_maybe($$bref); + html_page($ctx, 200, '<pre>', $l->to_html($$bref), '</pre><hr>', + dbg_log($ctx)); } -sub show_other ($$$$) { - my ($ctx, $res, $logref, $fn) = @_; - my ($git, $oid, $type, $size) = @$res; - if ($size > $MAX_SIZE) { - $$logref = "$oid is too big to show\n" . $$logref; - return html_page($ctx, 200, $logref); +sub cmt_title { # git->cat_async callback + my ($bref, $oid, $type, $size, $ctx_cb) = @_; + utf8_maybe($$bref); + my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : ''; + # $ctx_cb is [ $ctx, $cmt_fin ] + push @{$ctx_cb->[0]->{-cmt_pt}}, ascii_html($title); +} + +sub do_cat_async { + my ($arg, $cb, @req) = @_; + # favor git(1) over Gcf2 (libgit2) for SHA-256 support + my $ctx = ref $arg eq 'ARRAY' ? $arg->[0] : $arg; + $ctx->{git}->cat_async($_, $cb, $arg) for @req; + if ($ctx->{env}->{'pi-httpd.async'}) { + $ctx->{git}->watch_async; + } else { # synchronous, generic PSGI + $ctx->{git}->cat_async_wait; + } +} + +sub do_check_async { + my ($ctx, $cb, @req) = @_; + if ($ctx->{env}->{'pi-httpd.async'}) { + async_check($ctx, $_, $cb, $ctx) for @req; + } else { # synchronous, generic PSGI + $ctx->{git}->check_async($_, $cb, $ctx) for @req; + $ctx->{git}->check_async_wait; + } +} + +sub cmt_hdr_prep { # psgi_qx cb + my ($fh, $ctx, $cmt_fin) = @_; + return if $ctx->{-qsp_err_h}; # let cmt_fin handle it + seek $fh, 0, SEEK_SET; + my $buf = do { local $/ = "\0"; <$fh> } // die "readline: $!"; + chop($buf) eq "\0" or die 'no NUL in git show -z output'; + utf8_maybe($buf); # non-UTF-8 commits exist + chomp $buf; + (my $P, my $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9); + truncate $fh, 0; + return unless $P; + seek $fh, 0, SEEK_SET; + my $qsp_p = PublicInbox::Qspawn->new($ctx->{git}->cmd(qw(show + --encoding=UTF-8 --pretty=format:%n -M --stat -p), $ctx->{oid}), + undef, { 1 => $fh }); + $qsp_p->{qsp_err} = \($ctx->{-qsp_err_p} = ''); + $qsp_p->psgi_qx($ctx->{env}, undef, \&cmt_patch_prep, $ctx, $cmt_fin); + @{$ctx->{-cmt_P}} = split / /, $P; + @{$ctx->{-cmt_p}} = split / /, $p; # abbreviated + do_cat_async([$ctx, $cmt_fin], \&cmt_title, @{$ctx->{-cmt_P}}); +} + +sub read_patchid { # psgi_qx cb + my ($bref, $ctx, $cmt_fin) = @_; + my ($patchid) = split(/ /, $$bref); # ignore commit + $ctx->{-q_value_html} = "patchid:$patchid" if defined $patchid; +} + +sub cmt_patch_prep { # psgi_qx cb + my ($fh, $ctx, $cmt_fin) = @_; + return if $ctx->{-qsp_err_p}; # let cmt_fin handle error + return if -s $fh > $MAX_SIZE; # too big to show, too big to patch-id + seek $fh, 0, SEEK_SET; + my $qsp = PublicInbox::Qspawn->new( + $ctx->{git}->cmd(qw(patch-id --stable)), + undef, { 0 => $fh }); + $qsp->{qsp_err} = \$ctx->{-qsp_err_p}; + $qsp->psgi_qx($ctx->{env}, undef, \&read_patchid, $ctx, $cmt_fin); +} + +sub ibx_url_for { + my ($ctx) = @_; + $ctx->{ibx} and return; # fall back to $upfx + $ctx->{git} or die 'BUG: no {git}'; + if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) { + if (defined(my $u = $ALL->base_url($ctx->{env}))) { + return wantarray ? ($u) : $u; + } + } + my @ret; + if (my $ibx_names = $ctx->{git}->{ibx_names}) { + my $by_name = $ctx->{www}->{pi_cfg}->{-by_name}; + for my $name (@$ibx_names) { + my $ibx = $by_name->{$name} // do { + warn "inbox `$name' no longer exists\n"; + next; + }; + $ibx->isrch // next; + my $u = defined($ibx->{url}) ? + prurl($ctx->{env}, $ibx->{url}) : $name; + $u .= '/' if substr($u, -1) ne '/'; + push @ret, $u; + } + } + wantarray ? (@ret) : $ret[0]; +} + +sub cmt_fin { # OnDestroy cb + my ($ctx) = @_; + my ($eh, $ep) = delete @$ctx{qw(-qsp_err_h -qsp_err_p)}; + if ($eh || $ep) { + my $e = join(' - ', grep defined, $eh, $ep); + return html_page($ctx, 500, dbg_log($ctx) . + "git show/patch-id error:$e"); } - my $cmd = ['git', "--git-dir=$git->{git_dir}", - qw(show --encoding=UTF-8 --no-color --no-abbrev), $oid ]; + $ctx->{-linkify} //= PublicInbox::Linkify->new; + my $upfx = $ctx->{-upfx} = '../../'; # from "/$INBOX/$OID/s/" + my ($H, $T, $s, $f, $au, $co, $bdy) = @{delete $ctx->{cmt_info}}; + # try to keep author and committer dates lined up + my $x = length($au) - length($co); + if ($x > 0) { + $x = ' ' x $x; + $co =~ s/>/>$x/; + } elsif ($x < 0) { + $x = ' ' x (-$x); + $au =~ s/>/>$x/; + } + $_ = ascii_html($_) for ($au, $co); + my $ibx_url = ibx_url_for($ctx) // $upfx; + $au =~ s!(> +)([0-9]{4,}-\S+ \S+)! + my ($gt, $t) = ($1, $2); + $t =~ tr/ :-//d; + qq($gt<a +href="$ibx_url?t=$t" +title="list contemporary emails">$2</a>) + !e; + + $ctx->{-title_html} = $s = $ctx->{-linkify}->to_html($s); + my ($P, $p, $pt) = delete @$ctx{qw(-cmt_P -cmt_p -cmt_pt)}; + $_ = qq(<a href="$upfx$_/s/">).shift(@$p).'</a> '.shift(@$pt) for @$P; + if (@$P == 1) { + $x = qq{ (<a +href="$f.patch">patch</a>)\n <a href=#parent>parent</a> $P->[0]}; + } elsif (@$P > 1) { + $x = qq(\n <a href=#parents>parents</a> $P->[0]\n); + shift @$P; + $x .= qq( $_\n) for @$P; + chop $x; + } else { + $x = ' (<a href=#root_commit>root commit</a>)'; + } + PublicInbox::WwwStream::html_init($ctx); + my $zfh = $ctx->zfh; + print $zfh <<EOM; +<pre> <a href=#commit>commit</a> $H$x + <a href=#tree>tree</a> <a href="$upfx$T/s/?b=">$T</a> + author $au +committer $co + +<b>$s</b> +EOM + print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy); + undef $bdy; # free memory + my $fh = delete $ctx->{patch_fh}; + if (-s $fh > $MAX_SIZE) { + print $zfh '</pre><hr><pre>patch is too large to show</pre>'; + } else { # prepare flush_diff: + seek $fh, 0, SEEK_SET; + PublicInbox::IO::read_all $fh, -s _, \$x; + utf8_maybe($x); + $ctx->{-apfx} = $ctx->{-spfx} = $upfx; + $x =~ s/\r?\n/\n/gs; + $ctx->{-anchors} = {} if $x =~ /^diff --git /sm; + flush_diff($ctx, \$x); # undefs $x + # TODO: should there be another textarea which attempts to + # search for the exact email which was applied to make this + # commit? + if (my $qry = delete $ctx->{-qry}) { + my $q = ''; + for (@{$qry->{dfpost}}, @{$qry->{dfpre}}) { + # keep blobs as short as reasonable, emails + # are going to be older than what's in git + substr($_, 7, 64, ''); + $q .= "dfblob:$_ "; + } + chop $q; # no trailing SP + local $Text::Wrap::columns = PublicInbox::View::COLS; + local $Text::Wrap::huge = 'overflow'; + $q = wrap('', '', $q); + my $rows = ($q =~ tr/\n/\n/) + 1; + $q = ascii_html($q); + my $ibx_url = ibx_url_for($ctx); + my $alt; + if (defined $ibx_url) { + $alt = " `$ibx_url'"; + $ibx_url =~ m!://! or + substr($ibx_url, 0, 0, '../../../'); + $ibx_url = ascii_html($ibx_url); + } else { + $ibx_url = $upfx; + $alt = ''; + } + print $zfh <<EOM; +</pre><hr><form action="$ibx_url" +id=related><pre>find related emails, including ancestors/descendants/conflicts +<textarea name=q cols=${\PublicInbox::View::COLS} rows=$rows>$q</textarea> +<input type=submit value="search$alt" +/>\t(<a href="${ibx_url}_/text/help/">help</a>)</pre></form> +EOM + } + } + chop($x = <<EOM); +<hr><pre>glossary +-------- +<dfn +id=commit>Commit</dfn> objects reference one tree, and zero or more parents. + +Single <dfn +id=parent>parent</dfn> commits can typically generate a patch in +unified diff format via `git format-patch'. + +Multiple <dfn id=parents>parents</dfn> means the commit is a merge. + +<dfn id=root_commit>Root commits</dfn> have no ancestor. Note that it is +possible to have multiple root commits when merging independent histories. + +Every commit references one top-level <dfn id=tree>tree</dfn> object.</pre> +EOM + delete($ctx->{-wcb})->($ctx->html_done($x)); +} + +sub stream_patch_parse_hdr { # {parse_hdr} for Qspawn + my ($r, $bref, $ctx) = @_; + if (!defined $r) { # sysread error + html_page($ctx, 500, dbg_log($ctx)); + } elsif (index($$bref, "\n\n") >= 0) { + my $eml = bless { hdr => $bref }, 'PublicInbox::Eml'; + my $fn = to_filename($eml->header('Subject') // ''); + $fn = substr($fn // 'PATCH-no-subject', 6); # drop "PATCH-" + return [ 200, [ 'Content-Type', 'text/plain; charset=UTF-8', + 'Content-Disposition', + qq(inline; filename=$fn.patch) ] ]; + } elsif ($r == 0) { + my $log = dbg_log($ctx); + warn "premature EOF on $ctx->{patch_oid} $log"; + return html_page($ctx, 500, $log); + } else { + undef; # bref keeps growing until "\n\n" + } +} + +sub show_patch ($$) { + my ($ctx, $res) = @_; + my ($git, $oid) = @$res; + my $cmd = $git->cmd(qw(format-patch -1 --stdout -C), + "--signature=git format-patch -1 --stdout -C $oid", $oid); my $qsp = PublicInbox::Qspawn->new($cmd); - my $env = $ctx->{env}; - $ctx->{-qsp} = $qsp; - $ctx->{-logref} = $logref; - $qsp->psgi_qx($env, undef, \&show_other_result, $ctx); + $ctx->{env}->{'qspawn.wcb'} = $ctx->{-wcb}; + $ctx->{patch_oid} = $oid; + $qsp->psgi_yield($ctx->{env}, undef, \&stream_patch_parse_hdr, $ctx); } -# user_cb for SolverGit, called as: user_cb->($result_or_error, $uarg) -sub solve_result { - my ($res, $ctx) = @_; - my ($log, $hints, $fn) = delete @$ctx{qw(log hints fn)}; +sub show_commit ($$) { + my ($ctx, $res) = @_; + return show_patch($ctx, $res) if ($ctx->{fn} // '') =~ /\.patch\z/; + my ($git, $oid) = @$res; + # patch-id needs two passes, and we use the initial show to ensure + # a patch embedded inside the commit message body doesn't get fed + # to patch-id: + open $ctx->{patch_fh}, '+>', "$ctx->{-tmp}/show"; + my $qsp_h = PublicInbox::Qspawn->new($git->cmd('show', $SHOW_FMT, + qw(--encoding=UTF-8 -z --no-notes --no-patch), $oid), + undef, { 1 => $ctx->{patch_fh} }); + $qsp_h->{qsp_err} = \($ctx->{-qsp_err_h} = ''); + my $cmt_fin = on_destroy \&cmt_fin, $ctx; + $ctx->{git} = $git; + $ctx->{oid} = $oid; + $qsp_h->psgi_qx($ctx->{env}, undef, \&cmt_hdr_prep, $ctx, $cmt_fin); +} - unless (seek($log, 0, 0)) { - $ctx->{env}->{'psgi.errors'}->print("seek(log): $!\n"); - return html_page($ctx, 500, \'seek error'); +sub show_other ($$) { # just in case... + my ($ctx, $res) = @_; + my ($git, $oid, $type, $size) = @$res; + $size > $MAX_SIZE and return html_page($ctx, 200, + ascii_html($type)." $oid is too big to show\n". dbg_log($ctx)); + my $cmd = $git->cmd(qw(show --encoding=UTF-8 + --no-color --no-abbrev), $oid); + my $qsp = PublicInbox::Qspawn->new($cmd); + $qsp->{qsp_err} = \($ctx->{-qsp_err} = ''); + $qsp->psgi_qx($ctx->{env}, undef, \&show_other_result, $ctx); +} + +sub show_tree_result ($$) { + my ($bref, $ctx) = @_; + if (my $qsp_err = delete $ctx->{-qsp_err}) { + return html_page($ctx, 500, dbg_log($ctx) . + "git ls-tree -z error:$qsp_err"); } - $log = do { local $/; <$log> }; + my @ent = split(/\0/, $$bref); + my $qp = delete $ctx->{qp}; + my $l = $ctx->{-linkify} //= PublicInbox::Linkify->new; + my $pfx = $ctx->{-path} // $qp->{b}; # {-path} is from RepoTree + $$bref = "<pre><a href=#tree>tree</a> $ctx->{tree_oid}"; + # $REPO/tree/$path already sets {-upfx} + my $upfx = $ctx->{-upfx} //= '../../'; + if (defined $pfx) { + $pfx =~ s!/+\z!!s; + if (my $t = $ctx->{-obj}) { + my $t = ascii_html($t); + $$bref .= <<EOM +\n\$ git ls-tree -l $t # shows similar output on the CLI +EOM + } elsif ($pfx eq '') { + $$bref .= " (root)\n"; + } else { + my $x = ascii_html($pfx); + $pfx .= '/'; + $$bref .= qq( <a href=#path>path</a>: $x</a>\n); + } + } else { + $pfx = ''; + $$bref .= qq[ (<a href=#path>path</a> unknown)\n]; + } + my ($x, $m, $t, $oid, $sz, $f, $n, $gitlink); + $$bref .= "\n size name"; + for (@ent) { + ($x, $f) = split(/\t/, $_, 2); + undef $_; + ($m, $t, $oid, $sz) = split(/ +/, $x, 4); + $m = $GIT_MODE{$m} // '?'; + utf8_maybe($f); + $n = ascii_html($f); + if ($m eq 'g') { # gitlink submodule commit + $$bref .= "\ng\t\t$n @ <a\nhref=#g>commit</a>$oid"; + $gitlink = 1; + next; + } + my $q = 'b='.ascii_html(uri_escape_path($pfx.$f)); + if ($m eq 'd') { $n .= '/' } + elsif ($m eq 'x') { $n = "<b>$n</b>" } + elsif ($m eq 'l') { $n = "<i>$n</i>" } + $$bref .= qq(\n$m\t$sz\t<a\nhref="$upfx$oid/s/?$q">$n</a>); + } + $$bref .= dbg_log($ctx); + $$bref .= <<EOM; +<hr><pre>glossary +-------- +<dfn +id=tree>Tree</dfn> objects belong to commits or other tree objects. Trees may +reference blobs, sub-trees, or (rarely) commits of submodules. + +<dfn +id=path>Path</dfn> names are stored in tree objects, but trees do not know +their own path name. A tree's path name comes from their parent tree, +or it is the root tree referenced by a commit object. Thus, this web UI +relies on the `b=' URI parameter as a hint to display the path name. +EOM + + $$bref .= <<EOM if $gitlink; - my $ref = ref($res); +<dfn title="submodule commit" +id=g>Commit</dfn> objects may be stored in trees to reference submodules.</pre> +EOM + chop $$bref; + html_page($ctx, 200, $$bref); +} + +sub show_tree ($$) { # also used by RepoTree + my ($ctx, $res) = @_; + my ($git, $oid, undef, $size) = @$res; + $size > $MAX_SIZE and return html_page($ctx, 200, + "tree $oid is too big to show\n". dbg_log($ctx)); + my $cmd = $git->cmd(qw(ls-tree -z -l --no-abbrev), $oid); + my $qsp = PublicInbox::Qspawn->new($cmd); + $ctx->{tree_oid} = $oid; + $qsp->{qsp_err} = \($ctx->{-qsp_err} = ''); + $qsp->psgi_qx($ctx->{env}, undef, \&show_tree_result, $ctx); +} + +# returns seconds offset from git TZ offset +sub tz_adj ($) { + my ($tz) = @_; # e.g "-0700" + $tz = int($tz); + my $mm = $tz < 0 ? -$tz : $tz; + $mm = int($mm / 100) * 60 + ($mm % 100); + $mm = $tz < 0 ? -$mm : $mm; + ($mm * 60); +} + +sub show_tag_result { # git->cat_async callback + my ($bref, $oid, $type, $size, $ctx) = @_; + utf8_maybe($$bref); my $l = PublicInbox::Linkify->new; - $log = '<pre>debug log:</pre><hr /><pre>' . - $l->to_html($log) . '</pre>'; + $$bref = $l->to_html($$bref); + $$bref =~ s!^object ([a-f0-9]+)!object <a +href=../../$1/s/>$1</a>!; + + $$bref =~ s/^(tagger .*> )([0-9]+) ([\-+]?[0-9]+)/$1.strftime( + '%Y-%m-%d %H:%M:%S', gmtime($2 + tz_adj($3)))." $3"/sme; + # TODO: download link + html_page($ctx, 200, '<pre>', $$bref, '</pre>', dbg_log($ctx)); +} - $res or return html_page($ctx, 404, \$log); - $ref eq 'ARRAY' or return html_page($ctx, 500, \$log); +sub show_tag ($$) { + my ($ctx, $res) = @_; + my ($git, $oid) = @$res; + $ctx->{git} = $git; + do_cat_async($ctx, \&show_tag_result, $oid); +} + +# user_cb for SolverGit, called as: user_cb->($result_or_error, $uarg) +sub solve_result { + my ($res, $ctx) = @_; + my $hints = delete $ctx->{hints}; + $res or return html_page($ctx, 404, 'Not found', dbg_log($ctx)); + ref($res) eq 'ARRAY' or + return html_page($ctx, 500, 'Internal error', dbg_log($ctx)); my ($git, $oid, $type, $size, $di) = @$res; - return show_other($ctx, $res, \$log, $fn) if $type ne 'blob'; - my $path = to_filename($di->{path_b} // $hints->{path_b} // 'blob'); - my $raw_link = "(<a\nhref=$path>raw</a>)"; + return show_commit($ctx, $res) if $type eq 'commit'; + return show_tree($ctx, $res) if $type eq 'tree'; + return show_tag($ctx, $res) if $type eq 'tag'; + return show_other($ctx, $res) if $type ne 'blob'; + my $fn = $di->{path_b} // $hints->{path_b}; + my $paths = $ctx->{-paths} //= do { + my $path = to_filename($fn // 'blob') // 'blob'; + my $raw_more = qq[(<a\nhref="$path">raw</a>)]; + my @def; + + # XXX not sure if this is the correct wording + if (defined($fn)) { + $raw_more .= qq( +name: ${\ascii_html($fn)} \t # note: path name is non-authoritative<a +href="#pathdef" id=top>(*)</a>); + $def[0] = "<hr><pre\nid=pathdef>" . +'(*) Git path names are given by the tree(s) the blob belongs to. + Blobs themselves have no identifier aside from the hash of its contents.'. +qq(<a\nhref="#top">^</a></pre>); + } + [ $path, $raw_more, @def ]; + }; + $ctx->{-q_value_html} //= do { + my $s = defined($fn) ? 'dfn:'.ascii_html($fn).' ' : ''; + $s.'dfpost:'.substr($oid, 0, 7); + }; + if ($size > $MAX_SIZE) { - return stream_large_blob($ctx, $res, \$log, $fn) if defined $fn; - $log = "<pre><b>Too big to show, download available</b>\n" . - "$oid $type $size bytes $raw_link</pre>" . $log; - return html_page($ctx, 200, \$log); + return stream_large_blob($ctx, $res) if defined $ctx->{fn}; + return html_page($ctx, 200, <<EOM . dbg_log($ctx)); +<pre><b>Too big to show, download available</b> +blob $oid $size bytes $paths->[1]</pre> +EOM } + bless $ctx, 'PublicInbox::WwwStream'; # for DESTROY + $ctx->{git} = $git; + do_cat_async($ctx, \&show_blob, $oid); +} - my $blob = $git->cat_file($oid); - if (!$blob) { # WTF? +sub show_blob { # git->cat_async callback + my ($blob, $oid, $type, $size, $ctx) = @_; + if (!$blob) { my $e = "Failed to retrieve generated blob ($oid)"; - $ctx->{env}->{'psgi.errors'}->print("$e ($git->{git_dir})\n"); - $log = "<pre><b>$e</b></pre>" . $log; - return html_page($ctx, 500, \$log); + warn "$e ($ctx->{git}->{git_dir}) type=$type"; + return html_page($ctx, 500, "<pre><b>$e</b></pre>".dbg_log($ctx)) } my $bin = index(substr($$blob, 0, $BIN_DETECT), "\0") >= 0; - if (defined $fn) { + if (defined $ctx->{fn}) { my $h = [ 'Content-Length', $size, 'Content-Type' ]; push(@$h, ($bin ? 'application/octet-stream' : 'text/plain')); return delete($ctx->{-wcb})->([200, $h, [ $$blob ]]); } - if ($bin) { - $log = "<pre>$oid $type $size bytes (binary)" . - " $raw_link</pre>" . $log; - return html_page($ctx, 200, \$log); - } + my ($path, $raw_more, @def) = @{delete $ctx->{-paths}}; + $bin and return html_page($ctx, 200, + "<pre>blob $oid $size bytes (binary)" . + " $raw_more</pre>".dbg_log($ctx)); # TODO: detect + convert to ensure validity - utf8::decode($$blob); + utf8_maybe($$blob); my $nl = ($$blob =~ s/\r?\n/\n/sg); my $pad = length($nl); - $l->linkify_1($$blob); + ($ctx->{-linkify} //= PublicInbox::Linkify->new)->linkify_1($$blob); my $ok = $hl->do_hl($blob, $path) if $hl; if ($ok) { $blob = $ok; @@ -171,38 +606,63 @@ sub solve_result { } # using some of the same CSS class names and ids as cgit - $log = "<pre>$oid $type $size bytes $raw_link</pre>" . + my $x = "<pre>blob $oid $size bytes $raw_more</pre>" . "<hr /><table\nclass=blob>". - "<tr><td\nclass=linenumbers><pre>" . join('', map { - sprintf("<a id=n$_ href=#n$_>% ${pad}u</a>\n", $_) - } (1..$nl)) . '</pre></td>' . - '<td><pre> </pre></td>'. # pad for non-CSS users - "<td\nclass=lines><pre\nstyle='white-space:pre'><code>" . - $l->linkify_2($$blob) . - '</code></pre></td></tr></table>' . $log; + "<tr><td\nclass=linenumbers><pre>"; + # scratchpad in this loop is faster here than `printf $zfh': + $x .= sprintf("<a id=n$_ href=#n$_>% ${pad}u</a>\n", $_) for (1..$nl); + $x .= '</pre></td><td><pre> </pre></td>'. # pad for non-CSS users + "<td\nclass=lines><pre\nstyle='white-space:pre'><code>"; + html_page($ctx, 200, $x, $ctx->{-linkify}->linkify_2($$blob), + '</code></pre></td></tr></table>'.dbg_log($ctx), @def); +} - html_page($ctx, 200, \$log); +sub start_solver ($) { + my ($ctx) = @_; + while (my ($from, $to) = each %QP_MAP) { + my $v = $ctx->{qp}->{$from} // next; + $ctx->{hints}->{$to} = $v if $v ne ''; + } + $ctx->{-next_solver} = on_destroy \&next_solver; + ++$solver_nr; + $ctx->{-tmp} = File::Temp->newdir("solver.$ctx->{oid_b}-XXXX", + TMPDIR => 1); + $ctx->{lh} or open $ctx->{lh}, '+>>', "$ctx->{-tmp}/solve.log"; + my $solver = PublicInbox::SolverGit->new($ctx->{ibx}, + \&solve_result, $ctx); + $solver->{limiter} = $solver_lim; + $solver->{gits} //= [ $ctx->{git} ]; + $solver->{tmp} = $ctx->{-tmp}; # share tmpdir + # PSGI server will call this immediately and give us a callback (-wcb) + $solver->solve(@$ctx{qw(env lh oid_b hints)}); +} + +# run the next solver job when done and DESTROY-ed +sub next_solver { + --$solver_nr; + # XXX FIXME: client may've disconnected if it waited a long while + start_solver(shift(@solver_q) // return); +} + +sub may_start_solver ($) { + my ($ctx) = @_; + $solver_lim //= $ctx->{www}->{pi_cfg}->limiter('codeblob'); + if ($solver_nr >= $solver_lim->{max}) { + @solver_q > 128 ? html_page($ctx, 503, 'too busy') + : push(@solver_q, $ctx); + } else { + start_solver($ctx); + } } # GET /$INBOX/$GIT_OBJECT_ID/s/ # GET /$INBOX/$GIT_OBJECT_ID/s/$FILENAME sub show ($$;$) { my ($ctx, $oid_b, $fn) = @_; - my $qp = $ctx->{qp}; - my $hints = $ctx->{hints} = {}; - while (my ($from, $to) = each %QP_MAP) { - defined(my $v = $qp->{$from}) or next; - $hints->{$to} = $v if $v ne ''; - } - - $ctx->{'log'} = tmpfile("solve.$oid_b"); - $ctx->{fn} = $fn; - my $solver = PublicInbox::SolverGit->new($ctx->{-inbox}, - \&solve_result, $ctx); - # PSGI server will call this immediately and give us a callback (-wcb) + @$ctx{qw(oid_b fn)} = ($oid_b, $fn); sub { $ctx->{-wcb} = $_[0]; # HTTP write callback - $solver->solve($ctx->{env}, $ctx->{log}, $oid_b, $hints); + may_start_solver $ctx; }; } diff --git a/lib/PublicInbox/WQBlocked.pm b/lib/PublicInbox/WQBlocked.pm new file mode 100644 index 00000000..8d931fa9 --- /dev/null +++ b/lib/PublicInbox/WQBlocked.pm @@ -0,0 +1,48 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# non-blocking workqueues, currently used by LeiNoteEvent to track renames +package PublicInbox::WQBlocked; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLOUT EPOLLONESHOT); +use PublicInbox::IPC; +use Carp (); + +sub new { + my ($cls, $wq, $buf) = @_; + my $self = bless { msgq => [$buf], }, $cls; + $wq->{wqb} = $self->SUPER::new($wq->{-wq_s1}, EPOLLOUT|EPOLLONESHOT); +} + +sub flush_send { + my ($self) = @_; + push(@{$self->{msgq}}, $_[1]) if defined($_[1]); + while (defined(my $buf = shift @{$self->{msgq}})) { + if (ref($buf) eq 'CODE') { + $buf->($self); # could be \&PublicInbox::DS::close + } else { + my $wq_s1 = $self->{sock}; + my $n = $PublicInbox::IPC::send_cmd->($wq_s1, [], $buf, + 0); + next if defined($n); + Carp::croak("sendmsg: $!") unless $!{EAGAIN}; + PublicInbox::DS::epwait($wq_s1, EPOLLOUT|EPOLLONESHOT); + unshift @{$self->{msgq}}, $buf; + last; # wait for ->event_step + } + } +} + +sub enq_close { flush_send($_[0], $_[0]->can('close')) } + +sub event_step { # called on EPOLLOUT wakeup + my ($self) = @_; + eval { flush_send($self) } if $self->{sock}; + if ($@) { + warn $@; + $self->close; + } +} + +1; diff --git a/lib/PublicInbox/WQWorker.pm b/lib/PublicInbox/WQWorker.pm new file mode 100644 index 00000000..950bd170 --- /dev/null +++ b/lib/PublicInbox/WQWorker.pm @@ -0,0 +1,33 @@ +# Copyright (C) 2021 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# for PublicInbox::IPC wq_* (work queue) workers +package PublicInbox::WQWorker; +use strict; +use v5.10.1; +use parent qw(PublicInbox::DS); +use PublicInbox::Syscall qw(EPOLLIN EPOLLEXCLUSIVE); +use Errno qw(EAGAIN ECONNRESET); +use IO::Handle (); # blocking + +sub new { + my ($cls, $wq, $sock) = @_; + $sock->blocking(0); + my $self = bless { sock => $sock, wq => $wq }, $cls; + $self->SUPER::new($sock, EPOLLEXCLUSIVE|EPOLLIN); + $self; +} + +sub event_step { + my ($self) = @_; + my $n = $self->{wq}->recv_and_run($self->{sock}) and return; + unless (defined $n) { + return if $! == EAGAIN; + warn "recvmsg: $!" if $! != ECONNRESET; + } + $self->{sock} == $self->{wq}->{-wq_s2} and + $self->{wq}->wq_atexit_child; + $self->close; # PublicInbox::DS::close +} + +1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index e3b589cb..289599b8 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Main web interface for mailing list archives @@ -11,11 +11,10 @@ # - Must not rely on static content # - UTF-8 is only for user-content, 7-bit US-ASCII for us package PublicInbox::WWW; -use 5.010_001; use strict; -use warnings; -use bytes (); # only for bytes::length +use v5.10.1; use PublicInbox::Config; +use PublicInbox::Git; use PublicInbox::Hval; use URI::Escape qw(uri_unescape); use PublicInbox::MID qw(mid_escape); @@ -25,16 +24,15 @@ use PublicInbox::WwwStatic qw(r path_info_raw); use PublicInbox::Eml; # TODO: consider a routing tree now that we have more endpoints: -our $INBOX_RE = qr!\A/([\w\-][\w\.\-]*)!; +our $INBOX_RE = qr!\A/([\w\-][\w\.\-\+]*)!; our $MID_RE = qr!([^/]+)!; -our $END_RE = qr!(T/|t/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; +our $END_RE = qr!(T/|t/|d/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; our $ATTACH_RE = qr!([0-9][0-9\.]*)-($PublicInbox::Hval::FN)!; our $OID_RE = qr![a-f0-9]{7,}!; sub new { - my ($class, $pi_config) = @_; - $pi_config ||= PublicInbox::Config->new; - bless { pi_config => $pi_config }, $class; + my ($class, $pi_cfg) = @_; + bless { pi_cfg => $pi_cfg // PublicInbox::Config->new }, $class; } # backwards compatibility, do not use @@ -48,15 +46,21 @@ sub call { my $ctx = { env => $env, www => $self }; # we don't care about multi-value - %{$ctx->{qp}} = map { - utf8::decode($_); - tr/+/ /; - my ($k, $v) = split('=', $_, 2); - $v = uri_unescape($v // ''); - # none of the keys we care about will need escaping - $k => $v; - } split(/[&;]+/, $env->{QUERY_STRING}); - + # '0' isn't a QUERY_STRING we care about + if (my $qs = $env->{QUERY_STRING}) { + utf8::decode($qs); + $qs =~ tr/+/ /; + %{$ctx->{qp}} = map { + # we only use single-char query param keys + if (s/\A([A-Za-z])=//) { + $1 => uri_unescape($_) + } elsif (/\A[a-z]\z/) { # some boolean options + $_ => '' + } else { + () # ignored + } + } split(/[&;]+/, $qs); + } my $path_info = path_info_raw($env); my $method = $env->{REQUEST_METHOD}; @@ -68,7 +72,13 @@ sub call { serve_git($ctx, $epoch, $path); } elsif ($path_info =~ m!$INBOX_RE/(\w+)\.sql\.gz\z!o) { return get_altid_dump($ctx, $1, $2); - } elsif ($path_info =~ m!$INBOX_RE/!o) { + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$ATTACH_RE\z!o) { + my ($idx, $fn) = ($3, $4); + return invalid_inbox_mid($ctx, $1, $2) || + get_attach($ctx, $idx, $fn); + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/\z!o) { + return invalid_inbox_mid($ctx, $1, $2) || mbox_results($ctx); + } elsif ($path_info =~ m!$INBOX_RE/\z!o) { return invalid_inbox($ctx, $1) || mbox_results($ctx); } } @@ -91,6 +101,9 @@ sub call { invalid_inbox($ctx, $1) || get_atom($ctx); } elsif ($path_info =~ m!$INBOX_RE/new\.html\z!o) { invalid_inbox($ctx, $1) || get_new($ctx); + } elsif ($path_info =~ + m!$INBOX_RE/topics_(new|active)\.(atom|html)\z!o) { + get_topics($ctx, $1, $2, $3); } elsif ($path_info =~ m!$INBOX_RE/description\z!o) { get_description($ctx, $1); } elsif ($path_info =~ m!$INBOX_RE/(?:(?:git/)?([0-9]+)(?:\.git)?/)? @@ -135,7 +148,8 @@ sub call { # convenience redirects order matters } elsif ($path_info =~ m!$INBOX_RE/([^/]{2,})\z!o) { r301($ctx, $1, $2); - + } elsif ($path_info =~ m!\A/\+/([a-zA-Z0-9_\-\.]+)\.css\z!) { + get_css($ctx, undef, $1); # for WwwListing } else { legacy_redirects($ctx, $path_info); } @@ -169,25 +183,18 @@ sub preload { eval "require PublicInbox::$_;"; } if (ref($self)) { - my $pi_config = $self->{pi_config}; - if (defined($pi_config->{'publicinbox.cgitrc'})) { - $pi_config->limiter('-cgit'); + my $pi_cfg = $self->{pi_cfg}; + if (defined($pi_cfg->{'publicinbox.cgitrc'})) { + $pi_cfg->limiter('-cgit'); } + $pi_cfg->ALL and require PublicInbox::Isearch; $self->cgit; + $self->coderepo; $self->stylesheets_prepare($_) for ('', '../', '../../'); $self->news_www; - $pi_config->each_inbox(\&preload_inbox); } } -sub preload_inbox { - my $ibx = shift; - $ibx->altid_map; - $ibx->cloneurl; - $ibx->description; - $ibx->base_url; -} - # private functions below sub r404 { @@ -201,18 +208,29 @@ sub r404 { sub news_cgit_fallback ($) { my ($ctx) = @_; - my $www = $ctx->{www}; - my $env = $ctx->{env}; - my $res = $www->news_www->call($env); - $res->[0] == 404 ? $www->cgit->call($env) : $res; + my $res = $ctx->{www}->news_www->call($ctx->{env}); + + $res->[0] == 404 and ($ctx->{www}->{cgit_fallback} //= do { + my $c = $ctx->{www}->{pi_cfg}->{'publicinbox.cgit'} // 'first'; + $c ne 'first' # `fallback' and `rewrite' => true + } // 0) and $res = $ctx->{www}->coderepo->srv($ctx); + + ref($res) eq 'ARRAY' && $res->[0] == 404 and + $res = $ctx->{www}->cgit->call($ctx->{env}, $ctx); + + ref($res) eq 'ARRAY' && $res->[0] == 404 && + !$ctx->{www}->{cgit_fallback} and + $res = $ctx->{www}->coderepo->srv($ctx); + $res; } # returns undef if valid, array ref response if invalid sub invalid_inbox ($$) { my ($ctx, $inbox) = @_; - my $ibx = $ctx->{www}->{pi_config}->lookup_name($inbox); + my $ibx = $ctx->{www}->{pi_cfg}->lookup_name($inbox) // + $ctx->{www}->{pi_cfg}->lookup_ei($inbox); if (defined $ibx) { - $ctx->{-inbox} = $ibx; + $ctx->{ibx} = $ibx; return; } @@ -230,11 +248,11 @@ sub invalid_inbox_mid { return $ret if $ret; my $mid = $ctx->{mid} = uri_unescape($mid_ue); - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; if ($mid =~ m!\A([a-f0-9]{2})([a-f0-9]{38})\z!) { my ($x2, $x38) = ($1, $2); # this is horrifically wasteful for legacy URLs: - my $str = $ctx->{-inbox}->msg_by_path("$x2/$x38") or return; + my $str = $ctx->{ibx}->msg_by_path("$x2/$x38") or return; my $s = PublicInbox::Eml->new($str); $mid = PublicInbox::MID::mid_clean($s->header_raw('Message-ID')); return r301($ctx, $inbox, mid_escape($mid)); @@ -256,6 +274,13 @@ sub get_new { PublicInbox::Feed::new_html($ctx); } +# /$INBOX/topics_(new|active).(html|atom) +sub get_topics { + my ($ctx, $ibx_name, $category, $type) = @_; + require PublicInbox::WwwTopics; + PublicInbox::WwwTopics::response($ctx, $ibx_name, $category, $type); +} + # /$INBOX/?r=$GIT_COMMIT -> HTML only sub get_index { my ($ctx) = @_; @@ -272,7 +297,7 @@ sub get_index { sub get_mid_txt { my ($ctx) = @_; require PublicInbox::Mbox; - PublicInbox::Mbox::emit_raw($ctx) || r404($ctx); + PublicInbox::Mbox::emit_raw($ctx) || r(404); } # /$INBOX/$MESSAGE_ID/ -> HTML content (short quotes) @@ -285,7 +310,7 @@ sub get_mid_html { # /$INBOX/$MESSAGE_ID/t/ sub get_thread { my ($ctx, $flat) = @_; - $ctx->{-inbox}->over or return need($ctx, 'Overview'); + $ctx->{ibx}->over or return need($ctx, 'Overview'); $ctx->{flat} = $flat; require PublicInbox::View; PublicInbox::View::thread_html($ctx); @@ -310,6 +335,7 @@ sub get_vcs_object ($$$;$) { my ($ctx, $inbox, $oid, $filename) = @_; my $r404 = invalid_inbox($ctx, $inbox); return $r404 if $r404; + return r(404) if !$ctx->{www}->{pi_cfg}->repo_objs($ctx->{ibx}); require PublicInbox::ViewVCS; PublicInbox::ViewVCS::show($ctx, $oid, $filename); } @@ -323,11 +349,12 @@ sub get_altid_dump { } sub need { - my ($ctx, $extra) = @_; + my ($ctx, $extra, $upref) = @_; require PublicInbox::WwwStream; - PublicInbox::WwwStream::html_oneshot($ctx, 501, \<<EOF); + $upref //= '../'; + PublicInbox::WwwStream::html_oneshot($ctx, 501, <<EOF); <pre>$extra is not available for this public-inbox -<a\nhref="../">Return to index</a></pre> +<a\nhref="$upref">Return to index</a></pre> EOF } @@ -338,7 +365,7 @@ EOF # especially on older systems. Stick to zlib since that's what git uses. sub get_thread_mbox { my ($ctx, $sfx) = @_; - my $over = $ctx->{-inbox}->over or return need($ctx, 'Overview'); + my $over = $ctx->{ibx}->over or return need($ctx, 'Overview'); require PublicInbox::Mbox; PublicInbox::Mbox::thread_mbox($ctx, $over, $sfx); } @@ -347,7 +374,7 @@ sub get_thread_mbox { # /$INBOX/$MESSAGE_ID/t.atom -> thread as Atom feed sub get_thread_atom { my ($ctx) = @_; - $ctx->{-inbox}->over or return need($ctx, 'Overview'); + $ctx->{ibx}->over or return need($ctx, 'Overview'); require PublicInbox::Feed; PublicInbox::Feed::generate_thread_atom($ctx); } @@ -412,11 +439,11 @@ sub legacy_redirects { sub r301 { my ($ctx, $inbox, $mid_ue, $suffix) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; unless ($ibx) { my $r404 = invalid_inbox($ctx, $inbox); return $r404 if $r404; - $ibx = $ctx->{-inbox}; + $ibx = $ctx->{ibx}; } my $url = $ibx->base_url($ctx->{env}); my $qs = $ctx->{env}->{QUERY_STRING}; @@ -447,13 +474,17 @@ sub msg_page { # legacy, but no redirect for compatibility: 'f/' eq $e and return get_mid_html($ctx); + if ($e eq 'd/') { + require PublicInbox::View; + return PublicInbox::View::diff_msg($ctx); + } r404($ctx); } sub serve_git { my ($ctx, $epoch, $path) = @_; my $env = $ctx->{env}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $git = defined $epoch ? $ibx->git_epoch($epoch) : $ibx->git; $git ? PublicInbox::GitHTTPBackend::serve($env, $git, $path) : r404(); } @@ -461,7 +492,7 @@ sub serve_git { sub mbox_results { my ($ctx) = @_; if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) { - $ctx->{-inbox}->search or return need($ctx, 'search'); + $ctx->{ibx}->isrch or return need($ctx, 'search'); require PublicInbox::SearchView; return PublicInbox::SearchView::mbox_results($ctx); } @@ -478,24 +509,29 @@ sub serve_mbox_range { sub news_www { my ($self) = @_; - $self->{news_www} ||= do { + $self->{news_www} //= do { require PublicInbox::NewsWWW; - PublicInbox::NewsWWW->new($self->{pi_config}); + PublicInbox::NewsWWW->new($self->{pi_cfg}); } } sub cgit { my ($self) = @_; - $self->{cgit} ||= do { - my $pi_config = $self->{pi_config}; - - if (defined($pi_config->{'publicinbox.cgitrc'})) { + $self->{cgit} //= + (defined($self->{pi_cfg}->{'publicinbox.cgitrc'}) ? do { require PublicInbox::Cgit; - PublicInbox::Cgit->new($pi_config); - } else { + PublicInbox::Cgit->new($self->{pi_cfg}); + } : undef) // do { require Plack::Util; Plack::Util::inline_object(call => sub { r404() }); - } + }; +} + +sub coderepo { + my ($self) = @_; + $self->{coderepo} //= do { + require PublicInbox::WwwCoderepo; + PublicInbox::WwwCoderepo->new($self->{pi_cfg}); } } @@ -505,7 +541,7 @@ sub get_inbox_manifest ($$$) { my $r404 = invalid_inbox($ctx, $inbox); return $r404 if $r404; require PublicInbox::ManifestJsGz; - PublicInbox::ManifestJsGz->response($ctx); + PublicInbox::ManifestJsGz::per_inbox($ctx); } sub get_attach { @@ -537,7 +573,7 @@ sub stylesheets_prepare ($$) { } || sub { $_[0] }; my $css_map = {}; - my $stylesheets = $self->{pi_config}->{css} || []; + my $stylesheets = $self->{pi_cfg}->{css} || []; my $links = []; my $inline_ok = 1; @@ -564,9 +600,9 @@ sub stylesheets_prepare ($$) { next; }; my $ctime = 0; - my $local = do { local $/; <$fh> }; + my $local = PublicInbox::IO::read_all $fh; # sets _ if ($local =~ /\S/) { - $ctime = sprintf('%x',(stat($fh))[10]); + $ctime = sprintf('%x',(stat(_))[10]); $local = $mini->($local); } @@ -628,24 +664,25 @@ sub style { }; } -# /$INBOX/$KEY.css endpoint +# /$INBOX/$KEY.css and /+/$KEY.css endpoints # CSS is configured globally for all inboxes, but we access them on # a per-inbox basis. This allows administrators to setup per-inbox # static routes to intercept the request before it hits PSGI +# inbox == undef => top-level WwwListing sub get_css ($$$) { my ($ctx, $inbox, $key) = @_; - my $r404 = invalid_inbox($ctx, $inbox); + my $r404 = defined($inbox) ? invalid_inbox($ctx, $inbox) : undef; return $r404 if $r404; my $self = $ctx->{www}; - my $css_map = $self->{-css_map} || stylesheets_prepare($self, ''); + my $css_map = $self->{-css_map} || + stylesheets_prepare($self, defined($inbox) ? '' : '+/'); my $css = $css_map->{$key}; - if (!defined($css) && $key eq 'userContent') { + if (!defined($css) && defined($inbox) && $key eq 'userContent') { my $env = $ctx->{env}; - $css = PublicInbox::UserContent::sample($ctx->{-inbox}, $env); + $css = PublicInbox::UserContent::sample($ctx->{ibx}, $env); } defined $css or return r404(); - my $h = [ 'Content-Length', bytes::length($css), - 'Content-Type', 'text/css' ]; + my $h = [ 'Content-Length', length($css), 'Content-Type', 'text/css' ]; PublicInbox::GitHTTPBackend::cache_one_year($h); [ 200, $h, [ $css ] ]; } @@ -653,10 +690,20 @@ sub get_css ($$$) { sub get_description { my ($ctx, $inbox) = @_; invalid_inbox($ctx, $inbox) || do { - my $d = $ctx->{-inbox}->description . "\n"; - [ 200, [ 'Content-Length', bytes::length($d), + my $d = $ctx->{ibx}->description . "\n"; + utf8::encode($d); + [ 200, [ 'Content-Length', length($d), 'Content-Type', 'text/plain' ], [ $d ] ]; }; } +sub event_step { # called via requeue + my ($self) = @_; + # gzf = PublicInbox::GzipFilter == $ctx + my $gzf = shift(@{$self->{-low_prio_q}}) // return; + PublicInbox::DS::requeue($self) if scalar(@{$self->{-low_prio_q}}); + my $http = $gzf->{env}->{'psgix.io'}; # PublicInbox::HTTP + $http->next_step($gzf->can('async_next')); +} + 1; diff --git a/lib/PublicInbox/WWW.pod b/lib/PublicInbox/WWW.pod index 30fe602d..b55f010e 100644 --- a/lib/PublicInbox/WWW.pod +++ b/lib/PublicInbox/WWW.pod @@ -20,13 +20,14 @@ The PSGI web interface for public-inbox. Using this directly is not needed unless you wish to customize your public-inbox PSGI deployment or are using a PSGI server -other than L<public-inbox-httpd(1)>. +other than L<public-inbox-netd(1)> (C<-netd>) / +L<public-inbox-httpd(1)> (C<-httpd>) -While this PSGI application works with all PSGI/Plack web +While this PSGI application should work with all PSGI/Plack web servers such as L<starman(1)>, L<starlet(1)> or L<twiggy(1)>; -PublicInbox::WWW takes advantage of currently-undocumented APIs -of L<public-inbox-httpd(1)> to improve fairness when serving -large responses for thread views and git clones. +PublicInbox::WWW takes advantage of internal APIs of C<-netd> +and C<-httpd> to improve fairness when serving large responses +for thread views, mbox downloads, and git clones. =head1 ENVIRONMENT @@ -43,14 +44,15 @@ Used to override the default "~/.public-inbox/config" value. Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> The mail archives are hosted at L<https://public-inbox.org/meta/> -and L<http://hjrcffqmbrq6wope.onion/meta/> +and L<http://4uok3hntl7oi7b4uf4rtfwefqeexfzil2w6kgk2jn5z2f764irre7byd.onion/meta/> =head1 COPYRIGHT -Copyright (C) 2016-2020 all contributors L<mailto:meta@public-inbox.org> +Copyright (C) all contributors L<mailto:meta@public-inbox.org> License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> =head1 SEE ALSO -L<http://plackperl.org/>, L<Plack>, L<public-inbox-httpd(1)> +L<http://plackperl.org/>, L<Plack>, L<public-inbox-netd(1)>, +L<public-inbox-httpd(1)> diff --git a/lib/PublicInbox/Watch.pm b/lib/PublicInbox/Watch.pm index 8bbce929..eb90d353 100644 --- a/lib/PublicInbox/Watch.pm +++ b/lib/PublicInbox/Watch.pm @@ -1,21 +1,22 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # ref: https://cr.yp.to/proto/maildir.html -# http://wiki2.dovecot.org/MailboxFormat/Maildir +# https://wiki2.dovecot.org/MailboxFormat/Maildir package PublicInbox::Watch; use strict; use v5.10.1; use PublicInbox::Eml; -use PublicInbox::InboxWritable qw(eml_from_path warn_ignore_cb); +use PublicInbox::InboxWritable qw(eml_from_path); +use PublicInbox::MdirReader; +use PublicInbox::NetReader; use PublicInbox::Filter::Base qw(REJECT); use PublicInbox::Spamcheck; -use PublicInbox::Sigfd; -use PublicInbox::DS qw(now); +use PublicInbox::DS qw(now add_timer awaitpid); use PublicInbox::MID qw(mids); use PublicInbox::ContentHash qw(content_hash); -use PublicInbox::EOFpipe; use POSIX qw(_exit WNOHANG); +use constant { D_MAILDIR => 1, D_MH => 2 }; sub compile_watchheaders ($) { my ($ibx) = @_; @@ -40,82 +41,118 @@ sub compile_watchheaders ($) { $ibx->{-watchheaders} = $watch_hdrs if scalar @$watch_hdrs; } +sub d_type_set ($$$) { + my ($d_type, $dir, $is) = @_; + my $isnt = D_MAILDIR; + if ($is == D_MAILDIR) { + $isnt = D_MH; + $d_type->{"$dir/cur"} |= $is; + $d_type->{"$dir/new"} |= $is; + } + warn <<EOM if ($d_type->{$dir} |= $is) & $isnt; +W: `$dir' is both Maildir and MH (non-fatal) +EOM +} + sub new { - my ($class, $config) = @_; - my (%mdmap, $spamc); + my ($class, $cfg) = @_; + my (%d_map, %d_type); my (%imap, %nntp); # url => [inbox objects] or 'watchspam' + my (@imap, @nntp); + PublicInbox::Import::load_config($cfg); # "publicinboxwatch" is the documented namespace # "publicinboxlearn" is legacy but may be supported # indefinitely... foreach my $pfx (qw(publicinboxwatch publicinboxlearn)) { my $k = "$pfx.watchspam"; - defined(my $dirs = $config->{$k}) or next; - $dirs = PublicInbox::Config::_array($dirs); + my $dirs = $cfg->get_all($k) // next; for my $dir (@$dirs) { - my $url; + my $uri; if (is_maildir($dir)) { # skip "new", no MUA has seen it, yet. - $mdmap{"$dir/cur"} = 'watchspam'; - } elsif ($url = imap_url($dir)) { - $imap{$url} = 'watchspam'; - } elsif ($url = nntp_url($dir)) { - $nntp{$url} = 'watchspam'; + $d_map{"$dir/cur"} = 'watchspam'; + d_type_set \%d_type, $dir, D_MAILDIR; + } elsif (is_mh($dir)) { + $d_map{$dir} = 'watchspam'; + d_type_set \%d_type, $dir, D_MH; + } elsif ($uri = imap_uri($dir)) { + $imap{$$uri} = 'watchspam'; + push @imap, $uri; + } elsif ($uri = nntp_uri($dir)) { + $nntp{$$uri} = 'watchspam'; + push @nntp, $uri; } else { warn "unsupported $k=$dir\n"; } } } - my $k = 'publicinboxwatch.spamcheck'; my $default = undef; - my $spamcheck = PublicInbox::Spamcheck::get($config, $k, $default); + my $spamcheck = PublicInbox::Spamcheck::get($cfg, $k, $default); $spamcheck = _spamcheck_cb($spamcheck) if $spamcheck; - $config->each_inbox(sub { + $cfg->each_inbox(sub { # need to make all inboxes writable for spam removal: my $ibx = $_[0] = PublicInbox::InboxWritable->new($_[0]); my $watches = $ibx->{watch} or return; + + $ibx->{indexlevel} //= $ibx->detect_indexlevel; $watches = PublicInbox::Config::_array($watches); for my $watch (@$watches) { - my $url; - if (is_maildir($watch)) { + my $uri; + my $bool = $cfg->git_bool($watch); + if (defined $bool && !$bool) { + $ibx->{-watch_disabled} = 1; + } elsif (is_maildir($watch)) { compile_watchheaders($ibx); my ($new, $cur) = ("$watch/new", "$watch/cur"); - my $cur_dst = $mdmap{$cur} //= []; + my $cur_dst = $d_map{$cur} //= []; return if is_watchspam($cur, $cur_dst, $ibx); - push @{$mdmap{$new} //= []}, $ibx; + push @{$d_map{$new} //= []}, $ibx; push @$cur_dst, $ibx; - } elsif ($url = imap_url($watch)) { - return if is_watchspam($url, $imap{$url}, $ibx); + d_type_set \%d_type, $watch, D_MAILDIR; + } elsif (is_mh($watch)) { + my $cur_dst = $d_map{$watch} //= []; + return if is_watchspam($watch, $cur_dst, $ibx); compile_watchheaders($ibx); - push @{$imap{$url} ||= []}, $ibx; - } elsif ($url = nntp_url($watch)) { - return if is_watchspam($url, $nntp{$url}, $ibx); + push(@$cur_dst, $ibx); + d_type_set \%d_type, $watch, D_MH; + } elsif ($uri = imap_uri($watch)) { + my $cur_dst = $imap{$$uri} //= []; + return if is_watchspam($uri, $cur_dst, $ibx); compile_watchheaders($ibx); - push @{$nntp{$url} ||= []}, $ibx; + push(@imap, $uri) if 1 == push(@$cur_dst, $ibx); + } elsif ($uri = nntp_uri($watch)) { + my $cur_dst = $nntp{$$uri} //= []; + return if is_watchspam($uri, $cur_dst, $ibx); + compile_watchheaders($ibx); + push(@nntp, $uri) if 1 == push(@$cur_dst, $ibx); } else { warn "watch unsupported: $k=$watch\n"; } } }); - my $mdre; - if (scalar keys %mdmap) { - $mdre = join('|', map { quotemeta($_) } keys %mdmap); - $mdre = qr!\A($mdre)/!; + my $d_re; + if (scalar keys %d_map) { + $d_re = join('|', map quotemeta, keys %d_map); + $d_re = qr!\A($d_re)/!; } - return unless $mdre || scalar(keys %imap) || scalar(keys %nntp); + return unless $d_re || scalar(keys %imap) || scalar(keys %nntp); bless { max_batch => 10, # avoid hogging locks for too long spamcheck => $spamcheck, - mdmap => \%mdmap, - mdre => $mdre, - config => $config, + d_map => \%d_map, + d_re => $d_re, + d_type => \%d_type, + pi_cfg => $cfg, imap => scalar keys %imap ? \%imap : undef, nntp => scalar keys %nntp? \%nntp : undef, + imap_order => scalar(@imap) ? \@imap : undef, + nntp_order => scalar(@nntp) ? \@nntp: undef, importers => {}, opendirs => {}, # dirname => dirhandle (in progress scans) ops => [], # 'quit', 'full' @@ -134,6 +171,7 @@ sub _done_for_now { sub remove_eml_i { # each_inbox callback my ($ibx, $self, $eml, $loc) = @_; + return if $ibx->{-watch_disabled}; eval { # try to avoid taking a lock or unnecessary spawning @@ -174,8 +212,8 @@ sub _remove_spam { # path must be marked as (S)een $path =~ /:2,[A-R]*S[T-Za-z]*\z/ or return; my $eml = eml_from_path($path) or return; - local $SIG{__WARN__} = warn_ignore_cb(); - $self->{config}->each_inbox(\&remove_eml_i, $self, $eml, $path); + local $SIG{__WARN__} = PublicInbox::Eml::warn_ignore_cb(); + $self->{pi_cfg}->each_inbox(\&remove_eml_i, $self, $eml, $path); } sub import_eml ($$$) { @@ -207,17 +245,24 @@ sub import_eml ($$$) { sub _try_path { my ($self, $path) = @_; - return unless PublicInbox::InboxWritable::is_maildir_path($path); - if ($path !~ $self->{mdre}) { - warn "unrecognized path: $path\n"; - return; - } - my $inboxes = $self->{mdmap}->{$1}; - unless ($inboxes) { - warn "unmappable dir: $1\n"; - return; - } - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + $path =~ $self->{d_re} or + return warn("BUG? unrecognized path: $path\n"); + my $dir = $1; + my $inboxes = $self->{d_map}->{$dir} // + return warn("W: unmappable dir: $dir\n"); + my ($md_fl, $mh_seq); + if ($self->{d_type}->{$dir} & D_MH) { + $path =~ m!/([0-9]+)\z! ? ($mh_seq = $1) : return; + } + $self->{d_type}->{$dir} & D_MAILDIR and + $md_fl = PublicInbox::MdirReader::maildir_path_flags($path); + $md_fl // $mh_seq // return; + return if ($md_fl // '') =~ /[DT]/; # no Drafts or Trash + # n.b. none of the MH keywords are relevant for public mail, + # mh_seq is only used to validate we're reading an email + # and not treating .mh_sequences as an email + + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; local $SIG{__WARN__} = sub { my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; $warn_cb->($pfx, "path: $path\n", @_); @@ -236,25 +281,24 @@ sub quit_done ($) { return unless $self->{quit}; # don't have reliable wakeups, keep signalling - my $done = 1; - for (qw(idle_pids poll_pids)) { - my $pids = $self->{$_} or next; - for (keys %$pids) { - $done = undef if kill('QUIT', $_); - } - } - $done; + my $live = grep { kill('QUIT', $_) } keys %{$self->{pids}}; + add_timer(0.01, \&quit_done, $self) if $live; + $live == 0; } -sub quit { +sub quit { # may be called in IMAP/NNTP children my ($self) = @_; $self->{quit} = 1; %{$self->{opendirs}} = (); _done_for_now($self); quit_done($self); - if (my $idle_mic = $self->{idle_mic}) { + if (my $dir_idle = delete $self->{dir_idle}) { + $dir_idle->close if $dir_idle; + } + if (my $idle_mic = delete $self->{idle_mic}) { # IMAP child + return unless $idle_mic->IsConnected && $idle_mic->Socket; eval { $idle_mic->done }; - if ($@) { + if ($@ && $idle_mic->IsConnected && $idle_mic->Socket) { warn "IDLE DONE error: $@\n"; eval { $idle_mic->disconnect }; warn "IDLE LOGOUT error: $@\n" if $@; @@ -274,266 +318,58 @@ sub watch_fs_init ($) { }; require PublicInbox::DirIdle; # inotify_create + EPOLL_CTL_ADD - PublicInbox::DirIdle->new([keys %{$self->{mdmap}}], $cb); -} - -# avoid exposing deprecated "snews" to users. -my %SCHEME_MAP = ('snews' => 'nntps'); - -sub uri_scheme ($) { - my ($uri) = @_; - my $scheme = $uri->scheme; - $SCHEME_MAP{$scheme} // $scheme; -} - -# returns the git config section name, e.g [imap "imaps://user@example.com"] -# without the mailbox, so we can share connections between different inboxes -sub uri_section ($) { - my ($uri) = @_; - uri_scheme($uri) . '://' . $uri->authority; -} - -sub cfg_intvl ($$$) { - my ($cfg, $key, $url) = @_; - my $v = $cfg->urlmatch($key, $url) // return; - $v =~ /\A[0-9]+(?:\.[0-9]+)?\z/s and return $v + 0; - if (ref($v) eq 'ARRAY') { - $v = join(', ', @$v); - warn "W: $key has multiple values: $v\nW: $key ignored\n"; - } else { - warn "W: $key=$v is not a numeric value in seconds\n"; - } -} - -sub cfg_bool ($$$) { - my ($cfg, $key, $url) = @_; - my $orig = $cfg->urlmatch($key, $url) // return; - my $bool = $cfg->git_bool($orig); - warn "W: $key=$orig for $url is not boolean\n" unless defined($bool); - $bool; -} - -# flesh out common IMAP-specific data structures -sub imap_common_init ($) { - my ($self) = @_; - my $cfg = $self->{config}; - my $mic_args = {}; # scheme://authority => Mail:IMAPClient arg - for my $url (sort keys %{$self->{imap}}) { - my $uri = PublicInbox::URIimap->new($url); - my $sec = uri_section($uri); - for my $k (qw(Starttls Debug Compress)) { - my $bool = cfg_bool($cfg, "imap.$k", $url) // next; - $mic_args->{$sec}->{$k} = $bool; - } - my $to = cfg_intvl($cfg, 'imap.timeout', $url); - $mic_args->{$sec}->{Timeout} = $to if $to; - for my $k (qw(pollInterval idleInterval)) { - $to = cfg_intvl($cfg, "imap.$k", $url) // next; - $self->{imap_opt}->{$sec}->{$k} = $to; - } - my $k = 'imap.fetchBatchSize'; - my $bs = $cfg->urlmatch($k, $url) // next; - if ($bs =~ /\A([0-9]+)\z/) { - $self->{imap_opt}->{$sec}->{batch_size} = $bs; - } else { - warn "$k=$bs is not an integer\n"; - } - } - $mic_args; -} - -sub auth_anon_cb { '' }; # for Mail::IMAPClient::Authcallback - -sub mic_for ($$$) { # mic = Mail::IMAPClient - my ($self, $url, $mic_args) = @_; - my $uri = PublicInbox::URIimap->new($url); - require PublicInbox::GitCredential; - my $cred = bless { - url => $url, - protocol => $uri->scheme, - host => $uri->host, - username => $uri->user, - password => $uri->password, - }, 'PublicInbox::GitCredential'; - my $common = $mic_args->{uri_section($uri)} // {}; - # IMAPClient and Net::Netrc both mishandles `0', so we pass `127.0.0.1' - my $host = $cred->{host}; - $host = '127.0.0.1' if $host eq '0'; - my $mic_arg = { - Port => $uri->port, - Server => $host, - Ssl => $uri->scheme eq 'imaps', - Keepalive => 1, # SO_KEEPALIVE - %$common, # may set Starttls, Compress, Debug .... - }; - my $mic = PublicInbox::IMAPClient->new(%$mic_arg) or - die "E: <$url> new: $@\n"; - - # default to using STARTTLS if it's available, but allow - # it to be disabled since I usually connect to localhost - if (!$mic_arg->{Ssl} && !defined($mic_arg->{Starttls}) && - $mic->has_capability('STARTTLS') && - $mic->can('starttls')) { - $mic->starttls or die "E: <$url> STARTTLS: $@\n"; - } - - # do we even need credentials? - if (!defined($cred->{username}) && - $mic->has_capability('AUTH=ANONYMOUS')) { - $cred = undef; - } - if ($cred) { - $cred->check_netrc unless defined $cred->{password}; - $cred->fill; # may prompt user here - $mic->User($mic_arg->{User} = $cred->{username}); - $mic->Password($mic_arg->{Password} = $cred->{password}); - } else { # AUTH=ANONYMOUS - $mic->Authmechanism($mic_arg->{Authmechanism} = 'ANONYMOUS'); - $mic->Authcallback($mic_arg->{Authcallback} = \&auth_anon_cb); - } - if ($mic->login && $mic->IsAuthenticated) { - # success! keep IMAPClient->new arg in case we get disconnected - $self->{mic_arg}->{uri_section($uri)} = $mic_arg; - } else { - warn "E: <$url> LOGIN: $@\n"; - $mic = undef; - } - $cred->run($mic ? 'approve' : 'reject') if $cred; - $mic; + my $dir_idle = $self->{dir_idle} = PublicInbox::DirIdle->new($cb); + $dir_idle->add_watches([keys %{$self->{d_map}}]); } -sub imap_import_msg ($$$$$) { - my ($self, $url, $uid, $raw, $flags) = @_; - # our target audience expects LF-only, save storage - $$raw =~ s/\r\n/\n/sg; - - my $inboxes = $self->{imap}->{$url}; +sub net_cb { # NetReader::(nntp|imap)_each callback + my ($uri, $art, $kw, $eml, $self, $inboxes) = @_; + return if grep(/\Adraft\z/, @$kw); + local $self->{cur_uid} = $art; # IMAP UID or NNTP article if (ref($inboxes)) { - for my $ibx (@$inboxes) { - my $eml = PublicInbox::Eml->new($$raw); - import_eml($self, $ibx, $eml); + my @ibx = @$inboxes; + my $last = pop @ibx; + for my $ibx (@ibx) { + my $tmp = PublicInbox::Eml->new(\($eml->as_string)); + import_eml($self, $ibx, $tmp); } + import_eml($self, $last, $eml); } elsif ($inboxes eq 'watchspam') { - # we don't remove unseen messages - if ($flags =~ /\\Seen\b/) { - local $SIG{__WARN__} = warn_ignore_cb(); - my $eml = PublicInbox::Eml->new($raw); - $self->{config}->each_inbox(\&remove_eml_i, - $self, $eml, "$url UID:$uid"); + if ($uri->scheme =~ /\Aimaps?\z/ && !grep(/\Aseen\z/, @$kw)) { + return; } + $self->{pi_cfg}->each_inbox(\&remove_eml_i, + $self, $eml, "$uri #$art"); } else { die "BUG: destination unknown $inboxes"; } } -sub imap_fetch_all ($$$) { - my ($self, $mic, $url) = @_; - my $uri = PublicInbox::URIimap->new($url); - my $sec = uri_section($uri); - my $mbx = $uri->mailbox; - $mic->Clear(1); # trim results history - $mic->examine($mbx) or return "E: EXAMINE $mbx ($sec) failed: $!"; - my ($r_uidval, $r_uidnext); - for ($mic->Results) { - /^\* OK \[UIDVALIDITY ([0-9]+)\].*/ and $r_uidval = $1; - /^\* OK \[UIDNEXT ([0-9]+)\].*/ and $r_uidnext = $1; - last if $r_uidval && $r_uidnext; - } - $r_uidval //= $mic->uidvalidity($mbx) // - return "E: $url cannot get UIDVALIDITY"; - $r_uidnext //= $mic->uidnext($mbx) // - return "E: $url cannot get UIDNEXT"; - my $itrk = PublicInbox::IMAPTracker->new($url); - my ($l_uidval, $l_uid) = $itrk->get_last; - $l_uidval //= $r_uidval; # first time - $l_uid //= 1; - if ($l_uidval != $r_uidval) { - return "E: $url UIDVALIDITY mismatch\n". - "E: local=$l_uidval != remote=$r_uidval"; - } - my $r_uid = $r_uidnext - 1; - if ($l_uid != 1 && $l_uid > $r_uid) { - return "E: $url local UID exceeds remote ($l_uid > $r_uid)\n". - "E: $url strangely, UIDVALIDLITY matches ($l_uidval)\n"; - } - return if $l_uid >= $r_uid; # nothing to do - - warn "I: $url fetching UID $l_uid:$r_uid\n"; - $mic->Uid(1); # the default, we hope - my $bs = $self->{imap_opt}->{$sec}->{batch_size} // 1; - my $req = $mic->imap4rev1 ? 'BODY.PEEK[]' : 'RFC822.PEEK'; - - # TODO: FLAGS may be useful for personal use - my $key = $req; - $key =~ s/\.PEEK//; - my ($uids, $batch); - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; +sub imap_fetch_all ($$) { + my ($self, $uri) = @_; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; + $self->{incremental} = 1; + $self->{on_commit} = [ \&_done_for_now, $self ]; + local $self->{cur_uid}; local $SIG{__WARN__} = sub { - my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; - $batch //= '?'; - $warn_cb->("$pfx$url UID:$batch\n", @_); + my $pfx = ($_[0] // '') =~ /^([A-Z]: |# )/g ? $1 : ''; + my $uid = $self->{cur_uid}; + $warn_cb->("$pfx$uri", $uid ? (" UID:$uid") : (), "\n", @_); }; - my $err; - do { - # I wish "UID FETCH $START:*" could work, but: - # 1) servers do not need to return results in any order - # 2) Mail::IMAPClient doesn't offer a streaming API - $uids = $mic->search("UID $l_uid:*") or - return "E: $url UID SEARCH $l_uid:* error: $!"; - return if scalar(@$uids) == 0; - - # RFC 3501 doesn't seem to indicate order of UID SEARCH - # responses, so sort it ourselves. Order matters so - # IMAPTracker can store the newest UID. - @$uids = sort { $a <=> $b } @$uids; - - # Did we actually get new messages? - return if $uids->[0] < $l_uid; - - $l_uid = $uids->[-1] + 1; # for next search - my $last_uid; - my $n = $self->{max_batch}; - - while (scalar @$uids) { - if (--$n < 0) { - _done_for_now($self); - $itrk->update_last($r_uidval, $last_uid); - $n = $self->{max_batch}; - } - my @batch = splice(@$uids, 0, $bs); - $batch = join(',', @batch); - local $0 = "UID:$batch $mbx $sec"; - my $r = $mic->fetch_hash($batch, $req, 'FLAGS'); - unless ($r) { # network error? - $err = "E: $url UID FETCH $batch error: $!"; - last; - } - for my $uid (@batch) { - # messages get deleted, so holes appear - my $per_uid = delete $r->{$uid} // next; - my $raw = delete($per_uid->{$key}) // next; - my $fl = $per_uid->{FLAGS} // ''; - imap_import_msg($self, $url, $uid, \$raw, $fl); - $last_uid = $uid; - last if $self->{quit}; - } - last if $self->{quit}; - } - _done_for_now($self); - $itrk->update_last($r_uidval, $last_uid); - } until ($err || $self->{quit}); - $err; + PublicInbox::NetReader::imap_each($self, $uri, \&net_cb, $self, + $self->{imap}->{$$uri}); } sub imap_idle_once ($$$$) { - my ($self, $mic, $intvl, $url) = @_; + my ($self, $mic, $intvl, $uri) = @_; my $i = $intvl //= (29 * 60); my $end = now() + $intvl; - warn "I: $url idling for ${intvl}s\n"; + warn "# $uri idling for ${intvl}s\n"; local $0 = "IDLE $0"; + return if $self->{quit}; unless ($mic->idle) { return if $self->{quit}; - return "E: IDLE failed on $url: $!"; + return "E: IDLE failed on $uri: $!"; } $self->{idle_mic} = $mic; # for ->quit my @res; @@ -544,27 +380,30 @@ sub imap_idle_once ($$$$) { } delete $self->{idle_mic}; unless ($self->{quit}) { - $mic->IsConnected or return "E: IDLE disconnected on $url"; - $mic->done or return "E: IDLE DONE failed on $url: $!"; + $mic->IsConnected or return "E: IDLE disconnected on $uri"; + $mic->done or return "E: IDLE DONE failed on $uri: $!"; } undef; } # idles on a single URI sub watch_imap_idle_1 ($$$) { - my ($self, $url, $intvl) = @_; - my $uri = PublicInbox::URIimap->new($url); + my ($self, $uri, $intvl) = @_; my $sec = uri_section($uri); - my $mic_arg = $self->{mic_arg}->{$sec} or + my $mic_arg = $self->{net_arg}->{$sec} or die "BUG: no Mail::IMAPClient->new arg for $sec"; my $mic; local $0 = $uri->mailbox." $sec"; until ($self->{quit}) { - $mic //= PublicInbox::IMAPClient->new(%$mic_arg); + $mic //= PublicInbox::NetReader::mic_new( + $self, $mic_arg, $sec, $uri); my $err; if ($mic && $mic->IsConnected) { - $err = imap_fetch_all($self, $mic, $url); - $err //= imap_idle_once($self, $mic, $intvl, $url); + local $self->{mics_cached}->{$sec} = $mic; + my $m = imap_fetch_all($self, $uri); + $m == $mic or die "BUG: wrong mic"; + $mic->IsConnected and + $err = imap_idle_once($self, $mic, $intvl, $uri) } else { $err = "E: not connected: $!"; } @@ -578,63 +417,42 @@ sub watch_imap_idle_1 ($$$) { sub watch_atfork_child ($) { my ($self) = @_; - delete $self->{idle_pids}; - delete $self->{poll_pids}; - delete $self->{opendirs}; - PublicInbox::DS->Reset; - %SIG = (%SIG, %{$self->{sig}}, CHLD => 'DEFAULT'); - PublicInbox::Sigfd::sig_setmask($self->{oldset}); + delete @$self{qw(dir_idle pids opendirs)}; + my $sig = delete $self->{sig}; + $sig->{CHLD} = $sig->{HUP} = $sig->{USR1} = 'DEFAULT'; + # TERM/QUIT/INT call ->quit, which works in both parent+child + @SIG{keys %$sig} = values %$sig; + PublicInbox::DS::sig_setmask(PublicInbox::DS::allowset($sig)); } -sub watch_atfork_parent ($) { - my ($self) = @_; - _done_for_now($self); - PublicInbox::Sigfd::block_signals(); -} +sub watch_atfork_parent ($) { _done_for_now($_[0]) } -sub imap_idle_requeue ($) { # DS::add_timer callback - my ($self, $url_intvl) = @{$_[0]}; +sub imap_idle_requeue { # DS::add_timer callback + my ($self, $uri, $intvl) = @_; return if $self->{quit}; - push @{$self->{idle_todo}}, $url_intvl; + push @{$self->{idle_todo}}, $uri, $intvl; event_step($self); } -sub imap_idle_reap { # PublicInbox::DS::dwaitpid callback - my ($self, $pid) = @_; - my $url_intvl = delete $self->{idle_pids}->{$pid} or - die "BUG: PID=$pid (unknown) reaped: \$?=$?\n"; - - my ($url, $intvl) = @$url_intvl; +sub imap_idle_reap { # awaitpid callback + my ($pid, $self, $uri, $intvl) = @_; + delete $self->{pids}->{$pid}; return if $self->{quit}; - warn "W: PID=$pid on $url died: \$?=$?\n" if $?; - PublicInbox::DS::add_timer(60, - \&imap_idle_requeue, [ $self, $url_intvl ]); + warn "W: PID=$pid on $uri died: \$?=$?\n" if $?; + add_timer(60, \&imap_idle_requeue, $self, $uri, $intvl); } -sub reap { # callback for EOFpipe - my ($pid, $cb, $self) = @{$_[0]}; - my $ret = waitpid($pid, 0); - if ($ret == $pid) { - $cb->($self, $pid); # poll_fetch_reap || imap_idle_reap - } else { - warn "W: waitpid($pid) => ", $ret // "($!)", "\n"; - } -} - -sub imap_idle_fork ($$) { - my ($self, $url_intvl) = @_; - my ($url, $intvl) = @$url_intvl; - pipe(my ($r, $w)) or die "pipe: $!"; - defined(my $pid = fork) or die "fork: $!"; +sub imap_idle_fork { + my ($self, $uri, $intvl) = @_; + return if $self->{quit}; + my $pid = PublicInbox::DS::fork_persist; if ($pid == 0) { - close $r; watch_atfork_child($self); - watch_imap_idle_1($self, $url, $intvl); - close $w; + watch_imap_idle_1($self, $uri, $intvl); _exit(0); } - $self->{idle_pids}->{$pid} = $url_intvl; - PublicInbox::EOFpipe->new($r, \&reap, [$pid, \&imap_idle_reap, $self]); + $self->{pids}->{$pid} = undef; + awaitpid($pid, \&imap_idle_reap, $self, $uri, $intvl); } sub event_step { @@ -642,128 +460,87 @@ sub event_step { return if $self->{quit}; my $idle_todo = $self->{idle_todo}; if ($idle_todo && @$idle_todo) { - my $oldset = watch_atfork_parent($self); + watch_atfork_parent($self); eval { - while (my $url_intvl = shift(@$idle_todo)) { - imap_idle_fork($self, $url_intvl); + while (my ($uri, $intvl) = splice(@$idle_todo, 0, 2)) { + imap_idle_fork($self, $uri, $intvl); } }; - PublicInbox::Sigfd::sig_setmask($oldset); die $@ if $@; } - fs_scan_step($self) if $self->{mdre}; + fs_scan_step($self) if $self->{d_re}; } sub watch_imap_fetch_all ($$) { - my ($self, $urls) = @_; - for my $url (@$urls) { - my $uri = PublicInbox::URIimap->new($url); - my $sec = uri_section($uri); - my $mic_arg = $self->{mic_arg}->{$sec} or - die "BUG: no Mail::IMAPClient->new arg for $sec"; - my $mic = PublicInbox::IMAPClient->new(%$mic_arg) or next; - my $err = imap_fetch_all($self, $mic, $url); + my ($self, $uris) = @_; + for my $uri (@$uris) { + imap_fetch_all($self, $uri); last if $self->{quit}; - warn $err, "\n" if $err; } } sub watch_nntp_fetch_all ($$) { - my ($self, $urls) = @_; - for my $url (@$urls) { - my $uri = uri_new($url); - my $sec = uri_section($uri); - my $nn_arg = $self->{nn_arg}->{$sec} or - die "BUG: no Net::NNTP->new arg for $sec"; - my $nntp_opt = $self->{nntp_opt}->{$sec}; - my $nn = nn_new($nn_arg, $nntp_opt, $url); - unless ($nn) { - warn "E: $url: \$!=$!\n"; - next; - } - last if $self->{quit}; - if (my $postconn = $nntp_opt->{-postconn}) { - for my $m_arg (@$postconn) { - my ($method, @args) = @$m_arg; - $nn->$method(@args) and next; - warn "E: <$url> $method failed\n"; - $nn = undef; - last; - } - } + my ($self, $uris) = @_; + $self->{incremental} = 1; + $self->{on_commit} = [ \&_done_for_now, $self ]; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; + local $self->{cur_uid}; + my $uri = ''; + local $SIG{__WARN__} = sub { + my $pfx = ($_[0] // '') =~ /^([A-Z]: |# )/g ? $1 : ''; + my $art = $self->{cur_uid}; + $warn_cb->("$pfx$uri", $art ? (" ARTICLE $art") : (), "\n", @_); + }; + for $uri (@$uris) { + PublicInbox::NetReader::nntp_each($self, $uri, \&net_cb, $self, + $self->{nntp}->{$$uri}); last if $self->{quit}; - if ($nn) { - my $err = nntp_fetch_all($self, $nn, $url); - warn $err, "\n" if $err; - } } } -sub poll_fetch_fork ($) { # DS::add_timer callback - my ($self, $intvl, $urls) = @{$_[0]}; +sub poll_fetch_fork { # DS::add_timer callback + my ($self, $intvl, $uris) = @_; return if $self->{quit}; - pipe(my ($r, $w)) or die "pipe: $!"; - my $oldset = watch_atfork_parent($self); - my $pid = fork; - if (defined($pid) && $pid == 0) { - close $r; + watch_atfork_parent($self); + my @nntp; + my @imap = grep { # push() always returns > 0 + $_->scheme =~ m!\Aimaps?!i ? 1 : (push(@nntp, $_) < 0) + } @$uris; + my $pid = PublicInbox::DS::fork_persist; + if ($pid == 0) { watch_atfork_child($self); - if ($urls->[0] =~ m!\Aimaps?://!i) { - watch_imap_fetch_all($self, $urls); - } else { - watch_nntp_fetch_all($self, $urls); - } - close $w; + watch_imap_fetch_all($self, \@imap) if @imap; + watch_nntp_fetch_all($self, \@nntp) if @nntp; _exit(0); } - PublicInbox::Sigfd::sig_setmask($oldset); - die "fork: $!" unless defined $pid; - $self->{poll_pids}->{$pid} = [ $intvl, $urls ]; - PublicInbox::EOFpipe->new($r, \&reap, [$pid, \&poll_fetch_reap, $self]); + $self->{pids}->{$pid} = undef; + awaitpid($pid, \&poll_fetch_reap, $self, $intvl, $uris); } -sub poll_fetch_reap { - my ($self, $pid) = @_; - my $intvl_urls = delete $self->{poll_pids}->{$pid} or - die "BUG: PID=$pid (unknown) reaped: \$?=$?\n"; +sub poll_fetch_reap { # awaitpid callback + my ($pid, $self, $intvl, $uris) = @_; + delete $self->{pids}->{$pid}; return if $self->{quit}; - my ($intvl, $urls) = @$intvl_urls; if ($?) { - warn "W: PID=$pid died: \$?=$?\n", map { "$_\n" } @$urls; + warn "W: PID=$pid died: \$?=$?\n", map { "$_\n" } @$uris; } - warn("I: will check $_ in ${intvl}s\n") for @$urls; - PublicInbox::DS::add_timer($intvl, \&poll_fetch_fork, - [$self, $intvl, $urls]); + warn("# will check $_ in ${intvl}s\n") for @$uris; + add_timer($intvl, \&poll_fetch_fork, $self, $intvl, $uris); } sub watch_imap_init ($$) { my ($self, $poll) = @_; - eval { require PublicInbox::IMAPClient } or - die "Mail::IMAPClient is required for IMAP:\n$@\n"; - eval { require PublicInbox::IMAPTracker } or - die "DBD::SQLite is required for IMAP\n:$@\n"; - - my $mic_args = imap_common_init($self); # read args from config - - # make sure we can connect and cache the credentials in memory - $self->{mic_arg} = {}; # schema://authority => IMAPClient->new args - my $mics = {}; # schema://authority => IMAPClient obj - for my $url (sort keys %{$self->{imap}}) { - my $uri = PublicInbox::URIimap->new($url); - $mics->{uri_section($uri)} //= mic_for($self, $url, $mic_args); - } - - my $idle = []; # [ [ url1, intvl1 ], [url2, intvl2] ] - for my $url (keys %{$self->{imap}}) { - my $uri = PublicInbox::URIimap->new($url); + my $mics = PublicInbox::NetReader::imap_common_init($self) or return; + my $idle = []; # [ uri1, intvl1, uri2, intvl2 ] + for my $uri (@{$self->{imap_order}}) { my $sec = uri_section($uri); my $mic = $mics->{$sec}; - my $intvl = $self->{imap_opt}->{$sec}->{pollInterval}; + my $intvl = $self->{cfg_opt}->{$sec}->{pollInterval}; if ($mic->has_capability('IDLE') && !$intvl) { - $intvl = $self->{imap_opt}->{$sec}->{idleInterval}; - push @$idle, [ $url, $intvl // () ]; + $intvl = $self->{cfg_opt}->{$sec}->{idleInterval}; + push @$idle, $uri, $intvl; } else { - push @{$poll->{$intvl || 120}}, $url; + push @{$poll->{$intvl || 120}}, $uri; } } if (scalar @$idle) { @@ -772,249 +549,32 @@ sub watch_imap_init ($$) { } } -# flesh out common NNTP-specific data structures -sub nntp_common_init ($) { - my ($self) = @_; - my $cfg = $self->{config}; - my $nn_args = {}; # scheme://authority => Net::NNTP->new arg - for my $url (sort keys %{$self->{nntp}}) { - my $sec = uri_section(uri_new($url)); - - # Debug and Timeout are passed to Net::NNTP->new - my $v = cfg_bool($cfg, 'nntp.Debug', $url); - $nn_args->{$sec}->{Debug} = $v if defined $v; - my $to = cfg_intvl($cfg, 'nntp.Timeout', $url); - $nn_args->{$sec}->{Timeout} = $to if $to; - - # Net::NNTP post-connect commands - for my $k (qw(starttls compress)) { - $v = cfg_bool($cfg, "nntp.$k", $url) // next; - $self->{nntp_opt}->{$sec}->{$k} = $v; - } - - # internal option - for my $k (qw(pollInterval)) { - $to = cfg_intvl($cfg, "nntp.$k", $url) // next; - $self->{nntp_opt}->{$sec}->{$k} = $to; - } - } - $nn_args; -} - -# Net::NNTP doesn't support CAPABILITIES, yet -sub try_starttls ($) { - my ($host) = @_; - return if $host =~ /\.onion\z/s; - return if $host =~ /\A127\.[0-9]+\.[0-9]+\.[0-9]+\z/s; - return if $host eq '::1'; - 1; -} - -sub nn_new ($$$) { - my ($nn_arg, $nntp_opt, $url) = @_; - my $nn = Net::NNTP->new(%$nn_arg) or die "E: <$url> new: $!\n"; - - # default to using STARTTLS if it's available, but allow - # it to be disabled for localhost/VPN users - if (!$nn_arg->{SSL} && $nn->can('starttls')) { - if (!defined($nntp_opt->{starttls}) && - try_starttls($nn_arg->{Host})) { - # soft fail by default - $nn->starttls or warn <<""; -W: <$url> STARTTLS tried and failed (not requested) - - } elsif ($nntp_opt->{starttls}) { - # hard fail if explicitly configured - $nn->starttls or die <<""; -E: <$url> STARTTLS requested and failed - - } - } elsif ($nntp_opt->{starttls}) { - $nn->can('starttls') or - die "E: <$url> Net::NNTP too old for STARTTLS\n"; - $nn->starttls or die <<""; -E: <$url> STARTTLS requested and failed - - } - $nn; -} - -sub nn_for ($$$) { # nn = Net::NNTP - my ($self, $url, $nn_args) = @_; - my $uri = uri_new($url); - my $sec = uri_section($uri); - my $nntp_opt = $self->{nntp_opt}->{$sec} //= {}; - my $host = $uri->host; - # Net::NNTP and Net::Netrc both mishandle `0', so we pass `127.0.0.1' - $host = '127.0.0.1' if $host eq '0'; - my $cred; - my ($u, $p); - if (defined(my $ui = $uri->userinfo)) { - require PublicInbox::GitCredential; - $cred = bless { - url => $sec, - protocol => uri_scheme($uri), - host => $host, - }, 'PublicInbox::GitCredential'; - ($u, $p) = split(/:/, $ui, 2); - ($cred->{username}, $cred->{password}) = ($u, $p); - $cred->check_netrc unless defined $p; - } - my $common = $nn_args->{$sec} // {}; - my $nn_arg = { - Port => $uri->port, - Host => $host, - SSL => $uri->secure, # snews == nntps - %$common, # may Debug .... - }; - my $nn = nn_new($nn_arg, $nntp_opt, $url); - - if ($cred) { - $cred->fill; # may prompt user here - if ($nn->authinfo($u, $p)) { - push @{$nntp_opt->{-postconn}}, [ 'authinfo', $u, $p ]; - } else { - warn "E: <$url> AUTHINFO $u XXXX failed\n"; - $nn = undef; - } - } - - if ($nntp_opt->{compress}) { - # https://rt.cpan.org/Ticket/Display.html?id=129967 - if ($nn->can('compress')) { - if ($nn->compress) { - push @{$nntp_opt->{-postconn}}, [ 'compress' ]; - } else { - warn "W: <$url> COMPRESS failed\n"; - } - } else { - delete $nntp_opt->{compress}; - warn <<""; -W: <$url> COMPRESS not supported by Net::NNTP -W: see https://rt.cpan.org/Ticket/Display.html?id=129967 for updates - - } - } - - $self->{nn_arg}->{$sec} = $nn_arg; - $cred->run($nn ? 'approve' : 'reject') if $cred; - $nn; -} - -sub nntp_fetch_all ($$$) { - my ($self, $nn, $url) = @_; - my $uri = uri_new($url); - my ($group, $num_a, $num_b) = $uri->group; - my $sec = uri_section($uri); - my ($nr, $beg, $end) = $nn->group($group); - unless (defined($nr)) { - chomp(my $msg = $nn->message); - return "E: GROUP $group <$sec> $msg"; - } - - # IMAPTracker is also used for tracking NNTP, UID == article number - # LIST.ACTIVE can get the equivalent of UIDVALIDITY, but that's - # expensive. So we assume newsgroups don't change: - my $itrk = PublicInbox::IMAPTracker->new($url); - my (undef, $l_art) = $itrk->get_last; - $l_art //= $beg; # initial import - - # allow users to specify articles to refetch - # cf. https://tools.ietf.org/id/draft-gilman-news-url-01.txt - # nntp://example.com/inbox.foo/$num_a-$num_b - $l_art = $num_a if defined($num_a) && $num_a < $l_art; - $end = $num_b if defined($num_b) && $num_b < $end; - - return if $l_art >= $end; # nothing to do - $beg = $l_art + 1; - - warn "I: $url fetching ARTICLE $beg..$end\n"; - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; - my ($err, $art); - local $SIG{__WARN__} = sub { - my $pfx = ($_[0] // '') =~ /^([A-Z]: )/g ? $1 : ''; - $warn_cb->("$pfx$url ", $art ? ("ARTICLE $art") : (), "\n", @_); - }; - my $inboxes = $self->{nntp}->{$url}; - my $last_art; - my $n = $self->{max_batch}; - for ($beg..$end) { - last if $self->{quit}; - $art = $_; - if (--$n < 0) { - _done_for_now($self); - $itrk->update_last(0, $last_art); - $n = $self->{max_batch}; - } - my $raw = $nn->article($art); - unless (defined($raw)) { - my $msg = $nn->message; - if ($nn->code == 421) { # pseudo response from Net::Cmd - $err = "E: $msg"; - last; - } else { # probably just a deleted message (spam) - warn "W: $msg"; - next; - } - } - s/\r\n/\n/ for @$raw; - $raw = join('', @$raw); - if (ref($inboxes)) { - for my $ibx (@$inboxes) { - my $eml = PublicInbox::Eml->new($raw); - import_eml($self, $ibx, $eml); - } - } elsif ($inboxes eq 'watchspam') { - my $eml = PublicInbox::Eml->new(\$raw); - $self->{config}->each_inbox(\&remove_eml_i, - $self, $eml, "$url ARTICLE $art"); - } else { - die "BUG: destination unknown $inboxes"; - } - $last_art = $art; - } - _done_for_now($self); - $itrk->update_last(0, $last_art); - $err; -} - sub watch_nntp_init ($$) { my ($self, $poll) = @_; - eval { require Net::NNTP } or - die "Net::NNTP is required for NNTP:\n$@\n"; - eval { require PublicInbox::IMAPTracker } or - die "DBD::SQLite is required for NNTP\n:$@\n"; - - my $nn_args = nntp_common_init($self); # read args from config - - # make sure we can connect and cache the credentials in memory - $self->{nn_arg} = {}; # schema://authority => Net::NNTP->new args - for my $url (sort keys %{$self->{nntp}}) { - nn_for($self, $url, $nn_args); - } - for my $url (keys %{$self->{nntp}}) { - my $uri = uri_new($url); + PublicInbox::NetReader::nntp_common_init($self); + for my $uri (@{$self->{nntp_order}}) { my $sec = uri_section($uri); - my $intvl = $self->{nntp_opt}->{$sec}->{pollInterval}; - push @{$poll->{$intvl || 120}}, $url; + my $intvl = $self->{cfg_opt}->{$sec}->{pollInterval}; + push @{$poll->{$intvl || 120}}, $uri; } } +sub quit_inprogress { !$_[0]->quit_done } # post_loop_do CB + sub watch { # main entry point - my ($self, $sig, $oldset) = @_; - $self->{oldset} = $oldset; - $self->{sig} = $sig; - my $poll = {}; # intvl_seconds => [ url1, url2 ] + my ($self, $sig) = @_; + my $first_sig; + $self->{sig} //= ($first_sig = $sig); + my $poll = {}; # intvl_seconds => [ uri1, uri2 ] watch_imap_init($self, $poll) if $self->{imap}; watch_nntp_init($self, $poll) if $self->{nntp}; - while (my ($intvl, $urls) = each %$poll) { - # poll all URLs for a given interval sequentially - PublicInbox::DS::add_timer(0, \&poll_fetch_fork, - [$self, $intvl, $urls]); + while (my ($intvl, $uris) = each %$poll) { + # poll all URIs for a given interval sequentially + add_timer(0, \&poll_fetch_fork, $self, $intvl, $uris); } - watch_fs_init($self) if $self->{mdre}; - PublicInbox::DS->SetPostLoopCallback(sub { !$self->quit_done }); - PublicInbox::DS->EventLoop; # calls ->event_step + watch_fs_init($self) if $self->{d_re}; + local @PublicInbox::DS::post_loop_do = (\&quit_inprogress, $self); + PublicInbox::DS::event_loop($first_sig); # calls ->event_step _done_for_now($self); } @@ -1043,7 +603,7 @@ sub fs_scan_step { $opendirs->{$dir} = $dh if $n < 0; } if ($op && $op eq 'full') { - foreach my $dir (keys %{$self->{mdmap}}) { + foreach my $dir (keys %{$self->{d_map}}) { next if $opendirs->{$dir}; # already in progress my $ok = opendir(my $dh, $dir); unless ($ok) { @@ -1118,6 +678,13 @@ sub is_maildir { $_[0]; } +sub is_mh { + $_[0] =~ s!\Amh:!!i or return; + $_[0] =~ tr!/!/!s; + $_[0] =~ s!/\z!!; + $_[0]; +} + sub is_watchspam { my ($cur, $ws, $ibx) = @_; if ($ws && !ref($ws) && $ws eq 'watchspam') { @@ -1129,31 +696,6 @@ EOF undef; } -sub uri_new { - my ($url) = @_; - - # URI::snews exists, URI::nntps does not, so use URI::snews - $url =~ s!\Anntps://!snews://!i; - URI->new($url); -} - -sub imap_url { - my ($url) = @_; - require PublicInbox::URIimap; - my $uri = PublicInbox::URIimap->new($url); - $uri ? $uri->canonical->as_string : undef; -} - -my %IS_NNTP = (news => 1, snews => 1, nntp => 1); -sub nntp_url { - my ($url) = @_; - require URI; - my $uri = uri_new($url); - return unless $uri && $IS_NNTP{$uri->scheme} && $uri->group; - $url = $uri->canonical->as_string; - # nntps is IANA registered, snews is deprecated - $url =~ s!\Asnews://!nntps://!; - $url; -} +sub folder_select { 'select' } # for PublicInbox::NetReader 1; diff --git a/lib/PublicInbox/WwwAltId.pm b/lib/PublicInbox/WwwAltId.pm index 2818400e..31d9b607 100644 --- a/lib/PublicInbox/WwwAltId.pm +++ b/lib/PublicInbox/WwwAltId.pm @@ -1,9 +1,9 @@ -# Copyright (C) 2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # dumps using the ".dump" command of sqlite3(1) package PublicInbox::WwwAltId; -use strict; +use v5.12; use PublicInbox::Qspawn; use PublicInbox::WwwStream qw(html_oneshot); use PublicInbox::AltId; @@ -15,8 +15,7 @@ sub check_output { my ($r, $bref, $ctx) = @_; return html_oneshot($ctx, 500) if !defined($r); if ($r == 0) { - my $err = eval { $ctx->{env}->{'psgi.errors'} } // \*STDERR; - $err->print("unexpected EOF from sqlite3\n"); + warn 'unexpected EOF from sqlite3'; return html_oneshot($ctx, 501); } [200, [ qw(Content-Type application/gzip), 'Content-Disposition', @@ -30,49 +29,44 @@ sub check_output { sub sqldump ($$) { my ($ctx, $altid_pfx) = @_; my $env = $ctx->{env}; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $altid_map = $ibx->altid_map; my $fn = $altid_map->{$altid_pfx}; unless (defined $fn) { - return html_oneshot($ctx, 404, \<<EOF); + return html_oneshot($ctx, 404, <<EOF); <pre>`$altid_pfx' is not a valid altid for this inbox</pre> EOF } if ($env->{REQUEST_METHOD} ne 'POST') { my $url = $ibx->base_url($ctx->{env}) . "$altid_pfx.sql.gz"; - return html_oneshot($ctx, 405, \<<EOF); -<pre>A POST request required to retrieve $altid_pfx.sql.gz + return html_oneshot($ctx, 405, <<EOF); +<pre>A POST request is required to retrieve $altid_pfx.sql.gz - curl -XPOST -O $url + curl -d '' -O $url or - curl -XPOST $url | \\ + curl -d '' $url | \\ gzip -dc | \\ sqlite3 /path/to/$altid_pfx.sqlite3 </pre> EOF } - $sqlite3 //= which('sqlite3') // return html_oneshot($ctx, 501, \<<EOF); + $sqlite3 //= which('sqlite3') // return html_oneshot($ctx, 501, <<EOF); <pre>sqlite3 not available The administrator needs to install the sqlite3(1) binary to support gzipped sqlite3 dumps.</pre> EOF - # setup stdin, POSIX requires writes <= 512 bytes to succeed so - # we can close the pipe right away. - pipe(my ($r, $w)) or die "pipe: $!"; - syswrite($w, ".dump\n") == 6 or die "write: $!"; - close($w) or die "close: $!"; - # TODO: use -readonly if available with newer sqlite3(1) - my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, { 0 => $r }); + my $qsp = PublicInbox::Qspawn->new([$sqlite3, $fn], undef, + { 0 => \".dump\n" }); $ctx->{altid_pfx} = $altid_pfx; $env->{'qspawn.filter'} = PublicInbox::GzipFilter->new; - $qsp->psgi_return($env, undef, \&check_output, $ctx); + $qsp->psgi_yield($env, undef, \&check_output, $ctx); } 1; diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm index 388def12..26b366f5 100644 --- a/lib/PublicInbox/WwwAtomStream.pm +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Atom body stream for HTTP responses @@ -8,14 +8,15 @@ use strict; use parent 'PublicInbox::GzipFilter'; use POSIX qw(strftime); -use Digest::SHA qw(sha1_hex); +use PublicInbox::SHA qw(sha1_hex); use PublicInbox::Address; use PublicInbox::Hval qw(ascii_html mid_href); use PublicInbox::MsgTime qw(msg_timestamp); sub new { my ($class, $ctx, $cb) = @_; - $ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env}); + $ctx->{feed_base_url} = $ctx->{ibx}->base_url($ctx->{env}); + $ctx->{-spfx} = $ctx->{feed_base_url} if $ctx->{ibx}->{coderepo}; $ctx->{cb} = $cb || \&PublicInbox::GzipFilter::close; $ctx->{emit_header} = 1; bless $ctx, $class; @@ -28,7 +29,7 @@ sub async_next ($) { if (my $smsg = $ctx->{smsg} = $ctx->{cb}->($ctx)) { $ctx->smsg_blob($smsg); } else { - $ctx->{http_out}->write($ctx->translate('</feed>')); + $ctx->write('</feed>'); $ctx->close; } }; @@ -38,14 +39,15 @@ sub async_next ($) { sub async_eml { # for async_blob_cb my ($ctx, $eml) = @_; my $smsg = delete $ctx->{smsg}; - $ctx->{http_out}->write($ctx->translate(feed_entry($ctx, $smsg, $eml))) + $smsg->{mid} // $smsg->populate($eml); + $ctx->write(feed_entry($ctx, $smsg, $eml)); } sub response { - my ($class, $ctx, $code, $cb) = @_; + my ($class, $ctx, $cb) = @_; my $res_hdr = [ 'Content-Type' => 'application/atom+xml' ]; $class->new($ctx, $cb); - $ctx->psgi_response($code, $res_hdr); + $ctx->psgi_response(200, $res_hdr); } # called once for each message by PSGI server @@ -53,7 +55,7 @@ sub getline { my ($self) = @_; my $cb = $self->{cb} or return; while (my $smsg = $cb->($self)) { - my $eml = $self->{-inbox}->smsg_eml($smsg) or next; + my $eml = $self->{ibx}->smsg_eml($smsg) or next; return $self->translate(feed_entry($self, $smsg, $eml)); } delete $self->{cb}; @@ -82,7 +84,7 @@ sub to_uuid ($) { sub atom_header { my ($ctx, $title) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $base_url = $ctx->{feed_base_url}; my $search_q = $ctx->{search_query}; my $self_url = $base_url; @@ -97,11 +99,16 @@ sub atom_header { $base_url .= '?' . $search_q->qs_html(x => undef); $self_url .= '?' . $search_q->qs_html; $page_id = to_uuid("q\n".$query); + } elsif (defined(my $cat = $ctx->{topic_category})) { + $title = title_tag("$cat topics - ".$ibx->description); + $self_url .= "topics_$cat.atom"; } else { $title = title_tag($ibx->description); $self_url .= 'new.atom'; - $page_id = "mailto:$ibx->{-primary_address}"; + my $addr = $ibx->{-primary_address}; + $page_id = "mailto:$addr" if defined $addr; } + $page_id //= to_uuid($self_url); qq(<?xml version="1.0" encoding="us-ascii"?>\n) . qq(<feed\nxmlns="http://www.w3.org/2005/Atom"\n) . qq(xmlns:thr="http://purl.org/syndication/thread/1.0">) . @@ -136,28 +143,29 @@ sub feed_entry { $title = title_tag($title); my $from = $eml->header('From') // $eml->header('Sender') // - $ctx->{-inbox}->{-primary_address}; + $ctx->{ibx}->{-primary_address}; my ($email) = PublicInbox::Address::emails($from); my $name = ascii_html(join(', ', PublicInbox::Address::names($from))); - $email = ascii_html($email // $ctx->{-inbox}->{-primary_address}); + $email = ascii_html($email // $ctx->{ibx}->{-primary_address}); - my $s = delete($ctx->{emit_header}) ? atom_header($ctx, $title) : ''; - $s .= "<entry><author><name>$name</name><email>$email</email>" . + print { $ctx->zfh } + (delete($ctx->{emit_header}) ? atom_header($ctx, $title) : ''), + "<entry><author><name>$name</name><email>$email</email>" . "</author>$title$updated" . - qq(<link\nhref="$href"/>). + qq(<link\nhref="$href"/>) . "<id>$uuid</id>$irt" . qq{<content\ntype="xhtml">} . qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} . qq(<pre\nstyle="white-space:pre-wrap">); - $ctx->{obuf} = \$s; $ctx->{mhref} = $href; - PublicInbox::View::multipart_text_as_html($eml, $ctx); - delete $ctx->{obuf}; - $s .= '</pre></div></content></entry>'; + $ctx->{changed_href} = "${href}#related"; + $eml->each_part(\&PublicInbox::View::add_text_body, $ctx, 1); + '</pre></div></content></entry>'; } sub feed_updated { - '<updated>' . strftime('%Y-%m-%dT%H:%M:%SZ', gmtime(@_)) . '</updated>'; + my ($t) = @_; + '<updated>' . strftime('%Y-%m-%dT%H:%M:%SZ', gmtime($t)) . '</updated>'; } 1; diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm index 0b2cda90..87844bf3 100644 --- a/lib/PublicInbox/WwwAttach.pm +++ b/lib/PublicInbox/WwwAttach.pm @@ -1,23 +1,37 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # For retrieving attachments from messages in the WWW interface package PublicInbox::WwwAttach; # internal package use strict; +use v5.10.1; use parent qw(PublicInbox::GzipFilter); -use bytes (); # only for bytes::length -use PublicInbox::EmlContentFoo qw(parse_content_type); use PublicInbox::Eml; +sub referer_match ($) { + my ($ctx) = @_; + my $env = $ctx->{env}; + return 1 if $env->{REQUEST_METHOD} eq 'POST'; + my $referer = lc($env->{HTTP_REFERER} // ''); + return 1 if $referer eq ''; # no referer is always OK for wget/curl + + # prevent deep-linking from other domains on some browsers (Firefox) + # n.b.: $ctx->{ibx}->base_url($env) with INBOX_URL won't work + # with dillo, we can only match "$url_scheme://$HTTP_HOST/" without + # path components + my $base_url = lc($env->{'psgi.url_scheme'} . '://' . + ($env->{HTTP_HOST} // + "$env->{SERVER_NAME}:$env->{SERVER_PORT}") . '/'); + index($referer, $base_url) == 0; +} + sub get_attach_i { # ->each_part callback my ($part, $depth, $idx) = @{$_[0]}; my $ctx = $_[1]; return if $idx ne $ctx->{idx}; # [0-9]+(?:\.[0-9]+)+ my $res = $ctx->{res}; $res->[0] = 200; - my $ct = $part->content_type; - $ct = parse_content_type($ct) if $ct; - + my $ct = $part->ct; if ($ct && (($ct->{type} || '') eq 'text')) { # display all text as text/plain: my $cset = $ct->{attributes}->{charset}; @@ -28,10 +42,21 @@ sub get_attach_i { # ->each_part callback $ctx->{env}); $part = $ctx->zflush($part->body); } else { # TODO: allow user to configure safe types - $res->[1]->[1] = 'application/octet-stream'; - $part = $part->body; + if (referer_match($ctx)) { + $res->[1]->[1] = 'application/octet-stream'; + $part = $part->body; + } else { + $res->[0] = 403; + $res->[1]->[1] = 'text/html'; + $part = <<""; +<html><head><title>download +attachment</title><body><pre>Deep-linking prevented</pre><form +method=post\naction=""><input type=submit value="Download attachment" +/></form></body></html> + + } } - push @{$res->[1]}, 'Content-Length', bytes::length($part); + push @{$res->[1]}, 'Content-Length', length($part); $res->[2]->[0] = $part; } @@ -66,15 +91,15 @@ sub get_attach ($$$) { $ctx->{idx} = $idx; bless $ctx, __PACKAGE__; my $eml; - if ($ctx->{smsg} = $ctx->{-inbox}->smsg_by_mid($ctx->{mid})) { + if ($ctx->{smsg} = $ctx->{ibx}->smsg_by_mid($ctx->{mid})) { return sub { # public-inbox-httpd-only $ctx->{wcb} = $_[0]; scan_attach($ctx); } if $ctx->{env}->{'pi-httpd.async'}; # generic PSGI: - $eml = $ctx->{-inbox}->smsg_eml($ctx->{smsg}); - } elsif (!$ctx->{-inbox}->over) { - if (my $bref = $ctx->{-inbox}->msg_by_mid($ctx->{mid})) { + $eml = $ctx->{ibx}->smsg_eml($ctx->{smsg}); + } elsif (!$ctx->{ibx}->over) { + if (my $bref = $ctx->{ibx}->msg_by_mid($ctx->{mid})) { $eml = PublicInbox::Eml->new($bref); } } diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm new file mode 100644 index 00000000..a5e2dc4a --- /dev/null +++ b/lib/PublicInbox/WwwCoderepo.pm @@ -0,0 +1,377 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Standalone code repository viewer for users w/o cgit. +# This isn't intended to replicate all of cgit, but merely to be a +# "good enough" viewer with search support and some UI hints to encourage +# cloning + command-line usage. +package PublicInbox::WwwCoderepo; +use v5.12; +use parent qw(PublicInbox::WwwStream); +use File::Temp 0.19 (); # newdir +use POSIX qw(O_RDWR F_GETFL); +use PublicInbox::ViewVCS; +use PublicInbox::WwwStatic qw(r); +use PublicInbox::GitHTTPBackend; +use PublicInbox::WwwStream; +use PublicInbox::Hval qw(prurl ascii_html utf8_maybe); +use PublicInbox::ViewDiff qw(uri_escape_path); +use PublicInbox::RepoSnapshot; +use PublicInbox::RepoAtom; +use PublicInbox::RepoTree; +use PublicInbox::RepoList; +use PublicInbox::OnDestroy; +use URI::Escape qw(uri_escape_utf8); +use File::Spec; +use autodie qw(fcntl open); + +my @EACH_REF = (qw(git for-each-ref --sort=-creatordate), + "--format=%(HEAD)%00".join('%00', map { "%($_)" } + qw(objectname refname:short subject creatordate:short))); +my $HEADS_CMD = <<''; +# heads (aka `branches'): +$ git for-each-ref --sort=-creatordate refs/heads \ + --format='%(HEAD) %(refname:short) %(subject) (%(creatordate:short))' + +my $TAGS_CMD = <<''; +# tags: +$ git for-each-ref --sort=-creatordate refs/tags \ + --format='%(refname:short) %(subject) (%(creatordate:short))' + +my $NO_HEADS = "# no heads (branches), yet...\n"; +my $NO_TAGS = "# no tags, yet...\n"; + +# shared with PublicInbox::Cgit +sub prepare_coderepos { + my ($self) = @_; + my $pi_cfg = $self->{pi_cfg}; + + # TODO: support gitweb and other repository viewers? + $pi_cfg->parse_cgitrc(undef, 0); + + my $coderepos = $pi_cfg->{-coderepos}; + for my $k (grep(/\Acoderepo\.(?:.+)\.dir\z/, keys %$pi_cfg)) { + $k = substr($k, length('coderepo.'), -length('.dir')); + $coderepos->{$k} //= $pi_cfg->fill_coderepo($k); + } + + # associate inboxes and extindices with coderepos for search: + for my $k (grep(/\Apublicinbox\.(?:.+)\.coderepo\z/, keys %$pi_cfg)) { + $k = substr($k, length('publicinbox.'), -length('.coderepo')); + my $ibx = $pi_cfg->lookup_name($k) // next; + $pi_cfg->repo_objs($ibx); + } + for my $k (grep(/\Aextindex\.(?:.+)\.coderepo\z/, keys %$pi_cfg)) { + $k = substr($k, length('extindex.'), -length('.coderepo')); + my $eidx = $pi_cfg->lookup_ei($k) // next; + $pi_cfg->repo_objs($eidx); + } + $pi_cfg->each_cindex('load_coderepos', $pi_cfg); +} + +sub new { + my ($cls, $pi_cfg) = @_; + my $self = bless { pi_cfg => $pi_cfg }, $cls; + prepare_coderepos($self); + $self->{snapshots} = do { + my $s = $pi_cfg->{'coderepo.snapshots'} // ''; + $s eq 'all' ? \%PublicInbox::RepoSnapshot::FMT_TYPES : + +{ map { $_ => 1 } split(/\s+/, $s) }; + }; + $self->{$_} = 10 for qw(summary_branches summary_tags); + $self->{$_} = 10 for qw(summary_log); + + # try reuse STDIN if it's already /dev/null + open $self->{log_fh}, '+>', '/dev/null'; + my @l = stat($self->{log_fh}) or die "stat: $!"; + my @s = stat(STDIN) or die "stat(STDIN): $!"; + if ("@l[0, 1]" eq "@s[0, 1]") { + my $f = fcntl(STDIN, F_GETFL, 0); + $self->{log_fh} = *STDIN{IO} if $f & O_RDWR; + } + $self; +} + +sub _snapshot_link_prep { + my ($ctx) = @_; + my @s = sort keys %{$ctx->{wcr}->{snapshots}} or return (); + my $n = $ctx->{git}->local_nick // die "BUG: $ctx->{git_dir} nick"; + $n =~ s!\.git/*\z!!; + ($n) = ($n =~ m!([^/]+)/*\z!); + (ascii_html($n).'-', @s); +} + +sub _refs_heads_link { + my ($line, $upfx) = @_; + my ($pfx, $oid, $ref, $s, $cd) = split(/\0/, $line); + my $align = length($ref) < 12 ? ' ' x (12 - length($ref)) : ''; + ("$pfx <a\nhref=$upfx$oid/s/>", ascii_html($ref), + "</a>$align ", ascii_html($s), " ($cd)\n") +} + +sub _refs_tags_link { + my ($line, $upfx, $snap_pfx, @snap_fmt) = @_; + my (undef, $oid, $ref, $s, $cd) = split(/\0/, $line); + my $align = length($ref) < 12 ? ' ' x (12 - length($ref)) : ''; + if (@snap_fmt) { + my $v = $ref; + $v =~ s/\A[vV]//; + @snap_fmt = map { + qq{ <a href="${upfx}snapshot/$snap_pfx$v.$_">$_</a>} + } @snap_fmt; + } + ("<a\nhref=$upfx$oid/s/>", ascii_html($ref), + "</a>$align ", ascii_html($s), " ($cd)", @snap_fmt, "\n"); +} + +sub emit_joined_inboxes ($) { + my ($ctx) = @_; + my $names = $ctx->{git}->{ibx_names}; # coderepo directives in config + my $score = $ctx->{git}->{ibx_score}; # generated w/ cindex --join + ($names || $score) or return; + my $pi_cfg = $ctx->{wcr}->{pi_cfg}; + my ($u, $h); + my $zfh = $ctx->zfh; + print $zfh "\n# associated public inboxes:", + "\n# (number on the left is used for dev purposes)"; + my @ns = map { [ 0, $_ ] } @$names; + my $env = $ctx->{env}; + for (@ns, @$score) { + my ($nr, $name) = @$_; + my $ibx = $pi_cfg->lookup_name($name) // do { + warn "W: inbox `$name' gone for $ctx->{git}->{git_dir}"; + say $zfh '# ', ascii_html($name), ' (missing inbox?)'; + next; + }; + if (scalar(@{$ibx->{url} // []})) { + $u = $h = ascii_html(prurl($env, $ibx->{url})); + } else { + $h = ascii_html(prurl($env, uri_escape_utf8($name))); + $h .= '/'; + $u = ascii_html($name); + } + if ($nr) { + printf $zfh "\n% 11u", $nr; + } else { + print $zfh "\n", ' 'x11; + } + print $zfh qq{ <a\nhref="$h">$u</a>}; + } +} + +sub summary_END { # called via OnDestroy + my ($ctx) = @_; + my $wcb = delete($ctx->{-wcb}) or return; # already done + PublicInbox::WwwStream::html_init($ctx); + my $zfh = $ctx->zfh; + + my @r = split(/\n/s, delete($ctx->{qx_res}->{'log'}) // ''); + my $last = scalar(@r) > $ctx->{wcr}->{summary_log} ? pop(@r) : undef; + my $tip_html = ''; + my $tip = $ctx->{qp}->{h}; + $tip_html .= ' '.ascii_html($tip).' --' if defined $tip; + print $zfh <<EOM; +<pre><a id=log>\$</a> git log --pretty=format:'%h %s (%cs)%d'$tip_html +EOM + for (@r) { + my $d; # decorations + s/^ \(([^\)]+)\)// and $d = $1; + substr($_, 0, 1, ''); + my ($H, $h, $cs, $s) = split(/ /, $_, 4); + print $zfh "<a\nhref=./$H/s/>$h</a> ", ascii_html($s), + " (", $cs, ")\n"; + print $zfh "\t(", ascii_html($d), ")\n" if $d; + } + print $zfh '# no commits in `', ($tip//'HEAD'),"', yet\n\n" if !@r; + print $zfh "...\n" if $last; + + # README + my ($bref, $oid, $ref_path) = @{delete $ctx->{qx_res}->{readme}}; + if ($bref) { + my $l = PublicInbox::Linkify->new; + $$bref =~ s/\s*\z//sm; + my (undef, $path) = split(/:/, $ref_path, 2); # HEAD:README + print $zfh "\n<a id=readme>\$</a> " . + qq(git cat-file blob <a href="./$oid/s/?b=) . + ascii_html(uri_escape_path($path)) . q(">). + ascii_html($ref_path), "</a>\n", + $l->to_html($$bref), '</pre><hr><pre>'; + } + + # refs/heads + print $zfh '<a id=heads>', $HEADS_CMD , '</a>'; + @r = split(/^/sm, delete($ctx->{qx_res}->{heads}) // ''); + $last = scalar(@r) > $ctx->{wcr}->{summary_branches} ? pop(@r) : undef; + chomp(@r); + for (@r) { print $zfh _refs_heads_link($_, './') } + print $zfh $NO_HEADS if !@r; + print $zfh qq(<a href="refs/heads/">...</a>\n) if $last; + print $zfh "\n<a id=tags>", $TAGS_CMD, '</a>'; + @r = split(/^/sm, delete($ctx->{qx_res}->{tags}) // ''); + $last = scalar(@r) > $ctx->{wcr}->{summary_tags} ? pop(@r) : undef; + my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx); + chomp @r; + for (@r) { print $zfh _refs_tags_link($_, './', $snap_pfx, @snap_fmt) } + print $zfh $NO_TAGS if !@r; + print $zfh qq(<a href="refs/tags/">...</a>\n) if $last; + emit_joined_inboxes $ctx; + $wcb->($ctx->html_done('</pre>')); +} + +sub capture { # psgi_qx callback to capture git-for-each-ref + my ($bref, $ctx, $key) = @_; # $_[3] = OnDestroy(summary_END) + $ctx->{qx_res}->{$key} = $$bref; + # summary_END may be called via OnDestroy $arg->[2] +} + +sub set_readme { # git->cat_async callback + my ($bref, $oid, $type, $size, $ctx) = @_; + my $ref_path = shift @{$ctx->{-readme_tries}}; # e.g. HEAD:README + if ($type eq 'blob' && !$ctx->{qx_res}->{readme}) { + $ctx->{qx_res}->{readme} = [ $bref, $oid, $ref_path ]; + } elsif (scalar @{$ctx->{-readme_tries}} == 0) { + $ctx->{qx_res}->{readme} //= []; # nothing left to try + } # or try another README... + # summary_END may be called via OnDestroy ($ctx->{-END}) +} + +sub summary ($$) { + my ($ctx, $wcb) = @_; + $ctx->{-wcb} = $wcb; # PublicInbox::HTTP::{Identity,Chunked} + my $tip = $ctx->{qp}->{h}; # same as cgit + if (defined $tip && $tip eq '') { + delete $ctx->{qp}->{h}; + undef($tip); + } + my ($nb, $nt, $nl) = map { $_ + 1 } @{$ctx->{wcr}}{qw( + summary_branches summary_tags summary_log)}; + $ctx->{qx_res} = {}; + my $qsp_err = \($ctx->{-qsp_err} = ''); + my %opt = (quiet => 1, 2 => $ctx->{wcr}->{log_fh}); + my %env = (GIT_DIR => $ctx->{git}->{git_dir}); + my @log = (qw(git log), "-$nl", '--pretty=format:%d %H %h %cs %s'); + push(@log, $tip) if defined $tip; + + # limit scope for MockHTTP test (t/solver_git.t) + my $END = on_destroy \&summary_END, $ctx; + for (['log', \@log], + [ 'heads', [@EACH_REF, "--count=$nb", 'refs/heads'] ], + [ 'tags', [@EACH_REF, "--count=$nt", 'refs/tags'] ]) { + my ($k, $cmd) = @$_; + my $qsp = PublicInbox::Qspawn->new($cmd, \%env, \%opt); + $qsp->{qsp_err} = $qsp_err; + $qsp->psgi_qx($ctx->{env}, undef, \&capture, $ctx, $k, $END); + } + $tip //= 'HEAD'; + my @try = ("$tip:README", "$tip:README.md"); # TODO: configurable + my %ctx = (%$ctx, -END => $END, -readme_tries => [ @try ]); + PublicInbox::ViewVCS::do_cat_async(\%ctx, \&set_readme, @try); +} + +# called by GzipFilter->close after translate +sub zflush { $_[0]->SUPER::zflush('</pre>', $_[0]->_html_end) } + +# called by GzipFilter->write or GetlineResponse->getline +sub translate { + my $ctx = shift; + $_[0] // return zflush($ctx); # getline caller + my @out; + my $fbuf = delete($ctx->{fbuf}) // shift; + $fbuf .= shift while @_; + if ($ctx->{-heads}) { + while ($fbuf =~ s/\A([^\n]+)\n//s) { + utf8_maybe(my $x = $1); + push @out, _refs_heads_link($x, '../../'); + } + } else { + my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx); + while ($fbuf =~ s/\A([^\n]+)\n//s) { + utf8_maybe(my $x = $1); + push @out, _refs_tags_link($x, '../../', + $snap_pfx, @snap_fmt); + } + } + $ctx->{fbuf} = $fbuf; # may be incomplete + @out ? $ctx->SUPER::translate(@out) : ''; # not EOF, yet +} + +sub _refs_parse_hdr { # {parse_hdr} for Qspawn + my ($r, $bref, $ctx) = @_; + my ($code, $top); + if ($r == 0) { + $code = 404; + $top = $ctx->{-heads} ? $NO_HEADS : $NO_TAGS; + } else { + $code = 200; + $top = $ctx->{-heads} ? $HEADS_CMD : $TAGS_CMD; + } + PublicInbox::WwwStream::html_init($ctx); + bless $ctx, __PACKAGE__; # re-bless for ->translate + print { $ctx->{zfh} } '<pre>', $top; + [ $code, delete($ctx->{-res_hdr}), $ctx ]; # [2] is qspawn.filter +} + +sub refs_foo { # /$REPO/refs/{heads,tags} endpoints + my ($self, $ctx, $pfx) = @_; + $ctx->{wcr} = $self; + $ctx->{-upfx} = '../../'; + $ctx->{-heads} = 1 if $pfx eq 'refs/heads'; + my $qsp = PublicInbox::Qspawn->new([@EACH_REF, $pfx ], + { GIT_DIR => $ctx->{git}->{git_dir} }); + $qsp->psgi_yield($ctx->{env}, undef, \&_refs_parse_hdr, $ctx); +} + +sub srv { # endpoint called by PublicInbox::WWW + my ($self, $ctx) = @_; + my $path_info = $ctx->{env}->{PATH_INFO}; + my $git; + # handle clone requests + my $pi_cfg = $self->{pi_cfg}; + if ($path_info =~ m!\A/(.+?)/($PublicInbox::GitHTTPBackend::ANY)\z!x and + ($git = $pi_cfg->get_coderepo($1))) { + PublicInbox::GitHTTPBackend::serve($ctx->{env},$git,$2); + } elsif ($path_info =~ m!\A/(.+?)/\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{wcr} = $self; + sub { summary($ctx, $_[0]) }; # $_[0] = wcb + } elsif ($path_info =~ m!\A/(.+?)/([a-f0-9]+)/s/([^/]+)?\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{lh} = $self->{log_fh}; + PublicInbox::ViewVCS::show($ctx, $2, $3); + } elsif ($path_info =~ m!\A/(.+?)/tree/(.*)\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{lh} = $self->{log_fh}; + PublicInbox::RepoTree::srv_tree($ctx, $2) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/snapshot/([^/]+)\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{wcr} = $self; + PublicInbox::RepoSnapshot::srv($ctx, $2) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/atom/(.*)\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + $ctx->{lh} = $self->{log_fh}; + PublicInbox::RepoAtom::srv_atom($ctx, $2) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/tags\.atom\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + PublicInbox::RepoAtom::srv_tags_atom($ctx); + } elsif ($path_info =~ m!\A/(.+?)/(refs/(?:heads|tags))/\z! and + ($ctx->{git} = $pi_cfg->get_coderepo($1))) { + refs_foo($self, $ctx, $2); + } elsif ($path_info =~ m!\A/(.*?\*.*?)/*\z!) { + my $re = PublicInbox::Config::glob2re($1); + PublicInbox::RepoList::html($self, $ctx, qr!$re\z!) // r(404); + } elsif ($path_info =~ m!\A/(.+?)/\z!) { + my $re = qr!\A\Q$1\E/!; + PublicInbox::RepoList::html($self, $ctx, $re) // r(404); + } elsif ($path_info =~ m!\A/(.+?)\z! and + ($git = $pi_cfg->get_coderepo($1))) { + my $qs = $ctx->{env}->{QUERY_STRING}; + my $url = $git->base_url($ctx->{env}); + $url .= "?$qs" if $qs ne ''; + [ 301, [ Location => $url, 'Content-Type' => 'text/plain' ], + [ "Redirecting to $url\n" ] ]; + } else { + r(404); + } +} + +1; diff --git a/lib/PublicInbox/WwwHighlight.pm b/lib/PublicInbox/WwwHighlight.pm index 170bfcaa..75338806 100644 --- a/lib/PublicInbox/WwwHighlight.pm +++ b/lib/PublicInbox/WwwHighlight.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Standalone PSGI app to provide syntax highlighting as-a-service @@ -20,8 +20,7 @@ package PublicInbox::WwwHighlight; use strict; -use warnings; -use bytes (); # only for bytes::length +use v5.10.1; use parent qw(PublicInbox::HlMod); use PublicInbox::Linkify qw(); use PublicInbox::Hval qw(ascii_html); @@ -47,7 +46,7 @@ sub read_in_full ($) { return \$buf if $r == 0; $off += $r; } - $env->{'psgi.errors'}->print("input read error: $!\n"); + warn "input read error: $!"; undef; } @@ -69,7 +68,7 @@ sub call { $l->linkify_2($$bref); my $h = [ 'Content-Type', 'text/html; charset=UTF-8' ]; - push @$h, 'Content-Length', bytes::length($$bref); + push @$h, 'Content-Length', length($$bref); [ 200, $h, [ $$bref ] ] } diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index bda2761c..2d6c74da 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -1,29 +1,39 @@ -# Copyright (C) 2019-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # Provide an HTTP-accessible listing of inboxes. # Used by PublicInbox::WWW package PublicInbox::WwwListing; use strict; -use PublicInbox::Hval qw(prurl fmt_ts); -use PublicInbox::Linkify; +use v5.10.1; +use PublicInbox::Hval qw(prurl fmt_ts ascii_html); use PublicInbox::GzipFilter qw(gzf_maybe); use PublicInbox::ConfigIter; -use bytes (); # bytes::length +use PublicInbox::WwwStream; +use URI::Escape qw(uri_escape_utf8); +use PublicInbox::MID qw(mid_escape); sub ibx_entry { - my ($ctx, $ibx) = @_; - my $mtime = $ibx->modified; - my $ts = fmt_ts($mtime); - my $url = prurl($ctx->{env}, $ibx->{url}); - my $tmp = <<""; -* $ts - $url - ${\$ibx->description} - - if (defined(my $info_url = $ibx->{infourl})) { - $tmp .= ' ' . prurl($ctx->{env}, $info_url) . "\n"; + my ($ctx, $ibx, $ce) = @_; + my $desc = ascii_html($ce->{description} //= $ibx->description); + my $ts = fmt_ts($ce->{-modified} //= $ibx->modified); + my ($url, $href); + if (scalar(@{$ibx->{url} // []})) { + $url = $href = ascii_html(prurl($ctx->{env}, $ibx->{url})); + } else { + $href = ascii_html(uri_escape_utf8($ibx->{name})) . '/'; + $url = ascii_html($ibx->{name}); } - push @{$ctx->{-list}}, [ $mtime, $tmp ]; + my $tmp = <<EOM; +* $ts - <a\nhref="$href">$url</a> + $desc +EOM + if (defined($url = $ibx->{infourl})) { + $url = ascii_html(prurl($ctx->{env}, $url)); + $tmp .= qq( <a\nhref="$url">$url</a>\n); + } + push(@{$ctx->{-list}}, (scalar(@_) == 3 ? # $misc in use, already sorted + $tmp : [ $ce->{-modified}, $tmp ] )); } sub list_match_i { # ConfigIter callback @@ -31,29 +41,30 @@ sub list_match_i { # ConfigIter callback if (defined($section)) { return if $section !~ m!\Apublicinbox\.([^/]+)\z!; my $ibx = $cfg->lookup_name($1) or return; - if (!$ibx->{-hide}->{$ctx->hide_key} && - grep(/$re/, @{$ibx->{url}})) { - $ctx->ibx_entry($ibx); - } + $ctx->ibx_entry($ibx) unless $ctx->hide_inbox($ibx, $re); } else { # undef == "EOF" $ctx->{-wcb}->($ctx->psgi_triple); } } -sub url_regexp { +sub url_filter { my ($ctx, $key, $default) = @_; $key //= 'publicInbox.wwwListing'; $default //= '404'; - my $v = $ctx->{www}->{pi_config}->{lc $key} // $default; + my $cfg = $ctx->{www}->{pi_cfg}; + my $v = $cfg->{lc $key} // $default; again: if ($v eq 'match=domain') { my $h = $ctx->{env}->{HTTP_HOST} // $ctx->{env}->{SERVER_NAME}; $h =~ s/:[0-9]+\z//; - qr!\A(?:https?:)?//\Q$h\E(?::[0-9]+)?/!i; + (qr!\A(?:https?:)?//\Q$h\E(?::[0-9]+)?/!i, "url:$h"); } elsif ($v eq 'all') { - qr/./; + my $niu = $cfg->{lc 'publicinbox.nameIsUrl'}; + defined($niu) && $cfg->git_bool($niu) and + $ctx->{-name_is_url} = [ '.' ]; + (qr/./, undef); } elsif ($v eq '404') { - undef; + (undef, undef); } else { warn <<EOF; `$v' is not a valid value for `$key' @@ -66,42 +77,170 @@ EOF sub hide_key { 'www' } +sub hide_inbox { + my ($ctx, $ibx, $re) = @_; + $ibx->{'-hide_'.$ctx->hide_key} || + !grep(/$re/, @{$ibx->{url} // $ctx->{-name_is_url} // []}) +} + +sub add_misc_ibx { # MiscSearch->retry_reopen callback + my ($misc, $ctx, $re, $qs) = @_; + require PublicInbox::SearchQuery; + my $q = $ctx->{-sq} = PublicInbox::SearchQuery->new($ctx->{qp}); + my $o = $q->{o}; + my ($asc, $min, $max); + if ($o < 0) { + $asc = 1; + $o = -($o + 1); # so [-1] is the last element, like Perl lists + } + my $r = $q->{r}; + my $opt = { + offset => $o, + asc => $asc, + relevance => $r, + limit => $q->{l} + }; + $qs .= ' type:inbox'; + + delete $ctx->{-list}; # reset if retried + my $pi_cfg = $ctx->{www}->{pi_cfg}; + my $user_query = $q->{'q'} // ''; + if ($user_query =~ /\S/) { + $qs = "( $qs ) AND ( $user_query )"; + } else { # special case for ALL + $ctx->ibx_entry($pi_cfg->ALL // die('BUG: ->ALL expected'), {}); + } + my $mset = $misc->mset($qs, $opt); # sorts by $MODIFIED (mtime) + + for my $mi ($mset->items) { + my $doc = $mi->get_document; + my ($eidx_key) = PublicInbox::Search::xap_terms('Q', $doc); + $eidx_key // next; + my $ibx = $pi_cfg->lookup_eidx_key($eidx_key) // next; + next if $ctx->hide_inbox($ibx, $re); + $ctx->ibx_entry($ibx, $misc->doc2ibx_cache_ent($doc)); + if ($r) { # for descriptions in search_nav_bot + my $pct = PublicInbox::Search::get_pct($mi); + # only when sorting by relevance, ->items is always + # ordered descending: + $max //= $pct; + $min = $pct; + } + } + if ($r) { # for descriptions in search_nav_bot + $q->{-min_pct} = $min; + $q->{-max_pct} = $max; + } + $ctx->{-mset} = $mset; + psgi_triple($ctx); +} + sub response { my ($class, $ctx) = @_; bless $ctx, $class; - my $re = $ctx->url_regexp or return $ctx->psgi_triple; - my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_config}, + my ($re, $qs) = $ctx->url_filter; + $re // return $ctx->psgi_triple; + if (my $ALL = $ctx->{www}->{pi_cfg}->ALL) { # fast path + if ($ctx->{qp}->{a} && # "search all inboxes" + $ctx->{qp}->{'q'}) { + my $u = 'all/?q='.mid_escape($ctx->{qp}->{'q'}); + return [ 302, [ 'Location' => $u, + qw(Content-Type text/plain) ], + [ "Redirecting to $u\n" ] ]; + } + # FIXME: test this in t/ + $ALL->misc->reopen->retry_reopen(\&add_misc_ibx, + $ctx, $re, $qs); + } else { # slow path, no [extindex "all"] configured + my $iter = PublicInbox::ConfigIter->new($ctx->{www}->{pi_cfg}, \&list_match_i, $re, $ctx); - sub { - $ctx->{-wcb} = $_[0]; # HTTP server callback - $ctx->{env}->{'pi-httpd.async'} ? - $iter->event_step : $iter->each_section; + sub { + $ctx->{-wcb} = $_[0]; # HTTP server callback + $ctx->{env}->{'pi-httpd.async'} ? + $iter->event_step : $iter->each_section; + } } } +sub mset_footer ($$) { + my ($ctx, $mset) = @_; + # no footer if too few matches + return '' if $mset->get_matches_estimated == $mset->size; + require PublicInbox::SearchView; + PublicInbox::SearchView::search_nav_bot($ctx, $mset, $ctx->{-sq}); +} + +sub mset_nav_top { + my ($ctx, $mset) = @_; + my $q = $ctx->{-sq}; + my $qh = $q->{'q'} // ''; + if ($qh ne '') { + utf8::decode($qh); + $qh = qq[\nvalue="].ascii_html($qh).'"'; + } + chop(my $rv = <<EOM); +<form action="./"><pre><input name=q type=text$qh/><input +type=submit value="locate inbox"/><input type=submit name=a +value="search all inboxes"/></pre></form><pre> +EOM + if (defined($q->{'q'})) { + my $initial_q = $ctx->{-uxs_retried}; + if (defined $initial_q) { + my $rewritten = $q->{'q'}; + utf8::decode($initial_q); + utf8::decode($rewritten); + $initial_q = ascii_html($initial_q); + $rewritten = ascii_html($rewritten); + $rv .= " Warning: Initial query:\n <b>$initial_q</b>\n"; + $rv .= " returned no results, used:\n"; + $rv .= " <b>$rewritten</b>\n instead\n\n"; + } + $rv .= 'Search results ordered by ['; + if ($q->{r}) { + my $d = $q->qs_html(r => 0); + $rv .= qq{<a\nhref="?$d">updated</a>|<b>relevance</b>}; + } else { + my $d = $q->qs_html(r => 1); + $rv .= qq{<b>updated</b>|<a\nhref="?$d">relevance</a>}; + } + $rv .= ']'; + } + $rv .= qq{</pre>}; +} + sub psgi_triple { my ($ctx) = @_; my $h = [ 'Content-Type', 'text/html; charset=UTF-8', 'Content-Length', undef ]; my $gzf = gzf_maybe($h, $ctx->{env}); - $gzf->zmore('<html><head><title>' . - 'public-inbox listing</title>' . - '</head><body><pre>'); + my $zfh = $gzf->zfh; + print $zfh '<html><head><title>public-inbox listing</title>', + $ctx->{www}->style('+/'), + '</head><body>'; my $code = 404; - if (my $list = $ctx->{-list}) { + if (my $list = delete $ctx->{-list}) { + my $mset = delete $ctx->{-mset}; $code = 200; - # sort by ->modified - @$list = map { $_->[1] } sort { $b->[0] <=> $a->[0] } @$list; - $list = join("\n", @$list); - my $l = PublicInbox::Linkify->new; - $gzf->zmore($l->to_html($list)); + if ($mset) { # already sorted, so search bar: + print $zfh mset_nav_top($ctx, $mset); + } else { # sort config dump by ->modified + @$list = map { $_->[1] } + sort { $b->[0] <=> $a->[0] } @$list; + } + print $zfh '<pre>', join("\n", @$list); # big + print $zfh mset_footer($ctx, $mset) if $mset; + } elsif (my $mset = delete $ctx->{-mset}) { + print $zfh mset_nav_top($ctx, $mset), + '<pre>no matching inboxes', + mset_footer($ctx, $mset); } else { - $gzf->zmore('no inboxes, yet'); + print $zfh '<pre>no inboxes, yet'; } my $out = $gzf->zflush('</pre><hr><pre>'. - PublicInbox::WwwStream::code_footer($ctx->{env}) . +qq(This is a listing of public inboxes, see the `mirror' link of each inbox +for instructions on how to mirror all the data and code on this site.) . '</pre></body></html>'); - $h->[3] = bytes::length($out); + $h->[3] = length($out); [ $code, $h, [ $out ] ]; } diff --git a/lib/PublicInbox/WwwStatic.pm b/lib/PublicInbox/WwwStatic.pm index 051d2e03..d8902193 100644 --- a/lib/PublicInbox/WwwStatic.pm +++ b/lib/PublicInbox/WwwStatic.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # This package can either be a PSGI response body for a static file @@ -9,16 +9,15 @@ # functionality of nginx. package PublicInbox::WwwStatic; use strict; +use v5.10.1; use parent qw(Exporter); -use bytes (); use Fcntl qw(SEEK_SET O_RDONLY O_NONBLOCK); -use POSIX qw(strftime); use HTTP::Date qw(time2str); use HTTP::Status qw(status_message); use Errno qw(EACCES ENOTDIR ENOENT); use URI::Escape qw(uri_escape_utf8); use PublicInbox::GzipFilter qw(gzf_maybe); -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html fmt_ts); use Plack::MIME; our @EXPORT_OK = qw(@NO_CACHE r path_info_raw); @@ -218,7 +217,7 @@ my %path_re_cache; sub path_info_raw ($) { my ($env) = @_; my $sn = $env->{SCRIPT_NAME}; - my $re = $path_re_cache{$sn} ||= do { + my $re = $path_re_cache{$sn} //= do { $sn = '/'.$sn unless index($sn, '/') == 0; $sn =~ s!/\z!!; qr!\A(?:https?://[^/]+)?\Q$sn\E(/[^\?\#]+)!; @@ -275,12 +274,11 @@ sub dir_response ($$$) { my $path_info = $env->{PATH_INFO}; push @entries, '..' if $path_info ne '/'; for my $base (@entries) { + my @st = stat($fs_path . $base) or next; # unlikely my $href = ascii_html(uri_escape_utf8($base)); my $name = ascii_html($base); - my @st = stat($fs_path . $base) or next; # unlikely - my ($gzipped, $uncompressed, $hsize); - my $entry = ''; my $mtime = $st[9]; + my ($entry, $hsize); if (-d _) { $href .= '/'; $name .= '/'; @@ -296,12 +294,12 @@ sub dir_response ($$$) { next; } # 54 = 80 - (SP length(strftime(%Y-%m-%d %k:%M)) SP human_size) - $hsize = sprintf('% 8s', $hsize); my $pad = 54 - length($name); $pad = 1 if $pad <= 0; - $entry .= qq(<a\nhref="$href">$name</a>) . (' ' x $pad); - $mtime = strftime('%Y-%m-%d %k:%M', gmtime($mtime)); - $entry .= $mtime . $hsize; + $entry = qq(\n<a\nhref="$href">$name</a>) . + (' ' x $pad) . + fmt_ts($mtime) . + sprintf('% 8s', $hsize); } # filter out '.gz' files as long as the mtime matches the @@ -309,17 +307,16 @@ sub dir_response ($$$) { delete(@other{keys %want_gz}); @entries = ((map { ${$dirs{$_}} } sort keys %dirs), (map { ${$other{$_}} } sort keys %other)); - my $path_info_html = ascii_html($path_info); - my $h = [qw(Content-Type text/html Content-Length), undef]; - my $gzf = gzf_maybe($h, $env); - $gzf->zmore("<html><head><title>Index of $path_info_html</title>" . - ${$self->{style}} . - "</head><body><pre>Index of $path_info_html</pre><hr><pre>\n"); - $gzf->zmore(join("\n", @entries)); - my $out = $gzf->zflush("</pre><hr></body></html>\n"); - $h->[3] = bytes::length($out); - [ 200, $h, [ $out ] ] + my @h = qw(Content-Type text/html); + my $gzf = gzf_maybe(\@h, $env); + print { $gzf->zfh } '<html><head><title>Index of ', $path_info_html, + '</title>', ${$self->{style}}, '</head><body><pre>Index of ', + $path_info_html, '</pre><hr><pre>', @entries, + '</pre><hr></body></html>'; + my $out = $gzf->zflush; + push @h, 'Content-Length', length($out); + [ 200, \@h, [ $out ] ] } sub call { # PSGI app endpoint diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index 638f4e27..8d32074f 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # HTML body stream for which yields getline+close methods for @@ -7,16 +7,19 @@ # See PublicInbox::GzipFilter parent class for more info. package PublicInbox::WwwStream; use strict; +use v5.10.1; use parent qw(Exporter PublicInbox::GzipFilter); our @EXPORT_OK = qw(html_oneshot); -use bytes (); # length use PublicInbox::Hval qw(ascii_html prurl ts2str); -our $TOR_URL = 'https://www.torproject.org/'; -our $CODE_URL = 'https://public-inbox.org/public-inbox.git'; + +our $CODE_URL = [ qw( +http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git +https://public-inbox.org/public-inbox.git) ]; sub base_url ($) { - my $ctx = shift; - my $base_url = $ctx->{-inbox}->base_url($ctx->{env}); + my ($ctx) = @_; + my $thing = $ctx->{ibx} // $ctx->{git} // return; + my $base_url = $thing->base_url($ctx->{env}); chop $base_url; # no trailing slash for clone $base_url; } @@ -25,22 +28,74 @@ sub init { my ($ctx, $cb) = @_; $ctx->{cb} = $cb; $ctx->{base_url} = base_url($ctx); + $ctx->{-res_hdr} = [ 'Content-Type' => 'text/html; charset=UTF-8' ]; + $ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($ctx->{-res_hdr}, + $ctx->{env}); bless $ctx, __PACKAGE__; } sub async_eml { # for async_blob_cb my ($ctx, $eml) = @_; - $ctx->{http_out}->write($ctx->translate($ctx->{cb}->($ctx, $eml))); + $ctx->write($ctx->{cb}->($ctx, $eml)); +} + +sub html_repo_top ($) { + my ($ctx) = @_; + my $git = $ctx->{git} // return $ctx->html_top_fallback; + my $desc = ascii_html($git->description); + my $title = delete($ctx->{-title_html}) // $desc; + my $upfx = $ctx->{-upfx} // ''; + my $atom = $ctx->{-atom} // (substr($upfx, -1) eq '/' ? + "${upfx}atom/" : "$upfx/atom/"); + my $top = ascii_html($git->{nick}); + $top = qq(<a\nhref="$upfx">$top</a>) if length($upfx); + $top .= <<EOM; + <a href='$upfx#readme'>about</a> / <a +href='$upfx#heads'>heads</a> / <a +href='$upfx#tags'>tags</a> +<b>$desc</b> +EOM + my @url = PublicInbox::ViewVCS::ibx_url_for($ctx); + if (@url) { + $ctx->{-has_srch} = 1; + my $base_url = base_url($ctx); + my ($pfx, $sfx) = ($base_url =~ m!\A(https?://[^/]+/)(.*)\z!i); + my $iupfx = '../' x (($sfx =~ tr!/!/!) + 1); + $pfx = ascii_html($pfx); + $pfx = qr/\A\Q$pfx\E/i; + my $tmp = $top; + $top = ''; + my ($s, $u); + my $q_val = delete($ctx->{-q_value_html}) // ''; + $q_val = qq(\nvalue="$q_val") if $q_val ne ''; + for (@url) { + $u = $s = ascii_html($_); + substr($u, 0, 0, $iupfx) if $u !~ m!://!; + $s =~ s!$pfx!!; + $s =~ s!/\z!!; + $top .= qq{<form\naction="$u"><pre>$tmp} . + qq{<input\nname=q type=text$q_val />} . + qq{<input type=submit\n} . + qq{value="search mail in `$s'"/>} . + q{</pre></form>}; + $tmp = ''; + } + } else { + $top = "<pre>$top</pre>"; + } + "<html><head><title>$title</title>" . + qq(<link\nrel=alternate\ntitle="Atom feed"\n). + qq(href="$atom"\ntype="application/atom+xml"/>) . + $ctx->{www}->style($upfx) . + '</head><body>'.$top; } sub html_top ($) { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx} // return html_repo_top($ctx); my $desc = ascii_html($ibx->description); my $title = delete($ctx->{-title_html}) // $desc; my $upfx = $ctx->{-upfx} || ''; - my $help = $upfx.'_/text/help'; - my $color = $upfx.'_/text/color'; my $atom = $ctx->{-atom} || $upfx.'new.atom'; my $top = "<b>$desc</b>"; if (my $t_max = $ctx->{-t_max}) { @@ -49,12 +104,18 @@ sub html_top ($) { # we had some kind of query, link to /$INBOX/?t=YYYYMMDDhhmmss } elsif ($ctx->{qp}->{t}) { $top = qq(<a\nhref="./">$top</a>); + } elsif (length($upfx)) { + $top = qq(<a\nhref="$upfx">$top</a>); } - my $links = qq(<a\nhref="$help">help</a> / ). - qq(<a\nhref="$color">color</a> / ). - qq(<a\nhref=#mirror>mirror</a> / ). + my $code = $ibx->{coderepo} ? qq( / <a\nhref=#code>code</a>) : ''; + # id=mirror must exist for legacy bookmarks + my $links = qq(<a\nhref="${upfx}_/text/help/">help</a> / ). + qq(<a\nhref="${upfx}_/text/color/">color</a> / ). + qq(<a\nid=mirror) . + qq(\nhref="${upfx}_/text/mirror/">mirror</a>$code / ). qq(<a\nhref="$atom">Atom feed</a>); - if ($ibx->search) { + $links .= delete($ctx->{-html_more_links}) if $ctx->{-html_more_links}; + if ($ibx->isrch) { my $q_val = delete($ctx->{-q_value_html}) // ''; $q_val = qq(\nvalue="$q_val") if $q_val ne ''; # XXX gross, for SearchView.pm @@ -76,106 +137,71 @@ sub html_top ($) { '</head><body>'. $top . (delete($ctx->{-html_tip}) // ''); } +sub inboxes { () } # TODO + sub coderepos ($) { my ($ctx) = @_; - my $ibx = $ctx->{-inbox}; - my @ret; - if (defined(my $cr = $ibx->{coderepo})) { - my $cfg = $ctx->{www}->{pi_config}; - my $env = $ctx->{env}; - for my $cr_name (@$cr) { - my $urls = $cfg->{"coderepo.$cr_name.cgiturl"}; - if ($urls) { - $ret[0] //= <<EOF; -code repositories for the project(s) associated with this inbox: -EOF - $ret[0] .= "\n\t".prurl($env, $_) for @$urls; - } + $ctx->{ibx} // return inboxes($ctx); + my $cr = $ctx->{ibx}->{coderepo} // return (); + my $upfx = ($ctx->{-upfx} // ''). '../'; + my $pfx = $ctx->{base_url} //= $ctx->base_url; + my $up = $upfx =~ tr!/!/!; + $pfx =~ s!/[^/]+\z!/! for (1..$up); + $pfx .= '/' if substr($pfx, -1, 1) ne '/'; + my $buf = '<a id=code>' . + 'Code repositories for project(s) associated with this '. + $ctx->{ibx}->thing_type . "\n"; + for my $git (@{$ctx->{www}->{pi_cfg}->repo_objs($ctx->{ibx})}) { + for ($git->pub_urls($ctx->{env})) { + my $u = m!\A(?:[a-z\+]+:)?//!i ? $_ : $pfx.$_; + $u = ascii_html(prurl($ctx->{env}, $u)); + $buf .= qq(\n\t<a\nhref="$u">$u</a>); } } - @ret; # may be empty -} - -sub code_footer ($) { - my ($env) = @_; - my $u = prurl($env, $CODE_URL); - qq(AGPL code for this site: git clone <a\nhref="$u">$u</a>) + ($buf); } sub _html_end { my ($ctx) = @_; - my $urls = <<EOF; -<a -id=mirror>This inbox may be cloned and mirrored by anyone:</a> + my $upfx = $ctx->{-upfx} || ''; + my $m = "${upfx}_/text/mirror/"; + my $x = ''; + if ($ctx->{ibx} && $ctx->{ibx}->can('cloneurl')) { + $x = <<EOF; +This is a public inbox, see <a +href="$m">mirroring instructions</a> +for how to clone and mirror all data and code used for this inbox EOF - - my $ibx = $ctx->{-inbox}; - my $desc = ascii_html($ibx->description); - - my @urls; - my $http = $ctx->{base_url}; - my $max = $ibx->max_git_epoch; - my $dir = (split(m!/!, $http))[-1]; - my %seen = ($http => 1); - if (defined($max)) { # v2 - for my $i (0..$max) { - # old epochs my be deleted: - -d "$ibx->{inboxdir}/git/$i.git" or next; - my $url = "$http/$i"; - $seen{$url} = 1; - push @urls, "$url $dir/git/$i.git"; - } - my $nr = scalar(@urls); - if ($nr > 1) { - $urls .= "\n\t# this inbox consists of $nr epochs:"; - $urls[0] .= "\t# oldest"; - $urls[-1] .= "\t# newest"; + my $has_nntp = @{$ctx->{ibx}->nntp_url($ctx)}; + my $has_imap = @{$ctx->{ibx}->imap_url($ctx)}; + if ($has_nntp || $has_imap) { + substr($x, -1, 1) = ";\n"; # s/\n/;\n + if ($has_nntp && $has_imap) { + $x .= <<EOM; +as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s). +EOM + } elsif ($has_nntp) { + $x .= <<EOM; +as well as URLs for NNTP newsgroup(s). +EOM + } else { + $x .= <<EOM; +as well as URLs for IMAP folder(s). +EOM + } } - } else { # v1 - push @urls, $http; - } - - # FIXME: epoch splits can be different in other repositories, - # use the "cloneurl" file as-is for now: - foreach my $u (@{$ibx->cloneurl}) { - next if $seen{$u}++; - push @urls, $u =~ /\Ahttps?:/ ? qq(<a\nhref="$u">$u</a>) : $u; - } - - $urls .= "\n" . join('', map { "\tgit clone --mirror $_\n" } @urls); - my $addrs = $ibx->{address}; - $addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY'; - my $v = defined $max ? '-V2' : '-V1'; - $urls .= <<EOF; - - # If you have public-inbox 1.1+ installed, you may - # initialize and index your mirror using the following commands: - public-inbox-init $v $ibx->{name} $dir/ $http \\ - $addrs - public-inbox-index $dir -EOF - my $cfg_link = ($ctx->{-upfx} // '').'_/text/config/raw'; - $urls .= <<EOF; - -Example <a -href="$cfg_link">config snippet</a> for mirrors. + } elsif ($ctx->{ibx}) { # extindex + $x = <<EOF; +This is an external index of several public inboxes, +see <a href="$m">mirroring instructions</a> on how to clone and mirror +all data and code used by this external index. EOF - my @nntp = map { qq(<a\nhref="$_">$_</a>) } @{$ibx->nntp_url}; - if (@nntp) { - $urls .= @nntp == 1 ? 'Newsgroup' : 'Newsgroups are'; - $urls .= ' available over NNTP:'; - $urls .= "\n\t" . join("\n\t", @nntp) . "\n"; + } elsif ($ctx->{git}) { # coderepo + $x = join('', map { "git clone $_\n" } + @{$ctx->{git}->cloneurl($ctx->{env})}); } - if ($urls =~ m!\b[^:]+://\w+\.onion/!) { - $urls .= " note: .onion URLs require Tor: "; - $urls .= qq[<a\nhref="$TOR_URL">$TOR_URL</a>]; - } - '<hr><pre>'.join("\n\n", - $desc, - $urls, - coderepos($ctx), - code_footer($ctx->{env}) - ).'</pre></body></html>'; + chomp $x; + '<hr><pre>'.join("\n\n", coderepos($ctx), $x).'</pre></body></html>' } # callback for HTTP.pm (and any other PSGI servers) @@ -184,7 +210,7 @@ sub getline { my $cb = $ctx->{cb} or return; while (defined(my $x = $cb->($ctx))) { # x = smsg or scalar non-ref if (ref($x)) { # smsg - my $eml = $ctx->{-inbox}->smsg_eml($x) or next; + my $eml = $ctx->{ibx}->smsg_eml($x) or next; $ctx->{smsg} = $x; return $ctx->translate($cb->($ctx, $eml)); } else { # scalar @@ -195,19 +221,27 @@ sub getline { $ctx->zflush(_html_end($ctx)); } -sub html_oneshot ($$;$) { - my ($ctx, $code, $sref) = @_; +sub html_done ($;@) { + my $ctx = $_[0]; + my $bdy = $ctx->zflush(@_[1..$#_], _html_end($ctx)); + my $res_hdr = delete $ctx->{-res_hdr}; + push @$res_hdr, 'Content-Length', length($bdy); + [ 200, $res_hdr, [ $bdy ] ] +} + +sub html_oneshot ($$;@) { + my ($ctx, $code) = @_[0, 1]; my $res_hdr = [ 'Content-Type' => 'text/html; charset=UTF-8', 'Content-Length' => undef ]; bless $ctx, __PACKAGE__; $ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($res_hdr, $ctx->{env}); - $ctx->{base_url} //= do { - $ctx->zmore(html_top($ctx)); - base_url($ctx); + my @top; + $ctx->{base_url} // do { + @top = html_top($ctx); + $ctx->{base_url} = base_url($ctx); }; - $ctx->zmore($$sref) if $sref; - my $bdy = $ctx->zflush(_html_end($ctx)); - $res_hdr->[3] = bytes::length($bdy); + my $bdy = $ctx->zflush(@top, @_[2..$#_], _html_end($ctx)); + $res_hdr->[3] = length($bdy); [ $code, $res_hdr, [ $bdy ] ] } @@ -218,8 +252,7 @@ sub async_next ($) { if (my $smsg = $ctx->{smsg} = $ctx->{cb}->($ctx)) { $ctx->smsg_blob($smsg); } else { - $ctx->{http_out}->write( - $ctx->translate(_html_end($ctx))); + $ctx->write(_html_end($ctx)); $ctx->close; # GzipFilter->close } }; @@ -227,10 +260,23 @@ sub async_next ($) { } sub aresponse { - my ($ctx, $code, $cb) = @_; - my $res_hdr = [ 'Content-Type' => 'text/html; charset=UTF-8' ]; + my ($ctx, $cb) = @_; init($ctx, $cb); - $ctx->psgi_response($code, $res_hdr); + $ctx->psgi_response(200, delete $ctx->{-res_hdr}); +} + +sub html_init { + my $ctx = $_[-1]; + $ctx->{base_url} = base_url($ctx); + my $h = $ctx->{-res_hdr} = ['Content-Type', 'text/html; charset=UTF-8']; + $ctx->{gz} = PublicInbox::GzipFilter::gz_or_noop($h, $ctx->{env}); + bless $ctx, @_ > 1 ? $_[0] : __PACKAGE__; + print { $ctx->zfh } html_top($ctx); +} + +sub DESTROY { + my ($ctx) = @_; + $ctx->{git}->cleanup if $ctx->{git} && $ctx->{git}->{-tmp}; } 1; diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm index 04c9b1c4..5e23005e 100644 --- a/lib/PublicInbox/WwwText.pm +++ b/lib/PublicInbox/WwwText.pm @@ -1,14 +1,14 @@ -# Copyright (C) 2016-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # used for displaying help texts and other non-mail content package PublicInbox::WwwText; use strict; -use warnings; -use bytes (); # only for bytes::length +use v5.10.1; use PublicInbox::Linkify; use PublicInbox::WwwStream; -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html prurl fmt_ts); +use HTTP::Date qw(time2str); use URI::Escape qw(uri_escape_utf8); use PublicInbox::GzipFilter qw(gzf_maybe); our $QP_URL = 'https://xapian.org/docs/queryparser.html'; @@ -24,32 +24,29 @@ sub get_text { my ($ctx, $key) = @_; my $code = 200; - $key = 'help' if !defined $key; # this 302s to _/text/help/ + $key //= 'help'; # this 302s to _/text/help/ # get the raw text the same way we get mboxrds my $raw = ($key =~ s!/raw\z!!); my $have_tslash = ($key =~ s!/\z!!) if !$raw; my $txt = ''; - my $hdr = [ 'Content-Type', 'text/plain', 'Content-Length', undef ]; - if (!_default_text($ctx, $key, $hdr, \$txt)) { + if (!_default_text($ctx, $key, \$txt)) { $code = 404; $txt = "404 Not Found ($key)\n"; } my $env = $ctx->{env}; if ($raw) { - if ($code == 200) { - my $gzf = gzf_maybe($hdr, $env); - $txt = $gzf->translate($txt); - $txt .= $gzf->zflush; - } - $hdr->[3] = bytes::length($txt); - return [ $code, $hdr, [ $txt ] ] + my $h = delete $ctx->{-res_hdr}; + $txt = gzf_maybe($h, $env)->zflush($txt) if $code == 200; + push @$h, 'Content-Type', 'text/plain', + 'Content-Length', length($txt); + return [ $code, $h, [ $txt ] ] } # enforce trailing slash for "wget -r" compatibility if (!$have_tslash && $code == 200) { - my $url = $ctx->{-inbox}->base_url($env); + my $url = $ctx->{ibx}->base_url($env); $url .= "_/text/$key/"; return [ 302, [ 'Content-Type', 'text/plain', @@ -71,14 +68,18 @@ sub get_text { $txt = ascii_html($txt); } $txt = '<pre>' . $l->linkify_2($txt) . '</pre>'; - PublicInbox::WwwStream::html_oneshot($ctx, $code, \$txt); + $txt =~ s!^search$!<a\nid=search>search</a>!sm; + $txt =~ s!\bPOP3\b!<a\nid=pop3>POP3</a>!; + $txt =~ s!\b(Newsgroups?)\b!<a\nid=nntp>$1</a>!; + $txt =~ s!\bIMAP\b!<a\nid=imap>IMAP</a>!; + PublicInbox::WwwStream::html_oneshot($ctx, $code, $txt); } sub _srch_prefix ($$) { - my ($srch, $txt) = @_; + my ($ibx, $txt) = @_; my $pad = 0; my $htxt = ''; - my $help = $srch->help; + my $help = $ibx->isrch->help; my $i; for ($i = 0; $i < @$help; $i += 2) { my $pfx = $help->[$i]; @@ -89,10 +90,9 @@ sub _srch_prefix ($$) { $htxt .= "\f\n"; } $pad += 2; - my $padding = ' ' x ($pad + 8); + my $padding = ' ' x ($pad + 4); $htxt =~ s/^/$padding/gms; - $htxt =~ s/^$padding(\S+)\0/" $1". - (' ' x ($pad - length($1)))/egms; + $htxt =~ s/^$padding(\S+)\0/" $1".(' ' x ($pad - length($1)))/egms; $htxt =~ s/\f\n/\n/gs; $$txt .= $htxt; 1; @@ -100,7 +100,7 @@ sub _srch_prefix ($$) { sub _colors_help ($$) { my ($ctx, $txt) = @_; - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $env = $ctx->{env}; my $base_url = $ibx->base_url($env); $$txt .= "color customization for $base_url\n"; @@ -113,7 +113,7 @@ Users of browsers such as dillo, Firefox, or some browser extensions may start by downloading the following sample CSS file to control the colors they see: - ${base_url}userContent.css + ${base_url}userContent.css CSS sample ---------- @@ -130,13 +130,51 @@ sub dq_escape ($) { $name; } -sub URI_PATH () { '^A-Za-z0-9\-\._~/' } +sub _coderepo_config ($$) { + my ($ctx, $txt) = @_; + my $cr = $ctx->{ibx}->{coderepo} // return; + # note: this doesn't preserve cgitrc layout, since we parse cgitrc + # and drop the original structure + $$txt .= "\tcoderepo = $_\n" for @$cr; + $$txt .= <<'EOF'; + +; `coderepo' entries allows blob reconstruction via patch emails if +; the inbox is indexed with Xapian. `@@ <from-range> <to-range> @@' +; line number ranges in `[PATCH]' emails link to /$INBOX_NAME/$OID/s/, +; an HTTP endpoint which reconstructs git blobs via git-apply(1). +EOF + my $pi_cfg = $ctx->{www}->{pi_cfg}; + for my $cr_name (@$cr) { + my $urls = $pi_cfg->get_all("coderepo.$cr_name.cgiturl"); + my $path = "/path/to/$cr_name"; + $cr_name = dq_escape($cr_name); + + $$txt .= qq([coderepo "$cr_name"]\n); + if ($urls && scalar(@$urls)) { + $$txt .= "\t; "; + $$txt .= join(" ||\n\t;\t", map {; + my $dst = $path; + if ($path !~ m![a-z0-9_/\.\-]!i) { + $dst = '"'.dq_escape($dst).'"'; + } + qq(git clone $_ $dst); + } @$urls); + $$txt .= "\n"; + } + $$txt .= "\tdir = $path\n"; + $$txt .= "\tcgiturl = https://example.com/"; + $$txt .= uri_escape_utf8($cr_name, '^A-Za-z0-9\-\._~/')."\n"; + } +} # n.b. this is a perfect candidate for memoization -sub inbox_config ($$$) { - my ($ctx, $hdr, $txt) = @_; - my $ibx = $ctx->{-inbox}; - push @$hdr, 'Content-Disposition', 'inline; filename=inbox.config'; +sub inbox_config ($$) { + my ($ctx, $txt) = @_; + my $ibx = $ctx->{ibx}; + push @{$ctx->{-res_hdr}}, + 'Content-Disposition', 'inline; filename=inbox.config'; + my $t = eval { $ibx->mm->created_at }; + push(@{$ctx->{-res_hdr}}, 'Last-Modified', time2str($t)) if $t; my $name = dq_escape($ibx->{name}); my $inboxdir = '/path/to/top-level-inbox'; my $base_url = $ibx->base_url($ctx->{env}); @@ -165,7 +203,7 @@ EOS ; gzip(1), and sqlite3(1) as documented: EOF for (sort keys %$altid_map) { - $$txt .= "\t;\tcurl -XPOST $base_url$_.sql.gz | \\\n" . + $$txt .= "\t;\tcurl -d '' $base_url$_.sql.gz | \\\n" . "\t;\tgzip -dc | \\\n" . "\t;\tsqlite3 $inboxdir/$_.sqlite3\n"; $$txt .= "\taltid = serial:$_:file=$_.sqlite3\n"; @@ -176,170 +214,328 @@ EOF defined(my $v = $ibx->{$k}) or next; $$txt .= "\t$k = $v\n"; } - $$txt .= "\tnntpmirror = $_\n" for (@{$ibx->nntp_url}); + $$txt .= "\timapmirror = $_\n" for (@{$ibx->imap_url($ctx)}); + $$txt .= "\tnntpmirror = $_\n" for (@{$ibx->nntp_url($ctx)}); + _coderepo_config($ctx, $txt); + 1; +} - # note: this doesn't preserve cgitrc layout, since we parse cgitrc - # and drop the original structure - if (defined(my $cr = $ibx->{coderepo})) { - $$txt .= "\tcoderepo = $_\n" for @$cr; - $$txt .= <<'EOF'; +# n.b. this is a perfect candidate for memoization +sub extindex_config ($$) { + my ($ctx, $txt) = @_; + my $ibx = $ctx->{ibx}; + push @{$ctx->{-res_hdr}}, + 'Content-Disposition', 'inline; filename=extindex.config'; + my $name = dq_escape($ibx->{name}); + my $base_url = $ibx->base_url($ctx->{env}); + $$txt .= <<EOS; +; Example public-inbox config snippet for the external index (extindex) at: +; $base_url +; See public-inbox-config(5)manpage for more details: +; https://public-inbox.org/public-inbox-config.html +[extindex "$name"] + topdir = /path/to/extindex-topdir + url = https://example.com/$name/ + url = http://example.onion/$name/ +EOS + for my $k (qw(infourl)) { + defined(my $v = $ibx->{$k}) or next; + $$txt .= "\t$k = $v\n"; + } + _coderepo_config($ctx, $txt); + 1; +} -; `coderepo' entries allows blob reconstruction via patch emails if -; the inbox is indexed with Xapian. `@@ <from-range> <to-range> @@' -; line number ranges in `[PATCH]' emails link to /$INBOX_NAME/$OID/s/, -; an HTTP endpoint which reconstructs git blobs via git-apply(1). -EOF - my $pi_config = $ctx->{www}->{pi_config}; - for my $cr_name (@$cr) { - my $urls = $pi_config->{"coderepo.$cr_name.cgiturl"}; - my $path = "/path/to/$cr_name"; - $cr_name = dq_escape($cr_name); - - $$txt .= qq([coderepo "$cr_name"]\n); - if ($urls && scalar(@$urls)) { - $$txt .= "\t; "; - $$txt .= join(" ||\n\t;\t", map {; - my $dst = $path; - if ($path !~ m![a-z0-9_/\.\-]!i) { - $dst = '"'.dq_escape($dst).'"'; - } - qq(git clone $_ $dst); - } @$urls); - $$txt .= "\n"; +sub coderepos_raw ($$) { + my ($ctx, $top_url) = @_; + my $cfg = $ctx->{www}->{pi_cfg}; + my $cr = $cfg->repo_objs($ctx->{ibx}) or return (); + my $buf = 'Code repositories for project(s) associated with this '. + $ctx->{ibx}->thing_type . ":\n"; + my @recs = PublicInbox::CodeSearch::repos_sorted($cfg, @$cr); + my $cr_score = $ctx->{ibx}->{-cr_score}; + my $env = $ctx->{env}; + for (@recs) { + my ($t, $git) = @$_; + for ($git->pub_urls($env)) { + my $u = m!\A(?:[a-z\+]+:)?//!i ? $_ : $top_url.$_; + my $nr = $cr_score->{$git->{nick}}; + $buf .= "\n"; + $buf .= $nr ? sprintf('% 9u', $nr) : (' 'x9); + $buf .= ' '.fmt_ts($t).' '.prurl($env, $u); + } + } + ($buf); +} + +sub _add_non_http_urls ($$) { + my ($ctx, $txt) = @_; + $ctx->{ibx}->can('nntp_url') or return; # TODO extindex can have IMAP + my $urls = $ctx->{ibx}->imap_url($ctx); + if (@$urls) { + $urls = join("\n ", @$urls); + $urls =~ s!://([^/@]+)/!://;AUTH=ANONYMOUS\@$1/!sg; + $$txt .= <<EOM + +IMAP subfolder(s) are available under: + $urls + # each subfolder (starting with `0') holds 50K messages at most +EOM + } + $urls = $ctx->{ibx}->nntp_url($ctx); + if (@$urls) { + $$txt .= @$urls == 1 ? "\nNewsgroup" : "\nNewsgroups are"; + $$txt .= ' available over NNTP:'; + $$txt .= "\n " . join("\n ", @$urls) . "\n"; + } + $urls = $ctx->{ibx}->pop3_url($ctx); + if (@$urls) { + $urls = join("\n ", @$urls); + $$txt .= <<EOM; + +POP3 access is available: + $urls + +The POP3 password is: anonymous +The POP3 username is: \$(uuidgen)\@$ctx->{ibx}->{newsgroup} +where \$(uuidgen) in the output of the `uuidgen' command on your system. +The UUID in the username functions as a private cookie (don't share it). +By default, only 1000 messages are retrieved. You may download more +by appending `?limit=NUM' (without quotes) to the username, where +`NUM' is an integer between 1 and 50000. +Idle accounts will expire periodically. +EOM + } +} + +sub _add_onion_note ($) { + my ($txt) = @_; + $$txt =~ m!\b[^:]+://\w+\.onion/!i and $$txt .= <<EOM + +note: .onion URLs require Tor: https://www.torproject.org/ + +EOM +} + +sub _mirror_help ($$) { + my ($ctx, $txt) = @_; + my $ibx = $ctx->{ibx}; + my $base_url = $ibx->base_url($ctx->{env}); + chop $base_url; # no trailing slash for "git clone" + my $dir = (split(m!/!, $base_url))[-1]; + my %seen = ($base_url => 1); + my $top_url = $base_url; + $top_url =~ s!/[^/]+\z!/!; + $$txt .= "public-inbox mirroring instructions\n\n"; + if ($ibx->can('cloneurl')) { # PublicInbox::Inbox + $$txt .= + "This public inbox may be cloned and mirrored by anyone:\n"; + my @urls; + my $max = $ibx->max_git_epoch; + # TODO: some of these URLs may be too long and we may need to + # do something like code_footer() above, but these are local + # admin-defined + if (defined($max)) { # v2 + for my $i (0..$max) { + # old epochs my be deleted: + -d "$ibx->{inboxdir}/git/$i.git" or next; + my $url = "$base_url/$i"; + $seen{$url} = 1; + push @urls, "$url $dir/git/$i.git"; + } + my $nr = scalar(@urls); + if ($nr > 1) { + chomp($$txt .= <<EOM); + + # this inbox consists of $nr epochs: (no need to clone all of them) +EOM + $urls[0] .= " # oldest"; + $urls[-1] .= " # newest"; } - $$txt .= "\tdir = $path\n"; - $$txt .= "\tcgiturl = https://example.com/"; - $$txt .= uri_escape_utf8($cr_name, URI_PATH)."\n"; + } else { # v1 + push @urls, $base_url; + } + # FIXME: epoch splits can be different in other repositories, + # use the "cloneurl" file as-is for now: + for my $u (@{$ibx->cloneurl}) { + next if $seen{$u}++; + push @urls, $u; + } + $$txt .= "\n"; + $$txt .= join('', map { " git clone --mirror $_\n" } @urls); + my $addrs = $ibx->{address} // 'inbox@example.com'; + my $ng = $ibx->{newsgroup} // ''; + substr($ng, 0, 0, ' --ng ') if $ng; + $addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY'; + my $v = defined $max ? '-V2' : '-V1'; + $$txt .= <<EOF; + + # If you have public-inbox 1.1+ installed, you may + # initialize and index your mirror using the following commands: + public-inbox-init $v$ng \\ + $ibx->{name} ./$dir $base_url \\ + $addrs + public-inbox-index ./$dir +EOF + } else { # PublicInbox::ExtSearch + $$txt .= <<EOM; +This is an external index which is an amalgamation of several public inboxes. +Each public inbox needs to be mirrored individually. +EOM + my $v = $ctx->{www}->{pi_cfg}->{lc('publicInbox.wwwListing')}; + if (($v // '') =~ /\A(?:all|match=domain)\z/) { + $$txt .= <<EOM; +A list of them is available at $top_url +EOM } } + my $cfg_link = "$base_url/_/text/config/raw"; + $$txt .= <<EOF; + +Example config snippet for mirrors: $cfg_link +EOF + _add_non_http_urls($ctx, $txt); + _add_onion_note($txt); + + my $code_url = prurl($ctx->{env}, $PublicInbox::WwwStream::CODE_URL); + $$txt .= join("\n\n", + coderepos_raw($ctx, $top_url), # may be empty + "AGPL code for this site:\n git clone $code_url"); 1; } -sub _default_text ($$$$) { - my ($ctx, $key, $hdr, $txt) = @_; - return _colors_help($ctx, $txt) if $key eq 'color'; - return inbox_config($ctx, $hdr, $txt) if $key eq 'config'; +sub _default_text ($$$) { + my ($ctx, $key, $txt) = @_; + if ($key eq 'mirror') { + return _mirror_help($ctx, $txt); + } elsif ($key eq 'color') { + return _colors_help($ctx, $txt); + } elsif ($key eq 'config') { + return $ctx->{ibx}->can('cloneurl') ? + inbox_config($ctx, $txt) : + extindex_config($ctx, $txt); + } return if $key ne 'help'; # TODO more keys? - my $ibx = $ctx->{-inbox}; + my $ibx = $ctx->{ibx}; my $base_url = $ibx->base_url($ctx->{env}); - $$txt .= "public-inbox help for $base_url\n"; $$txt .= <<EOF; +public-inbox help for $base_url overview -------- - public-inbox uses Message-ID identifiers in URLs. - One may look up messages by substituting Message-IDs - (without the leading '<' or trailing '>') into the URL. - Forward slash ('/') characters in the Message-IDs - need to be escaped as "%2F" (without quotes). + public-inbox uses Message-ID identifiers in URLs. + One may look up messages by substituting Message-IDs + (without the leading '<' or trailing '>') into the URL. + Forward slash ('/') characters in the Message-IDs + need to be escaped as "%2F" (without quotes). - Thus, it is possible to retrieve any message by its - Message-ID by going to: + Thus, it is possible to retrieve any message by its + Message-ID by going to: - $base_url<Message-ID>/ + $base_url<Message-ID>/ + (without the '<' or '>') - (without the '<' or '>') + Message-IDs are described at: - Message-IDs are described at: - - $WIKI_URL/Message-ID + $WIKI_URL/Message-ID EOF # n.b. we use the Xapian DB for any regeneratable, # order-of-arrival-independent data. - my $srch = $ibx->search; - if ($srch) { + if ($ibx->isrch) { $$txt .= <<EOF; search ------ - This public-inbox has search functionality provided by Xapian. + This public-inbox has search functionality provided by Xapian. - It supports typical AND, OR, NOT, '+', '-' queries present - in other search engines. + It supports typical AND, OR, NOT, '+', '-' queries present + in other search engines. - We also support search prefixes to limit the scope of the - search to certain fields. + We also support search prefixes to limit the scope of the + search to certain fields. - Prefixes supported in this installation include: + Prefixes supported in this installation include: EOF - _srch_prefix($srch, $txt); - + _srch_prefix($ibx, $txt); $$txt .= <<EOF; - Most prefixes are probabilistic, meaning they support stemming - and wildcards ('*'). Ranges (such as 'd:') and boolean prefixes - do not support stemming or wildcards. - The upstream Xapian query parser documentation fully explains - the query syntax: + Most prefixes are probabilistic, meaning they support stemming + and wildcards ('*'). Ranges (such as 'd:') and boolean prefixes + do not support stemming or wildcards. + The upstream Xapian query parser documentation fully explains + the query syntax: - $QP_URL + $QP_URL EOF } # $srch - my $over = $ibx->over; - if ($over) { + if ($ibx->over) { $$txt .= <<EOF; message threading ----------------- - Message threading is enabled for this public-inbox, - additional endpoints for message threads are available: + Message threading is enabled for this public-inbox, + additional endpoints for message threads are available: - * $base_url<Message-ID>/T/#u + * $base_url<Message-ID>/T/#u - Loads the thread belonging to the given <Message-ID> - in flat chronological order. The "#u" anchor - focuses the browser on the given <Message-ID>. + Loads the thread belonging to the given <Message-ID> + in flat chronological order. The "#u" anchor + focuses the browser on the given <Message-ID>. - * $base_url<Message-ID>/t/#u + * $base_url<Message-ID>/t/#u - Loads the thread belonging to the given <Message-ID> - in threaded order with nesting. For deep threads, - this requires a wide display or horizontal scrolling. + Loads the thread belonging to the given <Message-ID> + in threaded order with nesting. For deep threads, + this requires a wide display or horizontal scrolling. - Both of these HTML endpoints are suitable for offline reading - using the thread overview at the bottom of each page. + Both of these HTML endpoints are suitable for offline reading + using the thread overview at the bottom of each page. - Users of feed readers may follow a particular thread using: + The gzipped mbox for a thread is available for downloading and + importing into your favorite mail client: - * $base_url<Message-ID>/t.atom + * $base_url<Message-ID>/t.mbox.gz - Which loads the thread in Atom Syndication Standard - described at Wikipedia and RFC4287: + We use the mboxrd variant of the mbox format described at: - $WIKI_URL/Atom_(standard) - https://tools.ietf.org/html/rfc4287 + $WIKI_URL/Mbox - Atom Threading Extensions (RFC4685) is supported: + Users of feed readers may follow a particular thread using: - https://tools.ietf.org/html/rfc4685 + * $base_url<Message-ID>/t.atom - Finally, the gzipped mbox for a thread is available for - downloading and importing into your favorite mail client: + Which loads the thread in Atom Syndication Standard + described at Wikipedia and RFC4287: - * $base_url<Message-ID>/t.mbox.gz + $WIKI_URL/Atom_(standard) + https://tools.ietf.org/html/rfc4287 - We use the mboxrd variant of the mbox format described - at: + Atom Threading Extensions (RFC4685) are supported: - $WIKI_URL/Mbox + https://tools.ietf.org/html/rfc4685 EOF } # $over + _add_non_http_urls($ctx, \(my $note = '')); + $note and $note =~ s/^/ /gms and $$txt .= <<EOF; +additional protocols +-------------------- +$note +EOF $$txt .= <<EOF; contact ------- - This help text is maintained by public-inbox developers - reachable via plain-text email at: meta\@public-inbox.org - Their inbox is archived at: https://public-inbox.org/meta/ - + This help text is maintained by public-inbox developers + reachable via plain-text email at: meta\@public-inbox.org + Their inbox is archived at: https://public-inbox.org/meta/ EOF # TODO: support admin contact info in ~/.public-inbox/config 1; diff --git a/lib/PublicInbox/WwwTopics.pm b/lib/PublicInbox/WwwTopics.pm new file mode 100644 index 00000000..9d270732 --- /dev/null +++ b/lib/PublicInbox/WwwTopics.pm @@ -0,0 +1,85 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::WwwTopics; +use v5.12; +use PublicInbox::Hval qw(ascii_html mid_href fmt_ts); + +sub add_topic_html ($$) { + my (undef, $smsg) = @_; + my $s = ascii_html($smsg->{subject}); + $s = '(no subject)' if $s eq ''; + $_[0] .= "\n".fmt_ts($smsg->{ds}) . + qq{ <a\nhref="}.mid_href($smsg->{mid}).qq{/#r">$s</a>}; + my $nr = $smsg->{'COUNT(num)'}; + $_[0] .= " $nr+ messages" if $nr > 1; +} + +# n.b. the `SELECT DISTINCT(tid)' subquery is critical for performance +# with giant inboxes and extindices +sub topics_new ($) { + $_[0]->do_get(<<EOS); +SELECT ds,ddd,COUNT(num) FROM over WHERE tid IN +(SELECT DISTINCT(tid) FROM over WHERE tid > 0 ORDER BY ts DESC LIMIT 200) +AND +num > 0 +GROUP BY tid +ORDER BY ds ASC +EOS +} + +sub topics_active ($) { + $_[0]->do_get(<<EOS); +SELECT ddd,MAX(ds) as ds,COUNT(num) FROM over WHERE tid IN +(SELECT DISTINCT(tid) FROM over WHERE tid > 0 ORDER BY ts DESC LIMIT 200) +AND +num > 0 +GROUP BY tid +ORDER BY ds ASC +EOS +} + +sub topics_i { pop @{$_[0]->{msgs}} } + +sub topics_atom { # GET /$INBOX_NAME/topics_(new|active).atom + my ($ctx) = @_; + require PublicInbox::WwwAtomStream; + my ($hdr, $smsg, $val); + PublicInbox::WwwAtomStream->response($ctx, \&topics_i); +} + +sub topics_html { # GET /$INBOX_NAME/topics_(new|active).html + my ($ctx) = @_; + require PublicInbox::WwwStream; + my $buf = '<pre>'; + $ctx->{-html_more_links} = qq{\n- recent:[<a +href="./">subjects (threaded)</a>|}; + + if ($ctx->{topic_category} eq 'new') { + $ctx->{-html_more_links} .= qq{<b>topics (new)</b>|<a +href="./topics_active.html">topics (active)</a>]}; + } else { # topic_category eq "active" - topics with recent replies + $ctx->{-html_more_links} .= qq{<a +href="./topics_new.html">topics (new)</a>|<b>topics (active)</b>]}; + } + # can't use SQL to filter references since our schema wasn't designed + # for it, but our SQL sorts by ascending time to favor top-level + # messages while our final result (post-references filter) favors + # recent messages + my $msgs = delete $ctx->{msgs}; + add_topic_html($buf, pop @$msgs) while scalar(@$msgs); + $buf .= '</pre>'; + PublicInbox::WwwStream::html_oneshot($ctx, 200, $buf); +} + +sub response { + my ($ctx, $ibx_name, $category, $type) = @_; + my ($ret, $over); + $ret = PublicInbox::WWW::invalid_inbox($ctx, $ibx_name) and return $ret; + $over = $ctx->{ibx}->over or + return PublicInbox::WWW::need($ctx, 'Overview', './'); + $ctx->{msgs} = $category eq 'new' ? topics_new($over) : + topics_active($over); + $ctx->{topic_category} = $category; + $type eq 'atom' ? topics_atom($ctx) : topics_html($ctx); +} + +1; diff --git a/lib/PublicInbox/XapClient.pm b/lib/PublicInbox/XapClient.pm new file mode 100644 index 00000000..24b3f45e --- /dev/null +++ b/lib/PublicInbox/XapClient.pm @@ -0,0 +1,50 @@ +#!perl -w +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# This talks to (XapHelperCxx.pm + xap_helper.h) or XapHelper.pm +# and will eventually allow users with neither XS nor SWIG Perl +# bindings to use Xapian as long as they have Xapian development +# headers/libs and a C++ compiler +package PublicInbox::XapClient; +use v5.12; +use PublicInbox::Spawn qw(spawn); +use Socket qw(AF_UNIX SOCK_SEQPACKET); +use PublicInbox::IPC; +use autodie qw(pipe socketpair); +our $tries = 50; + +sub mkreq { + my ($self, $ios, @arg) = @_; + my ($r, $n); + pipe($r, $ios->[0]) if !defined($ios->[0]); + my @fds = map fileno($_), @$ios; + my $buf = join("\0", @arg, ''); + $n = $PublicInbox::IPC::send_cmd->($self->{io}, \@fds, $buf, 0, $tries) + // die "send_cmd: $!"; + $n == length($buf) or die "send_cmd: $n != ".length($buf); + $r; +} + +sub start_helper (@) { + $PublicInbox::IPC::send_cmd or return; # can't work w/o SCM_RIGHTS + my @argv = @_; + socketpair(my $sock, my $in, AF_UNIX, SOCK_SEQPACKET, 0); + my $cls = 'PublicInbox::XapHelperCxx'; + my $env; + my $cmd = eval "require $cls; ${cls}::cmd()"; + if ($@) { # fall back to Perl + XS|SWIG + $cls = 'PublicInbox::XapHelper'; + # ensure the child process has the same @INC we do: + $env = { PERL5LIB => join(':', @INC) }; + $cmd = [$^X, ($^W ? ('-w') : ()), "-M$cls", '-e', + $cls.'::start(@ARGV)', '--' ]; + } + push @$cmd, @argv; + my $pid = spawn($cmd, $env, { 0 => $in }); + my $self = bless { io => $sock, impl => $cls }, __PACKAGE__; + PublicInbox::IO::attach_pid($sock, $pid); + $self; +} + +1; diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm new file mode 100644 index 00000000..c9957f64 --- /dev/null +++ b/lib/PublicInbox/XapHelper.pm @@ -0,0 +1,340 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Perl + SWIG||XS implementation if XapHelperCxx / xap_helper.h isn't usable. +package PublicInbox::XapHelper; +use v5.12; +use Getopt::Long (); # good API even if we only use short options +our $GLP = Getopt::Long::Parser->new; +$GLP->configure(qw(require_order bundling no_ignore_case no_auto_abbrev)); +use PublicInbox::Search qw(xap_terms); +use PublicInbox::CodeSearch; +use PublicInbox::IPC; +use PublicInbox::IO qw(read_all); +use Socket qw(SOL_SOCKET SO_TYPE SOCK_SEQPACKET AF_UNIX); +use PublicInbox::DS qw(awaitpid); +use autodie qw(open getsockopt); +use POSIX qw(:signal_h); +use Fcntl qw(LOCK_UN LOCK_EX); +use Carp qw(croak); +my $X = \%PublicInbox::Search::X; +our (%SRCH, %WORKERS, $nworker, $workerset, $in); +our $stderr = \*STDERR; + +sub cmd_test_inspect { + my ($req) = @_; + print { $req->{0} } "pid=$$ has_threadid=", + ($req->{srch}->has_threadid ? 1 : 0) +} + +sub cmd_test_sleep { select(undef, undef, undef, 0.01) while 1 } + +sub iter_retry_check ($) { + if (ref($@) =~ /\bDatabaseModifiedError\b/) { + $_[0]->{srch}->reopen; + undef; # retries + } elsif (ref($@) =~ /\bDocNotFoundError\b/) { + warn "doc not found: $@"; + 0; # continue to next doc + } else { + die; + } +} + +sub term_length_extract ($) { + my ($req) = @_; + @{$req->{A_len}} = map { + my $len = s/([0-9]+)\z// ? ($1 + 0) : undef; + [ $_, $len ]; + } @{$req->{A}}; +} + +sub dump_ibx_iter ($$$) { + my ($req, $ibx_id, $it) = @_; + my $out = $req->{0}; + eval { + my $doc = $it->get_document; + for my $pair (@{$req->{A_len}}) { + my ($pfx, $len) = @$pair; + my @t = xap_terms($pfx, $doc); + @t = grep { length == $len } @t if defined($len); + for (@t) { + print $out "$_ $ibx_id\n" or die "print: $!"; + ++$req->{nr_out}; + } + } + }; + $@ ? iter_retry_check($req) : 0; +} + +sub emit_mset_stats ($$) { + my ($req, $mset) = @_; + my $err = $req->{1} or croak "BUG: caller only passed 1 FD"; + say $err 'mset.size='.$mset->size.' nr_out='.$req->{nr_out} +} + +sub cmd_dump_ibx { + my ($req, $ibx_id, $qry_str) = @_; + $qry_str // die 'usage: dump_ibx [OPTIONS] IBX_ID QRY_STR'; + $req->{A} or die 'dump_ibx requires -A PREFIX'; + term_length_extract $req; + my $max = $req->{'m'} // $req->{srch}->{xdb}->get_doccount; + my $opt = { relevance => -1, limit => $max, offset => $req->{o} // 0 }; + $opt->{eidx_key} = $req->{O} if defined $req->{O}; + my $mset = $req->{srch}->mset($qry_str, $opt); + $req->{0}->autoflush(1); + for my $it ($mset->items) { + for (my $t = 10; $t > 0; --$t) { + $t = dump_ibx_iter($req, $ibx_id, $it) // $t; + } + } + emit_mset_stats($req, $mset); +} + +sub dump_roots_iter ($$$) { + my ($req, $root2off, $it) = @_; + eval { + my $doc = $it->get_document; + my $G = join(' ', map { $root2off->{$_} } xap_terms('G', $doc)); + for my $pair (@{$req->{A_len}}) { + my ($pfx, $len) = @$pair; + my @t = xap_terms($pfx, $doc); + @t = grep { length == $len } @t if defined($len); + for (@t) { + $req->{wbuf} .= "$_ $G\n"; + ++$req->{nr_out}; + } + } + }; + $@ ? iter_retry_check($req) : 0; +} + +sub dump_roots_flush ($$) { + my ($req, $fh) = @_; + if ($req->{wbuf} ne '') { + until (flock($fh, LOCK_EX)) { die "LOCK_EX: $!" if !$!{EINTR} } + print { $req->{0} } $req->{wbuf} or die "print: $!"; + until (flock($fh, LOCK_UN)) { die "LOCK_UN: $!" if !$!{EINTR} } + $req->{wbuf} = ''; + } +} + +sub cmd_dump_roots { + my ($req, $root2off_file, $qry_str) = @_; + $qry_str // die 'usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR'; + $req->{A} or die 'dump_roots requires -A PREFIX'; + term_length_extract $req; + open my $fh, '<', $root2off_file; + my $root2off; # record format: $OIDHEX "\0" uint32_t + my @x = split(/\0/, read_all $fh); + while (defined(my $oidhex = shift @x)) { + $root2off->{$oidhex} = shift @x; + } + my $opt = { relevance => -1, limit => $req->{'m'}, + offset => $req->{o} // 0 }; + my $mset = $req->{srch}->mset($qry_str, $opt); + $req->{0}->autoflush(1); + $req->{wbuf} = ''; + for my $it ($mset->items) { + for (my $t = 10; $t > 0; --$t) { + $t = dump_roots_iter($req, $root2off, $it) // $t; + } + if (!($req->{nr_out} & 0x3fff)) { + dump_roots_flush($req, $fh); + } + } + dump_roots_flush($req, $fh); + emit_mset_stats($req, $mset); +} + +sub mset_iter ($$) { + my ($req, $it) = @_; + say { $req->{0} } $it->get_docid, "\0", + $it->get_percent, "\0", $it->get_rank; +} + +sub cmd_mset { # to be used by WWW + IMAP + my ($req, $qry_str) = @_; + $qry_str // die 'usage: mset [OPTIONS] QRY_STR'; + my $opt = { limit => $req->{'m'}, offset => $req->{o} // 0 }; + $opt->{relevance} = 1 if $req->{r}; + $opt->{threads} = 1 if defined $req->{t}; + $opt->{git_dir} = $req->{g} if defined $req->{g}; + $opt->{eidx_key} = $req->{O} if defined $req->{O}; + $opt->{threadid} = $req->{T} if defined $req->{T}; + my $mset = $req->{srch}->mset($qry_str, $opt); + say { $req->{0} } 'mset.size=', $mset->size, + ' .get_matches_estimated=', $mset->get_matches_estimated; + for my $it ($mset->items) { + for (my $t = 10; $t > 0; --$t) { + $t = mset_iter($req, $it) // $t; + } + } +} + +sub srch_init_extra ($) { + my ($req) = @_; + my $qp = $req->{srch}->{qp}; + for (@{$req->{Q}}) { + my ($upfx, $m, $xpfx) = split /([:=])/; + $xpfx // die "E: bad -Q $_"; + $m = $m eq '=' ? 'add_boolean_prefix' : 'add_prefix'; + $qp->$m($upfx, $xpfx); + } + $req->{srch}->{qp_extra_done} = 1; +} + +sub dispatch { + my ($req, $cmd, @argv) = @_; + my $fn = $req->can("cmd_$cmd") or return; + $GLP->getoptionsfromarray(\@argv, $req, @PublicInbox::Search::XH_SPEC) + or return; + my $dirs = delete $req->{d} or die 'no -d args'; + my $key = join("\0", @$dirs); + my $new; + $req->{srch} = $SRCH{$key} //= do { + $new = { qp_flags => $PublicInbox::Search::QP_FLAGS }; + my $first = shift @$dirs; + my $slow_phrase = -f "$first/iamchert"; + $new->{xdb} = $X->{Database}->new($first); + for (@$dirs) { + $slow_phrase ||= -f "$_/iamchert"; + $new->{xdb}->add_database($X->{Database}->new($_)); + } + $slow_phrase or + $new->{qp_flags} |= PublicInbox::Search::FLAG_PHRASE(); + bless $new, $req->{c} ? 'PublicInbox::CodeSearch' : + 'PublicInbox::Search'; + $new->{qp} = $new->qparse_new; + $new; + }; + $req->{srch}->{xdb}->reopen unless $new; + $req->{Q} && !$req->{srch}->{qp_extra_done} and + srch_init_extra $req; + my $timeo = $req->{K}; + alarm($timeo) if $timeo; + $fn->($req, @argv); + alarm(0) if $timeo; +} + +sub recv_loop { + local $SIG{__WARN__} = sub { print $stderr @_ }; + my $rbuf; + local $SIG{TERM} = sub { undef $in }; + local $SIG{USR1} = \&reopen_logs; + while (defined($in)) { + PublicInbox::DS::sig_setmask($workerset); + my @fds = eval { # we undef $in in SIG{TERM} + $PublicInbox::IPC::recv_cmd->($in, $rbuf, 4096*33) + }; + if ($@) { + exit if !$in; # hit by SIGTERM + die; + } + scalar(@fds) or exit(66); # EX_NOINPUT + die "recvmsg: $!" if !defined($fds[0]); + PublicInbox::DS::block_signals(POSIX::SIGALRM); + my $req = bless {}, __PACKAGE__; + my $i = 0; + open($req->{$i++}, '+<&=', $_) for @fds; + local $stderr = $req->{1} // \*STDERR; + die "not NUL-terminated" if chop($rbuf) ne "\0"; + my @argv = split(/\0/, $rbuf); + $req->{nr_out} = 0; + $req->dispatch(@argv) if @argv; + } +} + +sub reap_worker { # awaitpid CB + my ($pid, $nr) = @_; + delete $WORKERS{$nr}; + if (($? >> 8) == 66) { # EX_NOINPUT + undef $in; + } elsif ($?) { + warn "worker[$nr] died \$?=$?\n"; + } + PublicInbox::DS::requeue(\&start_workers) if $in; +} + +sub start_worker ($) { + my ($nr) = @_; + my $pid = eval { PublicInbox::DS::fork_persist } // return(warn($@)); + if ($pid == 0) { + undef %WORKERS; + $SIG{TTIN} = $SIG{TTOU} = 'IGNORE'; + $SIG{CHLD} = 'DEFAULT'; # Xapian may use this + recv_loop(); + exit(0); + } else { + $WORKERS{$nr} = $pid; + awaitpid($pid, \&reap_worker, $nr); + } +} + +sub start_workers { + for my $nr (grep { !defined($WORKERS{$_}) } (0..($nworker - 1))) { + start_worker($nr) if $in; + } +} + +sub do_sigttou { + if ($in && $nworker > 1) { + --$nworker; + my @nr = grep { $_ >= $nworker } keys %WORKERS; + kill('TERM', @WORKERS{@nr}); + } +} + +sub reopen_logs { + my $p = $ENV{STDOUT_PATH}; + defined($p) && open(STDOUT, '>>', $p) and STDOUT->autoflush(1); + $p = $ENV{STDERR_PATH}; + defined($p) && open(STDERR, '>>', $p) and STDERR->autoflush(1); +} + +sub parent_reopen_logs { + reopen_logs(); + kill('USR1', values %WORKERS); +} + +sub xh_alive { $in || scalar(keys %WORKERS) } + +sub start (@) { + my (@argv) = @_; + my $c = getsockopt(local $in = \*STDIN, SOL_SOCKET, SO_TYPE); + unpack('i', $c) == SOCK_SEQPACKET or die 'stdin is not SOCK_SEQPACKET'; + + local (%SRCH, %WORKERS); + PublicInbox::Search::load_xapian(); + $GLP->getoptionsfromarray(\@argv, my $opt = { j => 1 }, 'j=i') or + die 'bad args'; + local $workerset = POSIX::SigSet->new; + $workerset->fillset or die "fillset: $!"; + for (@PublicInbox::DS::UNBLOCKABLE, POSIX::SIGUSR1) { + $workerset->delset($_) or die "delset($_): $!"; + } + + local $nworker = $opt->{j}; + return recv_loop() if $nworker == 0; + die '-j must be >= 0' if $nworker < 0; + for (POSIX::SIGTERM, POSIX::SIGCHLD) { + $workerset->delset($_) or die "delset($_): $!"; + } + my $sig = { + TTIN => sub { + if ($in) { + ++$nworker; + PublicInbox::DS::requeue(\&start_workers) + } + }, + TTOU => \&do_sigttou, + CHLD => \&PublicInbox::DS::enqueue_reap, + USR1 => \&parent_reopen_logs, + }; + PublicInbox::DS::block_signals(); + start_workers(); + @PublicInbox::DS::post_loop_do = \&xh_alive; + PublicInbox::DS::event_loop($sig); +} + +1; diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm new file mode 100644 index 00000000..74852ad1 --- /dev/null +++ b/lib/PublicInbox/XapHelperCxx.pm @@ -0,0 +1,141 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Just-ahead-of-time builder for the lib/PublicInbox/xap_helper.h shim. +# I never want users to be without source code for repairs, so this +# aims to replicate the feel of a scripting language using C++. +# The resulting executable is not linked to Perl in any way. +package PublicInbox::XapHelperCxx; +use v5.12; +use PublicInbox::Spawn qw(run_die run_qx run_wait which); +use PublicInbox::IO qw(try_cat write_file); +use PublicInbox::Search; +use Fcntl qw(SEEK_SET); +use Config; +use autodie; +my $cxx = which($ENV{CXX} // 'c++') // which('clang') // die 'no C++ compiler'; +my $dir = substr("$cxx-$Config{archname}", 1); # drop leading '/' +$dir =~ tr!/!-!; +my $idir; +if ((defined($ENV{XDG_CACHE_HOME}) && -d $ENV{XDG_CACHE_HOME}) || + (defined($ENV{HOME}) && -d $ENV{HOME})) { + $idir = ($ENV{XDG_CACHE_HOME} // + (($ENV{HOME} // die('HOME unset')).'/.cache') + ).'/public-inbox/jaot'; +} +$idir //= $ENV{PERL_INLINE_DIRECTORY} // + die 'HOME and PERL_INLINE_DIRECTORY unset'; +substr($dir, 0, 0) = "$idir/"; +my $bin = "$dir/xap_helper"; +my ($srcpfx) = (__FILE__ =~ m!\A(.+/)[^/]+\z!); +my @srcs = map { $srcpfx.$_ } qw(xh_mset.h xh_cidx.h xap_helper.h); +my @pm_dep = map { $srcpfx.$_ } qw(Search.pm CodeSearch.pm); +my $ldflags = '-Wl,-O1'; +$ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd'; +my $xflags = ($ENV{CXXFLAGS} // '-Wall -ggdb3 -pipe') . ' ' . + ' -DTHREADID=' . PublicInbox::Search::THREADID . + ' -DXH_SPEC="'.join('', + map { s/=.*/:/; $_ } @PublicInbox::Search::XH_SPEC) . '" ' . + ($ENV{LDFLAGS} // $ldflags); +substr($xflags, 0, 0, '-O2 ') if !defined($ENV{CXXFLAGS}) && !-w __FILE__; +my $xap_modversion; + +sub xap_cfg (@) { + my $cmd = [ $ENV{PKG_CONFIG} // 'pkg-config', @_, 'xapian-core' ]; + chomp(my $ret = run_qx($cmd, undef, { 2 => \(my $err) })); + return $ret if !$?; + die <<EOM; +@$cmd failed: Xapian development files missing? (\$?=$?) +$err +EOM +} + +sub needs_rebuild () { + my $prev = try_cat("$dir/XFLAGS") or return 1; + chomp $prev; + return 1 if $prev ne $xflags; + + $prev = try_cat("$dir/xap_modversion") or return 1; + chomp $prev; + + $xap_modversion = xap_cfg('--modversion'); + $xap_modversion ne $prev; +} + +sub build () { + if (!-d $dir) { + require File::Path; + eval { File::Path::make_path($dir) }; + if (!-d $dir && defined($ENV{PERL_INLINE_DIRECTORY})) { + $dir = $ENV{PERL_INLINE_DIRECTORY}; + File::Path::make_path($dir); + } + } + require PublicInbox::CodeSearch; + require PublicInbox::Lock; + my ($prog) = ($bin =~ m!/([^/]+)\z!); + my $lk = PublicInbox::Lock->new("$dir/$prog.lock")->lock_for_scope; + write_file '>', "$dir/$prog.cpp", qq{#include "xap_helper.h"\n}, + PublicInbox::Search::generate_cxx(), + PublicInbox::CodeSearch::generate_cxx(); + + # xap_modversion may be set by needs_rebuild + $xap_modversion //= xap_cfg('--modversion'); + my $fl = xap_cfg(qw(--libs --cflags)); + + # Using rpath seems acceptable/encouraged in the NetBSD packaging world + # since /usr/pkg/lib isn't searched by the dynamic loader by default. + # Not sure if other OSes need this, but rpath seems fine for JAOT + # binaries (like this one) even if other distros discourage it for + # distributed packages. + $^O eq 'netbsd' and $fl =~ s/(\A|[ \t])\-L([^ \t]+)([ \t]|\z)/ + "$1-L$2 -Wl,-rpath=$2$3"/egsx; + my @xflags = split(' ', "$fl $xflags"); # ' ' awk-mode eats leading WS + my @cflags = ('-I', $srcpfx, grep(!/\A-(?:Wl|l|L)/, @xflags)); + run_die([$cxx, '-o', "$dir/$prog.o", '-c', "$dir/$prog.cpp", @cflags]); + + # xapian on Alpine Linux (tested 3.19.0) is linked against libstdc++, + # and clang needs to be told to use it (rather than libc++): + my @try = rindex($cxx, 'clang') >= 0 ? qw(-lstdc++) : (); + my @cmd = ($cxx, '-o', "$dir/$prog.tmp", "$dir/$prog.o", @xflags); + while (run_wait(\@cmd) and @try) { + warn("# attempting to link again with $try[0]...\n"); + push(@cmd, shift(@try)); + } + die "# @cmd failed: \$?=$?" if $?; + unlink "$dir/$prog.cpp", "$dir/$prog.o"; + write_file '>', "$dir/XFLAGS.tmp", $xflags, "\n"; + write_file '>', "$dir/xap_modversion.tmp", $xap_modversion, "\n"; + undef $xap_modversion; # do we ever build() twice? + # not quite atomic, but close enough :P + rename("$dir/$_.tmp", "$dir/$_") for ($prog, qw(XFLAGS xap_modversion)); +} + +sub check_build () { + use Time::HiRes qw(stat); + my $ctime = 0; + my @bin = stat($bin) or return build(); + for (@srcs, @pm_dep) { + my @st = stat($_) or die "stat $_: $!"; + if ($st[10] > $ctime) { + $ctime = $st[10]; + return build() if $ctime > $bin[10]; + } + } + needs_rebuild() ? build() : 0; +} + +# returns spawn arg +sub cmd { + die 'PI_NO_CXX set' if $ENV{PI_NO_CXX}; + check_build(); + my @cmd; + if (my $v = $ENV{VALGRIND}) { + $v = 'valgrind -v' if $v eq '1'; + @cmd = split(/\s+/, $v); + } + push @cmd, $bin; + \@cmd; +} + +1; diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 6a74daf9..9a148ae4 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -1,15 +1,17 @@ -# Copyright (C) 2018-2020 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::Xapcmd; -use strict; -use PublicInbox::Spawn qw(which popen_rd nodatacow_dir); +use v5.12; +use PublicInbox::Spawn qw(which popen_rd); +use PublicInbox::Syscall; use PublicInbox::Admin qw(setup_signals); use PublicInbox::Over; +use PublicInbox::Search qw(xap_terms); use PublicInbox::SearchIdx; use File::Temp 0.19 (); # ->newdir use File::Path qw(remove_tree); -use File::Basename qw(dirname); -use POSIX qw(WNOHANG); +use POSIX qw(WNOHANG _exit); +use PublicInbox::DS; # support testing with dev versions of Xapian which installs # commands with a version number suffix (e.g. "xapian-compact-1.5") @@ -21,13 +23,25 @@ sub commit_changes ($$$$) { my $reshard = $opt->{reshard}; $SIG{INT} or die 'BUG: $SIG{INT} not handled'; - my @old_shard; - my $over_chg; - - while (my ($old, $newdir) = each %$tmp) { + my (@old_shard, $over_chg); + + # Sort shards highest-to-lowest, since ->xdb_shards_flat + # determines the number of shards to load based on the max; + # and we'd rather xdb_shards_flat to momentarily fail rather + # than load out-of-date shards + my @order = sort { + my ($x) = ($a =~ m!/([0-9]+)/*\z!); + my ($y) = ($b =~ m!/([0-9]+)/*\z!); + ($y // -1) <=> ($x // -1) # we may have non-shards + } keys %$tmp; + + my ($dname) = ($order[0] =~ m!(.*/)[^/]+/*\z!); + my $mode = (stat($dname))[2]; + for my $old (@order) { next if $old eq ''; # no invalid paths - my @st = stat($old); - if (!@st && !defined($opt->{reshard})) { + my $newdir = $tmp->{$old}; + my $have_old = -e $old; + if (!$have_old && !defined($opt->{reshard})) { die "failed to stat($old): $!"; } @@ -47,35 +61,31 @@ sub commit_changes ($$$$) { next; } - if (@st) { - chmod($st[2] & 07777, $new) or die "chmod $old: $!\n"; + chmod($mode & 07777, $new) or die "chmod($new): $!\n"; + if ($have_old) { rename($old, "$new/old") or die "rename $old => $new/old: $!\n"; } rename($new, $old) or die "rename $new => $old: $!\n"; - if (@st) { - my $prev = "$old/old"; - remove_tree($prev) or - die "failed to remove $prev: $!\n"; - } + push @old_shard, "$old/old" if $have_old; } # trigger ->check_inodes in read-only daemons - syswrite($im->{lockfh}, '.') if $over_chg; + syswrite($im->{lockfh}, '.') if $over_chg && $im; remove_tree(@old_shard); $tmp = undef; if (!$opt->{-coarse_lock}) { $opt->{-skip_lock} = 1; - - if ($im->can('count_shards')) { + $im //= $ibx if $ibx->can('eidx_sync') || $ibx->can('cidx_run'); + if ($im->can('count_shards')) { # v2w, eidx, cidx my $pr = $opt->{-progress}; my $n = $im->count_shards; if (defined $reshard && $n != $reshard) { die "BUG: counted $n shards after resharding to $reshard"; } - my $prev = $im->{shards}; + my $prev = $im->{shards} // $ibx->{nshard}; if ($pr && $prev != $n) { $pr->("shard count changed: $prev => $n\n"); $im->{shards} = $n; @@ -83,16 +93,21 @@ sub commit_changes ($$$$) { } my $env = $opt->{-idx_env}; local %ENV = (%ENV, %$env) if $env; - PublicInbox::Admin::index_inbox($ibx, $im, $opt); + if ($ibx->can('eidx_sync')) { + $ibx->eidx_sync($opt); + } elsif (!$ibx->can('cidx_run')) { + PublicInbox::Admin::index_inbox($ibx, $im, $opt); + } } } sub cb_spawn { my ($cb, $args, $opt) = @_; # $cb = cpdb() or compact() - defined(my $pid = fork) or die "fork: $!"; + my $pid = PublicInbox::DS::fork_persist; return $pid if $pid > 0; + $SIG{__DIE__} = sub { warn @_; _exit(1) }; # don't jump up stack $cb->($args, $opt); - POSIX::_exit(0); + _exit(0); } sub runnable_or_die ($) { @@ -100,17 +115,18 @@ sub runnable_or_die ($) { which($exe) or die "$exe not found in PATH\n"; } -sub prepare_reindex ($$$) { - my ($ibx, $im, $opt) = @_; - if ($ibx->version == 1) { +sub prepare_reindex ($$) { + my ($ibx, $opt) = @_; + if ($ibx->can('eidx_sync') || $ibx->can('cidx_run')) { + # no prep needed for ExtSearchIdx nor CodeSearchIdx + } elsif ($ibx->version == 1) { my $dir = $ibx->search->xdir(1); my $xdb = $PublicInbox::Search::X{Database}->new($dir); if (my $lc = $xdb->get_metadata('last_commit')) { $opt->{reindex}->{from} = $lc; } } else { # v2 - my $max; - $im->git_dir_latest(\$max) or return; + my $max = $ibx->max_git_epoch // return; my $from = $opt->{reindex}->{from}; my $mm = $ibx->mm; my $v = PublicInbox::Search::SCHEMA_VERSION(); @@ -132,8 +148,9 @@ sub kill_pids { } sub process_queue { - my ($queue, $cb, $opt) = @_; + my ($queue, $task, $opt) = @_; my $max = $opt->{jobs} // scalar(@$queue); + my $cb = \&$task; if ($max <= 1) { while (defined(my $args = shift @$queue)) { $cb->($args, $opt); @@ -143,7 +160,7 @@ sub process_queue { # run in parallel: my %pids; - local %SIG = %SIG; + local @SIG{keys %SIG} = values %SIG; setup_signals(\&kill_pids, \%pids); while (@$queue) { while (scalar(keys(%pids)) < $max && scalar(@$queue)) { @@ -170,9 +187,16 @@ sub prepare_run { my ($ibx, $opt) = @_; my $tmp = {}; # old shard dir => File::Temp->newdir object or undef my @queue; # ([old//src,newdir]) - list of args for cpdb() or compact() - my $old; - if (my $srch = $ibx->search) { + my ($old, $misc_ok); + if ($ibx->can('cidx_run')) { + $old = $ibx->xdir(1); + } elsif ($ibx->can('eidx_sync')) { + $misc_ok = 1; + $old = $ibx->xdir(1); + } elsif (my $srch = $ibx->search) { $old = $srch->xdir(1); + } + if (defined $old) { -d $old or die "$old does not exist\n"; } my $reshard = $opt->{reshard}; @@ -182,32 +206,33 @@ sub prepare_run { # we want temporary directories to be as deep as possible, # so v2 shards can keep "xap$SCHEMA_VERSION" on a separate FS. - if ($old && $ibx->version == 1) { + if (defined($old) && $ibx->can('version') && $ibx->version == 1) { if (defined $reshard) { warn "--reshard=$reshard ignored for v1 $ibx->{inboxdir}\n"; } - my $dir = dirname($old); + my ($dir) = ($old =~ m!(.*?/)[^/]+/*\z!); same_fs_or_die($dir, $old); my $v = PublicInbox::Search::SCHEMA_VERSION(); - my $wip = File::Temp->newdir("xapian$v-XXXXXXXX", DIR => $dir); + my $wip = File::Temp->newdir("xapian$v-XXXX", DIR => $dir); $tmp->{$old} = $wip; - nodatacow_dir($wip->dirname); + PublicInbox::Syscall::nodatacow_dir($wip->dirname); push @queue, [ $old, $wip ]; - } elsif ($old) { + } elsif (defined $old) { opendir my $dh, $old or die "Failed to opendir $old: $!\n"; my @old_shards; while (defined(my $dn = readdir($dh))) { if ($dn =~ /\A[0-9]+\z/) { - push @old_shards, $dn; + push(@old_shards, $dn + 0); } elsif ($dn eq '.' || $dn eq '..') { } elsif ($dn =~ /\Aover\.sqlite3/) { + } elsif ($dn eq 'misc' && $misc_ok) { } else { warn "W: skipping unknown dir: $old/$dn\n" } } die "No Xapian shards found in $old\n" unless @old_shards; - + @old_shards = sort { $a <=> $b } @old_shards; my ($src, $max_shard); if (!defined($reshard) || $reshard == scalar(@old_shards)) { # 1:1 copy @@ -218,12 +243,11 @@ sub prepare_run { $src = [ map { "$old/$_" } @old_shards ]; } foreach my $dn (0..$max_shard) { - my $tmpl = "$dn-XXXXXXXX"; - my $wip = File::Temp->newdir($tmpl, DIR => $old); + my $wip = File::Temp->newdir("$dn-XXXX", DIR => $old); same_fs_or_die($old, $wip->dirname); my $cur = "$old/$dn"; push @queue, [ $src // $cur , $wip ]; - nodatacow_dir($wip->dirname); + PublicInbox::Syscall::nodatacow_dir($wip->dirname); $tmp->{$cur} = $wip; } # mark old shards to be unlinked @@ -236,43 +260,49 @@ sub prepare_run { sub check_compact () { runnable_or_die($XAPIAN_COMPACT) } -sub _run { - my ($ibx, $cb, $opt) = @_; - my $im = $ibx->importer(0); - $im->lock_acquire; - my ($tmp, $queue) = prepare_run($ibx, $opt); - - # fine-grained locking if we prepare for reindex - if (!$opt->{-coarse_lock}) { - prepare_reindex($ibx, $im, $opt); - $im->lock_release; - } - - $ibx->cleanup; - process_queue($queue, $cb, $opt); - $im->lock_acquire if !$opt->{-coarse_lock}; - commit_changes($ibx, $im, $tmp, $opt); -} - sub run { my ($ibx, $task, $opt) = @_; # task = 'cpdb' or 'compact' - my $cb = \&$task; PublicInbox::Admin::progress_prepare($opt ||= {}); - defined(my $dir = $ibx->{inboxdir}) or die "no inboxdir defined\n"; - -d $dir or die "inboxdir=$dir does not exist\n"; - check_compact() if $opt->{compact} && $ibx->search; + my $dir; + for my $fld (qw(inboxdir topdir cidx_dir)) { + my $d = $ibx->{$fld} // next; + -d $d or die "$fld=$d does not exist\n"; + $dir = $d; + last; + } + check_compact() if $opt->{compact} && + ($ibx->can('cidx_run') || $ibx->search); - if (!$opt->{-coarse_lock}) { + if (!$ibx->can('eidx_sync') && $ibx->can('version') && + !$opt->{-coarse_lock}) { # per-epoch ranges for v2 # v1:{ from => $OID }, v2:{ from => [ $OID, $OID, $OID ] } } $opt->{reindex} = { from => $ibx->version == 1 ? '' : [] }; PublicInbox::SearchIdx::load_xapian_writable(); } - local %SIG = %SIG; + local @SIG{keys %SIG} = values %SIG; setup_signals(); - $ibx->umask_prepare; - $ibx->with_umask(\&_run, $ibx, $cb, $opt); + my $restore = $ibx->with_umask; + + my $im = $ibx->can('importer') ? $ibx->importer(0) : undef; + ($im // $ibx)->lock_acquire; + my ($tmp, $queue) = prepare_run($ibx, $opt); + + # fine-grained locking if we prepare for reindex + if (!$opt->{-coarse_lock}) { + prepare_reindex($ibx, $opt); + ($im // $ibx)->lock_release; + } + + $ibx->cleanup if $ibx->can('cleanup'); + if ($task eq 'cpdb' && $opt->{reshard} && $ibx->can('cidx_run')) { + cidx_reshard($ibx, $queue, $opt); + } else { + process_queue($queue, $task, $opt); + } + ($im // $ibx)->lock_acquire if !$opt->{-coarse_lock}; + commit_changes($ibx, $im, $tmp, $opt); } sub cpdb_retryable ($$) { @@ -290,20 +320,21 @@ sub cpdb_retryable ($$) { } sub progress_pfx ($) { - my ($wip) = @_; # tempdir v2: ([0-9])+-XXXXXXXX - my @p = split('/', $wip); + my ($wip) = @_; # tempdir v2: ([0-9])+-XXXX + my @p = split(m'/', $wip); - # return "xap15/0" for v2, or "xapian15" for v1: - ($p[-1] =~ /\A([0-9]+)/) ? "$p[-2]/$1" : $p[-1]; + # "basename(inboxdir)/xap15/0" for v2, + # "basename(inboxdir)/xapian15" for v1: + ($p[-1] =~ /\A([0-9]+)/) ? "$p[-3]/$p[-2]/$1" : "$p[-2]/$p[-1]"; } sub kill_compact { # setup_signals callback - my ($sig, $pidref) = @_; - kill($sig, $$pidref) if defined($$pidref); + my ($sig, $ioref) = @_; + kill($sig, $$ioref->attached_pid // return) if defined($$ioref); } # xapian-compact wrapper -sub compact ($$) { +sub compact ($$) { # cb_spawn callback my ($args, $opt) = @_; my ($src, $newdir) = @$args; my $dst = ref($newdir) ? $newdir->dirname : $newdir; @@ -327,18 +358,16 @@ sub compact ($$) { } $pr->("$pfx `".join(' ', @$cmd)."'\n") if $pr; push @$cmd, $src, $dst; - my ($rd, $pid); - local %SIG = %SIG; - setup_signals(\&kill_compact, \$pid); - ($rd, $pid) = popen_rd($cmd, undef, $rdr); + local @SIG{keys %SIG} = values %SIG; + setup_signals(\&kill_compact, \my $rd); + $rd = popen_rd($cmd, undef, $rdr); while (<$rd>) { if ($pr) { s/\r/\r$pfx /g; $pr->("$pfx $_"); } } - waitpid($pid, 0); - die "@$cmd failed: \$?=$?\n" if $?; + $rd->close or die "@$cmd failed: \$?=$?\n"; } sub cpdb_loop ($$$;$$) { @@ -382,18 +411,96 @@ sub cpdb_loop ($$$;$$) { } while (cpdb_retryable($src, $pfx)); } +sub xapian_write_prep ($) { + my ($opt) = @_; + PublicInbox::SearchIdx::load_xapian_writable(); + my $flag = eval($PublicInbox::Search::Xap.'::DB_CREATE()'); + die if $@; + $flag |= $PublicInbox::SearchIdx::DB_NO_SYNC if !$opt->{fsync}; + (\%PublicInbox::Search::X, $flag); +} + +sub compact_tmp_shard ($) { + my ($wip) = @_; + my $new = $wip->dirname; + my ($dir) = ($new =~ m!(.*?/)[^/]+/*\z!); + same_fs_or_die($dir, $new); + my $ft = File::Temp->newdir("$new.compact-XXXX", DIR => $dir); + PublicInbox::Syscall::nodatacow_dir($ft->dirname); + $ft; +} + +sub cidx_reshard { # not docid based + my ($cidx, $queue, $opt) = @_; + my ($X, $flag) = xapian_write_prep($opt); + my $src = $cidx->xdb; + delete($cidx->{xdb}) == $src or die "BUG: xdb != $src"; + my $pfx = $opt->{-progress_pfx} = progress_pfx($cidx->xdir.'/0'); + my $pr = $opt->{-progress}; + my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr; + local @SIG{keys %SIG} = values %SIG; + + # like copydatabase(1), be sure we don't overwrite anything in case + # of other bugs: + setup_signals() if $opt->{compact}; + my @tmp; + my @dst = map { + my $wip = $_->[1]; + my $tmp = $opt->{compact} ? compact_tmp_shard($wip) : $wip; + push @tmp, $tmp; + $X->{WritableDatabase}->new($tmp->dirname, $flag); + } @$queue; + my $l = $src->get_metadata('indexlevel'); + $dst[0]->set_metadata('indexlevel', $l) if $l eq 'medium'; + my $fmt; + if ($pr_data) { + my $tot = $src->get_doccount; + $fmt = "$pfx % ".length($tot)."u/$tot\n"; + $pr->("$pfx copying $tot documents\n"); + } + my $cur = $src->postlist_begin(''); + my $end = $src->postlist_end(''); + my $git_dir_hash = $cidx->can('git_dir_hash'); + my ($n, $nr); + for (; $cur != $end; $cur++) { + my $doc = $src->get_document($cur->get_docid); + if (my @cmt = xap_terms('Q', $doc)) { + $n = hex(substr($cmt[0], 0, 8)) % scalar(@dst); + warn "W: multi-commit: @cmt" if scalar(@cmt) != 1; + } elsif (my @P = xap_terms('P', $doc)) { + $n = $git_dir_hash->($P[0]) % scalar(@dst); + warn "W: multi-path @P " if scalar(@P) != 1; + } else { + warn "W: skipped, no terms in ".$cur->get_docid; + next; + } + $dst[$n]->add_document($doc); + $pr->(sprintf($fmt, $nr)) if $pr_data && !(++$nr & 1023); + } + return if !$opt->{compact}; + $src = undef; + @dst = (); # flushes and closes + my @q; + for my $tmp (@tmp) { + my $arg = shift @$queue // die 'BUG: $queue empty'; + my $wip = $arg->[1] // die 'BUG: no $wip'; + push @q, [ "$tmp", $wip ]; + } + delete $opt->{-progress_pfx}; + process_queue(\@q, 'compact', $opt); +} + # Like copydatabase(1), this is horribly slow; and it doesn't seem due # to the overhead of Perl. -sub cpdb ($$) { +sub cpdb ($$) { # cb_spawn callback my ($args, $opt) = @_; - my ($old, $newdir) = @$args; - my $new = $newdir->dirname; + my ($old, $wip) = @$args; my ($src, $cur_shard); my $reshard; - PublicInbox::SearchIdx::load_xapian_writable() or die; - my $XapianDatabase = $PublicInbox::Search::X{Database}; + my ($X, $flag) = xapian_write_prep($opt); if (ref($old) eq 'ARRAY') { - ($cur_shard) = ($new =~ m!xap[0-9]+/([0-9]+)\b!); + my $new = $wip->dirname; + ($cur_shard) = ($new =~ m!(?:xap|ei)[0-9]+/([0-9]+)\b!); defined $cur_shard or die "BUG: could not extract shard # from $new"; $reshard = $opt->{reshard}; @@ -402,36 +509,27 @@ sub cpdb ($$) { # resharding, M:N copy means have full read access foreach (@$old) { if ($src) { - my $sub = $XapianDatabase->new($_); + my $sub = $X->{Database}->new($_); $src->add_database($sub); } else { - $src = $XapianDatabase->new($_); + $src = $X->{Database}->new($_); } } } else { - $src = $XapianDatabase->new($old); + $src = $X->{Database}->new($old); } - my ($tmp, $ft); - local %SIG = %SIG; + my $tmp = $wip; + local @SIG{keys %SIG} = values %SIG; if ($opt->{compact}) { - my $dir = dirname($new); - same_fs_or_die($dir, $new); - $ft = File::Temp->newdir("$new.compact-XXXXXX", DIR => $dir); + $tmp = compact_tmp_shard($wip); setup_signals(); - $tmp = $ft->dirname; - nodatacow_dir($tmp); - } else { - $tmp = $new; } # like copydatabase(1), be sure we don't overwrite anything in case # of other bugs: - my $flag = eval($PublicInbox::Search::Xap.'::DB_CREATE()'); - die if $@; - my $XapianWritableDatabase = $PublicInbox::Search::X{WritableDatabase}; - $flag |= $PublicInbox::SearchIdx::DB_NO_SYNC if !$opt->{fsync}; - my $dst = $XapianWritableDatabase->new($tmp, $flag); + my $new = $wip->dirname; + my $dst = $X->{WritableDatabase}->new($tmp->dirname, $flag); my $pr = $opt->{-progress}; my $pfx = $opt->{-progress_pfx} = progress_pfx($new); my $pr_data = { pr => $pr, pfx => $pfx, nr => 0 } if $pr; @@ -443,11 +541,10 @@ sub cpdb ($$) { $dst->set_metadata('last_commit', $lc) if $lc; # only the first xapian shard (0) gets 'indexlevel' - if ($new =~ m!(?:xapian[0-9]+|xap[0-9]+/0)\b!) { + if ($new =~ m!/(?:xapian[0-9]+|(?:ei|xap)[0-9]+/0)\b!) { my $l = $src->get_metadata('indexlevel'); - if ($l eq 'medium') { + $l eq 'medium' and $dst->set_metadata('indexlevel', $l); - } } if ($pr_data) { my $tot = $src->get_doccount; @@ -474,7 +571,7 @@ sub cpdb ($$) { # individually. $src = undef; foreach (@$old) { - my $old = $XapianDatabase->new($_); + my $old = $X->{Database}->new($_); cpdb_loop($old, $dst, $pr_data, $cur_shard, $reshard); } } else { @@ -489,7 +586,6 @@ sub cpdb ($$) { # this is probably the best place to do xapian-compact # since $dst isn't readable by HTTP or NNTP clients, yet: compact([ $tmp, $new ], $opt); - remove_tree($tmp) or die "failed to remove $tmp: $!\n"; } 1; diff --git a/lib/PublicInbox/XhcMset.pm b/lib/PublicInbox/XhcMset.pm new file mode 100644 index 00000000..ac25eece --- /dev/null +++ b/lib/PublicInbox/XhcMset.pm @@ -0,0 +1,51 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# mocks Xapian::Mset and allows slow queries from blocking the event loop +package PublicInbox::XhcMset; +use v5.12; +use parent qw(PublicInbox::DS); +use PublicInbox::XhcMsetIterator; +use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); + +sub event_step { + my ($self) = @_; + my ($cb, @args) = @{delete $self->{cb_args} // return}; + my $rd = $self->{sock}; + eval { + my $hdr = <$rd> // die "E: reading mset header: $!"; + for (split /\s+/, $hdr) { # read mset.size + estimated_matches + my ($k, $v) = split /=/, $_, 2; + $k =~ s/\A[^\.]*\.//; # s/(mset)?\./ + $self->{$k} = $v; + } + my $size = $self->{size} // die "E: bad xhc header: `$hdr'"; + my @it = map { PublicInbox::XhcMsetIterator::make($_) } <$rd>; + $self->{items} = \@it; + scalar(@it) == $size or die + 'E: got ',scalar(@it),", expected mset.size=$size"; + }; + my $err = $@; + $self->close; + eval { $cb->(@args, $self, $err) }; + warn "E: $@\n" if $@; +} + +sub maybe_new { + my (undef, $rd, $srch, @cb_args) = @_; + my $self = bless { cb_args => \@cb_args, srch => $srch }, __PACKAGE__; + if ($PublicInbox::DS::in_loop) { # async + $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); + } else { # synchronous + $self->{sock} = $rd; + event_step($self); + undef; + } +} + +eval(join('', map { "sub $_ { \$_[0]->{$_} }\n" } qw(size + get_matches_estimated))); + +sub items { @{$_[0]->{items}} } + +1; diff --git a/lib/PublicInbox/XhcMsetIterator.pm b/lib/PublicInbox/XhcMsetIterator.pm new file mode 100644 index 00000000..dcfc61e4 --- /dev/null +++ b/lib/PublicInbox/XhcMsetIterator.pm @@ -0,0 +1,20 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# mocks Xapian::MsetIterator, there's many of these allocated at once +package PublicInbox::XhcMsetIterator; +use v5.12; + +sub make ($) { + chomp($_[0]); + my @self = map { $_ + 0 } split /\0/, $_[0]; # docid, pct, rank + # we don't store $xdb in self[4] since we avoid $it->get_document + # in favor of $xdb->get_document($it->get_docid) + bless \@self, __PACKAGE__; +} + +sub get_docid { $_[0]->[0] } +sub get_percent { $_[0]->[1] } +sub get_rank { $_[0]->[2] } + +1; diff --git a/lib/PublicInbox/gcf2_libgit2.h b/lib/PublicInbox/gcf2_libgit2.h index 800c6bad..e1f0ef39 100644 --- a/lib/PublicInbox/gcf2_libgit2.h +++ b/lib/PublicInbox/gcf2_libgit2.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 all contributors <meta@public-inbox.org> + * Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> * License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> * * libgit2 for Inline::C diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h new file mode 100644 index 00000000..a30a8768 --- /dev/null +++ b/lib/PublicInbox/xap_helper.h @@ -0,0 +1,1134 @@ +/* + * Copyright (C) all contributors <meta@public-inbox.org> + * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt> + * Note: GPL-2+ since it'll incorporate approxidate from git someday + * + * Standalone helper process using C and minimal C++ for Xapian, + * this is not linked to Perl in any way. + * C (not C++) is used as much as possible to lower the contribution + * barrier for hackers who mainly know C (this includes the maintainer). + * Yes, that means we use C stdlib stuff like hsearch and open_memstream + * instead their equivalents in the C++ stdlib :P + * Everything here is an unstable internal API of public-inbox and + * NOT intended for ordinary users; only public-inbox hackers + */ +#ifndef _ALL_SOURCE +# define _ALL_SOURCE +#endif +#if defined(__NetBSD__) && !defined(_OPENBSD_SOURCE) // for reallocarray(3) +# define _OPENBSD_SOURCE +#endif +#include <sys/file.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/wait.h> +#include <poll.h> + +#include <assert.h> +#include <err.h> // BSD, glibc, and musl all have this +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <search.h> +#include <signal.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> +#include <xapian.h> // our only reason for using C++ + +#define MY_VER(maj,min,rev) ((maj) << 16 | (min) << 8 | (rev)) +#define XAP_VER \ + MY_VER(XAPIAN_MAJOR_VERSION,XAPIAN_MINOR_VERSION,XAPIAN_REVISION) + +#if XAP_VER >= MY_VER(1,3,6) +# define NRP Xapian::NumberRangeProcessor +# define ADD_RP add_rangeprocessor +# define SET_MAX_EXPANSION set_max_expansion // technically 1.3.3 +#else +# define NRP Xapian::NumberValueRangeProcessor +# define ADD_RP add_valuerangeprocessor +# define SET_MAX_EXPANSION set_max_wildcard_expansion +#endif + +#if defined(__GLIBC__) +# define MY_DO_OPTRESET() do { optind = 0; } while (0) +#else /* FreeBSD, musl, dfly, NetBSD, OpenBSD */ +# define MY_DO_OPTRESET() do { optind = optreset = 1; } while (0) +#endif + +#if defined(__DragonFly__) || defined(__FreeBSD__) || defined(__GLIBC__) +# define STDERR_ASSIGNABLE (1) +#else +# define STDERR_ASSIGNABLE (0) +#endif + +// assert functions are used correctly (e.g. ensure hackers don't +// cause EINVAL/EFAULT). Not for stuff that can fail due to HW +// failures. +# define CHECK(type, expect, expr) do { \ + type ckvar______ = (expr); \ + assert(ckvar______ == (expect) && "BUG" && __FILE__ && __LINE__); \ +} while (0) + +// coredump on most usage errors since our only users are internal +#define ABORT(...) do { warnx(__VA_ARGS__); abort(); } while (0) +#define EABORT(...) do { warn(__VA_ARGS__); abort(); } while (0) + +// sock_fd is modified in signal handler, yes, it's SOCK_SEQPACKET +static volatile int sock_fd = STDIN_FILENO; +static sigset_t fullset, workerset; +static bool alive = true; +#if STDERR_ASSIGNABLE +static FILE *orig_err = stderr; +#endif +static int orig_err_fd = -1; +static void *srch_tree; // tsearch + tdelete + twalk +static pid_t *worker_pids; // nr => pid +#define WORKER_MAX USHRT_MAX +static unsigned long nworker, nworker_hwm; +static int pipefds[2]; +static const char *stdout_path, *stderr_path; // for SIGUSR1 +static sig_atomic_t worker_needs_reopen; + +// PublicInbox::Search and PublicInbox::CodeSearch generate these: +static void mail_nrp_init(void); +static void code_nrp_init(void); +static void qp_init_mail_search(Xapian::QueryParser *); +static void qp_init_code_search(Xapian::QueryParser *); + +enum exc_iter { + ITER_OK = 0, + ITER_RETRY, + ITER_ABORT +}; + +struct srch { + int paths_len; // int for comparisons + unsigned qp_flags; + bool qp_extra_done; + Xapian::Database *db; + Xapian::QueryParser *qp; + char paths[]; // $shard_path0\0$shard_path1\0... +}; + +#define MY_ARG_MAX 256 +typedef bool (*cmd)(struct req *); + +// only one request per-process since we have RLIMIT_CPU timeout +struct req { // argv and pfxv point into global rbuf + char *argv[MY_ARG_MAX]; + char *pfxv[MY_ARG_MAX]; // -A <prefix> + char *qpfxv[MY_ARG_MAX]; // -Q <user_prefix>[:=]<INTERNAL_PREFIX> + size_t *lenv; // -A <prefix>LENGTH + struct srch *srch; + char *Pgit_dir; + char *Oeidx_key; + cmd fn; + unsigned long long max; + unsigned long long off; + unsigned long long threadid; + unsigned long timeout_sec; + size_t nr_out; + long sort_col; // value column, negative means BoolWeight + int argc; + int pfxc; + int qpfxc; + FILE *fp[2]; // [0] response pipe or sock, [1] status/errors (optional) + bool has_input; // fp[0] is bidirectional + bool collapse_threads; + bool code_search; + bool relevance; // sort by relevance before column + bool asc; // ascending sort +}; + +struct worker { + pid_t pid; + unsigned nr; +}; + +struct fbuf { + FILE *fp; + char *ptr; + size_t len; +}; + +#define SPLIT2ARGV(dst,buf,len) split2argv(dst,buf,len,MY_ARRAY_SIZE(dst)) +static size_t split2argv(char **dst, char *buf, size_t len, size_t limit) +{ + if (buf[0] == 0 || len == 0 || buf[len - 1] != 0) + ABORT("bogus argument given"); + size_t nr = 0; + char *c = buf; + for (size_t i = 1; i < len; i++) { + if (!buf[i]) { + dst[nr++] = c; + c = buf + i + 1; + } + if (nr == limit) + ABORT("too many args: %zu == %zu", nr, limit); + } + if (nr == 0) ABORT("no argument given"); + if ((long)nr < 0) ABORT("too many args: %zu", nr); + return (long)nr; +} + +static bool has_threadid(const struct srch *srch) +{ + return srch->db->get_metadata("has_threadid") == "1"; +} + +static Xapian::Enquire prep_enquire(const struct req *req) +{ + Xapian::Enquire enq(*req->srch->db); + if (req->sort_col < 0) { + enq.set_weighting_scheme(Xapian::BoolWeight()); + enq.set_docid_order(req->asc ? Xapian::Enquire::ASCENDING + : Xapian::Enquire::DESCENDING); + } else if (req->relevance) { + enq.set_sort_by_relevance_then_value(req->sort_col, !req->asc); + } else { + enq.set_sort_by_value_then_relevance(req->sort_col, !req->asc); + } + return enq; +} + +static Xapian::MSet enquire_mset(struct req *req, Xapian::Enquire *enq) +{ + if (!req->max) { + switch (sizeof(Xapian::doccount)) { + case 4: req->max = UINT_MAX; break; + default: req->max = ULLONG_MAX; + } + } + for (int i = 0; i < 9; i++) { + try { + Xapian::MSet mset = enq->get_mset(req->off, req->max); + return mset; + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + } + } + return enq->get_mset(req->off, req->max); +} + +// for v1, v2, and extindex +static Xapian::MSet mail_mset(struct req *req, const char *qry_str) +{ + struct srch *srch = req->srch; + Xapian::Query qry = srch->qp->parse_query(qry_str, srch->qp_flags); + if (req->Oeidx_key) { + req->Oeidx_key[0] = 'O'; // modifies static rbuf + qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, + Xapian::Query(req->Oeidx_key)); + } + // TODO: uid_range + if (req->threadid != ULLONG_MAX) { + std::string tid = Xapian::sortable_serialise(req->threadid); + qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, + Xapian::Query(Xapian::Query::OP_VALUE_RANGE, THREADID, + tid, tid)); + } + Xapian::Enquire enq = prep_enquire(req); + enq.set_query(qry); + // THREADID is a CPP macro defined on CLI (see) XapHelperCxx.pm + if (req->collapse_threads && has_threadid(srch)) + enq.set_collapse_key(THREADID); + + return enquire_mset(req, &enq); +} + +static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len) +{ + return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len); +} + +static void apply_roots_filter(struct req *req, Xapian::Query *qry) +{ + if (!req->Pgit_dir) return; + req->Pgit_dir[0] = 'P'; // modifies static rbuf + Xapian::Database *xdb = req->srch->db; + for (int i = 0; i < 9; i++) { + try { + std::string P = req->Pgit_dir; + Xapian::PostingIterator p = xdb->postlist_begin(P); + if (p == xdb->postlist_end(P)) { + warnx("W: %s not indexed?", req->Pgit_dir + 1); + return; + } + Xapian::TermIterator cur = xdb->termlist_begin(*p); + Xapian::TermIterator end = xdb->termlist_end(*p); + cur.skip_to("G"); + if (cur == end) { + warnx("W: %s has no root commits?", + req->Pgit_dir + 1); + return; + } + Xapian::Query f = Xapian::Query(*cur); + for (++cur; cur != end; ++cur) { + std::string tn = *cur; + if (!starts_with(&tn, "G", 1)) + continue; + f = Xapian::Query(Xapian::Query::OP_OR, f, tn); + } + *qry = Xapian::Query(Xapian::Query::OP_FILTER, *qry, f); + return; + } catch (const Xapian::DatabaseModifiedError & e) { + xdb->reopen(); + } + } +} + +// for cindex +static Xapian::MSet commit_mset(struct req *req, const char *qry_str) +{ + struct srch *srch = req->srch; + Xapian::Query qry = srch->qp->parse_query(qry_str, srch->qp_flags); + apply_roots_filter(req, &qry); + + // we only want commits: + qry = Xapian::Query(Xapian::Query::OP_FILTER, qry, + Xapian::Query("T" "c")); + Xapian::Enquire enq = prep_enquire(req); + enq.set_query(qry); + return enquire_mset(req, &enq); +} + +static void emit_mset_stats(struct req *req, const Xapian::MSet *mset) +{ + if (req->fp[1]) + fprintf(req->fp[1], "mset.size=%llu nr_out=%zu\n", + (unsigned long long)mset->size(), req->nr_out); + else + ABORT("BUG: %s caller only passed 1 FD", req->argv[0]); +} + +static int my_setlinebuf(FILE *fp) // glibc setlinebuf(3) can't report errors +{ + return setvbuf(fp, NULL, _IOLBF, 0); +} + +// n.b. __cleanup__ works fine with C++ exceptions, but not longjmp +// Only clang and g++ are supported, as AFAIK there's no other +// relevant Free(-as-in-speech) C++ compilers. +#define CLEANUP_FBUF __attribute__((__cleanup__(fbuf_ensure))) +static void fbuf_ensure(void *ptr) +{ + struct fbuf *fbuf = (struct fbuf *)ptr; + if (fbuf->fp && fclose(fbuf->fp)) + err(EXIT_FAILURE, "fclose(fbuf->fp)"); // ENOMEM? + fbuf->fp = NULL; + free(fbuf->ptr); +} + +static void fbuf_init(struct fbuf *fbuf) +{ + assert(!fbuf->ptr); + fbuf->fp = open_memstream(&fbuf->ptr, &fbuf->len); + if (!fbuf->fp) err(EXIT_FAILURE, "open_memstream(fbuf)"); +} + +static bool write_all(int fd, const struct fbuf *wbuf, size_t len) +{ + const char *p = wbuf->ptr; + assert(wbuf->len >= len); + do { // write to client FD + ssize_t n = write(fd, p, len); + if (n > 0) { + len -= n; + p += n; + } else { + perror(n ? "write" : "write (zero bytes)"); + return false; + } + } while (len); + return true; +} + +#define ERR_FLUSH(f) do { \ + if (ferror(f) | fflush(f)) err(EXIT_FAILURE, "ferror|fflush "#f); \ +} while (0) + +#define ERR_CLOSE(f, e) do { \ + if (ferror(f) | fclose(f)) \ + e ? err(e, "ferror|fclose "#f) : perror("ferror|fclose "#f); \ +} while (0) + +static void xclose(int fd) +{ + if (close(fd) < 0 && errno != EINTR) + EABORT("BUG: close"); +} + +static size_t off2size(off_t n) +{ + if (n < 0 || (uintmax_t)n > SIZE_MAX) + ABORT("off_t out of size_t range: %lld\n", (long long)n); + return (size_t)n; +} + +static char *hsearch_enter_key(char *s) +{ +#if defined(__OpenBSD__) || defined(__DragonFly__) + // hdestroy frees each key on some platforms, + // so give it something to free: + char *ret = strdup(s); + if (!ret) err(EXIT_FAILURE, "strdup"); + return ret; +// AFAIK there's no way to detect musl, assume non-glibc Linux is musl: +#elif defined(__GLIBC__) || defined(__linux__) || \ + defined(__FreeBSD__) || defined(__NetBSD__) + // do nothing on these platforms +#else +#warning untested platform detected, unsure if hdestroy(3) frees keys +#warning contact us at meta@public-inbox.org if you get segfaults +#endif + return s; +} + +// for test usage only, we need to ensure the compiler supports +// __cleanup__ when exceptions are thrown +struct inspect { struct req *req; }; + +static void inspect_ensure(struct inspect *x) +{ + fprintf(x->req->fp[0], "pid=%d has_threadid=%d", + (int)getpid(), has_threadid(x->req->srch) ? 1 : 0); +} + +static bool cmd_test_inspect(struct req *req) +{ + __attribute__((__cleanup__(inspect_ensure))) struct inspect x; + x.req = req; + try { + throw Xapian::InvalidArgumentError("test"); + } catch (Xapian::InvalidArgumentError) { + return true; + } + fputs("this should not be printed", req->fp[0]); + return false; +} + +static bool cmd_test_sleep(struct req *req) +{ + for (;;) poll(NULL, 0, 10); + return false; +} +#include "xh_mset.h" // read-only (WWW, IMAP, lei) stuff +#include "xh_cidx.h" // CodeSearchIdx.pm stuff + +#define CMD(n) { .fn_len = sizeof(#n) - 1, .fn_name = #n, .fn = cmd_##n } +static const struct cmd_entry { + size_t fn_len; + const char *fn_name; + cmd fn; +} cmds[] = { // should be small enough to not need bsearch || gperf + // most common commands first + CMD(mset), // WWW and IMAP requests + CMD(dump_ibx), // many inboxes + CMD(dump_roots), // per-cidx shard + CMD(test_inspect), // least common commands last + CMD(test_sleep), // least common commands last +}; + +#define MY_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) +#define RECV_FD_CAPA 2 +#define RECV_FD_SPACE (RECV_FD_CAPA * sizeof(int)) +union my_cmsg { + struct cmsghdr hdr; + char pad[sizeof(struct cmsghdr) + 16 + RECV_FD_SPACE]; +}; + +static bool recv_req(struct req *req, char *rbuf, size_t *len) +{ + union my_cmsg cmsg = {}; + struct msghdr msg = {}; + struct iovec iov; + ssize_t r; + iov.iov_base = rbuf; + iov.iov_len = *len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = &cmsg.hdr; + msg.msg_controllen = CMSG_SPACE(RECV_FD_SPACE); + + // allow SIGTERM to hit + CHECK(int, 0, sigprocmask(SIG_SETMASK, &workerset, NULL)); + +again: + r = recvmsg(sock_fd, &msg, 0); + if (r == 0) { + exit(EX_NOINPUT); /* grandparent went away */ + } else if (r < 0) { + switch (errno) { + case EINTR: goto again; + case EBADF: if (sock_fd < 0) exit(0); + // fall-through + default: err(EXIT_FAILURE, "recvmsg"); + } + } + + // success! no signals for the rest of the request/response cycle + CHECK(int, 0, sigprocmask(SIG_SETMASK, &fullset, NULL)); + if (r > 0 && msg.msg_flags) + ABORT("unexpected msg_flags"); + + *len = r; + if (cmsg.hdr.cmsg_level == SOL_SOCKET && + cmsg.hdr.cmsg_type == SCM_RIGHTS) { + size_t clen = cmsg.hdr.cmsg_len; + int *fdp = (int *)CMSG_DATA(&cmsg.hdr); + size_t i; + for (i = 0; CMSG_LEN((i + 1) * sizeof(int)) <= clen; i++) { + int fd = *fdp++; + const char *mode = NULL; + int fl = fcntl(fd, F_GETFL); + if (fl == -1) { + errx(EXIT_FAILURE, "invalid fd=%d", fd); + } else if (fl & O_WRONLY) { + mode = "w"; + } else if (fl & O_RDWR) { + mode = "r+"; + if (i == 0) req->has_input = true; + } else { + errx(EXIT_FAILURE, + "invalid mode from F_GETFL: 0x%x", fl); + } + req->fp[i] = fdopen(fd, mode); + if (!req->fp[i]) + err(EXIT_FAILURE, "fdopen(fd=%d)", fd); + } + return true; + } + errx(EXIT_FAILURE, "no FD received in %zd-byte request", r); + return false; +} + +static int srch_cmp(const void *pa, const void *pb) // for tfind|tsearch +{ + const struct srch *a = (const struct srch *)pa; + const struct srch *b = (const struct srch *)pb; + int diff = a->paths_len - b->paths_len; + + return diff ? diff : memcmp(a->paths, b->paths, (size_t)a->paths_len); +} + +static bool is_chert(const char *dir) +{ + char iamchert[PATH_MAX]; + struct stat sb; + int rc = snprintf(iamchert, sizeof(iamchert), "%s/iamchert", dir); + + if (rc <= 0 || rc >= (int)sizeof(iamchert)) + err(EXIT_FAILURE, "BUG: snprintf(%s/iamchert)", dir); + if (stat(iamchert, &sb) == 0 && S_ISREG(sb.st_mode)) + return true; + return false; +} + +static bool srch_init(struct req *req) +{ + char *dirv[MY_ARG_MAX]; + int i; + struct srch *srch = req->srch; + int dirc = (int)SPLIT2ARGV(dirv, srch->paths, (size_t)srch->paths_len); + const unsigned FLAG_PHRASE = Xapian::QueryParser::FLAG_PHRASE; + srch->qp_flags = FLAG_PHRASE | + Xapian::QueryParser::FLAG_BOOLEAN | + Xapian::QueryParser::FLAG_LOVEHATE | + Xapian::QueryParser::FLAG_WILDCARD; + if (is_chert(dirv[0])) + srch->qp_flags &= ~FLAG_PHRASE; + try { + srch->db = new Xapian::Database(dirv[0]); + } catch (...) { + warn("E: Xapian::Database(%s)", dirv[0]); + return false; + } + try { + for (i = 1; i < dirc; i++) { + if (srch->qp_flags & FLAG_PHRASE && is_chert(dirv[i])) + srch->qp_flags &= ~FLAG_PHRASE; + srch->db->add_database(Xapian::Database(dirv[i])); + } + } catch (...) { + warn("E: add_database(%s)", dirv[i]); + return false; + } + try { + srch->qp = new Xapian::QueryParser; + } catch (...) { + perror("E: Xapian::QueryParser"); + return false; + } + srch->qp->set_default_op(Xapian::Query::OP_AND); + srch->qp->set_database(*srch->db); + try { + srch->qp->set_stemmer(Xapian::Stem("english")); + } catch (...) { + perror("E: Xapian::Stem"); + return false; + } + srch->qp->set_stemming_strategy(Xapian::QueryParser::STEM_SOME); + srch->qp->SET_MAX_EXPANSION(100); + + if (req->code_search) + qp_init_code_search(srch->qp); // CodeSearch.pm + else + qp_init_mail_search(srch->qp); // Search.pm + return true; +} + +// setup query parser for altid and arbitrary headers +static void srch_init_extra(struct req *req) +{ + const char *XPFX; + for (int i = 0; i < req->qpfxc; i++) { + size_t len = strlen(req->qpfxv[i]); + char *c = (char *)memchr(req->qpfxv[i], '=', len); + + if (c) { // it's boolean "gmane=XGMANE" + XPFX = c + 1; + *c = 0; + req->srch->qp->add_boolean_prefix(req->qpfxv[i], XPFX); + continue; + } + // maybe it's a non-boolean prefix "blob:XBLOBID" + c = (char *)memchr(req->qpfxv[i], ':', len); + if (!c) + errx(EXIT_FAILURE, "bad -Q %s", req->qpfxv[i]); + XPFX = c + 1; + *c = 0; + req->srch->qp->add_prefix(req->qpfxv[i], XPFX); + } + req->srch->qp_extra_done = true; +} + +static void free_srch(void *p) // tdestroy +{ + struct srch *srch = (struct srch *)p; + delete srch->qp; + delete srch->db; + free(srch); +} + +static void dispatch(struct req *req) +{ + int c; + size_t size = strlen(req->argv[0]); + union { + struct srch *srch; + char *ptr; + } kbuf; + char *end; + FILE *kfp; + struct srch **s; + req->threadid = ULLONG_MAX; + for (c = 0; c < (int)MY_ARRAY_SIZE(cmds); c++) { + if (cmds[c].fn_len == size && + !memcmp(cmds[c].fn_name, req->argv[0], size)) { + req->fn = cmds[c].fn; + break; + } + } + if (!req->fn) ABORT("not handled: `%s'", req->argv[0]); + + kfp = open_memstream(&kbuf.ptr, &size); + if (!kfp) err(EXIT_FAILURE, "open_memstream(kbuf)"); + // write padding, first (contents don't matter) + fwrite(&req->argv[0], offsetof(struct srch, paths), 1, kfp); + + // global getopt variables: + optopt = 0; + optarg = NULL; + MY_DO_OPTRESET(); + + // XH_SPEC is generated from @PublicInbox::Search::XH_SPEC + while ((c = getopt(req->argc, req->argv, XH_SPEC)) != -1) { + switch (c) { + case 'a': req->asc = true; break; + case 'c': req->code_search = true; break; + case 'd': fwrite(optarg, strlen(optarg) + 1, 1, kfp); break; + case 'g': req->Pgit_dir = optarg - 1; break; // pad "P" prefix + case 'k': + req->sort_col = strtol(optarg, &end, 10); + if (*end) ABORT("-k %s", optarg); + switch (req->sort_col) { + case LONG_MAX: case LONG_MIN: ABORT("-k %s", optarg); + } + break; + case 'm': + req->max = strtoull(optarg, &end, 10); + if (*end || req->max == ULLONG_MAX) + ABORT("-m %s", optarg); + break; + case 'o': + req->off = strtoull(optarg, &end, 10); + if (*end || req->off == ULLONG_MAX) + ABORT("-o %s", optarg); + break; + case 'r': req->relevance = true; break; + case 't': req->collapse_threads = true; break; + case 'A': + req->pfxv[req->pfxc++] = optarg; + if (MY_ARG_MAX == req->pfxc) + ABORT("too many -A"); + break; + case 'K': + req->timeout_sec = strtoul(optarg, &end, 10); + if (*end || req->timeout_sec == ULONG_MAX) + ABORT("-K %s", optarg); + break; + case 'O': req->Oeidx_key = optarg - 1; break; // pad "O" prefix + case 'T': + req->threadid = strtoull(optarg, &end, 10); + if (*end || req->threadid == ULLONG_MAX) + ABORT("-T %s", optarg); + break; + case 'Q': + req->qpfxv[req->qpfxc++] = optarg; + if (MY_ARG_MAX == req->qpfxc) ABORT("too many -Q"); + break; + default: ABORT("bad switch `-%c'", c); + } + } + ERR_CLOSE(kfp, EXIT_FAILURE); // may ENOMEM, sets kbuf.srch + kbuf.srch->db = NULL; + kbuf.srch->qp = NULL; + kbuf.srch->qp_extra_done = false; + kbuf.srch->paths_len = size - offsetof(struct srch, paths); + if (kbuf.srch->paths_len <= 0) + ABORT("no -d args"); + s = (struct srch **)tsearch(kbuf.srch, &srch_tree, srch_cmp); + if (!s) err(EXIT_FAILURE, "tsearch"); // likely ENOMEM + req->srch = *s; + if (req->srch != kbuf.srch) { // reuse existing + free_srch(kbuf.srch); + req->srch->db->reopen(); + } else if (!srch_init(req)) { + assert(kbuf.srch == *((struct srch **)tfind( + kbuf.srch, &srch_tree, srch_cmp))); + void *del = tdelete(kbuf.srch, &srch_tree, srch_cmp); + assert(del); + free_srch(kbuf.srch); + goto cmd_err; // srch_init already warned + } + if (req->qpfxc && !req->srch->qp_extra_done) + srch_init_extra(req); + if (req->timeout_sec) + alarm(req->timeout_sec > UINT_MAX ? + UINT_MAX : (unsigned)req->timeout_sec); + try { + if (!req->fn(req)) + warnx("`%s' failed", req->argv[0]); + } catch (const Xapian::Error & e) { + warnx("Xapian::Error: %s", e.get_description().c_str()); + } catch (...) { + warn("unhandled exception"); + } + if (req->timeout_sec) + alarm(0); +cmd_err: + return; // just be silent on errors, for now +} + +static void cleanup_pids(void) +{ + free(worker_pids); + worker_pids = NULL; +} + +static void stderr_set(FILE *tmp_err) +{ +#if STDERR_ASSIGNABLE + if (my_setlinebuf(tmp_err)) + perror("W: setlinebuf(tmp_err)"); + stderr = tmp_err; + return; +#endif + int fd = fileno(tmp_err); + if (fd < 0) err(EXIT_FAILURE, "BUG: fileno(tmp_err)"); + while (dup2(fd, STDERR_FILENO) < 0) { + if (errno != EINTR) + err(EXIT_FAILURE, "dup2(%d => 2)", fd); + } +} + +static void stderr_restore(FILE *tmp_err) +{ +#if STDERR_ASSIGNABLE + stderr = orig_err; + return; +#endif + ERR_FLUSH(stderr); + while (dup2(orig_err_fd, STDERR_FILENO) < 0) { + if (errno != EINTR) + err(EXIT_FAILURE, "dup2(%d => 2)", orig_err_fd); + } + clearerr(stderr); +} + +static void sigw(int sig) // SIGTERM+SIGUSR1 handler for worker +{ + switch (sig) { + case SIGUSR1: worker_needs_reopen = 1; break; + default: sock_fd = -1; // break out of recv_loop + } +} + +#define CLEANUP_REQ __attribute__((__cleanup__(req_cleanup))) +static void req_cleanup(void *ptr) +{ + struct req *req = (struct req *)ptr; + free(req->lenv); +} + +static void reopen_logs(void) +{ + if (stdout_path && *stdout_path && !freopen(stdout_path, "a", stdout)) + err(EXIT_FAILURE, "freopen %s", stdout_path); + if (stderr_path && *stderr_path) { + if (!freopen(stderr_path, "a", stderr)) + err(EXIT_FAILURE, "freopen %s", stderr_path); + if (my_setlinebuf(stderr)) + err(EXIT_FAILURE, "setlinebuf(stderr)"); + } +} + +static void recv_loop(void) // worker process loop +{ + static char rbuf[4096 * 33]; // per-process + struct sigaction sa = {}; + sa.sa_handler = sigw; + + CHECK(int, 0, sigaction(SIGTERM, &sa, NULL)); + CHECK(int, 0, sigaction(SIGUSR1, &sa, NULL)); + + while (sock_fd == 0) { + size_t len = sizeof(rbuf); + CLEANUP_REQ struct req req = {}; + + if (!recv_req(&req, rbuf, &len)) + continue; + if (req.fp[1]) + stderr_set(req.fp[1]); + req.argc = (int)SPLIT2ARGV(req.argv, rbuf, len); + dispatch(&req); + ERR_CLOSE(req.fp[0], 0); + if (req.fp[1]) { + stderr_restore(req.fp[1]); + ERR_CLOSE(req.fp[1], 0); + } + if (worker_needs_reopen) { + worker_needs_reopen = 0; + reopen_logs(); + } + } +} + +static void insert_pid(pid_t pid, unsigned nr) +{ + assert(!worker_pids[nr]); + worker_pids[nr] = pid; +} + +static void start_worker(unsigned nr) +{ + pid_t pid = fork(); + if (pid < 0) { + warn("E: fork(worker=%u)", nr); + } else if (pid > 0) { + insert_pid(pid, nr); + } else { + cleanup_pids(); + xclose(pipefds[0]); + xclose(pipefds[1]); + if (signal(SIGCHLD, SIG_DFL) == SIG_ERR) + err(EXIT_FAILURE, "signal CHLD"); + if (signal(SIGTTIN, SIG_IGN) == SIG_ERR) + err(EXIT_FAILURE, "signal TTIN"); + if (signal(SIGTTOU, SIG_IGN) == SIG_ERR) + err(EXIT_FAILURE, "signal TTIN"); + recv_loop(); + exit(0); + } +} + +static void start_workers(void) +{ + sigset_t old; + + CHECK(int, 0, sigprocmask(SIG_SETMASK, &fullset, &old)); + for (unsigned long nr = 0; nr < nworker; nr++) { + if (!worker_pids[nr]) + start_worker(nr); + } + CHECK(int, 0, sigprocmask(SIG_SETMASK, &old, NULL)); +} + +static void cleanup_all(void) +{ + cleanup_pids(); +#ifdef __GLIBC__ + tdestroy(srch_tree, free_srch); + srch_tree = NULL; +#endif +} + +static void parent_reopen_logs(void) +{ + reopen_logs(); + for (unsigned long nr = nworker; nr < nworker_hwm; nr++) { + pid_t pid = worker_pids[nr]; + if (pid != 0 && kill(pid, SIGUSR1)) + warn("BUG?: kill(%d, SIGUSR1)", (int)pid); + } +} + +static void sigp(int sig) // parent signal handler +{ + static const char eagain[] = "signals coming in too fast"; + static const char bad_sig[] = "BUG: bad sig\n"; + static const char write_errno[] = "BUG: sigp write (errno)"; + static const char write_zero[] = "BUG: sigp write wrote zero bytes"; + char c = 0; + + switch (sig) { + case SIGCHLD: c = '.'; break; + case SIGTTOU: c = '-'; break; + case SIGTTIN: c = '+'; break; + case SIGUSR1: c = '#'; break; + default: + write(STDERR_FILENO, bad_sig, sizeof(bad_sig) - 1); + _exit(EXIT_FAILURE); + } + ssize_t w = write(pipefds[1], &c, 1); + if (w > 0) return; + if (w < 0 && errno == EAGAIN) { + write(STDERR_FILENO, eagain, sizeof(eagain) - 1); + return; + } else if (w == 0) { + write(STDERR_FILENO, write_zero, sizeof(write_zero) - 1); + } else { + // strerror isn't technically async-signal-safe, and + // strerrordesc_np+strerrorname_np isn't portable + write(STDERR_FILENO, write_errno, sizeof(write_errno) - 1); + } + _exit(EXIT_FAILURE); +} + +static void reaped_worker(pid_t pid, int st) +{ + unsigned long nr = 0; + for (; nr < nworker_hwm; nr++) { + if (worker_pids[nr] == pid) { + worker_pids[nr] = 0; + break; + } + } + if (nr >= nworker_hwm) { + warnx("W: unknown pid=%d reaped $?=%d", (int)pid, st); + return; + } + if (WIFEXITED(st) && WEXITSTATUS(st) == EX_NOINPUT) + alive = false; + else if (st) + warnx("worker[%lu] died $?=%d alive=%d", nr, st, (int)alive); + if (alive) + start_workers(); +} + +static void do_sigchld(void) +{ + while (1) { + int st; + pid_t pid = waitpid(-1, &st, WNOHANG); + if (pid > 0) { + reaped_worker(pid, st); + } else if (pid == 0) { + return; + } else { + switch (errno) { + case ECHILD: return; + case EINTR: break; // can it happen w/ WNOHANG? + default: err(EXIT_FAILURE, "BUG: waitpid"); + } + } + } +} + +static void do_sigttin(void) +{ + if (!alive) return; + if (nworker >= WORKER_MAX) { + warnx("workers cannot exceed %zu", (size_t)WORKER_MAX); + return; + } + void *p = realloc(worker_pids, (nworker + 1) * sizeof(pid_t)); + if (!p) { + warn("realloc worker_pids"); + } else { + worker_pids = (pid_t *)p; + worker_pids[nworker++] = 0; + if (nworker_hwm < nworker) + nworker_hwm = nworker; + start_workers(); + } +} + +static void do_sigttou(void) +{ + if (!alive || nworker <= 1) return; + + // worker_pids array does not shrink + --nworker; + for (unsigned long nr = nworker; nr < nworker_hwm; nr++) { + pid_t pid = worker_pids[nr]; + if (pid != 0 && kill(pid, SIGTERM)) + warn("BUG?: kill(%d, SIGTERM)", (int)pid); + } +} + +static size_t living_workers(void) +{ + size_t ret = 0; + + for (unsigned long nr = 0; nr < nworker_hwm; nr++) { + if (worker_pids[nr]) + ret++; + } + return ret; +} + +int main(int argc, char *argv[]) +{ + int c; + socklen_t slen = (socklen_t)sizeof(c); + stdout_path = getenv("STDOUT_PATH"); + stderr_path = getenv("STDERR_PATH"); + + if (getsockopt(sock_fd, SOL_SOCKET, SO_TYPE, &c, &slen)) + err(EXIT_FAILURE, "getsockopt"); + if (c != SOCK_SEQPACKET) + errx(EXIT_FAILURE, "stdin is not SOCK_SEQPACKET"); + + mail_nrp_init(); + code_nrp_init(); + atexit(cleanup_all); + + if (!STDERR_ASSIGNABLE) { + orig_err_fd = dup(STDERR_FILENO); + if (orig_err_fd < 0) + err(EXIT_FAILURE, "dup(2)"); + } + + nworker = 1; + // make warn/warnx/err multi-process friendly: + if (my_setlinebuf(stderr)) + err(EXIT_FAILURE, "setlinebuf(stderr)"); + // not using -W<workers> like Daemon.pm, since -W is reserved (glibc) + while ((c = getopt(argc, argv, "j:")) != -1) { + char *end; + + switch (c) { + case 'j': + nworker = strtoul(optarg, &end, 10); + if (*end != 0 || nworker > WORKER_MAX) + errx(EXIT_FAILURE, "-j %s invalid", optarg); + break; + case ':': + errx(EXIT_FAILURE, "missing argument: `-%c'", optopt); + case '?': + errx(EXIT_FAILURE, "unrecognized: `-%c'", optopt); + default: + errx(EXIT_FAILURE, "BUG: `-%c'", c); + } + } + sigset_t pset; // parent-only + CHECK(int, 0, sigfillset(&pset)); + + // global sigsets: + CHECK(int, 0, sigfillset(&fullset)); + CHECK(int, 0, sigfillset(&workerset)); + +#define DELSET(sig) do { \ + CHECK(int, 0, sigdelset(&fullset, sig)); \ + CHECK(int, 0, sigdelset(&workerset, sig)); \ + CHECK(int, 0, sigdelset(&pset, sig)); \ +} while (0) + DELSET(SIGABRT); + DELSET(SIGBUS); + DELSET(SIGFPE); + DELSET(SIGILL); + DELSET(SIGSEGV); + DELSET(SIGXCPU); + DELSET(SIGXFSZ); +#undef DELSET + CHECK(int, 0, sigdelset(&workerset, SIGUSR1)); + CHECK(int, 0, sigdelset(&fullset, SIGALRM)); + + if (nworker == 0) { // no SIGTERM handling w/o workers + recv_loop(); + return 0; + } + CHECK(int, 0, sigdelset(&workerset, SIGTERM)); + CHECK(int, 0, sigdelset(&workerset, SIGCHLD)); + nworker_hwm = nworker; + worker_pids = (pid_t *)calloc(nworker, sizeof(pid_t)); + if (!worker_pids) err(EXIT_FAILURE, "calloc"); + + if (pipe(pipefds)) err(EXIT_FAILURE, "pipe"); + int fl = fcntl(pipefds[1], F_GETFL); + if (fl == -1) err(EXIT_FAILURE, "F_GETFL"); + if (fcntl(pipefds[1], F_SETFL, fl | O_NONBLOCK)) + err(EXIT_FAILURE, "F_SETFL"); + + CHECK(int, 0, sigdelset(&pset, SIGCHLD)); + CHECK(int, 0, sigdelset(&pset, SIGTTIN)); + CHECK(int, 0, sigdelset(&pset, SIGTTOU)); + CHECK(int, 0, sigdelset(&pset, SIGUSR1)); + + struct sigaction sa = {}; + sa.sa_handler = sigp; + + CHECK(int, 0, sigaction(SIGUSR1, &sa, NULL)); + CHECK(int, 0, sigaction(SIGTTIN, &sa, NULL)); + CHECK(int, 0, sigaction(SIGTTOU, &sa, NULL)); + sa.sa_flags = SA_NOCLDSTOP; + CHECK(int, 0, sigaction(SIGCHLD, &sa, NULL)); + + CHECK(int, 0, sigprocmask(SIG_SETMASK, &pset, NULL)); + + start_workers(); + + char sbuf[64]; + while (alive || living_workers()) { + ssize_t n = read(pipefds[0], &sbuf, sizeof(sbuf)); + if (n < 0) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "read"); + } else if (n == 0) { + errx(EXIT_FAILURE, "read EOF"); + } + do_sigchld(); + for (ssize_t i = 0; i < n; i++) { + switch (sbuf[i]) { + case '.': break; // do_sigchld already called + case '-': do_sigttou(); break; + case '+': do_sigttin(); break; + case '#': parent_reopen_logs(); break; + default: errx(EXIT_FAILURE, "BUG: c=%c", sbuf[i]); + } + } + } + + return 0; +} diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h new file mode 100644 index 00000000..311ca05f --- /dev/null +++ b/lib/PublicInbox/xh_cidx.h @@ -0,0 +1,277 @@ +// Copyright (C) all contributors <meta@public-inbox.org> +// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt> +// This file is only intended to be included by xap_helper.h +// it implements pieces used by CodeSearchIdx.pm + +static void term_length_extract(struct req *req) +{ + req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t)); + if (!req->lenv) + EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t)); + for (int i = 0; i < req->pfxc; i++) { + char *pfx = req->pfxv[i]; + // extract trailing digits as length: + // $len = s/([0-9]+)\z// ? ($1+0) : 0 + for (size_t j = 0; pfx[j]; j++) { + if (pfx[j] < '0' || pfx[j] > '9') + continue; + if (j == 0) { + warnx("W: `%s' not a valid prefix", pfx); + continue; + } + char *end; + unsigned long long tmp = strtoull(pfx + j, &end, 10); + if (*end || tmp >= (unsigned long long)SIZE_MAX) { + warnx("W: `%s' not recognized", pfx); + } else { + req->lenv[i] = (size_t)tmp; + pfx[j] = 0; + break; + } + } + } +} + +static void dump_ibx_term(struct req *req, int p, + Xapian::Document *doc, const char *ibx_id) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + const char *pfx = req->pfxv[p]; + size_t pfx_len = strlen(pfx); + size_t term_len = req->lenv[p]; + + for (cur.skip_to(pfx); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, pfx, pfx_len)) break; + if (term_len > 0 && (tn.length() - pfx_len) != term_len) + continue; + fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id); + ++req->nr_out; + } +} + +static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id, + Xapian::MSetIterator *i) +{ + try { + Xapian::Document doc = i->get_document(); + for (int p = 0; p < req->pfxc; p++) + dump_ibx_term(req, p, &doc, ibx_id); + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + return ITER_RETRY; + } catch (const Xapian::DocNotFoundError & e) { // oh well... + warnx("doc not found: %s", e.get_description().c_str()); + } + return ITER_OK; +} + +static bool cmd_dump_ibx(struct req *req) +{ + if ((optind + 1) >= req->argc) + ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR"); + if (!req->pfxc) + ABORT("dump_ibx requires -A PREFIX"); + + const char *ibx_id = req->argv[optind]; + if (my_setlinebuf(req->fp[0])) // for sort(1) pipe + EABORT("setlinebuf(fp[0])"); // WTF? + req->asc = true; + req->sort_col = -1; + term_length_extract(req); + Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]); + + // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine + // in case we need to retry on DB reopens + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + for (int t = 10; t > 0; --t) + switch (dump_ibx_iter(req, ibx_id, &i)) { + case ITER_OK: t = 0; break; // leave inner loop + case ITER_RETRY: break; // continue for-loop + case ITER_ABORT: return false; // error + } + } + emit_mset_stats(req, &mset); + return true; +} + +struct dump_roots_tmp { + struct stat sb; + void *mm_ptr; + char **entries; + struct fbuf wbuf; + int root2off_fd; +}; + +#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure))) +static void dump_roots_ensure(void *ptr) +{ + struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr; + if (drt->root2off_fd >= 0) + xclose(drt->root2off_fd); + hdestroy(); // idempotent + size_t size = off2size(drt->sb.st_size); + if (drt->mm_ptr && munmap(drt->mm_ptr, size)) + EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size); + free(drt->entries); + fbuf_ensure(&drt->wbuf); +} + +static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + ENTRY e, *ep; + fbuf_init(root_offs); + for (cur.skip_to("G"); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, "G", 1)) break; + union { const char *in; char *out; } u; + u.in = tn.c_str() + 1; + e.key = u.out; + ep = hsearch(e, FIND); + if (!ep) ABORT("hsearch miss `%s'", e.key); + // ep->data is a NUL-terminated string matching /[0-9]+/ + fputc(' ', root_offs->fp); + fputs((const char *)ep->data, root_offs->fp); + } + fputc('\n', root_offs->fp); + ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM + root_offs->fp = NULL; + return true; +} + +// writes term values matching @pfx for a given @doc, ending the line +// with the contents of @root_offs +static void dump_roots_term(struct req *req, int p, + struct dump_roots_tmp *drt, + struct fbuf *root_offs, + Xapian::Document *doc) +{ + Xapian::TermIterator cur = doc->termlist_begin(); + Xapian::TermIterator end = doc->termlist_end(); + const char *pfx = req->pfxv[p]; + size_t pfx_len = strlen(pfx); + size_t term_len = req->lenv[p]; + + for (cur.skip_to(pfx); cur != end; cur++) { + std::string tn = *cur; + if (!starts_with(&tn, pfx, pfx_len)) break; + if (term_len > 0 && (tn.length() - pfx_len) != term_len) + continue; + fputs(tn.c_str() + pfx_len, drt->wbuf.fp); + fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp); + ++req->nr_out; + } +} + +// we may have lines which exceed PIPE_BUF, so we do our own +// buffering and rely on flock(2), here +static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt) +{ + bool ok = true; + off_t off = ftello(drt->wbuf.fp); + if (off < 0) EABORT("ftello"); + if (!off) return ok; + + ERR_FLUSH(drt->wbuf.fp); // ENOMEM + int fd = fileno(req->fp[0]); + + while (flock(drt->root2off_fd, LOCK_EX)) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK? + } + ok = write_all(fd, &drt->wbuf, (size_t)off); + while (flock(drt->root2off_fd, LOCK_UN)) { + if (errno == EINTR) continue; + err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK? + } + if (fseeko(drt->wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); + return ok; +} + +static enum exc_iter dump_roots_iter(struct req *req, + struct dump_roots_tmp *drt, + Xapian::MSetIterator *i) +{ + CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n" + try { + Xapian::Document doc = i->get_document(); + if (!root2offs_str(&root_offs, &doc)) + return ITER_ABORT; // bad request, abort + for (int p = 0; p < req->pfxc; p++) + dump_roots_term(req, p, drt, &root_offs, &doc); + } catch (const Xapian::DatabaseModifiedError & e) { + req->srch->db->reopen(); + return ITER_RETRY; + } catch (const Xapian::DocNotFoundError & e) { // oh well... + warnx("doc not found: %s", e.get_description().c_str()); + } + return ITER_OK; +} + +static bool cmd_dump_roots(struct req *req) +{ + CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {}; + drt.root2off_fd = -1; + if ((optind + 1) >= req->argc) + ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR"); + if (!req->pfxc) + ABORT("dump_roots requires -A PREFIX"); + const char *root2off_file = req->argv[optind]; + drt.root2off_fd = open(root2off_file, O_RDONLY); + if (drt.root2off_fd < 0) + EABORT("open(%s)", root2off_file); + if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM? + err(EXIT_FAILURE, "fstat(%s)", root2off_file); + // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0), + // so /32 overestimates the number of expected entries by + // ~%25 (as recommended by Linux hcreate(3) manpage) + size_t size = off2size(drt.sb.st_size); + size_t est = (size / 32) + 1; //+1 for "\0" termination + drt.mm_ptr = mmap(NULL, size, PROT_READ, + MAP_PRIVATE, drt.root2off_fd, 0); + if (drt.mm_ptr == MAP_FAILED) + err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file); + size_t asize = est * 2; + if (asize < est) ABORT("too many entries: %zu", est); + drt.entries = (char **)calloc(asize, sizeof(char *)); + if (!drt.entries) + err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *)); + size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize); + if (tot <= 0) return false; // split2argv already warned on error + if (!hcreate(est)) + err(EXIT_FAILURE, "hcreate(%zu)", est); + for (size_t i = 0; i < tot; ) { + ENTRY e; + e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM + e.data = drt.entries[i++]; + if (!hsearch(e, ENTER)) + err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key, + (const char *)e.data); + } + req->asc = true; + req->sort_col = -1; + Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]); + term_length_extract(req); + + fbuf_init(&drt.wbuf); + + // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine + // in case we need to retry on DB reopens + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + for (int t = 10; t > 0; --t) + switch (dump_roots_iter(req, &drt, &i)) { + case ITER_OK: t = 0; break; // leave inner loop + case ITER_RETRY: break; // continue for-loop + case ITER_ABORT: return false; // error + } + if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt)) + return false; + } + if (!dump_roots_flush(req, &drt)) + return false; + emit_mset_stats(req, &mset); + return true; +} diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h new file mode 100644 index 00000000..db2692c9 --- /dev/null +++ b/lib/PublicInbox/xh_mset.h @@ -0,0 +1,53 @@ +// Copyright (C) all contributors <meta@public-inbox.org> +// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt> +// This file is only intended to be included by xap_helper.h +// it implements pieces used by WWW, IMAP and lei + +#ifndef WBUF_FLUSH_THRESHOLD +# define WBUF_FLUSH_THRESHOLD (BUFSIZ - 1000) +#endif +#if WBUF_FLUSH_THRESHOLD < 0 +# undef WBUF_FLUSH_THRESHOLD +# define WBUF_FLUSH_THRESHOLD BUFSIZ +#endif + +static bool cmd_mset(struct req *req) +{ + if (optind >= req->argc) ABORT("usage: mset [OPTIONS] WANT QRY_STR"); + if (req->fp[1]) ABORT("mset only accepts 1 FD"); + const char *qry_str = req->argv[optind]; + CLEANUP_FBUF struct fbuf wbuf = {}; + Xapian::MSet mset = req->code_search ? commit_mset(req, qry_str) : + mail_mset(req, qry_str); + fbuf_init(&wbuf); + fprintf(wbuf.fp, "mset.size=%llu .get_matches_estimated=%llu\n", + (unsigned long long)mset.size(), + (unsigned long long)mset.get_matches_estimated()); + int fd = fileno(req->fp[0]); + for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { + off_t off = ftello(wbuf.fp); + if (off < 0) EABORT("ftello"); + /* + * TODO verify our fflush + fseeko use isn't affected by a + * glibc <2.25 bug: + * https://sourceware.org/bugzilla/show_bug.cgi?id=20181 + * CentOS 7.x only has glibc 2.17. In any case, bug #20181 + * shouldn't affect us since our use of fseeko is used to + * effectively discard data. + */ + if (off > WBUF_FLUSH_THRESHOLD) { + ERR_FLUSH(wbuf.fp); + if (!write_all(fd, &wbuf, (size_t)off)) return false; + if (fseeko(wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); + off = 0; + } + fprintf(wbuf.fp, "%llu" "%c" "%d" "%c" "%llu\n", + (unsigned long long)(*i), // get_docid + 0, i.get_percent(), + 0, (unsigned long long)i.get_rank()); + } + off_t off = ftello(wbuf.fp); + if (off < 0) EABORT("ftello"); + ERR_FLUSH(wbuf.fp); + return off > 0 ? write_all(fd, &wbuf, (size_t)off) : true; +} |