From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 3E5041F9FF for ; Wed, 17 Mar 2021 07:02:19 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/5] extindex: add some validation and config knobs for WWW Date: Tue, 16 Mar 2021 23:02:15 -0800 Message-Id: <20210317070218.7971-3-e@80x24.org> In-Reply-To: <20210317070218.7971-1-e@80x24.org> References: <20210317070218.7971-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We'll try to share a bit more configuration with extindex entries for WWW PSGI usage. --- MANIFEST | 1 + lib/PublicInbox/Config.pm | 76 +++++++++++++++++--------------- lib/PublicInbox/ExtSearch.pm | 2 +- lib/PublicInbox/TestCommon.pm | 28 +++++++++++- lib/PublicInbox/WwwAtomStream.pm | 6 ++- t/config.t | 4 +- t/extindex-psgi.t | 48 ++++++++++++++++++++ t/psgi_v2.t | 30 +++---------- 8 files changed, 130 insertions(+), 65 deletions(-) create mode 100644 t/extindex-psgi.t diff --git a/MANIFEST b/MANIFEST index 49c10d62..775de5cd 100644 --- a/MANIFEST +++ b/MANIFEST @@ -324,6 +324,7 @@ t/eml.t t/eml_content_disposition.t t/eml_content_type.t t/epoll.t +t/extindex-psgi.t t/extsearch.t t/fail-bin/spamc t/fake_inotify.t diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 87a03fd3..113975dd 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -88,12 +88,12 @@ sub lookup_list_id { sub lookup_name ($$) { my ($self, $name) = @_; - $self->{-by_name}->{$name} // _fill($self, "publicinbox.$name"); + $self->{-by_name}->{$name} // _fill_ibx($self, $name); } sub lookup_ei { my ($self, $name) = @_; - $self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name"); + $self->{-ei_by_name}->{$name} //= _fill_ei($self, $name); } # special case for [extindex "all"] @@ -167,8 +167,8 @@ sub git_config_dump { $rv; } -sub valid_inbox_name ($) { - my ($name) = @_; +sub valid_foo_name ($;$) { + my ($name, $pfx) = @_; # Similar rules found in git.git/remote.c::valid_remote_nick # and git.git/refs.c::check_refname_component @@ -176,6 +176,7 @@ sub valid_inbox_name ($) { if ($name eq '' || $name =~ /\@\{/ || $name =~ /\.\./ || $name =~ m![/:\?\[\]\^~\s\f[:cntrl:]\*]! || $name =~ /\A\./ || $name =~ /\.\z/) { + warn "invalid $pfx name: `$name'\n" if $pfx; return 0; } @@ -375,26 +376,26 @@ sub rel2abs_collapsed { Cwd::abs_path($p); } -sub _fill { - my ($self, $pfx) = @_; - my $ibx = {}; +sub _one_val { + my ($self, $pfx, $k) = @_; + my $v = $self->{"$pfx.$k"} // return; + return $v if !ref($v); + warn "W: $pfx.$k has multiple values, only using `$v->[-1]'\n"; + $v->[-1]; +} +sub _fill_ibx { + my ($self, $name) = @_; + my $pfx = "publicinbox.$name"; + my $ibx = {}; for my $k (qw(watch nntpserver)) { my $v = $self->{"$pfx.$k"}; $ibx->{$k} = $v if defined $v; } for my $k (qw(filter inboxdir newsgroup replyto httpbackendmax feedmax indexlevel indexsequentialshard)) { - if (defined(my $v = $self->{"$pfx.$k"})) { - if (ref($v) eq 'ARRAY') { - warn <[-1]' -EOF - $ibx->{$k} = $v->[-1]; - } else { - $ibx->{$k} = $v; - } - } + my $v = _one_val($self, $pfx, $k) // next; + $ibx->{$k} = $v; } # "mainrepo" is backwards compatibility: @@ -403,9 +404,8 @@ EOF warn "E: `$dir' must not contain `\\n'\n"; return; } - foreach my $k (qw(obfuscate)) { - my $v = $self->{"$pfx.$k"}; - defined $v or next; + for my $k (qw(obfuscate)) { + my $v = $self->{"$pfx.$k"} // next; if (defined(my $bval = git_bool($v))) { $ibx->{$k} = $bval; } else { @@ -414,19 +414,13 @@ EOF } # TODO: more arrays, we should support multi-value for # more things to encourage decentralization - foreach my $k (qw(address altid nntpmirror coderepo hide listid url + for my $k (qw(address altid nntpmirror coderepo hide listid url infourl watchheader)) { - if (defined(my $v = $self->{"$pfx.$k"})) { - $ibx->{$k} = _array($v); - } - } - - my $name = substr($pfx, length('publicinbox.')); - if (!valid_inbox_name($name)) { - warn "invalid inbox name: '$name'\n"; - return; + my $v = $self->{"$pfx.$k"} // next; + $ibx->{$k} = _array($v); } + return unless valid_foo_name($name, 'publicinbox'); $ibx->{name} = $name; $ibx->{-pi_cfg} = $self; $ibx = PublicInbox::Inbox->new($ibx); @@ -473,14 +467,13 @@ EOF $ibx->{-no_obfuscate_re} = $self->{-no_obfuscate_re}; fill_all($self); # noop to populate -no_obfuscate } - if (my $ibx_code_repos = $ibx->{coderepo}) { my $code_repos = $self->{-code_repos}; my $repo_objs = $ibx->{-repo_objs} = []; foreach my $nick (@$ibx_code_repos) { my @parts = split(m!/!, $nick); my $valid = 0; - $valid += valid_inbox_name($_) foreach (@parts); + $valid += valid_foo_name($_) foreach (@parts); $valid == scalar(@parts) or next; my $repo = $code_repos->{$nick} //= @@ -496,10 +489,23 @@ EOF } sub _fill_ei ($$) { - my ($self, $pfx) = @_; + my ($self, $name) = @_; eval { require PublicInbox::ExtSearch } or return; - my $d = $self->{"$pfx.topdir"}; - defined($d) && -d $d ? PublicInbox::ExtSearch->new($d) : undef; + my $pfx = "extindex.$name"; + my $d = $self->{"$pfx.topdir"} // return; + -d $d or return; + my $es = PublicInbox::ExtSearch->new($d); + for my $k (qw(indexlevel indexsequentialshard)) { + my $v = _one_val($self, $pfx, $k) // next; + $es->{$k} = $v; + } + for my $k (qw(altid coderepo hide url infourl)) { + my $v = $self->{"$pfx.$k"} // next; + $es->{$k} = _array($v); + } + return unless valid_foo_name($name, 'extindex'); + $es->{name} = $name; + $es; } sub urlmatch { diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index 8ba4d396..c2cfc338 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -107,10 +107,10 @@ sub description { sub cloneurl { [] } # TODO -sub base_url { 'https://example.com/TODO/' } sub nntp_url { [] } no warnings 'once'; +*base_url = \&PublicInbox::Inbox::base_url; *smsg_eml = \&PublicInbox::Inbox::smsg_eml; *smsg_by_mid = \&PublicInbox::Inbox::smsg_by_mid; *msg_by_mid = \&PublicInbox::Inbox::msg_by_mid; diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index c2d07e59..0d15514e 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -15,7 +15,8 @@ BEGIN { @EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods run_script start_script key2sub xsys xsys_e xqx eml_load tick have_xapian_compact json_utf8 setup_public_inboxes create_inbox - tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt); + tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt + test_httpd); require Test::More; my @methods = grep(!/\W/, @Test::More::EXPORT); eval(join('', map { "*$_=\\&Test::More::$_;" } @methods)); @@ -636,6 +637,31 @@ sub create_inbox ($$;@) { $ibx; } +sub test_httpd ($$;$) { + my ($env, $client, $skip) = @_; + for (qw(PI_CONFIG TMPDIR)) { + $env->{$_} or BAIL_OUT "$_ unset"; + } + SKIP: { + require_mods(qw(Plack::Test::ExternalServer), $skip // 1); + my $sock = tcp_server() or die; + my ($out, $err) = map { "$env->{TMPDIR}/std$_.log" } qw(out err); + my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ]; + my $td = start_script($cmd, $env, { 3 => $sock }); + my ($h, $p) = tcp_host_port($sock); + local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p"; + Plack::Test::ExternalServer::test_psgi(client => $client); + $td->join('TERM'); + open my $fh, '<', $err or BAIL_OUT $!; + my $e = do { local $/; <$fh> }; + if ($e =~ s/^Plack::Middleware::ReverseProxy missing,\n//gms) { + $e =~ s/^URL generation for redirects .*\n//gms; + } + is($e, '', 'no errors'); + } +}; + + package PublicInboxTestProcess; use strict; diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm index 361e61f6..f60251b7 100644 --- a/lib/PublicInbox/WwwAtomStream.pm +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -100,7 +100,11 @@ sub atom_header { } else { $title = title_tag($ibx->description); $self_url .= 'new.atom'; - $page_id = "mailto:$ibx->{-primary_address}"; + if (defined(my $addr = $ibx->{-primary_address})) { + $page_id = "mailto:$addr"; + } else { + $page_id = to_uuid($self_url); + } } qq(\n) . qq( 1% (parens) &more eql= +plus], '#hash'; for my $s (@valid) { - ok(PublicInbox::Config::valid_inbox_name($s), "`$s' name accepted"); + ok(PublicInbox::Config::valid_foo_name($s), "`$s' name accepted"); } { diff --git a/t/extindex-psgi.t b/t/extindex-psgi.t new file mode 100644 index 00000000..6f62b5a0 --- /dev/null +++ b/t/extindex-psgi.t @@ -0,0 +1,48 @@ +#!perl -w +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ +use strict; +use v5.10.1; +use PublicInbox::TestCommon; +use PublicInbox::Config; +use File::Copy qw(cp); +use IO::Handle (); +require_git(2.6); +require_mods(qw(json DBD::SQLite Search::Xapian + HTTP::Request::Common Plack::Test URI::Escape Plack::Builder)); +use_ok($_) for (qw(HTTP::Request::Common Plack::Test)); +require PublicInbox::WWW; +my ($ro_home, $cfg_path) = setup_public_inboxes; +my ($tmpdir, $for_destroy) = tmpdir; +my $home = "$tmpdir/home"; +mkdir $home or BAIL_OUT $!; +mkdir "$home/.public-inbox" or BAIL_OUT $!; +my $pi_config = "$home/.public-inbox/config"; +cp("$ro_home/.public-inbox/config", $pi_config) or BAIL_OUT; +my $env = { HOME => $home }; +run_script([qw(-extindex --all), "$tmpdir/eidx"], $env) or BAIL_OUT; +{ + open my $cfgfh, '>', $pi_config or BAIL_OUT; + $cfgfh->autoflush(1); + print $cfgfh <new(PublicInbox::Config->new($pi_config)); +my $client = sub { + my ($cb) = @_; + my $res = $cb->(GET('/all/')); + is($res->code, 200, '/all/ good'); + $res = $cb->(GET('/all/new.atom', Host => 'usethis.example.com')); + like($res->content, qr!http://usethis\.example\.com/!s, + 'Host: header respected in Atom feed'); + unlike($res->content, qr!http://bogus\.example\.com/!s, + 'default URL ignored with different host header'); +}; +test_psgi(sub { $www->call(@_) }, $client); +%$env = (%$env, TMPDIR => $tmpdir, PI_CONFIG => $pi_config); +test_httpd($env, $client); + +done_testing; diff --git a/t/psgi_v2.t b/t/psgi_v2.t index 487317b6..1f190708 100644 --- a/t/psgi_v2.t +++ b/t/psgi_v2.t @@ -56,27 +56,6 @@ EOF close $fh or BAIL_OUT; } -my $run_httpd = sub { - my ($client, $skip) = @_; - SKIP: { - require_mods(qw(Plack::Test::ExternalServer), $skip); - my $env = { PI_CONFIG => $cfgpath }; - my $sock = tcp_server() or die; - my ($out, $err) = map { "$tmpdir/std$_.log" } qw(out err); - my $cmd = [ qw(-httpd -W0), "--stdout=$out", "--stderr=$err" ]; - my $td = start_script($cmd, $env, { 3 => $sock }); - my ($h, $p) = tcp_host_port($sock); - local $ENV{PLACK_TEST_EXTERNALSERVER_URI} = "http://$h:$p"; - Plack::Test::ExternalServer::test_psgi(client => $client); - $td->join('TERM'); - open my $fh, '<', $err or BAIL_OUT $!; - my $e = do { local $/; <$fh> }; - if ($e =~ s/^Plack::Middleware::ReverseProxy missing,\n//gms) { - $e =~ s/^URL generation for redirects .*\n//gms; - } - is($e, '', 'no errors'); - } -}; my $msg = $ibx->msg_by_mid('a-mid@b'); like($$msg, qr/\AFrom oldbug/s, '"From_" line stored to test old bug workaround'); @@ -115,7 +94,8 @@ my $client0 = sub { 'new.html ordering is chronological'); }; test_psgi(sub { $www->call(@_) }, $client0); -$run_httpd->($client0, 9); +my $env = { TMPDIR => $tmpdir, PI_CONFIG => $cfgpath }; +test_httpd($env, $client0, 9); $eml->header_set('Message-ID', 'a-mid@b'); $eml->body_set("hello ghosts\n"); @@ -225,7 +205,7 @@ my $client1 = sub { }; test_psgi(sub { $www->call(@_) }, $client1); -$run_httpd->($client1, 38); +test_httpd($env, $client1, 38); { my $exp = [ qw( ) ]; @@ -267,7 +247,7 @@ my $client2 = sub { }; test_psgi(sub { $www->call(@_) }, $client2); -$run_httpd->($client2, 8); +test_httpd($env, $client2, 8); { # ensure conflicted attachments can be resolved local $SIG{__WARN__} = sub {}; @@ -298,6 +278,6 @@ my $client3 = sub { is_deeply(\@warn, [], 'no warnings on YYYYMMDD only'); }; test_psgi(sub { $www->call(@_) }, $client3); -$run_httpd->($client3, 4); +test_httpd($env, $client3, 4); done_testing;