From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E98AF1F5A2 for ; Wed, 1 Jan 2020 10:39:00 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 6/6] wwwstatic: add directory listing + index.html support Date: Wed, 1 Jan 2020 10:38:59 +0000 Message-Id: <20200101103859.15401-7-e@80x24.org> In-Reply-To: <20200101103859.15401-1-e@80x24.org> References: <20200101103859.15401-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: It's now possible to use WwwStatic as a standalone PSGI app to serve static files and recreate the award-winning web design of https://public-inbox.org/ :> --- MANIFEST | 1 + lib/PublicInbox/Cgit.pm | 6 +- lib/PublicInbox/WWW.pm | 15 +-- lib/PublicInbox/WwwStatic.pm | 198 ++++++++++++++++++++++++++++++++++- t/www_static.t | 96 +++++++++++++++++ 5 files changed, 294 insertions(+), 22 deletions(-) create mode 100644 t/www_static.t diff --git a/MANIFEST b/MANIFEST index f649bbef..16c92c36 100644 --- a/MANIFEST +++ b/MANIFEST @@ -290,6 +290,7 @@ t/watch_filter_rubylang.t t/watch_maildir.t t/watch_maildir_v2.t t/www_listing.t +t/www_static.t t/xcpdb-reshard.t xt/git-http-backend.t xt/git_async_cmp.t diff --git a/lib/PublicInbox/Cgit.pm b/lib/PublicInbox/Cgit.pm index c0b1a73b..c42f8847 100644 --- a/lib/PublicInbox/Cgit.pm +++ b/lib/PublicInbox/Cgit.pm @@ -16,7 +16,6 @@ use PublicInbox::Git; use warnings; use PublicInbox::Qspawn; use PublicInbox::WwwStatic qw(r); -use Plack::MIME; sub locate_cgit ($) { my ($pi_config) = @_; @@ -114,9 +113,8 @@ sub call { } } elsif ($path_info =~ m!$self->{static}! && defined($cgit_data = $self->{cgit_data})) { - my $f = $1; - return PublicInbox::WwwStatic::response($env, [], $cgit_data.$f, - Plack::MIME->mime_type($f)); + my $f = $cgit_data.$1; # {static} only matches leading slash + return PublicInbox::WwwStatic::response($env, [], $f); } my $cgi_env = { PATH_INFO => $path_info }; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 99f9f1dc..efe7c8ca 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -22,7 +22,7 @@ use PublicInbox::MID qw(mid_escape); require PublicInbox::Git; use PublicInbox::GitHTTPBackend; use PublicInbox::UserContent; -use PublicInbox::WwwStatic qw(r); +use PublicInbox::WwwStatic qw(r path_info_raw); # TODO: consider a routing tree now that we have more endpoints: our $INBOX_RE = qr!\A/([\w\-][\w\.\-]*)!; @@ -43,19 +43,6 @@ sub run { PublicInbox::WWW->new->call($req->env); } -# PATH_INFO is decoded, and we want the undecoded original -my %path_re_cache; -sub path_info_raw ($) { - my ($env) = @_; - my $sn = $env->{SCRIPT_NAME}; - my $re = $path_re_cache{$sn} ||= do { - $sn = '/'.$sn unless index($sn, '/') == 0; - $sn =~ s!/\z!!; - qr!\A(?:https?://[^/]+)?\Q$sn\E(/[^\?\#]+)!; - }; - $env->{REQUEST_URI} =~ $re ? $1 : $env->{PATH_INFO}; -} - sub call { my ($self, $env) = @_; my $ctx = { env => $env, www => $self }; diff --git a/lib/PublicInbox/WwwStatic.pm b/lib/PublicInbox/WwwStatic.pm index ce4bfe9b..bc42236e 100644 --- a/lib/PublicInbox/WwwStatic.pm +++ b/lib/PublicInbox/WwwStatic.pm @@ -1,19 +1,48 @@ # Copyright (C) 2016-2019 all contributors # License: AGPL-3.0+ +# This package can either be a PSGI response body for a static file +# OR a standalone PSGI app which returns the above PSGI response body +# (or an HTML directory listing). +# +# It encapsulates the "autoindex", "index", and "gzip_static" +# functionality of nginx. package PublicInbox::WwwStatic; use strict; use parent qw(Exporter); +use bytes (); use Fcntl qw(SEEK_SET O_RDONLY O_NONBLOCK); +use POSIX qw(strftime lround); use HTTP::Date qw(time2str); use HTTP::Status qw(status_message); use Errno qw(EACCES ENOTDIR ENOENT); -our @EXPORT_OK = qw(@NO_CACHE r); +use URI::Escape qw(uri_escape_utf8); +use PublicInbox::Hval qw(ascii_html); +use Plack::MIME; +our @EXPORT_OK = qw(@NO_CACHE r path_info_raw); our @NO_CACHE = ('Expires', 'Fri, 01 Jan 1980 00:00:00 GMT', 'Pragma', 'no-cache', 'Cache-Control', 'no-cache, max-age=0, must-revalidate'); +our $STYLE = <<'EOF'; + +EOF + +$STYLE =~ s/^\s*//gm; +$STYLE =~ tr/\n//d; + sub r ($;$) { my ($code, $msg) = @_; $msg ||= status_message($code); @@ -69,8 +98,28 @@ sub prepare_range { ($code, $len); } -sub response { +# returns a PSGI arrayref response iff .gz and non-.gz mtimes match +sub try_gzip_static ($$$$) { my ($env, $h, $path, $type) = @_; + return unless ($env->{HTTP_ACCEPT_ENCODING} // '') =~ /\bgzip\b/i; + my $mtime; + return unless -f $path && defined(($mtime = (stat(_))[9])); + my $gz = "$path.gz"; + return unless -f $gz && (stat(_))[9] == $mtime; + my $res = response($env, $h, $gz, $type); + return if ($res->[0] > 300 || $res->[0] < 200); + push @{$res->[1]}, qw(Cache-Control no-transform Content-Encoding gzip); + $res; +} + +sub response ($$$;$) { + my ($env, $h, $path, $type) = @_; + $type //= Plack::MIME->mime_type($path) // 'application/octet-stream'; + if ($path !~ /\.gz\z/i) { + if (my $res = try_gzip_static($env, $h, $path, $type)) { + return $res; + } + } my $in; if ($env->{REQUEST_METHOD} eq 'HEAD') { @@ -108,7 +157,7 @@ sub response { [ $code, $h, $body ]; } -# called by PSGI servers: +# called by PSGI servers on each response chunk: sub getline { my ($self) = @_; my $len = $self->{len} or return; # undef, tells server we're done @@ -132,6 +181,147 @@ sub getline { undef; } -sub close {} # noop, just let everything go out-of-scope +sub close {} # noop, called by PSGI server, just let everything go out-of-scope + +# OO interface for use as a Plack app +sub new { + my ($class, %opt) = @_; + my $index = $opt{'index'} // [ 'index.html' ]; + $index = [ $index ] if defined($index) && ref($index) ne 'ARRAY'; + $index = undef if scalar(@$index) == 0; + my $style = $opt{style}; + if (defined $style) { + $style = \$style unless ref($style); + } + my $docroot = $opt{docroot}; + die "`docroot' not set" unless defined($docroot) && $docroot ne ''; + bless { + docroot => $docroot, + index => $index, + autoindex => $opt{autoindex}, + style => $style // \$STYLE, + }, $class; +} + +# PATH_INFO is decoded, and we want the undecoded original +my %path_re_cache; +sub path_info_raw ($) { + my ($env) = @_; + my $sn = $env->{SCRIPT_NAME}; + my $re = $path_re_cache{$sn} ||= do { + $sn = '/'.$sn unless index($sn, '/') == 0; + $sn =~ s!/\z!!; + qr!\A(?:https?://[^/]+)?\Q$sn\E(/[^\?\#]+)!; + }; + $env->{REQUEST_URI} =~ $re ? $1 : $env->{PATH_INFO}; +} + +sub redirect_slash ($) { + my ($env) = @_; + my $url = $env->{'psgi.url_scheme'} . '://'; + my $host_port = $env->{HTTP_HOST} // + "$env->{SERVER_NAME}:$env->{SERVER_PORT}"; + $url .= $host_port . path_info_raw($env) . '/'; + my $body = "Redirecting to $url\n"; + [ 302, [ qw(Content-Type text/plain), 'Location', $url, + 'Content-Length', length($body) ], [ $body ] ] +} + +sub human_size ($) { + my ($size) = @_; + my $suffix = ''; + for my $s (qw(K M G T P)) { + last if $size < 1024; + $size /= 1024; + if ($size <= 1024) { + $suffix = $s; + last; + } + } + lround($size).$suffix; +} + +# by default, this returns "index.html" if it exists for a given directory +# It'll generate a directory listing, (autoindex). +# May be disabled by setting autoindex => 0 +sub dir_response ($$$) { + my ($self, $env, $fs_path) = @_; + if (my $index = $self->{'index'}) { # serve index.html or similar + for my $html (@$index) { + my $p = $fs_path . $html; + my $res = response($env, [], $p); + return $res if $res->[0] != 404; + } + } + return r(404) unless $self->{autoindex}; + opendir(my $dh, $fs_path) or do { + return r(404) if ($! == ENOENT || $! == ENOTDIR); + return r(403) if $! == EACCES; + return r(500); + }; + my @entries = grep(!/\A\./, readdir($dh)); + $dh = undef; + my (%dirs, %other, %want_gz); + my $path_info = $env->{PATH_INFO}; + push @entries, '..' if $path_info ne '/'; + for my $base (@entries) { + my $href = ascii_html(uri_escape_utf8($base)); + my $name = ascii_html($base); + my @st = stat($fs_path . $base) or next; # unlikely + my ($gzipped, $uncompressed, $hsize); + my $entry = ''; + my $mtime = $st[9]; + if (-d _) { + $href .= '/'; + $name .= '/'; + $hsize = '-'; + $dirs{"$base\0$mtime"} = \$entry; + } elsif (-f _) { + $other{"$base\0$mtime"} = \$entry; + if ($base !~ /\.gz\z/i) { + $want_gz{"$base.gz\0$mtime"} = undef; + } + $hsize = human_size($st[7]); + } else { + next; + } + # 54 = 80 - (SP length(strftime(%Y-%m-%d %k:%M)) SP human_size) + $hsize = sprintf('% 8s', $hsize); + my $pad = 54 - length($name); + $pad = 1 if $pad <= 0; + $entry .= qq($name) . (' ' x $pad); + $mtime = strftime('%Y-%m-%d %k:%M', gmtime($mtime)); + $entry .= $mtime . $hsize; + } + + # filter out '.gz' files as long as the mtime matches the + # uncompressed version + delete(@other{keys %want_gz}); + @entries = ((map { ${$dirs{$_}} } sort keys %dirs), + (map { ${$other{$_}} } sort keys %other)); + + my $path_info_html = ascii_html($path_info); + my $body = "Index of $path_info_html" . + ${$self->{style}} . + "
Index of $path_info_html

\n";
+	$body .= join("\n", @entries) . "

\n"; + [ 200, [ qw(Content-Type text/html + Content-Length), bytes::length($body) ], [ $body ] ] +} + +sub call { # PSGI app endpoint + my ($self, $env) = @_; + return r(405) if $env->{REQUEST_METHOD} !~ /\A(?:GET|HEAD)\z/; + my $path_info = $env->{PATH_INFO}; + return r(403) if index($path_info, "\0") >= 0; + my (@parts) = split(m!/+!, $path_info, -1); + return r(403) if grep(/\A(?:\.\.)\z/, @parts) || $parts[0] ne ''; + + my $fs_path = join('/', $self->{docroot}, @parts); + return dir_response($self, $env, $fs_path) if $parts[-1] eq ''; + + my $res = response($env, [], $fs_path); + $res->[0] == 404 && -d $fs_path ? redirect_slash($env) : $res; +} 1; diff --git a/t/www_static.t b/t/www_static.t new file mode 100644 index 00000000..5f2e3380 --- /dev/null +++ b/t/www_static.t @@ -0,0 +1,96 @@ +# Copyright (C) 2019 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::TestCommon; +my ($tmpdir, $for_destroy) = tmpdir(); +my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape); +require_mods(@mods); +use_ok $_ foreach @mods; +use_ok 'PublicInbox::WwwStatic'; + +my $app = sub { + my $ws = PublicInbox::WwwStatic->new(docroot => $tmpdir, @_); + sub { $ws->call(shift) }; +}; + +test_psgi($app->(), sub { + my $cb = shift; + my $res = $cb->(GET('/')); + is($res->code, 404, '404 on "/" by default'); + open my $fh, '>', "$tmpdir/index.html" or die; + print $fh 'hi' or die; + close $fh or die; + $res = $cb->(GET('/')); + is($res->code, 200, '200 with index.html'); + is($res->content, 'hi', 'default index.html returned'); + $res = $cb->(HEAD('/')); + is($res->code, 200, '200 on HEAD /'); + is($res->content, '', 'no content'); + is($res->header('Content-Length'), '2', 'content-length set'); + like($res->header('Content-Type'), qr!^text/html\b!, + 'content-type is html'); +}); + +test_psgi($app->(autoindex => 1, index => []), sub { + my $cb = shift; + my $res = $cb->(GET('/')); + my $updir = 'href="../">../'; + is($res->code, 200, '200 with autoindex default'); + my $ls = $res->content; + like($ls, qr/index\.html/, 'got listing with index.html'); + ok(index($ls, $updir) < 0, 'no updir at /'); + mkdir("$tmpdir/dir") or die; + rename("$tmpdir/index.html", "$tmpdir/dir/index.html") or die; + + $res = $cb->(GET('/dir/')); + is($res->code, 200, '200 with autoindex for dir/'); + $ls = $res->content; + ok(index($ls, $updir) > 0, 'updir at /dir/'); + + for my $up (qw(/../ .. /dir/.. /dir/../)) { + is($cb->(GET($up))->code, 403, "`$up' traversal rejected"); + } + + $res = $cb->(GET('/dir')); + is($res->code, 302, '302 w/o slash'); + like($res->header('Location'), qr!://[^/]+/dir/\z!, + 'redirected w/ slash'); + + rename("$tmpdir/dir/index.html", "$tmpdir/dir/foo") or die; + link("$tmpdir/dir/foo", "$tmpdir/dir/foo.gz") or die; + $res = $cb->(GET('/dir/')); + unlike($res->content, qr/>foo\.gzcontent, qr/>foo(GET('/dir/foo/bar')); + is($res->code, 404, 'using file as dir fails'); + + unlink("$tmpdir/dir/foo") or die; + $res = $cb->(GET('/dir/')); + like($res->content, qr/>foo\.gz', "$tmpdir/dir/foo" or die; + print $fh "uncompressed\n" or die; + close $fh or die; + utime(0, 0, "$tmpdir/dir/foo") or die; + $res = $cb->(GET('/dir/')); + my $html = $res->content; + like($html, qr/>foofoo\.gz(GET('/dir/foo')); + is($res->content, "uncompressed\n", + 'got uncompressed on mtime mismatch'); + + utime(0, 0, "$tmpdir/dir/foo.gz") or die; + my $get = GET('/dir/foo'); + $get->header('Accept-Encoding' => 'gzip'); + $res = $cb->($get); + is($res->content, "hi", 'got compressed on mtime match'); +}); + +done_testing();