diff options
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r-- | lib/PublicInbox/Config.pm | 96 | ||||
-rw-r--r-- | lib/PublicInbox/GetlineBody.pm | 16 | ||||
-rw-r--r-- | lib/PublicInbox/Git.pm | 87 | ||||
-rw-r--r-- | lib/PublicInbox/GitHTTPBackend.pm | 64 | ||||
-rw-r--r-- | lib/PublicInbox/HTTPD/Async.pm | 27 | ||||
-rw-r--r-- | lib/PublicInbox/HlMod.pm | 125 | ||||
-rw-r--r-- | lib/PublicInbox/Hval.pm | 46 | ||||
-rw-r--r-- | lib/PublicInbox/Qspawn.pm | 143 | ||||
-rw-r--r-- | lib/PublicInbox/SolverGit.pm | 530 | ||||
-rw-r--r-- | lib/PublicInbox/UserContent.pm | 98 | ||||
-rw-r--r-- | lib/PublicInbox/View.pm | 55 | ||||
-rw-r--r-- | lib/PublicInbox/ViewDiff.pm | 161 | ||||
-rw-r--r-- | lib/PublicInbox/ViewVCS.pm | 146 | ||||
-rw-r--r-- | lib/PublicInbox/WWW.pm | 152 | ||||
-rw-r--r-- | lib/PublicInbox/WwwHighlight.pm | 74 | ||||
-rw-r--r-- | lib/PublicInbox/WwwStream.pm | 4 | ||||
-rw-r--r-- | lib/PublicInbox/WwwText.pm | 35 |
17 files changed, 1733 insertions, 126 deletions
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index bea26176..ccfc114f 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -2,12 +2,19 @@ # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used throughout the project for reading configuration +# +# Note: I hate camelCase; but git-config(1) uses it, but it's better +# than alllowercasewithoutunderscores, so use lc('configKey') where +# applicable for readability + package PublicInbox::Config; use strict; use warnings; require PublicInbox::Inbox; use PublicInbox::Spawn qw(popen_rd); +sub _array ($) { ref($_[0]) eq 'ARRAY' ? $_[0] : [ $_[0] ] } + # returns key-value pairs of config directives in a hash # if keys may be multi-value, the value is an array ref containing all values sub new { @@ -22,6 +29,7 @@ sub new { $self->{-by_newsgroup} ||= {}; $self->{-no_obfuscate} ||= {}; $self->{-limiters} ||= {}; + $self->{-code_repos} ||= {}; # nick => PublicInbox::Git object if (my $no = delete $self->{'publicinbox.noobfuscate'}) { $no = [ $no ] if ref($no) ne 'ARRAY'; @@ -41,6 +49,9 @@ sub new { my $nod = join('|', @domains); $self->{-no_obfuscate_re} = qr/(?:$nod)\z/i; } + if (my $css = delete $self->{'publicinbox.css'}) { + $self->{css} = _array($css); + } $self; } @@ -79,13 +90,22 @@ sub lookup_name ($$) { sub each_inbox { my ($self, $cb) = @_; - my %seen; - foreach my $k (keys %$self) { - $k =~ m!\Apublicinbox\.([^/]+)\.mainrepo\z! or next; - next if $seen{$1}; - $seen{$1} = 1; - my $ibx = lookup_name($self, $1) or next; - $cb->($ibx); + if (my $section_order = $self->{-section_order}) { + foreach my $section (@$section_order) { + next if $section !~ m!\Apublicinbox\.([^/]+)\z!; + $self->{"publicinbox.$1.mainrepo"} or next; + my $ibx = lookup_name($self, $1) or next; + $cb->($ibx); + } + } else { + my %seen; + foreach my $k (keys %$self) { + $k =~ m!\Apublicinbox\.([^/]+)\.mainrepo\z! or next; + next if $seen{$1}; + $seen{$1} = 1; + my $ibx = lookup_name($self, $1) or next; + $cb->($ibx); + } } } @@ -126,7 +146,7 @@ sub default_file { sub git_config_dump { my ($file) = @_; - my ($in, $out); + my (%section_seen, @section_order); my @cmd = (qw/git config/, "--file=$file", '-l'); my $cmd = join(' ', @cmd); my $fh = popen_rd(\@cmd) or die "popen_rd failed for $file: $!\n"; @@ -135,8 +155,14 @@ sub git_config_dump { while (defined(my $line = <$fh>)) { chomp $line; my ($k, $v) = split(/=/, $line, 2); - my $cur = $rv{$k}; + my ($section) = ($k =~ /\A(\S+)\.[^\.]+\z/); + unless (defined $section_seen{$section}) { + $section_seen{$section} = 1; + push @section_order, $section; + } + + my $cur = $rv{$k}; if (defined $cur) { if (ref($cur) eq "ARRAY") { push @$cur, $v; @@ -148,6 +174,7 @@ sub git_config_dump { } } close $fh or die "failed to close ($cmd) pipe: $?"; + $rv{-section_order} = \@section_order; \%rv; } @@ -169,6 +196,41 @@ sub valid_inbox_name ($) { 1; } +# parse a code repo +# Only git is supported at the moment, but SVN and Hg are possibilities +sub _fill_code_repo { + my ($self, $nick) = @_; + my $pfx = "coderepo.$nick"; + + my $dir = $self->{"$pfx.dir"}; # aka "GIT_DIR" + unless (defined $dir) { + warn "$pfx.repodir unset"; + return; + } + + my $git = PublicInbox::Git->new($dir); + foreach my $t (qw(blob commit tree tag)) { + $git->{$t.'_url_format'} = + _array($self->{lc("$pfx.${t}UrlFormat")}); + } + + if (my $cgits = $self->{lc("$pfx.cgitUrl")}) { + $git->{cgit_url} = $cgits = _array($cgits); + + # cgit supports "/blob/?id=%s", but it's only a plain-text + # display and requires an unabbreviated id= + foreach my $t (qw(blob commit tag)) { + $git->{$t.'_url_format'} ||= map { + "$_/$t/?id=%s" + } @$cgits; + } + } + # TODO: support gitweb and other repository viewers? + # TODO: parse cgitrc + + $git; +} + sub _fill { my ($self, $pfx) = @_; my $rv = {}; @@ -192,9 +254,9 @@ sub _fill { } # TODO: more arrays, we should support multi-value for # more things to encourage decentralization - foreach my $k (qw(address altid nntpmirror)) { + foreach my $k (qw(address altid nntpmirror coderepo)) { if (defined(my $v = $self->{"$pfx.$k"})) { - $rv->{$k} = ref($v) eq 'ARRAY' ? $v : [ $v ]; + $rv->{$k} = _array($v); } } @@ -224,6 +286,18 @@ sub _fill { $rv->{-no_obfuscate_re} = $self->{-no_obfuscate_re}; each_inbox($self, sub {}); # noop to populate -no_obfuscate } + + if (my $ibx_code_repos = $rv->{coderepo}) { + my $code_repos = $self->{-code_repos}; + my $repo_objs = $rv->{-repo_objs} = []; + foreach my $nick (@$ibx_code_repos) { + valid_inbox_name($nick) or next; + my $repo = $code_repos->{$nick} ||= + _fill_code_repo($self, $nick); + push @$repo_objs, $repo if $repo; + } + } + $rv } diff --git a/lib/PublicInbox/GetlineBody.pm b/lib/PublicInbox/GetlineBody.pm index ea07f3d6..0a922fd2 100644 --- a/lib/PublicInbox/GetlineBody.pm +++ b/lib/PublicInbox/GetlineBody.pm @@ -13,8 +13,13 @@ use strict; use warnings; sub new { - my ($class, $rpipe, $end, $buf) = @_; - bless { rpipe => $rpipe, end => $end, buf => $buf }, $class; + my ($class, $rpipe, $end, $buf, $filter) = @_; + bless { + rpipe => $rpipe, + end => $end, + buf => $buf, + filter => $filter || 0, + }, $class; } # close should always be called after getline returns undef, @@ -24,8 +29,13 @@ sub DESTROY { $_[0]->close } sub getline { my ($self) = @_; + my $filter = $self->{filter}; + return if $filter == -1; # last call was EOF + my $buf = delete $self->{buf}; # initial buffer - defined $buf ? $buf : $self->{rpipe}->getline; + $buf = $self->{rpipe}->getline unless defined $buf; + $self->{filter} = -1 unless defined $buf; # set EOF for next call + $filter ? $filter->($buf) : $buf; } sub close { diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 90b9214a..3ad08112 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -13,7 +13,7 @@ use POSIX qw(dup2); require IO::Handle; use PublicInbox::Spawn qw(spawn popen_rd); use base qw(Exporter); -our @EXPORT_OK = qw(git_unquote); +our @EXPORT_OK = qw(git_unquote git_quote); my %GIT_ESC = ( a => "\a", @@ -26,6 +26,8 @@ my %GIT_ESC = ( '"' => '"', '\\' => '\\', ); +my %ESC_GIT = map { $GIT_ESC{$_} => $_ } keys %GIT_ESC; + # unquote pathnames used by git, see quote.c::unquote_c_style.c in git.git sub git_unquote ($) { @@ -36,10 +38,19 @@ sub git_unquote ($) { $_[0]; } +sub git_quote ($) { + if ($_[0] =~ s/([\\"\a\b\f\n\r\t\013]|[^[:print:]])/ + '\\'.($ESC_GIT{$1}||sprintf("%0o",ord($1)))/egs) { + return qq{"$_[0]"}; + } + $_[0]; +} + sub new { my ($class, $git_dir) = @_; my @st; $st[7] = $st[10] = 0; + # may contain {-tmp} field for File::Temp::Dir bless { git_dir => $git_dir, st => \@st }, $class } @@ -53,9 +64,25 @@ sub alternates_changed { $self->{st} = \@st; } +sub last_check_err { + my ($self) = @_; + my $fh = $self->{err_c} or return; + sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); + defined(sysread($fh, my $buf, -s $fh)) or + fail($self, "sysread failed: $!"); + $buf; +} + sub _bidi_pipe { - my ($self, $batch, $in, $out, $pid) = @_; - return if $self->{$pid}; + my ($self, $batch, $in, $out, $pid, $err) = @_; + if ($self->{$pid}) { + if (defined $err) { # "err_c" + my $fh = $self->{$err}; + sysseek($fh, 0, 0) or fail($self, "sysseek failed: $!"); + truncate($fh, 0) or fail($self, "truncate failed: $!"); + } + return; + } my ($in_r, $in_w, $out_r, $out_w); pipe($in_r, $in_w) or fail($self, "pipe failed: $!"); @@ -65,8 +92,14 @@ sub _bidi_pipe { fcntl($in_w, 1031, 4096) if $batch eq '--batch-check'; } - my @cmd = ('git', "--git-dir=$self->{git_dir}", qw(cat-file), $batch); + my @cmd = (qw(git), "--git-dir=$self->{git_dir}", + qw(-c core.abbrev=40 cat-file), $batch); my $redir = { 0 => fileno($out_r), 1 => fileno($in_w) }; + if ($err) { + open(my $fh, '+>', undef) or fail($self, "open.err failed: $!"); + $self->{$err} = $fh; + $redir->{2} = fileno($fh); + } my $p = spawn(\@cmd, undef, $redir); defined $p or fail($self, "spawn failed: $!"); $self->{$pid} = $p; @@ -141,12 +174,25 @@ sub batch_prepare ($) { _bidi_pipe($_[0], qw(--batch in out pid)) } sub check { my ($self, $obj) = @_; - $self->_bidi_pipe(qw(--batch-check in_c out_c pid_c)); + _bidi_pipe($self, qw(--batch-check in_c out_c pid_c err_c)); $self->{out_c}->print($obj, "\n") or fail($self, "write error: $!"); local $/ = "\n"; chomp(my $line = $self->{in_c}->getline); my ($hex, $type, $size) = split(' ', $line); - return if $type eq 'missing'; + + # Future versions of git.git may show 'ambiguous', but for now, + # we must handle 'dangling' below (and maybe some other oddball + # stuff): + # https://public-inbox.org/git/20190118033845.s2vlrb3wd3m2jfzu@dcvr/T/ + return if $type eq 'missing' || $type eq 'ambiguous'; + + if ($hex eq 'dangling' || $hex eq 'notdir' || $hex eq 'loop') { + $size = $type + length("\n"); + my $r = read($self->{in_c}, my $buf, $size); + defined($r) or fail($self, "read failed: $!"); + return; + } + ($hex, $type, $size); } @@ -201,6 +247,35 @@ sub packed_bytes { sub DESTROY { cleanup(@_) } +sub local_nick ($) { + my ($self) = @_; + my $ret = '???'; + # don't show full FS path, basename should be OK: + if ($self->{git_dir} =~ m!/([^/]+)(?:/\.git)?\z!) { + $ret = "/path/to/$1"; + } + wantarray ? ($ret) : $ret; +} + +# show the blob URL for cgit/gitweb/whatever +sub src_blob_url { + my ($self, $oid) = @_; + # blob_url_format = "https://example.com/foo.git/blob/%s" + if (my $bfu = $self->{blob_url_format}) { + return map { sprintf($_, $oid) } @$bfu if wantarray; + return sprintf($bfu->[0], $oid); + } + local_nick($self); +} + +sub pub_urls { + my ($self) = @_; + if (my $urls = $self->{cgit_url}) { + return @$urls; + } + local_nick($self); +} + 1; __END__ =pod diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index 54ccfa05..ab43a009 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -200,69 +200,15 @@ sub serve_smart { $env{$name} = $val if defined $val; } my $limiter = $git->{-httpbackend_limiter} || $default_limiter; - my $git_dir = $git->{git_dir}; $env{GIT_HTTP_EXPORT_ALL} = '1'; - $env{PATH_TRANSLATED} = "$git_dir/$path"; + $env{PATH_TRANSLATED} = "$git->{git_dir}/$path"; my $rdr = { 0 => fileno($in) }; my $qsp = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, $rdr); - my ($fh, $rpipe); - my $end = sub { - if (my $err = $qsp->finish) { - err($env, "git http-backend ($git_dir): $err"); - } - $fh->close if $fh; # async-only - }; - - # Danga::Socket users, we queue up the read_enable callback to - # fire after pending writes are complete: - my $buf = ''; - my $rd_hdr = sub { - my $r = sysread($rpipe, $buf, 1024, length($buf)); - return if !defined($r) && ($!{EINTR} || $!{EAGAIN}); - return r(500, 'http-backend error') unless $r; - $r = parse_cgi_headers(\$buf) or return; # incomplete headers + $qsp->psgi_return($env, $limiter, sub { + my ($r, $bref) = @_; + $r = parse_cgi_headers($bref) or return; # incomplete headers $r->[0] == 403 ? serve_dumb($env, $git, $path) : $r; - }; - my $res; - my $async = $env->{'pi-httpd.async'}; # XXX unstable API - my $cb = sub { - my $r = $rd_hdr->() or return; - $rd_hdr = undef; - if (scalar(@$r) == 3) { # error: - if ($async) { - $async->close; # calls rpipe->close - } else { - $rpipe->close; - $end->(); - } - $res->($r); - } elsif ($async) { - $fh = $res->($r); - $async->async_pass($env->{'psgix.io'}, $fh, \$buf); - } else { # for synchronous PSGI servers - require PublicInbox::GetlineBody; - $r->[2] = PublicInbox::GetlineBody->new($rpipe, $end, - $buf); - $res->($r); - } - }; - sub { - ($res) = @_; - - # hopefully this doesn't break any middlewares, - # holding the input here is a waste of FDs and memory - $env->{'psgi.input'} = undef; - - $qsp->start($limiter, sub { # may run later, much later... - ($rpipe) = @_; - $in = undef; - if ($async) { - $async = $async->($rpipe, $cb, $end); - } else { # generic PSGI - $cb->() while $rd_hdr; - } - }); - }; + }); } sub input_to_file { diff --git a/lib/PublicInbox/HTTPD/Async.pm b/lib/PublicInbox/HTTPD/Async.pm index 842aaf62..a647f10d 100644 --- a/lib/PublicInbox/HTTPD/Async.pm +++ b/lib/PublicInbox/HTTPD/Async.pm @@ -14,6 +14,15 @@ require PublicInbox::EvCleanup; sub new { my ($class, $io, $cb, $cleanup) = @_; + + # no $io? call $cb at the top of the next event loop to + # avoid recursion: + unless (defined($io)) { + PublicInbox::EvCleanup::asap($cb) if $cb; + PublicInbox::EvCleanup::next_tick($cleanup) if $cleanup; + return; + } + my $self = fields::new($class); IO::Handle::blocking($io, 0); $self->SUPER::new($io); @@ -23,6 +32,7 @@ sub new { $self; } +# fires after pending writes are complete: sub restart_read_cb ($) { my ($self) = @_; sub { $self->watch_read(1) } @@ -35,14 +45,16 @@ sub main_cb ($$$) { my $r = sysread($self->{sock}, $$bref, 8192); if ($r) { $fh->write($$bref); - return if $http->{closed}; - if ($http->{write_buf_size}) { - $self->watch_read(0); - $http->write(restart_read_cb($self)); + unless ($http->{closed}) { # Danga::Socket sets this + if ($http->{write_buf_size}) { + $self->watch_read(0); + $http->write(restart_read_cb($self)); + } + # stay in watch_read, but let other clients + # get some work done, too. + return; } - # stay in watch_read, but let other clients - # get some work done, too. - return; + # fall through to close below... } elsif (!defined $r) { return if $!{EAGAIN} || $!{EINTR}; } @@ -66,7 +78,6 @@ sub async_pass { sub event_read { $_[0]->{cb}->(@_) } sub event_hup { $_[0]->{cb}->(@_) } sub event_err { $_[0]->{cb}->(@_) } -sub sysread { shift->{sock}->sysread(@_) } sub close { my $self = shift; diff --git a/lib/PublicInbox/HlMod.pm b/lib/PublicInbox/HlMod.pm new file mode 100644 index 00000000..237ffaca --- /dev/null +++ b/lib/PublicInbox/HlMod.pm @@ -0,0 +1,125 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# I have no idea how stable or safe this is for handling untrusted +# input, but it seems to have been around for a while, and the +# highlight(1) executable is supported by gitweb and cgit. +# +# I'm also unsure about API stability, but highlight 3.x seems to +# have been around a few years and ikiwiki (apparently the only +# user of the SWIG/Perl bindings, at least in Debian) hasn't needed +# major changes to support it in recent years. +# +# Some code stolen from ikiwiki (GPL-2.0+) +# wrapper for SWIG-generated highlight.pm bindings +package PublicInbox::HlMod; +use strict; +use warnings; +use highlight; # SWIG-generated stuff + +sub _parse_filetypes ($) { + my $ft_conf = $_[0]->searchFile('filetypes.conf') or + die 'filetypes.conf not found by highlight'; + open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!"; + local $/; + my $cfg = <$fh>; + my %ext2lang; + my @shebang; # order matters + + # Hrm... why isn't this exposed by the highlight API? + # highlight >= 3.2 format (bind-style) (from ikiwiki) + while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+ + Extensions\s*=\s*{([^}]+)}/sgx) { + my $lang = $1; + foreach my $bit (split(/,/, $2)) { + $bit =~ s/.*"(.*)".*/$1/s; + $ext2lang{$bit} = $lang; + } + } + # AFAIK, all the regexps used by in filetypes.conf distributed + # by highlight work as Perl REs + while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+ + Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) { + my ($lang, $re) = ($1, $2); + eval { + my $perl_re = qr/$re/; + push @shebang, [ $lang, $perl_re ]; + }; + if ($@) { + warn "$lang shebang=[[$re]] did not work in Perl: $@"; + } + } + (\%ext2lang, \@shebang); +} + +sub new { + my ($class) = @_; + my $dir = highlight::DataDir->new; + $dir->initSearchDirectories(''); + my ($ext2lang, $shebang) = _parse_filetypes($dir); + bless { + -dir => $dir, + -ext2lang => $ext2lang, + -shebang => $shebang, + }, $class; +} + +sub _shebang2lang ($$) { + my ($self, $str) = @_; + my $shebang = $self->{-shebang}; + foreach my $s (@$shebang) { + return $s->[0] if $$str =~ $s->[1]; + } + undef; +} + +sub _path2lang ($$) { + my ($self, $path) = @_; + my ($ext) = ($path =~ m!([^\\/\.]+)\z!); + $ext = lc($ext); + $self->{-ext2lang}->{$ext} || $ext; +} + +sub do_hl { + my ($self, $str, $path) = @_; + my $lang = _path2lang($self, $path) if defined $path; + my $dir = $self->{-dir}; + my $langpath; + if (defined $lang) { + $langpath = $dir->getLangPath("$lang.lang") or return; + $langpath = undef unless -f $langpath; + } + unless (defined $langpath) { + $lang = _shebang2lang($self, $str) or return; + $langpath = $dir->getLangPath("$lang.lang") or return; + $langpath = undef unless -f $langpath; + } + return unless defined $langpath; + + my $gen = $self->{$langpath} ||= do { + my $g = highlight::CodeGenerator::getInstance($highlight::HTML); + $g->setFragmentCode(1); # generate html fragment + + # whatever theme works + my $themepath = $dir->getThemePath('print.theme'); + $g->initTheme($themepath); + $g->loadLanguage($langpath); + $g->setEncoding('utf-8'); + $g; + }; + \($gen->generateString($$str)) +} + +# SWIG instances aren't reference-counted, but $self is; +# so we need to delete all the CodeGenerator instances manually +# at our own destruction +sub DESTROY { + my ($self) = @_; + foreach my $gen (values %$self) { + if (ref($gen) eq 'highlight::CodeGenerator') { + highlight::CodeGenerator::deleteInstance($gen); + } + } +} + +1; diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index ccfa3242..53810b33 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -9,17 +9,7 @@ use warnings; use Encode qw(find_encoding); use PublicInbox::MID qw/mid_clean mid_escape/; use base qw/Exporter/; -our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename/; - -# User-generated content (UGC) may have excessively long lines -# and screw up rendering on some browsers, so we use pre-wrap. -# -# We also force everything to the same scaled font-size because GUI -# browsers (tested both Firefox and surf (webkit)) uses a larger font -# for the Search <form> element than the rest of the page. Font size -# uniformity is important to people who rely on gigantic fonts. -use constant STYLE => - '<style>pre{white-space:pre-wrap}*{font-size:100%}</style>'; +our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape/; my $enc_ascii = find_encoding('us-ascii'); @@ -47,6 +37,21 @@ sub new_oneline { $class->new($raw); } +# some of these overrides are standard C escapes so they're +# easy-to-understand when rendered. +my %escape_sequence = ( + "\x00" => '\\0', # NUL + "\x07" => '\\a', # bell + "\x08" => '\\b', # backspace + "\x09" => "\t", # obvious to show as-is + "\x0a" => "\n", # obvious to show as-is + "\x0b" => '\\v', # vertical tab + "\x0c" => '\\f', # form feed + "\x0d" => '\\r', # carriage ret (not preceding \n) + "\x1b" => '^[', # ASCII escape (mutt seems to escape this way) + "\x7f" => '\\x7f', # DEL +); + my %xhtml_map = ( '"' => '"', '&' => '&', @@ -56,18 +61,13 @@ my %xhtml_map = ( ); $xhtml_map{chr($_)} = sprintf('\\x%02x', $_) for (0..31); -# some of these overrides are standard C escapes so they're -# easy-to-understand when rendered. -$xhtml_map{"\x00"} = '\\0'; # NUL -$xhtml_map{"\x07"} = '\\a'; # bell -$xhtml_map{"\x08"} = '\\b'; # backspace -$xhtml_map{"\x09"} = "\t"; # obvious to show as-is -$xhtml_map{"\x0a"} = "\n"; # obvious to show as-is -$xhtml_map{"\x0b"} = '\\v'; # vertical tab -$xhtml_map{"\x0c"} = '\\f'; # form feed -$xhtml_map{"\x0d"} = '\\r'; # carriage ret (not preceding \n) -$xhtml_map{"\x1b"} = '^['; # ASCII escape (mutt seems to escape this way) -$xhtml_map{"\x7f"} = '\\x7f'; # DEL +%xhtml_map = (%xhtml_map, %escape_sequence); + +sub src_escape ($) { + $_[0] =~ s/\r\n/\n/sg; + $_[0] =~ s/([\x7f\x00-\x1f])/$xhtml_map{$1}/sge; + $_[0] = $enc_ascii->encode($_[0], Encode::HTMLCREF); +} sub ascii_html { my ($s) = @_; diff --git a/lib/PublicInbox/Qspawn.pm b/lib/PublicInbox/Qspawn.pm index 3500f8a4..913fac81 100644 --- a/lib/PublicInbox/Qspawn.pm +++ b/lib/PublicInbox/Qspawn.pm @@ -9,6 +9,8 @@ package PublicInbox::Qspawn; use strict; use warnings; use PublicInbox::Spawn qw(popen_rd); +require Plack::Util; +my $def_limiter; sub new ($$$;) { my ($class, $cmd, $env, $opt) = @_; @@ -28,13 +30,22 @@ sub _do_spawn { $cb->($self->{rpipe}); } +sub child_err ($) { + my ($child_error) = @_; # typically $? + my $exitstatus = ($child_error >> 8) or return; + my $sig = $child_error & 127; + my $msg = "exit status=$exitstatus"; + $msg .= " signal=$sig" if $sig; + $msg; +} + sub finish ($) { my ($self) = @_; my $limiter = $self->{limiter}; my $running; if (delete $self->{rpipe}) { my $pid = delete $self->{pid}; - $self->{err} = $pid == waitpid($pid, 0) ? $? : + $self->{err} = $pid == waitpid($pid, 0) ? child_err($?) : "PID:$pid still running?"; $running = --$limiter->{running}; } @@ -59,6 +70,119 @@ sub start { } } +sub _psgi_finish ($$) { + my ($self, $env) = @_; + my $err = $self->finish; + if ($err && !$env->{'qspawn.quiet'}) { + $err = join(' ', @{$self->{args}->[0]}).": $err\n"; + $env->{'psgi.errors'}->print($err); + } +} + +sub psgi_qx { + my ($self, $env, $limiter, $qx_cb) = @_; + my $qx = PublicInbox::Qspawn::Qx->new; + my $end = sub { + _psgi_finish($self, $env); + eval { $qx_cb->($qx) }; + $qx = undef; + }; + my $rpipe; + my $async = $env->{'pi-httpd.async'}; + my $cb = sub { + my $r = sysread($rpipe, my $buf, 8192); + if ($async) { + $async->async_pass($env->{'psgix.io'}, $qx, \$buf); + } elsif (defined $r) { + $r ? $qx->write($buf) : $end->(); + } else { + return if $!{EAGAIN} || $!{EINTR}; # loop again + $end->(); + } + }; + $limiter ||= $def_limiter ||= PublicInbox::Qspawn::Limiter->new(32); + $self->start($limiter, sub { # may run later, much later... + ($rpipe) = @_; + if ($async) { + # PublicInbox::HTTPD::Async->new($rpipe, $cb, $end) + $async = $async->($rpipe, $cb, $end); + } else { # generic PSGI + $cb->() while $qx; + } + }); +} + +# create a filter for "push"-based streaming PSGI writes used by HTTPD::Async +sub filter_fh ($$) { + my ($fh, $filter) = @_; + Plack::Util::inline_object( + close => sub { + $fh->write($filter->(undef)); + $fh->close; + }, + write => sub { + $fh->write($filter->($_[0])); + }); +} + +sub psgi_return { + my ($self, $env, $limiter, $parse_hdr) = @_; + my ($fh, $rpipe); + my $end = sub { + _psgi_finish($self, $env); + $fh->close if $fh; # async-only + }; + + my $buf = ''; + my $rd_hdr = sub { + my $r = sysread($rpipe, $buf, 1024, length($buf)); + return if !defined($r) && ($!{EINTR} || $!{EAGAIN}); + $parse_hdr->($r, \$buf); + }; + my $res = delete $env->{'qspawn.response'}; + my $async = $env->{'pi-httpd.async'}; + my $cb = sub { + my $r = $rd_hdr->() or return; + $rd_hdr = undef; + my $filter = delete $env->{'qspawn.filter'}; + if (scalar(@$r) == 3) { # error + if ($async) { + $async->close; # calls rpipe->close and $end + } else { + $rpipe->close; + $end->(); + } + $res->($r); + } elsif ($async) { + $fh = $res->($r); # scalar @$r == 2 + $fh = filter_fh($fh, $filter) if $filter; + $async->async_pass($env->{'psgix.io'}, $fh, \$buf); + } else { # for synchronous PSGI servers + require PublicInbox::GetlineBody; + $r->[2] = PublicInbox::GetlineBody->new($rpipe, $end, + $buf, $filter); + $res->($r); + } + }; + $limiter ||= $def_limiter ||= PublicInbox::Qspawn::Limiter->new(32); + my $start_cb = sub { # may run later, much later... + ($rpipe) = @_; + if ($async) { + # PublicInbox::HTTPD::Async->new($rpipe, $cb, $end) + $async = $async->($rpipe, $cb, $end); + } else { # generic PSGI + $cb->() while $rd_hdr; + } + }; + + return $self->start($limiter, $start_cb) if $res; + + sub { + ($res) = @_; + $self->start($limiter, $start_cb); + }; +} + package PublicInbox::Qspawn::Limiter; use strict; use warnings; @@ -73,4 +197,21 @@ sub new { }, $class; } +# captures everything into a buffer and executes a callback when done +package PublicInbox::Qspawn::Qx; +use strict; +use warnings; + +sub new { + my ($class) = @_; + my $buf = ''; + bless \$buf, $class; +} + +# called by PublicInbox::HTTPD::Async ($fh->write) +sub write { + ${$_[0]} .= $_[1]; + undef; +} + 1; diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm new file mode 100644 index 00000000..d7875333 --- /dev/null +++ b/lib/PublicInbox/SolverGit.pm @@ -0,0 +1,530 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# "Solve" blobs which don't exist in git code repositories by +# searching inboxes for post-image blobs. + +# this emits a lot of debugging/tracing information which may be +# publically viewed over HTTP(S). Be careful not to expose +# local filesystem layouts in the process. +package PublicInbox::SolverGit; +use strict; +use warnings; +use File::Temp qw(); +use Fcntl qw(SEEK_SET); +use PublicInbox::Git qw(git_unquote git_quote); +use PublicInbox::Spawn qw(spawn popen_rd); +use PublicInbox::MsgIter qw(msg_iter msg_part_text); +use PublicInbox::Qspawn; +use URI::Escape qw(uri_escape_utf8); + +# POSIX requires _POSIX_ARG_MAX >= 4096, and xargs is required to +# subtract 2048 bytes. We also don't factor in environment variable +# headroom into this. +use POSIX qw(sysconf _SC_ARG_MAX); +my $ARG_SIZE_MAX = (sysconf(_SC_ARG_MAX) || 4096) - 2048; + +# By default, "git format-patch" generates filenames with a four-digit +# prefix, so that means 9999 patch series are OK, right? :> +# Maybe we can make this configurable, main concern is disk space overhead +# for uncompressed patch fragments. Aside from space, public-inbox-httpd +# is otherwise unaffected by having many patches, here, as it can share +# work fairly. Other PSGI servers may have trouble, though. +my $MAX_PATCH = 9999; + +# di = diff info / a hashref with information about a diff ($di): +# { +# oid_a => abbreviated pre-image oid, +# oid_b => abbreviated post-image oid, +# tmp => anonymous file handle with the diff, +# hdr_lines => arrayref of various header lines for mode information +# mode_a => original mode of oid_a (string, not integer), +# ibx => PublicInbox::Inbox object containing the diff +# smsg => PublicInbox::SearchMsg object containing diff +# path_a => pre-image path +# path_b => post-image path +# } + +# don't bother if somebody sends us a patch with these path components, +# it's junk at best, an attack attempt at worse: +my %bad_component = map { $_ => 1 } ('', '.', '..'); + +sub dbg ($$) { + print { $_[0]->{out} } $_[1], "\n" or ERR($_[0], "print(dbg): $!"); +} + +sub ERR ($$) { + my ($self, $err) = @_; + print { $self->{out} } $err, "\n"; + my $ucb = delete($self->{user_cb}); + eval { $ucb->($err) } if $ucb; + die $err; +} + +# look for existing blobs already in git repos +sub solve_existing ($$) { + my ($self, $want) = @_; + my $oid_b = $want->{oid_b}; + my @ambiguous; # Array of [ git, $oids] + foreach my $git (@{$self->{gits}}) { + my ($oid_full, $type, $size) = $git->check($oid_b); + if (defined($type) && $type eq 'blob') { + return [ $git, $oid_full, $type, int($size) ]; + } + + next if length($oid_b) == 40; + + # parse stderr of "git cat-file --batch-check" + my $err = $git->last_check_err; + my (@oids) = ($err =~ /\b([a-f0-9]{40})\s+blob\b/g); + next unless scalar(@oids); + + # TODO: do something with the ambiguous array? + # push @ambiguous, [ $git, @oids ]; + + dbg($self, "`$oid_b' ambiguous in " . + join("\n\t", $git->pub_urls) . "\n" . + join('', map { "$_ blob\n" } @oids)); + } + scalar(@ambiguous) ? \@ambiguous : undef; +} + +sub extract_diff ($$$$$) { + my ($self, $p, $re, $ibx, $smsg) = @_; + my ($part) = @$p; # ignore $depth and @idx; + my $hdr_lines; # diff --git a/... b/... + my $tmp; + my $ct = $part->content_type || 'text/plain'; + my ($s, undef) = msg_part_text($part, $ct); + defined $s or return; + my $di = {}; + + # Email::MIME::Encodings forces QP to be CRLF upon decoding, + # change it back to LF: + my $cte = $part->header('Content-Transfer-Encoding') || ''; + if ($cte =~ /\bquoted-printable\b/i && $part->crlf eq "\n") { + $s =~ s/\r\n/\n/sg; + } + + foreach my $l (split(/^/m, $s)) { + if ($l =~ $re) { + $di->{oid_a} = $1; + $di->{oid_b} = $2; + if (defined($3)) { + my $mode_a = $3; + if ($mode_a =~ /\A(?:100644|120000|100755)\z/) { + $di->{mode_a} = $mode_a; + } + } + + + # start writing the diff out to a tempfile + my $pn = ++$self->{tot}; + open($tmp, '>', $self->{tmp}->dirname . "/$pn") or + die "open(tmp): $!"; + + push @$hdr_lines, $l; + $di->{hdr_lines} = $hdr_lines; + utf8::encode($_) for @$hdr_lines; + print $tmp @$hdr_lines or die "print(tmp): $!"; + + # for debugging/diagnostics: + $di->{ibx} = $ibx; + $di->{smsg} = $smsg; + } elsif ($l =~ m!\Adiff --git ("?[^/]+/.+) ("?[^/]+/.+)$!) { + last if $tmp; # got our blob, done! + + my ($path_a, $path_b) = ($1, $2); + + # diff header lines won't have \r because git + # will quote them, but Email::MIME gives CRLF + # for quoted-printable: + $path_b =~ tr/\r//d; + + # don't care for leading 'a/' and 'b/' + my (undef, @a) = split(m{/}, git_unquote($path_a)); + my (undef, @b) = split(m{/}, git_unquote($path_b)); + + # get rid of path-traversal attempts and junk patches: + foreach (@a, @b) { + return if $bad_component{$_}; + } + + $di->{path_a} = join('/', @a); + $di->{path_b} = join('/', @b); + $hdr_lines = [ $l ]; + } elsif ($tmp) { + utf8::encode($l); + print $tmp $l or die "print(tmp): $!"; + } elsif ($hdr_lines) { + push @$hdr_lines, $l; + if ($l =~ /\Anew file mode (100644|120000|100755)$/) { + $di->{mode_a} = $1; + } + } + } + return undef unless $tmp; + close $tmp or die "close(tmp): $!"; + $di; +} + +sub path_searchable ($) { defined($_[0]) && $_[0] =~ m!\A[\w/\. \-]+\z! } + +sub find_extract_diff ($$$) { + my ($self, $ibx, $want) = @_; + my $srch = $ibx->search or return; + + my $post = $want->{oid_b} or die 'BUG: no {oid_b}'; + $post =~ /\A[a-f0-9]+\z/ or die "BUG: oid_b not hex: $post"; + + my $q = "dfpost:$post"; + my $pre = $want->{oid_a}; + if (defined $pre && $pre =~ /\A[a-f0-9]+\z/) { + $q .= " dfpre:$pre"; + } else { + $pre = '[a-f0-9]{7}'; # for $re below + } + + my $path_b = $want->{path_b}; + if (path_searchable($path_b)) { + $q .= qq{ dfn:"$path_b"}; + + my $path_a = $want->{path_a}; + if (path_searchable($path_a) && $path_a ne $path_b) { + $q .= qq{ dfn:"$path_a"}; + } + } + + my $msgs = $srch->query($q, { relevance => 1 }); + my $re = qr/\Aindex ($pre[a-f0-9]*)\.\.($post[a-f0-9]*)(?: (\d+))?/; + + my $di; + foreach my $smsg (@$msgs) { + $ibx->smsg_mime($smsg) or next; + msg_iter(delete($smsg->{mime}), sub { + $di ||= extract_diff($self, $_[0], $re, $ibx, $smsg); + }); + return $di if $di; + } +} + +sub prepare_index ($) { + my ($self) = @_; + my $patches = $self->{patches}; + $self->{nr} = 0; + + my $di = $patches->[0] or die 'no patches'; + my $oid_a = $di->{oid_a} or die '{oid_a} unset'; + my $existing = $self->{found}->{$oid_a}; + + # no index creation for added files + $oid_a =~ /\A0+\z/ and return next_step($self); + + die "BUG: $oid_a not not found" unless $existing; + + my $oid_full = $existing->[1]; + my $path_a = $di->{path_a} or die "BUG: path_a missing for $oid_full"; + my $mode_a = $di->{mode_a} || extract_old_mode($di); + + open my $in, '+>', undef or die "open: $!"; + print $in "$mode_a $oid_full\t$path_a\0" or die "print: $!"; + $in->flush or die "flush: $!"; + sysseek($in, 0, 0) or die "seek: $!"; + + dbg($self, 'preparing index'); + my $rdr = { 0 => fileno($in) }; + my $cmd = [ qw(git update-index -z --index-info) ]; + my $qsp = PublicInbox::Qspawn->new($cmd, $self->{git_env}, $rdr); + $qsp->psgi_qx($self->{psgi_env}, undef, sub { + my ($bref) = @_; + if (my $err = $qsp->{err}) { + ERR($self, "git update-index error: $err"); + } + dbg($self, "index prepared:\n" . + "$mode_a $oid_full\t" . git_quote($path_a)); + next_step($self); # onto do_git_apply + }); +} + +# pure Perl "git init" +sub do_git_init ($) { + my ($self) = @_; + my $dir = $self->{tmp}->dirname; + my $git_dir = "$dir/git"; + + foreach ('', qw(objects refs objects/info refs/heads)) { + mkdir("$git_dir/$_") or die "mkdir $_: $!"; + } + open my $fh, '>', "$git_dir/config" or die "open git/config: $!"; + print $fh <<'EOF' or die "print git/config $!"; +[core] + repositoryFormatVersion = 0 + filemode = true + bare = false + fsyncObjectfiles = false + logAllRefUpdates = false +EOF + close $fh or die "close git/config: $!"; + + open $fh, '>', "$git_dir/HEAD" or die "open git/HEAD: $!"; + print $fh "ref: refs/heads/master\n" or die "print git/HEAD: $!"; + close $fh or die "close git/HEAD: $!"; + + my $f = 'objects/info/alternates'; + open $fh, '>', "$git_dir/$f" or die "open: $f: $!"; + print($fh (map { "$_->{git_dir}/objects\n" } @{$self->{gits}})) or + die "print $f: $!"; + close $fh or die "close: $f: $!"; + my $tmp_git = $self->{tmp_git} = PublicInbox::Git->new($git_dir); + $tmp_git->{-tmp} = $self->{tmp}; + $self->{git_env} = { + GIT_DIR => $git_dir, + GIT_INDEX_FILE => "$git_dir/index", + }; + prepare_index($self); +} + +sub extract_old_mode ($) { + my ($di) = @_; + if (join('', @{$di->{hdr_lines}}) =~ + /^old mode (100644|100755|120000)\b/) { + return $1; + } + '100644'; +} + +sub do_step ($) { + my ($self) = @_; + eval { + # step 1: resolve blobs to patches in the todo queue + if (my $want = pop @{$self->{todo}}) { + # this populates {patches} and {todo} + resolve_patch($self, $want); + + # step 2: then we instantiate a working tree once + # the todo queue is finally empty: + } elsif (!defined($self->{tmp_git})) { + do_git_init($self); + + # step 3: apply each patch in the stack + } elsif (scalar @{$self->{patches}}) { + do_git_apply($self); + + # step 4: execute the user-supplied callback with + # our result: (which may be undef) + # Other steps may call user_cb to terminate prematurely + # on error + } elsif (my $ucb = delete($self->{user_cb})) { + $ucb->($self->{found}->{$self->{oid_want}}); + } else { + die 'about to call user_cb twice'; # Oops :x + } + }; # eval + my $err = $@; + if ($err) { + $err =~ s/^\s*Exception:\s*//; # bad word to show users :P + dbg($self, "E: $err"); + my $ucb = delete($self->{user_cb}); + eval { $ucb->($err) } if $ucb; + } +} + +sub step_cb ($) { + my ($self) = @_; + sub { do_step($self) }; +} + +sub next_step ($) { + my ($self) = @_; + # if outside of public-inbox-httpd, caller is expected to be + # looping step_cb, anyways + my $async = $self->{psgi_env}->{'pi-httpd.async'} or return; + # PublicInbox::HTTPD::Async->new + $async->(undef, step_cb($self)); +} + +sub mark_found ($$$) { + my ($self, $oid, $found_info) = @_; + $self->{found}->{$oid} = $found_info; +} + +sub parse_ls_files ($$$$) { + my ($self, $qsp, $bref, $di) = @_; + if (my $err = $qsp->{err}) { + die "git ls-files error: $err"; + } + + my ($line, @extra) = split(/\0/, $$bref); + scalar(@extra) and die "BUG: extra files in index: <", + join('> <', @extra), ">"; + + my ($info, $file) = split(/\t/, $line, 2); + my ($mode_b, $oid_b_full, $stage) = split(/ /, $info); + if ($file ne $di->{path_b}) { + die +"BUG: index mismatch: file=$file != path_b=$di->{path_b}"; + } + + my $tmp_git = $self->{tmp_git} or die 'no git working tree'; + my (undef, undef, $size) = $tmp_git->check($oid_b_full); + defined($size) or die "check $oid_b_full failed"; + + dbg($self, "index at:\n$mode_b $oid_b_full\t$file"); + my $created = [ $tmp_git, $oid_b_full, 'blob', $size, $di ]; + mark_found($self, $di->{oid_b}, $created); + next_step($self); # onto the next patch +} + +sub start_ls_files ($$) { + my ($self, $di) = @_; + my $cmd = [qw(git ls-files -s -z)]; + my $qsp = PublicInbox::Qspawn->new($cmd, $self->{git_env}); + $qsp->psgi_qx($self->{psgi_env}, undef, sub { + my ($bref) = @_; + eval { parse_ls_files($self, $qsp, $bref, $di) }; + ERR($self, $@) if $@; + }); +} + +sub do_git_apply ($) { + my ($self) = @_; + my $dn = $self->{tmp}->dirname; + my $patches = $self->{patches}; + + # we need --ignore-whitespace because some patches are CRLF + my @cmd = (qw(git -C), $dn, qw(apply --cached --ignore-whitespace + --whitespace=warn --verbose)); + my $len = length(join(' ', @cmd)); + my $total = $self->{tot}; + my $di; # keep track of the last one for "git ls-files" + + do { + my $i = ++$self->{nr}; + $di = shift @$patches; + dbg($self, "\napplying [$i/$total] " . di_url($self, $di) . + "\n" . join('', @{$di->{hdr_lines}})); + my $path = $total + 1 - $i; + $len += length($path) + 1; + push @cmd, $path; + } while (@$patches && $len < $ARG_SIZE_MAX); + + my $rdr = { 2 => 1 }; + my $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}, $rdr); + $qsp->psgi_qx($self->{psgi_env}, undef, sub { + my ($bref) = @_; + dbg($self, $$bref); + if (my $err = $qsp->{err}) { + ERR($self, "git apply error: $err"); + } + eval { start_ls_files($self, $di) }; + ERR($self, $@) if $@; + }); +} + +sub di_url ($$) { + my ($self, $di) = @_; + # note: we don't pass the PSGI env unconditionally, here, + # different inboxes can have different HTTP_HOST on the same instance. + my $ibx = $di->{ibx}; + my $env = $self->{psgi_env} if $ibx eq $self->{inboxes}->[0]; + my $url = $ibx->base_url($env); + my $mid = $di->{smsg}->{mid}; + defined($url) ? "$url$mid/" : "<$mid>"; +} + +sub resolve_patch ($$) { + my ($self, $want) = @_; + + if (scalar(@{$self->{patches}}) > $MAX_PATCH) { + die "Aborting, too many steps to $self->{oid_want}"; + } + + # see if we can find the blob in an existing git repo: + my $cur_want = $want->{oid_b}; + if ($self->{seen_oid}->{$cur_want}++) { + die "Loop detected solving $cur_want\n"; + } + if (my $existing = solve_existing($self, $want)) { + dbg($self, "found $cur_want in " . + join("\n", $existing->[0]->pub_urls)); + + if ($cur_want eq $self->{oid_want}) { # all done! + eval { delete($self->{user_cb})->($existing) }; + die "E: $@" if $@; + return; + } + mark_found($self, $cur_want, $existing); + return next_step($self); # onto patch application + } + + # scan through inboxes to look for emails which results in + # the oid we want: + my $di; + foreach my $ibx (@{$self->{inboxes}}) { + $di = find_extract_diff($self, $ibx, $want) or next; + + unshift @{$self->{patches}}, $di; + dbg($self, "found $cur_want in ".di_url($self, $di)); + + # good, we can find a path to the oid we $want, now + # lets see if we need to apply more patches: + my $src = $di->{oid_a}; + + unless ($src =~ /\A0+\z/) { + # we have to solve it using another oid, fine: + my $job = { oid_b => $src, path_b => $di->{path_a} }; + push @{$self->{todo}}, $job; + } + return next_step($self); # onto the next todo item + } + dbg($self, "could not find $cur_want"); + eval { delete($self->{user_cb})->(undef) }; # not found! :< + die "E: $@" if $@; +} + +# this API is designed to avoid creating self-referential structures; +# so user_cb never references the SolverGit object +sub new { + my ($class, $ibx, $user_cb) = @_; + + bless { + gits => $ibx->{-repo_objs}, + user_cb => $user_cb, + + # TODO: config option for searching related inboxes + inboxes => [ $ibx ], + }, $class; +} + +# recreate $oid_want using $hints +# Calls {user_cb} with: [ ::Git object, oid_full, type, size, di (diff_info) ] +# with found object, or undef if nothing was found +# Calls {user_cb} with a string error on fatal errors +sub solve ($$$$$) { + my ($self, $env, $out, $oid_want, $hints) = @_; + + # should we even get here? Probably not, but somebody + # could be manually typing URLs: + return (delete $self->{user_cb})->(undef) if $oid_want =~ /\A0+\z/; + + $self->{oid_want} = $oid_want; + $self->{out} = $out; + $self->{seen_oid} = {}; + $self->{tot} = 0; + $self->{psgi_env} = $env; + $self->{todo} = [ { %$hints, oid_b => $oid_want } ]; + $self->{patches} = []; # [ $di, $di, ... ] + $self->{found} = {}; # { abbr => [ ::Git, oid, type, size, $di ] } + $self->{tmp} = File::Temp->newdir('solver.tmp-XXXXXXXX', TMPDIR => 1); + + dbg($self, "solving $oid_want ..."); + my $step_cb = step_cb($self); + if (my $async = $env->{'pi-httpd.async'}) { + # PublicInbox::HTTPD::Async->new + $async->(undef, $step_cb); + } else { + $step_cb->() while $self->{user_cb}; + } +} + +1; diff --git a/lib/PublicInbox/UserContent.pm b/lib/PublicInbox/UserContent.pm new file mode 100644 index 00000000..df0429c3 --- /dev/null +++ b/lib/PublicInbox/UserContent.pm @@ -0,0 +1,98 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Self-updating module containing a sample CSS for client-side +# customization by users of public-inbox. Used by Makefile.PL +package PublicInbox::UserContent; +use strict; +use warnings; + +# this sub is updated automatically: +sub CSS () { + <<'_' + /* + * Dark color scheme using 216 web-safe colors, inspired + * somewhat by the default color scheme in mutt. + * It reduces eyestrain for me, and energy usage for all: + * https://en.wikipedia.org/wiki/Light-on-dark_color_scheme + */ + * { background:#000; color:#ccc } + + /* + * Underlined links add visual noise which make them hard-to-read. + * Use colors to make them stand out, instead. + */ + a { color:#69f; text-decoration:none } + a:visited { color:#96f } + + /* quoted text gets a different color */ + *.q { color:#09f } + + /* + * these may be used with cgit, too + * (cgit uses <div>, public-inbox uses <span>) + */ + *.add { color:#0ff } + *.del { color:#f0f } + *.head { color:#fff } + *.hunk { color:#c93 } + + /* + * highlight 3.x colors (tested 3.18) + * this doesn't use most of the colors available (I find too many + * colors overwhelming). So the #ccc default is commented out. + */ + .hl.num { color:#f30 } /* number */ + .hl.esc { color:#f0f } /* escape character */ + .hl.str { color:#f30 } /* string */ + .hl.ppc { color:#f0f } /* preprocessor */ + .hl.pps { color:#f30 } /* preprocessor string */ + .hl.slc { color:#09f } /* single-line comment */ + .hl.com { color:#09f } + /* .hl.opt { color:#ccc } */ + /* .hl.ipl { color:#ccc } */ + /* .hl.lin { color:#ccc } */ + .hl.kwa { color:#ff0 } + .hl.kwb { color:#0f0 } + .hl.kwc { color:#ff0 } + /* .hl.kwd { color:#ccc } */ +_ +} +# end of auto-updated sub + +# return a sample CSS +sub sample ($$) { + my ($ibx, $env) = @_; + my $url_prefix = $ibx->base_url($env); + my $preamble = <<""; +/* + * Firefox users: this goes in \$PROFILE_FOLDER/chrome/userContent.css + * where \$PROFILE_FOLDER is platform-specific + * + * cf. http://kb.mozillazine.org/UserContent.css + * http://kb.mozillazine.org/Profile_folder_-_Firefox + * + * Users of dillo can remove the entire lines with "moz-only" + * in them and place the resulting file in ~/.dillo/style.css + */ +\@-moz-document url-prefix($url_prefix) { /* moz-only */ + + $preamble . CSS() . "\n} /* moz-only */\n"; +} + +# Auto-update this file based on the contents of a CSS file: +# usage: perl -I lib __FILE__ contrib/css/216dark.css +# (See Makefile.PL) +if (scalar(@ARGV) == 1 && -r __FILE__) { + use autodie; + open my $ro, '<', $ARGV[0]; + my $css = do { local $/; <$ro> }; + $css =~ s/^([ \t]*\S)/\t$1/smg; + open my $rw, '+<', __FILE__; + my $out = do { local $/; <$rw> }; + $out =~ s/^sub CSS.*^_\n\}/sub CSS () {\n\t<<'_'\n${css}_\n}/sm; + seek $rw, 0, 0; + print $rw $out; +} + +1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 4c1c6705..ca9b9550 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -14,6 +14,7 @@ use PublicInbox::MsgIter; use PublicInbox::Address; use PublicInbox::WwwStream; use PublicInbox::Reply; +use PublicInbox::ViewDiff qw(flush_diff); require POSIX; use Time::Local qw(timegm); @@ -28,7 +29,7 @@ sub msg_html { my ($ctx, $mime, $more, $smsg) = @_; my $hdr = $mime->header_obj; my $ibx = $ctx->{-inbox}; - my $obfs_ibx = $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef; + $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef; my $tip = _msg_html_prepare($hdr, $ctx, $more, 0); my $end = 2; PublicInbox::WwwStream->response($ctx, 200, sub { @@ -36,7 +37,7 @@ sub msg_html { if ($nr == 1) { # $more cannot be true w/o $smsg being defined: my $upfx = $more ? '../'.mid_escape($smsg->mid).'/' : ''; - $tip . multipart_text_as_html($mime, $upfx, $obfs_ibx) . + $tip . multipart_text_as_html($mime, $upfx, $ibx) . '</pre><hr>' } elsif ($more && @$more) { ++$end; @@ -81,15 +82,15 @@ sub msg_html_more { my $str = eval { my ($id, $prev, $smsg) = @$more; my $mid = $ctx->{mid}; - $smsg = $ctx->{-inbox}->smsg_mime($smsg); + my $ibx = $ctx->{-inbox}; + $smsg = $ibx->smsg_mime($smsg); my $next = $ctx->{srch}->next_by_mid($mid, \$id, \$prev); @$more = $next ? ($id, $prev, $next) : (); if ($smsg) { my $mime = $smsg->{mime}; my $upfx = '../' . mid_escape($smsg->mid) . '/'; _msg_html_prepare($mime->header_obj, $ctx, $more, $nr) . - multipart_text_as_html($mime, $upfx, - $ctx->{-obfs_ibx}) . + multipart_text_as_html($mime, $upfx, $ibx) . '</pre><hr>' } else { ''; @@ -260,7 +261,8 @@ sub index_entry { $rv .= "\n"; # scan through all parts, looking for displayable text - msg_iter($mime, sub { $rv .= add_text_body($mhref, $obfs_ibx, $_[0]) }); + my $ibx = $ctx->{-inbox}; + msg_iter($mime, sub { $rv .= add_text_body($mhref, $ibx, $_[0]) }); # add the footer $rv .= "\n<a\nhref=#$id_m\nid=e$id>^</a> ". @@ -488,11 +490,11 @@ sub thread_html { } sub multipart_text_as_html { - my ($mime, $upfx, $obfs_ibx) = @_; + my ($mime, $upfx, $ibx) = @_; my $rv = ""; # scan through all parts, looking for displayable text - msg_iter($mime, sub { $rv .= add_text_body($upfx, $obfs_ibx, $_[0]) }); + msg_iter($mime, sub { $rv .= add_text_body($upfx, $ibx, $_[0]) }); $rv; } @@ -545,7 +547,8 @@ sub attach_link ($$$$;$) { } sub add_text_body { - my ($upfx, $obfs_ibx, $p) = @_; + my ($upfx, $ibx, $p) = @_; + my $obfs_ibx = $ibx->{obfuscate} ? $ibx : undef; # $p - from msg_iter: [ Email::MIME, depth, @idx ] my ($part, $depth) = @$p; # attachment @idx is unused my $ct = $part->content_type || 'text/plain'; @@ -554,6 +557,25 @@ sub add_text_body { return attach_link($upfx, $ct, $p, $fn) unless defined $s; + # makes no difference to browsers, and don't screw up filename + # link generation in diffs with the extra '%0D' + $s =~ s/\r\n/\n/sg; + + my ($diff, $spfx); + if ($s =~ /^(?:diff|---|\+{3}) /ms) { + $diff = []; + if ($ibx->{-repo_objs}) { + my $n_slash = $upfx =~ tr!/!/!; + if ($n_slash == 0) { + $spfx = '../'; + } elsif ($n_slash == 1) { + $spfx = ''; + } else { # nslash == 2 + $spfx = '../../'; + } + } + }; + my @lines = split(/^/m, $s); $s = ''; if (defined($fn) || $depth > 0 || $err) { @@ -568,19 +590,26 @@ sub add_text_body { # show the previously buffered quote inline flush_quote(\$s, $l, \@quot) if @quot; - # regular line, OK - $l->linkify_1($cur); - $s .= $l->linkify_2(ascii_html($cur)); + if ($diff) { + push @$diff, $cur; + } else { + # regular line, OK + $l->linkify_1($cur); + $s .= $l->linkify_2(ascii_html($cur)); + } } else { + flush_diff(\$s, $spfx, $l, $diff) if $diff && @$diff; push @quot, $cur; } } if (@quot) { # ugh, top posted flush_quote(\$s, $l, \@quot); + flush_diff(\$s, $spfx, $l, $diff) if $diff && @$diff; obfuscate_addrs($obfs_ibx, $s) if $obfs_ibx; $s; } else { + flush_diff(\$s, $spfx, $l, $diff) if $diff && @$diff; obfuscate_addrs($obfs_ibx, $s) if $obfs_ibx; if ($s =~ /\n\z/s) { # common, last line ends with a newline $s; @@ -1070,7 +1099,7 @@ sub dump_topics { my $mbox = qq(<a\nhref="$href/t.mbox.gz">mbox.gz</a>); my $atom = qq(<a\nhref="$href/t.atom">Atom</a>); - my $s = "<a\nhref=\"$href/T/$anchor\"><b>$top</b></a>\n" . + my $s = "<a\nhref=\"$href/T/$anchor\">$top</a>\n" . " $ds UTC $n - $mbox / $atom\n"; for (my $i = 0; $i < scalar(@ex); $i += 2) { my $level = $ex[$i]; diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm new file mode 100644 index 00000000..a8045687 --- /dev/null +++ b/lib/PublicInbox/ViewDiff.pm @@ -0,0 +1,161 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# used by PublicInbox::View +# This adds CSS spans for diff highlighting. +# It also generates links for ViewVCS + SolverGit to show +# (or reconstruct) blobs. + +package PublicInbox::ViewDiff; +use strict; +use warnings; +use base qw(Exporter); +our @EXPORT_OK = qw(flush_diff); +use URI::Escape qw(uri_escape_utf8); +use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Git qw(git_unquote); + +sub DSTATE_INIT () { 0 } +sub DSTATE_STAT () { 1 } # TODO +sub DSTATE_HEAD () { 2 } # /^diff --git /, /^index /, /^--- /, /^\+\+\+ / +sub DSTATE_CTX () { 3 } # /^ / +sub DSTATE_ADD () { 4 } # /^\+/ +sub DSTATE_DEL () { 5 } # /^\-/ +my @state2class = ( + '', # init + '', # stat + 'head', + '', # ctx + 'add', + 'del' +); + +sub UNSAFE () { "^A-Za-z0-9\-\._~/" } + +my $OID_NULL = '0{7,40}'; +my $OID_BLOB = '[a-f0-9]{7,40}'; +my $PATH_A = '"?a/.+|/dev/null'; +my $PATH_B = '"?b/.+|/dev/null'; + +sub to_html ($$) { + $_[0]->linkify_1($_[1]); + $_[0]->linkify_2(ascii_html($_[1])); +} + +# link to line numbers in blobs +sub diff_hunk ($$$$) { + my ($dctx, $spfx, $ca, $cb) = @_; + my $oid_a = $dctx->{oid_a}; + my $oid_b = $dctx->{oid_b}; + + (defined($spfx) && defined($oid_a) && defined($oid_b)) or + return "@@ $ca $cb @@"; + + my ($n) = ($ca =~ /^-(\d+)/); + $n = defined($n) ? do { ++$n; "#n$n" } : ''; + + my $rv = qq(@@ <a\nhref="$spfx$oid_a/s/$dctx->{Q}$n">$ca</a>); + + ($n) = ($cb =~ /^\+(\d+)/); + $n = defined($n) ? do { ++$n; "#n$n" } : ''; + + $rv .= qq( <a\nhref="$spfx$oid_b/s/$dctx->{Q}$n">$cb</a> @@); +} + +sub oid ($$$) { + my ($dctx, $spfx, $oid) = @_; + defined($spfx) ? qq(<a\nhref="$spfx$oid/s/$dctx->{Q}">$oid</a>) : $oid; +} + +sub to_state ($$$) { + my ($dst, $state, $new_state) = @_; + $$dst .= '</span>' if $state2class[$state]; + $_[1] = $new_state; + my $class = $state2class[$new_state] or return; + $$dst .= qq(<span\nclass="$class">); +} + +sub flush_diff ($$$$) { + my ($dst, $spfx, $linkify, $diff) = @_; + my $state = DSTATE_INIT; + my $dctx = { Q => '' }; # {}, keys: oid_a, oid_b, path_a, path_b + + foreach my $s (@$diff) { + if ($s =~ /^ /) { + if ($state2class[$state]) { + to_state($dst, $state, DSTATE_CTX); + } + $$dst .= to_html($linkify, $s); + } elsif ($s =~ /^-- $/) { # email signature begins + $state == DSTATE_INIT or + to_state($dst, $state, DSTATE_INIT); + $$dst .= $s; + } elsif ($s =~ m!^diff --git ($PATH_A) ($PATH_B)$!) { + if ($state != DSTATE_HEAD) { + my ($pa, $pb) = ($1, $2); + to_state($dst, $state, DSTATE_HEAD); + $pa = (split('/', git_unquote($pa), 2))[1]; + $pb = (split('/', git_unquote($pb), 2))[1]; + $dctx = { + Q => "?b=".uri_escape_utf8($pb, UNSAFE), + }; + if ($pa ne $pb) { + $dctx->{Q} .= + "&a=".uri_escape_utf8($pa, UNSAFE); + } + } + $$dst .= to_html($linkify, $s); + } elsif ($s =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b//o) { + $$dst .= $1 . oid($dctx, $spfx, $2); + $dctx = { Q => '' }; + $$dst .= to_html($linkify, $s) ; + } elsif ($s =~ s/^index ($OID_BLOB)(\.\.$OID_NULL)\b//o) { + $$dst .= 'index ' . oid($dctx, $spfx, $1) . $2; + $dctx = { Q => '' }; + $$dst .= to_html($linkify, $s); + } elsif ($s =~ /^index ($OID_BLOB)\.\.($OID_BLOB)/o) { + $dctx->{oid_a} = $1; + $dctx->{oid_b} = $2; + $$dst .= to_html($linkify, $s); + } elsif ($s =~ s/^@@ (\S+) (\S+) @@//) { + $$dst .= '</span>' if $state2class[$state]; + $$dst .= qq(<span\nclass="hunk">); + $$dst .= diff_hunk($dctx, $spfx, $1, $2); + $$dst .= '</span>'; + $state = DSTATE_CTX; + $$dst .= to_html($linkify, $s); + } elsif ($s =~ m!^--- $PATH_A! || + $s =~ m!^\+{3} $PATH_B!) { + # color only (no oid link) + $state == DSTATE_INIT and + to_state($dst, $state, DSTATE_HEAD); + $$dst .= to_html($linkify, $s); + } elsif ($s =~ /^\+/) { + if ($state != DSTATE_ADD && $state != DSTATE_INIT) { + to_state($dst, $state, DSTATE_ADD); + } + $$dst .= to_html($linkify, $s); + } elsif ($s =~ /^-/) { + if ($state != DSTATE_DEL && $state != DSTATE_INIT) { + to_state($dst, $state, DSTATE_DEL); + } + $$dst .= to_html($linkify, $s); + # ignore the following lines in headers: + } elsif ($s =~ /^(?:dis)similarity index/ || + $s =~ /^(?:old|new) mode/ || + $s =~ /^(?:deleted|new) file mode/ || + $s =~ /^(?:copy|rename) (?:from|to) / || + $s =~ /^(?:dis)?similarity index /) { + $$dst .= to_html($linkify, $s); + } else { + $state == DSTATE_INIT or + to_state($dst, $state, DSTATE_INIT); + $$dst .= to_html($linkify, $s); + } + } + @$diff = (); + $$dst .= '</span>' if $state2class[$state]; + undef; +} + +1; diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm new file mode 100644 index 00000000..85edf22f --- /dev/null +++ b/lib/PublicInbox/ViewVCS.pm @@ -0,0 +1,146 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# show any VCS object, similar to "git show" +# FIXME: we only show blobs for now +# +# This can use a "solver" to reconstruct blobs based on git +# patches (with abbreviated OIDs in the header). However, the +# abbreviated OIDs must match exactly what's in the original +# email (unless a normal code repo already has the blob). +# +# In other words, we can only reliably reconstruct blobs based +# on links generated by ViewDiff (and only if the emailed +# patches apply 100% cleanly to published blobs). + +package PublicInbox::ViewVCS; +use strict; +use warnings; +use Encode qw(find_encoding); +use PublicInbox::SolverGit; +use PublicInbox::WwwStream; +use PublicInbox::Linkify; +use PublicInbox::Hval qw(ascii_html to_filename src_escape); +my $hl = eval { + require PublicInbox::HlMod; + PublicInbox::HlMod->new; +}; + +# we need to trigger highlight::CodeGenerator::deleteInstance +# in HlMod::DESTROY before the rest of Perl shuts down to avoid +# a segfault at shutdown +END { $hl = undef }; + +my %QP_MAP = ( A => 'oid_a', B => 'oid_b', a => 'path_a', b => 'path_b' ); +my $max_size = 1024 * 1024; # TODO: configurable +my $enc_utf8 = find_encoding('UTF-8'); + +sub html_page ($$$) { + my ($ctx, $code, $strref) = @_; + my $wcb = delete $ctx->{-wcb}; + $ctx->{-upfx} = '../../'; # from "/$INBOX/$OID/s/" + my $res = PublicInbox::WwwStream->response($ctx, $code, sub { + my ($nr, undef) = @_; + $nr == 1 ? $$strref : undef; + }); + $wcb->($res); +} + +sub solve_result { + my ($ctx, $res, $log, $hints, $fn) = @_; + + unless (seek($log, 0, 0)) { + $ctx->{env}->{'psgi.errors'}->print("seek(log): $!\n"); + return html_page($ctx, 500, \'seek error'); + } + $log = do { local $/; <$log> }; + + my $ref = ref($res); + my $l = PublicInbox::Linkify->new; + $l->linkify_1($log); + $log = '<pre>debug log:</pre><hr /><pre>' . + $l->linkify_2(ascii_html($log)) . '</pre>'; + + $res or return html_page($ctx, 404, \$log); + $ref eq 'ARRAY' or return html_page($ctx, 500, \$log); + + my ($git, $oid, $type, $size, $di) = @$res; + if ($size > $max_size) { + # TODO: stream the raw file if it's gigantic, at least + $log = '<pre><b>Too big to show</b></pre>' . $log; + return html_page($ctx, 500, \$log); + } + + my $blob = $git->cat_file($oid); + if (!$blob) { # WTF? + my $e = "Failed to retrieve generated blob ($oid)"; + $ctx->{env}->{'psgi.errors'}->print("$e ($git->{git_dir})\n"); + $log = "<pre><b>$e</b></pre>" . $log; + return html_page($ctx, 500, \$log); + } + + my $binary = index($$blob, "\0") >= 0; + if ($fn) { + my $h = [ 'Content-Length', $size, 'Content-Type' ]; + push(@$h, ($binary ? 'application/octet-stream' : 'text/plain')); + return delete($ctx->{-wcb})->([200, $h, [ $$blob ]]); + } + + my $path = to_filename($di->{path_b} || $hints->{path_b} || 'blob'); + my $raw_link = "(<a\nhref=$path>raw</a>)"; + if ($binary) { + $log = "<pre>$oid $type $size bytes (binary)" . + " $raw_link</pre>" . $log; + return html_page($ctx, 200, \$log); + } + + $$blob = $enc_utf8->decode($$blob); + my $nl = ($$blob =~ tr/\n/\n/); + my $pad = length($nl); + + $l->linkify_1($$blob); + my $ok = $hl->do_hl($blob, $path) if $hl; + if ($ok) { + $$ok = $enc_utf8->decode($$ok); + src_escape($$ok); + $blob = $ok; + } else { + $$blob = ascii_html($$blob); + } + + # using some of the same CSS class names and ids as cgit + $log = "<pre>$oid $type $size bytes $raw_link</pre>" . + "<hr /><table\nclass=blob>". + "<tr><td\nclass=linenumbers><pre>" . join('', map { + sprintf("<a id=n$_ href=#n$_>% ${pad}u</a>\n", $_) + } (1..$nl)) . '</pre></td>' . + '<td><pre> </pre></td>'. # pad for non-CSS users + "<td\nclass=lines><pre\nstyle='white-space:pre'><code>" . + $l->linkify_2($$blob) . + '</code></pre></td></tr></table>' . $log; + + html_page($ctx, 200, \$log); +} + +sub show ($$;$) { + my ($ctx, $oid_b, $fn) = @_; + my $qp = $ctx->{qp}; + my $hints = {}; + while (my ($from, $to) = each %QP_MAP) { + defined(my $v = $qp->{$from}) or next; + $hints->{$to} = $v; + } + + open my $log, '+>', undef or die "open: $!"; + my $solver = PublicInbox::SolverGit->new($ctx->{-inbox}, sub { + solve_result($ctx, $_[0], $log, $hints, $fn); + }); + + # PSGI server will call this and give us a callback + sub { + $ctx->{-wcb} = $_[0]; # HTTP write callback + $solver->solve($ctx->{env}, $log, $oid_b, $hints); + }; +} + +1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 3562e46c..406802a9 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -6,6 +6,7 @@ # We focus on the lowest common denominators here: # - targeted at text-only console browsers (w3m, links, etc..) # - Only basic HTML, CSS only for line-wrapping <pre> text content for GUIs +# and diff/syntax-highlighting (optional) # - No JavaScript, graphics or icons allowed. # - Must not rely on static content # - UTF-8 is only for user-content, 7-bit US-ASCII for us @@ -19,12 +20,14 @@ use URI::Escape qw(uri_unescape); use PublicInbox::MID qw(mid_escape); require PublicInbox::Git; use PublicInbox::GitHTTPBackend; +use PublicInbox::UserContent; # TODO: consider a routing tree now that we have more endpoints: our $INBOX_RE = qr!\A/([\w\-][\w\.\-]*)!; our $MID_RE = qr!([^/]+)!; our $END_RE = qr!(T/|t/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; our $ATTACH_RE = qr!(\d[\.\d]*)-([[:alnum:]][\w\.-]+[[:alnum:]])!i; +our $OID_RE = qr![a-f0-9]{7,40}!; sub new { my ($class, $pi_config) = @_; @@ -117,7 +120,14 @@ sub call { r301($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/_/text(?:/(.*))?\z!o) { get_text($ctx, $1, $2); - + } elsif ($path_info =~ m!$INBOX_RE/([\w\-\.]+)\.css\z!o) { + get_css($ctx, $1, $2); + } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/\z!o) { + get_vcs_object($ctx, $1, $2); + } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/([\w\.\-]+)\z!o) { + get_vcs_object($ctx, $1, $2, $3); + } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s\z!o) { + r301($ctx, $1, $2, 's/'); # convenience redirects order matters } elsif ($path_info =~ m!$INBOX_RE/([^/]{2,})\z!o) { r301($ctx, $1, $2); @@ -129,6 +139,7 @@ sub call { # for CoW-friendliness, MOOOOO! sub preload { + my ($self) = @_; require PublicInbox::Feed; require PublicInbox::View; require PublicInbox::SearchThread; @@ -141,6 +152,9 @@ sub preload { PublicInbox::NewsWWW)) { eval "require $_;"; } + if (ref($self)) { + $self->stylesheets_prepare($_) for ('', '../', '../../'); + } } # private functions below @@ -259,6 +273,18 @@ sub get_text { PublicInbox::WwwText::get_text($ctx, $key); } +# show git objects (blobs and commits) +# /$INBOX/_/$OBJECT_ID/show +# /$INBOX/_/${OBJECT_ID}_${FILENAME} +# KEY may contain slashes +sub get_vcs_object ($$$;$) { + my ($ctx, $inbox, $oid, $filename) = @_; + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; + require PublicInbox::ViewVCS; + PublicInbox::ViewVCS::show($ctx, $oid, $filename); +} + sub ctx_get { my ($ctx, $key) = @_; my $val = $ctx->{$key}; @@ -446,4 +472,128 @@ sub get_attach { PublicInbox::WwwAttach::get_attach($ctx, $idx, $fn); } +# User-generated content (UGC) may have excessively long lines +# and screw up rendering on some browsers, so we use pre-wrap. +# +# We also force everything to the same scaled font-size because GUI +# browsers (tested both Firefox and surf (webkit)) uses a larger font +# for the Search <form> element than the rest of the page. Font size +# uniformity is important to people who rely on gigantic fonts. +# Finally, we use monospace to ensure the Search field and button +# has the same size and spacing as everything else which is +# <pre>-formatted anyways. +our $STYLE = 'pre{white-space:pre-wrap}*{font-size:100%;font-family:monospace}'; + +sub stylesheets_prepare ($$) { + my ($self, $upfx) = @_; + my $mini = eval { + require CSS::Minifier; + sub { CSS::Minifier::minify(input => $_[0]) }; + } || eval { + require CSS::Minifier::XS; + sub { CSS::Minifier::XS::minify($_[0]) }; + } || sub { $_[0] }; + + my $css_map = {}; + my $stylesheets = $self->{pi_config}->{css} || []; + my $links = []; + my $inline_ok = 1; + + foreach my $s (@$stylesheets) { + my $attr = {}; + local $_ = $s; + foreach my $k (qw(media title href)) { + if (s/\s*$k='([^']+)'// || s/\s*$k=(\S+)//) { + $attr->{$k} = $1; + } + } + + if (defined $attr->{href}) { + $inline_ok = 0; + } else { + open(my $fh, '<', $_) or do { + warn "failed to open $_: $!\n"; + next; + }; + my ($key) = (m!([^/]+?)(?:\.css)?\z!i); + my $ctime = 0; + my $local = do { local $/; <$fh> }; + if ($local =~ /\S/) { + $ctime = sprintf('%x',(stat($fh))[10]); + $local = $mini->($local); + } + $css_map->{$key} = $local; + $attr->{href} = "$upfx$key.css?$ctime"; + if (defined($attr->{title})) { + $inline_ok = 0; + } elsif (($attr->{media}||'screen') eq 'screen') { + $attr->{-inline} = $local; + } + } + push @$links, $attr; + } + + my $buf = "<style>$STYLE"; + if ($inline_ok) { + my @ext; # for media=print and whatnot + foreach my $attr (@$links) { + if (defined(my $str = delete $attr->{-inline})) { + $buf .= $str; + } else { + push @ext, $attr; + } + } + $links = \@ext; + } + $buf .= '</style>'; + + if (@$links) { + foreach my $attr (@$links) { + delete $attr->{-inline}; + $buf .= "<link\ntype=text/css\nrel=stylesheet"; + while (my ($k, $v) = each %$attr) { + $v = qq{"$v"} if $v =~ /[\s=]/; + $buf .= qq{\n$k=$v}; + } + $buf .= ' />'; + } + $self->{"-style-$upfx"} = $buf; + } else { + $self->{-style_inline} = $buf; + } + $self->{-css_map} = $css_map; +} + +# returns an HTML fragment with <style> or <link> tags in them +# Called by WwwStream by nearly every HTML page +sub style { + my ($self, $upfx) = @_; + $self->{-style_inline} || $self->{"-style-$upfx"} || do { + stylesheets_prepare($self, $upfx); + $self->{-style_inline} || $self->{"-style-$upfx"} + }; +} + +# /$INBOX/$KEY.css endpoint +# CSS is configured globally for all inboxes, but we access them on +# a per-inbox basis. This allows administrators to setup per-inbox +# static routes to intercept the request before it hits PSGI +sub get_css ($$$) { + my ($ctx, $inbox, $key) = @_; + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; + my $self = $ctx->{www}; + my $css_map = $self->{-css_map} || stylesheets_prepare($self, ''); + my $css = $css_map->{$key}; + if (!defined($css) && $key eq 'userContent') { + my $env = $ctx->{env}; + $css = PublicInbox::UserContent::sample($ctx->{-inbox}, $env); + } + defined $css or return r404(); + my $h = [ 'Content-Length', bytes::length($css), + 'Content-Type', 'text/css' ]; + PublicInbox::GitHTTPBackend::cache_one_year($h); + [ 200, $h, [ $css ] ]; +} + 1; diff --git a/lib/PublicInbox/WwwHighlight.pm b/lib/PublicInbox/WwwHighlight.pm new file mode 100644 index 00000000..09fc48ab --- /dev/null +++ b/lib/PublicInbox/WwwHighlight.pm @@ -0,0 +1,74 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Standalone PSGI app to provide syntax highlighting as-a-service +# via "highlight" Perl module ("libhighlight-perl" in Debian). +# +# This allows exposing highlight as a persistent HTTP service for +# other scripts via HTTP PUT requests. PATH_INFO will be used +# as a hint for detecting the language for highlight. +# +# The following example using curl(1) will do the right thing +# regarding the file extension: +# +# curl -HExpect: -T /path/to/file http://example.com/ +# +# You can also force a file extension by giving a path +# (in this case, "c") via: +# +# curl -HExpect: -T /path/to/file http://example.com/x.c + +package PublicInbox::WwwHighlight; +use strict; +use warnings; +use HTTP::Status qw(status_message); +use parent qw(PublicInbox::HlMod); + +# TODO: support highlight(1) for distros which don't package the +# SWIG extension. Also, there may be admins who don't want to +# have ugly SWIG-generated code in a long-lived Perl process. + +sub r ($) { + my ($code) = @_; + my $msg = status_message($code); + my $len = length($msg); + [ $code, [qw(Content-Type text/plain Content-Length), $len], [$msg] ] +} + +# another slurp API hogging up all my memory :< +# This is capped by whatever the PSGI server allows, +# $ENV{GIT_HTTP_MAX_REQUEST_BUFFER} for PublicInbox::HTTP (10 MB) +sub read_in_full ($) { + my ($env) = @_; + + my $in = $env->{'psgi.input'}; + my $off = 0; + my $buf = ''; + my $len = $env->{CONTENT_LENGTH} || 8192; + while (1) { + my $r = $in->read($buf, $len, $off); + last unless defined $r; + return \$buf if $r == 0; + $off += $r; + } + $env->{'psgi.errors'}->print("input read error: $!\n"); + undef; +} + +# entry point for PSGI +sub call { + my ($self, $env) = @_; + my $req_method = $env->{REQUEST_METHOD}; + + return r(405) if $req_method ne 'PUT'; + + my $bref = read_in_full($env) or return r(500); + $bref = $self->do_hl($bref, $env->{PATH_INFO}); + + my $h = [ 'Content-Type', 'text/html; charset=UTF-8' ]; + push @$h, 'Content-Length', bytes::length($$bref); + + [ 200, $h, [ $$bref ] ] +} + +1; diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index e548f00f..8ae35c73 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -38,10 +38,12 @@ sub _html_top ($) { my $title = $ctx->{-title_html} || $desc; my $upfx = $ctx->{-upfx} || ''; my $help = $upfx.'_/text/help'; + my $color = $upfx.'_/text/color'; my $atom = $ctx->{-atom} || $upfx.'new.atom'; my $tip = $ctx->{-html_tip} || ''; my $top = "<b>$desc</b>"; my $links = "<a\nhref=\"$help\">help</a> / ". + "<a\nhref=\"$color\">color</a> / ". "<a\nhref=\"$atom\">Atom feed</a>"; if ($obj->search) { my $q_val = $ctx->{-q_value_html}; @@ -65,7 +67,7 @@ sub _html_top ($) { "<html><head><title>$title</title>" . "<link\nrel=alternate\ntitle=\"Atom feed\"\n". "href=\"$atom\"\ntype=\"application/atom+xml\"/>" . - PublicInbox::Hval::STYLE . + $ctx->{www}->style($upfx) . "</head><body>". $top . $tip; } diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm index b5874cf6..d3413ad7 100644 --- a/lib/PublicInbox/WwwText.pm +++ b/lib/PublicInbox/WwwText.pm @@ -88,9 +88,44 @@ sub _srch_prefix ($$) { 1; } +sub _colors_help ($$) { + my ($ctx, $txt) = @_; + my $ibx = $ctx->{-inbox}; + my $base_url = $ibx->base_url($ctx->{env}); + $$txt .= "color customization for $base_url\n"; + $$txt .= <<EOF; + +public-inbox provides a stable set of CSS classes for users to +customize colors for highlighting diffs and code. + +Users of browsers such as dillo, Firefox, or some browser +extensions may start by downloading the following sample CSS file +to control the colors they see: + + ${base_url}userContent.css + +CSS classes +----------- + + span.q - quoted text in email messages + +For diff highlighting, we try to match class names with those +used by cgit: https://git.zx2c4.com/cgit/ + + span.add - diff post-image lines + + span.del - diff pre-image lines + + span.head - diff header (metainformation) + + span.hunk - diff hunk-header + +EOF +} sub _default_text ($$$) { my ($ctx, $key, $txt) = @_; + return _colors_help($ctx, $txt) if $key eq 'color'; return if $key ne 'help'; # TODO more keys? my $ibx = $ctx->{-inbox}; |