diff options
-rw-r--r-- | MANIFEST | 4 | ||||
-rw-r--r-- | examples/highlight.psgi | 13 | ||||
-rw-r--r-- | lib/PublicInbox/HlMod.pm | 126 | ||||
-rw-r--r-- | lib/PublicInbox/WwwHighlight.pm | 73 | ||||
-rw-r--r-- | t/hl_mod.t | 54 |
5 files changed, 270 insertions, 0 deletions
@@ -38,6 +38,7 @@ examples/apache2_perl.conf examples/apache2_perl_old.conf examples/cgi-webrick.rb examples/cgit-commit-filter.lua +examples/highlight.psgi examples/logrotate.conf examples/public-inbox-config examples/public-inbox-httpd.socket @@ -74,6 +75,7 @@ lib/PublicInbox/GitHTTPBackend.pm lib/PublicInbox/HTTP.pm lib/PublicInbox/HTTPD.pm lib/PublicInbox/HTTPD/Async.pm +lib/PublicInbox/HlMod.pm lib/PublicInbox/Hval.pm lib/PublicInbox/Import.pm lib/PublicInbox/Inbox.pm @@ -120,6 +122,7 @@ lib/PublicInbox/WWW.pod lib/PublicInbox/WatchMaildir.pm lib/PublicInbox/WwwAtomStream.pm lib/PublicInbox/WwwAttach.pm +lib/PublicInbox/WwwHighlight.pm lib/PublicInbox/WwwStream.pm lib/PublicInbox/WwwText.pm sa_config/Makefile @@ -170,6 +173,7 @@ t/git-http-backend.psgi t/git-http-backend.t t/git.fast-import-data t/git.t +t/hl_mod.t t/html_index.t t/httpd-corner.psgi t/httpd-corner.t diff --git a/examples/highlight.psgi b/examples/highlight.psgi new file mode 100644 index 00000000..244b128e --- /dev/null +++ b/examples/highlight.psgi @@ -0,0 +1,13 @@ +#!/usr/bin/perl -w +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Usage: plackup [OPTIONS] /path/to/this/file +# A startup command for development which monitors changes: +# plackup -I lib -o 127.0.0.1 -R lib -r examples/highlight.psgi +use strict; +use warnings; +use PublicInbox::WwwHighlight; +use Plack::Builder; +my $hl = PublicInbox::WwwHighlight->new; +builder { sub { $hl->call(@_) }; } diff --git a/lib/PublicInbox/HlMod.pm b/lib/PublicInbox/HlMod.pm new file mode 100644 index 00000000..5cbfb298 --- /dev/null +++ b/lib/PublicInbox/HlMod.pm @@ -0,0 +1,126 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# I have no idea how stable or safe this is for handling untrusted +# input, but it seems to have been around for a while, and the +# highlight(1) executable is supported by gitweb and cgit. +# +# I'm also unsure about API stability, but highlight 3.x seems to +# have been around a few years and ikiwiki (apparently the only +# user of the SWIG/Perl bindings, at least in Debian) hasn't needed +# major changes to support it in recent years. +# +# Some code stolen from ikiwiki (GPL-2.0+) +# wrapper for SWIG-generated highlight.pm bindings +package PublicInbox::HlMod; +use strict; +use warnings; +use highlight; # SWIG-generated stuff + +sub _parse_filetypes ($) { + my $ft_conf = $_[0]->searchFile('filetypes.conf') or + die 'filetypes.conf not found by highlight'; + open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!"; + local $/; + my $cfg = <$fh>; + my %ext2lang; + my @shebang; # order matters + + # Hrm... why isn't this exposed by the highlight API? + # highlight >= 3.2 format (bind-style) (from ikiwiki) + while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+ + Extensions\s*=\s*{([^}]+)}/sgx) { + my $lang = $1; + foreach my $bit (split(/,/, $2)) { + $bit =~ s/.*"(.*)".*/$1/s; + $ext2lang{$bit} = $lang; + } + } + # AFAIK, all the regexps used by in filetypes.conf distributed + # by highlight work as Perl REs + while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+ + Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) { + my ($lang, $re) = ($1, $2); + eval { + my $perl_re = qr/$re/; + push @shebang, [ $lang, $perl_re ]; + }; + if ($@) { + warn "$lang shebang=[[$re]] did not work in Perl: $@"; + } + } + (\%ext2lang, \@shebang); +} + +sub new { + my ($class) = @_; + my $dir = highlight::DataDir->new; + $dir->initSearchDirectories(''); + my ($ext2lang, $shebang) = _parse_filetypes($dir); + bless { + -dir => $dir, + -ext2lang => $ext2lang, + -shebang => $shebang, + }, $class; +} + +sub _shebang2lang ($$) { + my ($self, $str) = @_; + my $shebang = $self->{-shebang}; + foreach my $s (@$shebang) { + return $s->[0] if $$str =~ $s->[1]; + } + undef; +} + +sub _path2lang ($$) { + my ($self, $path) = @_; + my ($ext) = ($path =~ m!([^\\/\.]+)\z!); + $ext = lc($ext); + $self->{-ext2lang}->{$ext} || $ext; +} + +sub do_hl { + my ($self, $str, $path) = @_; + my $lang = _path2lang($self, $path) if defined $path; + my $dir = $self->{-dir}; + my $langpath; + if (defined $lang) { + $langpath = $dir->getLangPath("$lang.lang") or return; + $langpath = undef unless -f $langpath; + } + unless (defined $langpath) { + $lang = _shebang2lang($self, $str) or return; + $langpath = $dir->getLangPath("$lang.lang") or return; + $langpath = undef unless -f $langpath; + } + return unless defined $langpath; + + my $gen = $self->{$langpath} ||= do { + my $g = highlight::CodeGenerator::getInstance($highlight::HTML); + $g->setFragmentCode(1); # generate html fragment + $g->setHTMLEnclosePreTag(1); # include <pre> + + # whatever theme works + my $themepath = $dir->getThemePath('print.theme'); + $g->initTheme($themepath); + $g->loadLanguage($langpath); + $g->setEncoding('utf-8'); + $g; + }; + \($gen->generateString($$str)) +} + +# SWIG instances aren't reference-counted, but $self is; +# so we need to delete all the CodeGenerator instances manually +# at our own destruction +sub DESTROY { + my ($self) = @_; + foreach my $gen (values %$self) { + if (ref($gen) eq 'highlight::CodeGenerator') { + highlight::CodeGenerator::deleteInstance($gen); + } + } +} + +1; diff --git a/lib/PublicInbox/WwwHighlight.pm b/lib/PublicInbox/WwwHighlight.pm new file mode 100644 index 00000000..3d6ca03b --- /dev/null +++ b/lib/PublicInbox/WwwHighlight.pm @@ -0,0 +1,73 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Standalone PSGI app to provide syntax highlighting as-a-service +# via "highlight" Perl module ("libhighlight-perl" in Debian). +# +# This allows exposing highlight as a persistent HTTP service for +# other scripts via HTTP PUT requests. PATH_INFO will be used +# as a hint for detecting the language for highlight. +# +# The following example using curl(1) will do the right thing +# regarding the file extension: +# +# curl -HExpect: -T /path/to/file http://example.com/ +# +# You can also force a file extension by giving a path +# (in this case, "c") via: +# +# curl -HExpect: -T /path/to/file http://example.com/x.c + +package PublicInbox::WwwHighlight; +use strict; +use warnings; +use HTTP::Status qw(status_message); +use parent qw(PublicInbox::HlMod); + +# TODO: support highlight(1) for distros which don't package the +# SWIG extension. Also, there may be admins who don't want to +# have ugly SWIG-generated code in a long-lived Perl process. + +sub r ($) { + my ($code) = @_; + my $msg = status_message($code); + my $len = length($msg); + [ $code, [qw(Content-Type text/plain Content-Length), $len], [$msg] ] +} + +# another slurp API hogging up all my memory :< +# This is capped by whatever the PSGI server allows, +# $ENV{GIT_HTTP_MAX_REQUEST_BUFFER} for PublicInbox::HTTP (10 MB) +sub read_in_full ($) { + my ($env) = @_; + + my $in = $env->{'psgi.input'}; + my $off = 0; + my $buf = ''; + my $len = $env->{CONTENT_LENGTH} || 8192; + while (1) { + my $r = $in->read($buf, $len, $off); + last unless defined $r; + return \$buf if $r == 0; + $off += $r; + } + $env->{'psgi.errors'}->print("input read error: $!\n"); +} + +# entry point for PSGI +sub call { + my ($self, $env) = @_; + my $req_method = $env->{REQUEST_METHOD}; + + return r(405) if $req_method ne 'PUT'; + + my $bref = read_in_full($env) or return r(500); + $bref = $self->do_hl($bref, $env->{PATH_INFO}); + + my $h = [ 'Content-Type', 'text/html; charset=UTF-8' ]; + push @$h, 'Content-Length', bytes::length($$bref); + + [ 200, $h, [ $$bref ] ] +} + +1; diff --git a/t/hl_mod.t b/t/hl_mod.t new file mode 100644 index 00000000..b8b8eb9d --- /dev/null +++ b/t/hl_mod.t @@ -0,0 +1,54 @@ +#!/usr/bin/perl -w +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +eval { require highlight } or + plan skip_all => 'failed to load highlight.pm'; +use_ok 'PublicInbox::HlMod'; +my $hls = PublicInbox::HlMod->new; +ok($hls, 'initialized OK'); +is($hls->_shebang2lang(\"#!/usr/bin/perl -w\n"), 'perl', 'perl shebang OK'); +is($hls->{-ext2lang}->{'pm'}, 'perl', '.pm suffix OK'); +is($hls->{-ext2lang}->{'pl'}, 'perl', '.pl suffix OK'); +is($hls->_path2lang('Makefile'), 'make', 'Makefile OK'); +my $str = do { local $/; open(my $fh, __FILE__); <$fh> }; +my $orig = $str; + +{ + my $ref = $hls->do_hl(\$str, 'foo.perl'); + is(ref($ref), 'SCALAR', 'got a scalar reference back'); + like($$ref, qr/I can see you!/, 'we can see ourselves in output'); + + use PublicInbox::Spawn qw(which); + if (eval { require IPC::Run } && which('w3m')) { + require File::Temp; + my $cmd = [ qw(w3m -T text/html -dump -config /dev/null) ]; + my ($out, $err) = ('', ''); + IPC::Run::run($cmd, $ref, \$out, \$err); + # expand tabs and normalize whitespace, + # w3m doesn't preserve tabs + $orig =~ s/\t/ /gs; + $out =~ s/\s*\z//sg; + $orig =~ s/\s*\z//sg; + is($out, $orig, 'w3m output matches'); + } +} + +my $nr = $ENV{TEST_MEMLEAK}; +if ($nr && -r "/proc/$$/status") { + my $fh; + open $fh, '<', "/proc/$$/status"; + diag "starting at memtest at ".join('', grep(/VmRSS:/, <$fh>)); + PublicInbox::HlMod->new->do_hl(\$orig) for (1..$nr); + open $fh, '<', "/proc/$$/status"; + diag "creating $nr instances: ".join('', grep(/VmRSS:/, <$fh>)); + my $hls = PublicInbox::HlMod->new; + $hls->do_hl(\$orig) for (1..$nr); + $hls = undef; + open $fh, '<', "/proc/$$/status"; + diag "reused instance $nr times: ".join('', grep(/VmRSS:/, <$fh>)); +} + +done_testing; |