user/dev discussion of public-inbox itself
 help / color / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 35/37] highlight: initial wrapper and PSGI service
Date: Mon, 21 Jan 2019 20:52:51 +0000
Message-ID: <20190121205253.10455-36-e@80x24.org> (raw)
In-Reply-To: <20190121205253.10455-1-e@80x24.org>

I'll probably expose the PSGI service for cgit;
but it could be useful to others as well.
---
 MANIFEST                        |   4 +
 examples/highlight.psgi         |  13 ++++
 lib/PublicInbox/HlMod.pm        | 126 ++++++++++++++++++++++++++++++++
 lib/PublicInbox/WwwHighlight.pm |  73 ++++++++++++++++++
 t/hl_mod.t                      |  54 ++++++++++++++
 5 files changed, 270 insertions(+)
 create mode 100644 examples/highlight.psgi
 create mode 100644 lib/PublicInbox/HlMod.pm
 create mode 100644 lib/PublicInbox/WwwHighlight.pm
 create mode 100644 t/hl_mod.t

diff --git a/MANIFEST b/MANIFEST
index 53d51b2..e627206 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -38,6 +38,7 @@ examples/apache2_perl.conf
 examples/apache2_perl_old.conf
 examples/cgi-webrick.rb
 examples/cgit-commit-filter.lua
+examples/highlight.psgi
 examples/logrotate.conf
 examples/public-inbox-config
 examples/public-inbox-httpd.socket
@@ -74,6 +75,7 @@ lib/PublicInbox/GitHTTPBackend.pm
 lib/PublicInbox/HTTP.pm
 lib/PublicInbox/HTTPD.pm
 lib/PublicInbox/HTTPD/Async.pm
+lib/PublicInbox/HlMod.pm
 lib/PublicInbox/Hval.pm
 lib/PublicInbox/Import.pm
 lib/PublicInbox/Inbox.pm
@@ -120,6 +122,7 @@ lib/PublicInbox/WWW.pod
 lib/PublicInbox/WatchMaildir.pm
 lib/PublicInbox/WwwAtomStream.pm
 lib/PublicInbox/WwwAttach.pm
+lib/PublicInbox/WwwHighlight.pm
 lib/PublicInbox/WwwStream.pm
 lib/PublicInbox/WwwText.pm
 sa_config/Makefile
@@ -170,6 +173,7 @@ t/git-http-backend.psgi
 t/git-http-backend.t
 t/git.fast-import-data
 t/git.t
+t/hl_mod.t
 t/html_index.t
 t/httpd-corner.psgi
 t/httpd-corner.t
diff --git a/examples/highlight.psgi b/examples/highlight.psgi
new file mode 100644
index 0000000..244b128
--- /dev/null
+++ b/examples/highlight.psgi
@@ -0,0 +1,13 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# Usage: plackup [OPTIONS] /path/to/this/file
+# A startup command for development which monitors changes:
+#	plackup -I lib -o 127.0.0.1 -R lib -r examples/highlight.psgi
+use strict;
+use warnings;
+use PublicInbox::WwwHighlight;
+use Plack::Builder;
+my $hl = PublicInbox::WwwHighlight->new;
+builder { sub { $hl->call(@_) }; }
diff --git a/lib/PublicInbox/HlMod.pm b/lib/PublicInbox/HlMod.pm
new file mode 100644
index 0000000..5cbfb29
--- /dev/null
+++ b/lib/PublicInbox/HlMod.pm
@@ -0,0 +1,126 @@
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# I have no idea how stable or safe this is for handling untrusted
+# input, but it seems to have been around for a while, and the
+# highlight(1) executable is supported by gitweb and cgit.
+#
+# I'm also unsure about API stability, but highlight 3.x seems to
+# have been around a few years and ikiwiki (apparently the only
+# user of the SWIG/Perl bindings, at least in Debian) hasn't needed
+# major changes to support it in recent years.
+#
+# Some code stolen from ikiwiki (GPL-2.0+)
+# wrapper for SWIG-generated highlight.pm bindings
+package PublicInbox::HlMod;
+use strict;
+use warnings;
+use highlight; # SWIG-generated stuff
+
+sub _parse_filetypes ($) {
+	my $ft_conf = $_[0]->searchFile('filetypes.conf') or
+				die 'filetypes.conf not found by highlight';
+	open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!";
+	local $/;
+	my $cfg = <$fh>;
+	my %ext2lang;
+	my @shebang; # order matters
+
+	# Hrm... why isn't this exposed by the highlight API?
+	# highlight >= 3.2 format (bind-style) (from ikiwiki)
+	while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
+			 Extensions\s*=\s*{([^}]+)}/sgx) {
+		my $lang = $1;
+		foreach my $bit (split(/,/, $2)) {
+			$bit =~ s/.*"(.*)".*/$1/s;
+			$ext2lang{$bit} = $lang;
+		}
+	}
+	# AFAIK, all the regexps used by in filetypes.conf distributed
+	# by highlight work as Perl REs
+	while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
+			Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) {
+		my ($lang, $re) = ($1, $2);
+		eval {
+			my $perl_re = qr/$re/;
+			push @shebang, [ $lang, $perl_re ];
+		};
+		if ($@) {
+			warn "$lang shebang=[[$re]] did not work in Perl: $@";
+		}
+	}
+	(\%ext2lang, \@shebang);
+}
+
+sub new {
+	my ($class) = @_;
+	my $dir = highlight::DataDir->new;
+	$dir->initSearchDirectories('');
+	my ($ext2lang, $shebang) = _parse_filetypes($dir);
+	bless {
+		-dir => $dir,
+		-ext2lang => $ext2lang,
+		-shebang => $shebang,
+	}, $class;
+}
+
+sub _shebang2lang ($$) {
+	my ($self, $str) = @_;
+	my $shebang = $self->{-shebang};
+	foreach my $s (@$shebang) {
+		return $s->[0] if $$str =~ $s->[1];
+	}
+	undef;
+}
+
+sub _path2lang ($$) {
+	my ($self, $path) = @_;
+	my ($ext) = ($path =~ m!([^\\/\.]+)\z!);
+	$ext = lc($ext);
+	$self->{-ext2lang}->{$ext} || $ext;
+}
+
+sub do_hl {
+	my ($self, $str, $path) = @_;
+	my $lang = _path2lang($self, $path) if defined $path;
+	my $dir = $self->{-dir};
+	my $langpath;
+	if (defined $lang) {
+		$langpath = $dir->getLangPath("$lang.lang") or return;
+		$langpath = undef unless -f $langpath;
+	}
+	unless (defined $langpath) {
+		$lang = _shebang2lang($self, $str) or return;
+		$langpath = $dir->getLangPath("$lang.lang") or return;
+		$langpath = undef unless -f $langpath;
+	}
+	return unless defined $langpath;
+
+	my $gen = $self->{$langpath} ||= do {
+		my $g = highlight::CodeGenerator::getInstance($highlight::HTML);
+		$g->setFragmentCode(1); # generate html fragment
+		$g->setHTMLEnclosePreTag(1); # include <pre>
+
+		# whatever theme works
+		my $themepath = $dir->getThemePath('print.theme');
+		$g->initTheme($themepath);
+		$g->loadLanguage($langpath);
+		$g->setEncoding('utf-8');
+		$g;
+	};
+	\($gen->generateString($$str))
+}
+
+# SWIG instances aren't reference-counted, but $self is;
+# so we need to delete all the CodeGenerator instances manually
+# at our own destruction
+sub DESTROY {
+	my ($self) = @_;
+	foreach my $gen (values %$self) {
+		if (ref($gen) eq 'highlight::CodeGenerator') {
+			highlight::CodeGenerator::deleteInstance($gen);
+		}
+	}
+}
+
+1;
diff --git a/lib/PublicInbox/WwwHighlight.pm b/lib/PublicInbox/WwwHighlight.pm
new file mode 100644
index 0000000..3d6ca03
--- /dev/null
+++ b/lib/PublicInbox/WwwHighlight.pm
@@ -0,0 +1,73 @@
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Standalone PSGI app to provide syntax highlighting as-a-service
+# via "highlight" Perl module ("libhighlight-perl" in Debian).
+#
+# This allows exposing highlight as a persistent HTTP service for
+# other scripts via HTTP PUT requests.  PATH_INFO will be used
+# as a hint for detecting the language for highlight.
+#
+# The following example using curl(1) will do the right thing
+# regarding the file extension:
+#
+#   curl -HExpect: -T /path/to/file http://example.com/
+#
+# You can also force a file extension by giving a path
+# (in this case, "c") via:
+#
+#   curl -HExpect: -T /path/to/file http://example.com/x.c
+
+package PublicInbox::WwwHighlight;
+use strict;
+use warnings;
+use HTTP::Status qw(status_message);
+use parent qw(PublicInbox::HlMod);
+
+# TODO: support highlight(1) for distros which don't package the
+# SWIG extension.  Also, there may be admins who don't want to
+# have ugly SWIG-generated code in a long-lived Perl process.
+
+sub r ($) {
+	my ($code) = @_;
+	my $msg = status_message($code);
+	my $len = length($msg);
+	[ $code, [qw(Content-Type text/plain Content-Length), $len], [$msg] ]
+}
+
+# another slurp API hogging up all my memory :<
+# This is capped by whatever the PSGI server allows,
+# $ENV{GIT_HTTP_MAX_REQUEST_BUFFER} for PublicInbox::HTTP (10 MB)
+sub read_in_full ($) {
+	my ($env) = @_;
+
+	my $in = $env->{'psgi.input'};
+	my $off = 0;
+	my $buf = '';
+	my $len = $env->{CONTENT_LENGTH} || 8192;
+	while (1) {
+		my $r = $in->read($buf, $len, $off);
+		last unless defined $r;
+		return \$buf if $r == 0;
+		$off += $r;
+	}
+	$env->{'psgi.errors'}->print("input read error: $!\n");
+}
+
+# entry point for PSGI
+sub call {
+	my ($self, $env) = @_;
+	my $req_method = $env->{REQUEST_METHOD};
+
+	return r(405) if $req_method ne 'PUT';
+
+	my $bref = read_in_full($env) or return r(500);
+	$bref = $self->do_hl($bref, $env->{PATH_INFO});
+
+	my $h = [ 'Content-Type', 'text/html; charset=UTF-8' ];
+	push @$h, 'Content-Length', bytes::length($$bref);
+
+	[ 200, $h, [ $$bref ] ]
+}
+
+1;
diff --git a/t/hl_mod.t b/t/hl_mod.t
new file mode 100644
index 0000000..b8b8eb9
--- /dev/null
+++ b/t/hl_mod.t
@@ -0,0 +1,54 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+eval { require highlight } or
+	plan skip_all => 'failed to load highlight.pm';
+use_ok 'PublicInbox::HlMod';
+my $hls = PublicInbox::HlMod->new;
+ok($hls, 'initialized OK');
+is($hls->_shebang2lang(\"#!/usr/bin/perl -w\n"), 'perl', 'perl shebang OK');
+is($hls->{-ext2lang}->{'pm'}, 'perl', '.pm suffix OK');
+is($hls->{-ext2lang}->{'pl'}, 'perl', '.pl suffix OK');
+is($hls->_path2lang('Makefile'), 'make', 'Makefile OK');
+my $str = do { local $/; open(my $fh, __FILE__); <$fh> };
+my $orig = $str;
+
+{
+	my $ref = $hls->do_hl(\$str, 'foo.perl');
+	is(ref($ref), 'SCALAR', 'got a scalar reference back');
+	like($$ref, qr/I can see you!/, 'we can see ourselves in output');
+
+	use PublicInbox::Spawn qw(which);
+	if (eval { require IPC::Run } && which('w3m')) {
+		require File::Temp;
+		my $cmd = [ qw(w3m -T text/html -dump -config /dev/null) ];
+		my ($out, $err) = ('', '');
+		IPC::Run::run($cmd, $ref, \$out, \$err);
+		# expand tabs and normalize whitespace,
+		# w3m doesn't preserve tabs
+		$orig =~ s/\t/        /gs;
+		$out =~ s/\s*\z//sg;
+		$orig =~ s/\s*\z//sg;
+		is($out, $orig, 'w3m output matches');
+	}
+}
+
+my $nr = $ENV{TEST_MEMLEAK};
+if ($nr && -r "/proc/$$/status") {
+	my $fh;
+	open $fh, '<', "/proc/$$/status";
+	diag "starting at memtest at ".join('', grep(/VmRSS:/, <$fh>));
+	PublicInbox::HlMod->new->do_hl(\$orig) for (1..$nr);
+	open $fh, '<', "/proc/$$/status";
+	diag "creating $nr instances: ".join('', grep(/VmRSS:/, <$fh>));
+	my $hls = PublicInbox::HlMod->new;
+	$hls->do_hl(\$orig) for (1..$nr);
+	$hls = undef;
+	open $fh, '<', "/proc/$$/status";
+	diag "reused instance $nr times: ".join('', grep(/VmRSS:/, <$fh>));
+}
+
+done_testing;
-- 
EW


  parent reply index

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-21 20:52 [PATCH 00/37] viewvcs: diff highlighting and more Eric Wong
2019-01-21 20:52 ` [PATCH 01/37] view: disable bold in topic display Eric Wong
2019-01-21 20:52 ` [PATCH 02/37] hval: force monospace for <form> elements, too Eric Wong
2019-01-21 20:52 ` [PATCH 03/37] t/perf-msgview: add test to check msg_html performance Eric Wong
2019-01-21 20:52 ` [PATCH 04/37] solver: initial Perl implementation Eric Wong
2019-01-21 20:52 ` [PATCH 05/37] git: support multiple URL endpoints Eric Wong
2019-01-21 20:52 ` [PATCH 06/37] git: add git_quote Eric Wong
2019-01-21 20:52 ` [PATCH 07/37] git: check saves error on disambiguation Eric Wong
2019-01-21 20:52 ` [PATCH 08/37] solver: various bugfixes and cleanups Eric Wong
2019-01-21 20:52 ` [PATCH 09/37] view: wire up diff and vcs viewers with solver Eric Wong
2019-01-21 20:52 ` [PATCH 10/37] git: disable abbreviations with cat-file hints Eric Wong
2019-01-21 20:52 ` [PATCH 11/37] solver: operate directly on git index Eric Wong
2019-01-21 20:52 ` [PATCH 12/37] view: enable naming hints for raw blob downloads Eric Wong
2019-01-21 20:52 ` [PATCH 13/37] git: support 'ambiguous' result from --batch-check Eric Wong
2019-01-21 20:52 ` [PATCH 14/37] solver: more verbose blob resolution Eric Wong
2019-01-21 20:52 ` [PATCH 15/37] solver: break up patch application steps Eric Wong
2019-01-21 20:52 ` [PATCH 16/37] solver: switch patch application to use a callback Eric Wong
2019-01-21 20:52 ` [PATCH 17/37] solver: simplify control flow for initial loop Eric Wong
2019-01-21 20:52 ` [PATCH 18/37] solver: break @todo loop into a callback Eric Wong
2019-01-21 20:52 ` [PATCH 19/37] solver: note the synchronous nature of index preparation Eric Wong
2019-01-21 20:52 ` [PATCH 20/37] solver: add a TODO note about making this fully evented Eric Wong
2019-01-21 20:52 ` [PATCH 21/37] view: enforce trailing slash for /$INBOX/$OID/s/ endpoints Eric Wong
2019-01-21 20:52 ` [PATCH 22/37] solver: restore diagnostics and deal with CRLF Eric Wong
2019-01-21 20:52 ` [PATCH 23/37] www: admin-configurable CSS via "publicinbox.css" Eric Wong
2019-01-21 20:52 ` [PATCH 24/37] $INBOX/_/text/color/ and sample user-side CSS Eric Wong
2019-01-21 20:52 ` [PATCH 25/37] viewdiff: support diff-highlighting w/o coderepo Eric Wong
2019-01-21 20:52 ` [PATCH 26/37] viewdiff: cleanup state transitions a bit Eric Wong
2019-01-21 20:52 ` [PATCH 27/37] viewdiff: quote attributes for Atom feed Eric Wong
2019-01-21 20:52 ` [PATCH 28/37] t/check-www-inbox: use xmlstarlet to validate Atom if available Eric Wong
2019-01-21 20:52 ` [PATCH 29/37] viewdiff: do not link to 0{7,40} blobs (again) Eric Wong
2019-01-21 20:52 ` [PATCH 30/37] viewvcs: disable white-space prewrap in blob view Eric Wong
2019-01-21 20:52 ` [PATCH 31/37] solver: force quoted-printable bodies to LF Eric Wong
2019-01-21 20:52 ` [PATCH 32/37] solver: remove extra "^index $OID..$OID" line Eric Wong
2019-01-21 20:52 ` [PATCH 33/37] config: each_inbox iteration preserves config order Eric Wong
2019-01-21 20:52 ` [PATCH 34/37] t/check-www-inbox: warn on missing Content-Type Eric Wong
2019-01-21 20:52 ` Eric Wong [this message]
2019-01-21 20:52 ` [PATCH 36/37] hval: split out escape sequences to a separate table Eric Wong
2019-01-21 20:52 ` [PATCH 37/37] t/check-www-inbox: trap SIGINT for File::Temp destruction Eric Wong

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190121205253.10455-36-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox