# Copyright (C) 2019-2020 all contributors # License: AGPL-3.0+ # I have no idea how stable or safe this is for handling untrusted # input, but it seems to have been around for a while, and the # highlight(1) executable is supported by gitweb and cgit. # # I'm also unsure about API stability, but highlight 3.x seems to # have been around a few years and ikiwiki (apparently the only # user of the SWIG/Perl bindings, at least in Debian) hasn't needed # major changes to support it in recent years. # # Some code stolen from ikiwiki (GPL-2.0+) # wrapper for SWIG-generated highlight.pm bindings package PublicInbox::HlMod; use strict; use warnings; use highlight; # SWIG-generated stuff use PublicInbox::Hval qw(src_escape ascii_html); my $hl; sub _parse_filetypes ($) { my $ft_conf = $_[0]->getFiletypesConfPath('filetypes') or die 'filetypes.conf not found by highlight'; open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!"; local $/; my $cfg = <$fh>; my %ext2lang; my @shebang; # order matters # Hrm... why isn't this exposed by the highlight API? # highlight >= 3.2 format (bind-style) (from ikiwiki) while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+ Extensions\s*=\s*{([^}]+)}/sgx) { my $lang = $1; foreach my $bit (split(/,/, $2)) { $bit =~ s/.*"(.*)".*/$1/s; $ext2lang{$bit} = $lang; } } # AFAIK, all the regexps used by in filetypes.conf distributed # by highlight work as Perl REs while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+ Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) { my ($lang, $re) = ($1, $2); eval { my $perl_re = qr/$re/; push @shebang, [ $lang, $perl_re ]; }; if ($@) { warn "$lang shebang=[[$re]] did not work in Perl: $@"; } } (\%ext2lang, \@shebang); } # We only need one instance, so we don't need to do # highlight::CodeGenerator::deleteInstance sub new { my ($class) = @_; $hl ||= do { my $dir = highlight::DataDir->new; $dir->initSearchDirectories(''); my ($ext2lang, $shebang) = _parse_filetypes($dir); bless { -dir => $dir, -ext2lang => $ext2lang, -shebang => $shebang, }, $class; }; } sub _shebang2lang ($$) { my ($self, $str) = @_; my $shebang = $self->{-shebang}; foreach my $s (@$shebang) { return $s->[0] if $$str =~ $s->[1]; } undef; } sub _path2lang ($$) { my ($self, $path) = @_; my ($ext) = ($path =~ m!([^\\/\.]+)\z!); $ext = lc($ext); $self->{-ext2lang}->{$ext} || $ext; } sub do_hl { my ($self, $str, $path) = @_; my $lang = _path2lang($self, $path) if defined $path; do_hl_lang($self, $str, $lang); } sub do_hl_lang { my ($self, $str, $lang) = @_; my $dir = $self->{-dir}; my $langpath; if (defined $lang) { $langpath = $dir->getLangPath("$lang.lang") or return; $lang = undef unless -f $langpath } unless (defined $lang) { $lang = _shebang2lang($self, $str) or return; $langpath = $dir->getLangPath("$lang.lang") or return; return unless -f $langpath } my $gen = $self->{$langpath} ||= do { my $g = highlight::CodeGenerator::getInstance($highlight::HTML); $g->setFragmentCode(1); # generate html fragment # whatever theme works my $themepath = $dir->getThemePath('print.theme'); $g->initTheme($themepath); $g->loadLanguage($langpath); $g->setEncoding('utf-8'); $g; }; # we assume $$str is valid UTF-8, but the SWIG binding doesn't # know that, so ensure it's marked as UTF-8 even if it isnt... my $out = $gen->generateString($$str); utf8::decode($out); src_escape($out); \$out; } # Highlight text, but support Markdown "```$LANG" notation # while preserving WYSIWYG of plain-text documentation. # This is NOT to be enabled by default or encouraged for parsing # emails, since it is NOT stable and can lead to standards # proliferation of email. sub do_hl_text { my ($self, $str) = @_; $$str = join('', map { if (/\A(``` ?)(\w+)\s*?\n(.+)(^```\s*)\z/sm) { my ($pfx, $lang, $code, $post) = ($1, $2, $3, $4); my $hl = do_hl_lang($self, \$code, $lang) || \$code; $pfx . $lang . "\n" . $$hl . $post; } else { ascii_html($_); } } split(/(^``` ?\w+\s*?\n.+?^```\s*$)/sm, $$str)); } 1;