user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
blob 36e311060dcf52bb41eed5f6a2cff38bbb2de006 4168 bytes (raw)
name: lib/PublicInbox/HlMod.pm 	 # note: path name is non-authoritative(*)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
 
# Copyright (C) 2019 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>

# I have no idea how stable or safe this is for handling untrusted
# input, but it seems to have been around for a while, and the
# highlight(1) executable is supported by gitweb and cgit.
#
# I'm also unsure about API stability, but highlight 3.x seems to
# have been around a few years and ikiwiki (apparently the only
# user of the SWIG/Perl bindings, at least in Debian) hasn't needed
# major changes to support it in recent years.
#
# Some code stolen from ikiwiki (GPL-2.0+)
# wrapper for SWIG-generated highlight.pm bindings
package PublicInbox::HlMod;
use strict;
use warnings;
use highlight; # SWIG-generated stuff
use PublicInbox::Hval qw(src_escape ascii_html);
my $hl;

sub _parse_filetypes ($) {
	my $ft_conf = $_[0]->searchFile('filetypes.conf') or
				die 'filetypes.conf not found by highlight';
	open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!";
	local $/;
	my $cfg = <$fh>;
	my %ext2lang;
	my @shebang; # order matters

	# Hrm... why isn't this exposed by the highlight API?
	# highlight >= 3.2 format (bind-style) (from ikiwiki)
	while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
			 Extensions\s*=\s*{([^}]+)}/sgx) {
		my $lang = $1;
		foreach my $bit (split(/,/, $2)) {
			$bit =~ s/.*"(.*)".*/$1/s;
			$ext2lang{$bit} = $lang;
		}
	}
	# AFAIK, all the regexps used by in filetypes.conf distributed
	# by highlight work as Perl REs
	while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
			Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) {
		my ($lang, $re) = ($1, $2);
		eval {
			my $perl_re = qr/$re/;
			push @shebang, [ $lang, $perl_re ];
		};
		if ($@) {
			warn "$lang shebang=[[$re]] did not work in Perl: $@";
		}
	}
	(\%ext2lang, \@shebang);
}

# We only need one instance, so we don't need to do
# highlight::CodeGenerator::deleteInstance
sub new {
	my ($class) = @_;
	$hl ||= do {
		my $dir = highlight::DataDir->new;
		$dir->initSearchDirectories('');
		my ($ext2lang, $shebang) = _parse_filetypes($dir);
		bless {
			-dir => $dir,
			-ext2lang => $ext2lang,
			-shebang => $shebang,
		}, $class;
	};
}

sub _shebang2lang ($$) {
	my ($self, $str) = @_;
	my $shebang = $self->{-shebang};
	foreach my $s (@$shebang) {
		return $s->[0] if $$str =~ $s->[1];
	}
	undef;
}

sub _path2lang ($$) {
	my ($self, $path) = @_;
	my ($ext) = ($path =~ m!([^\\/\.]+)\z!);
	$ext = lc($ext);
	$self->{-ext2lang}->{$ext} || $ext;
}

sub do_hl {
	my ($self, $str, $path) = @_;
	my $lang = _path2lang($self, $path) if defined $path;
	do_hl_lang($self, $str, $lang);
}

sub do_hl_lang {
	my ($self, $str, $lang) = @_;

	my $dir = $self->{-dir};
	my $langpath;

	if (defined $lang) {
		$langpath = $dir->getLangPath("$lang.lang") or return;
		$lang = undef unless -f $langpath
	}
	unless (defined $lang) {
		$lang = _shebang2lang($self, $str) or return;
		$langpath = $dir->getLangPath("$lang.lang") or return;
		return unless -f $langpath
	}
	my $gen = $self->{$langpath} ||= do {
		my $g = highlight::CodeGenerator::getInstance($highlight::HTML);
		$g->setFragmentCode(1); # generate html fragment

		# whatever theme works
		my $themepath = $dir->getThemePath('print.theme');
		$g->initTheme($themepath);
		$g->loadLanguage($langpath);
		$g->setEncoding('utf-8');
		$g;
	};

	# we assume $$str is valid UTF-8, but the SWIG binding doesn't
	# know that, so ensure it's marked as UTF-8 even if it isnt...
	my $out = $gen->generateString($$str);
	utf8::decode($out);
	src_escape($out);
	\$out;
}

# Highlight text, but support Markdown "```$LANG" notation
# while preserving WYSIWYG of plain-text documentation.
# This is NOT to be enabled by default or encouraged for parsing
# emails, since it is NOT stable and can lead to standards
# proliferation of email.
sub do_hl_text {
	my ($self, $str) = @_;

	$$str = join('', map {
		if (/\A(``` ?)(\w+)\s*?\n(.+)(^```\s*)\z/sm) {
			my ($pfx, $lang, $code, $post) = ($1, $2, $3, $4);
			my $hl = do_hl_lang($self, \$code, $lang) || \$code;
			$pfx . $lang . "\n" . $$hl . $post;
		} else {
			ascii_html($_);
		}
	} split(/(^``` ?\w+\s*?\n.+?^```\s*$)/sm, $$str));
}

1;

debug log:

solving 36e3110 ...
found 36e3110 in https://80x24.org/public-inbox.git

(*) Git path names are given by the tree(s) the blob belongs to.
    Blobs themselves have no identifier aside from the hash of its contents.^

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).