user/dev discussion of public-inbox itself
 help / color / Atom feed
36e311060dcf52bb41eed5f6a2cff38bbb2de006 blob 4168 bytes (raw)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
 
# Copyright (C) 2019 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>

# I have no idea how stable or safe this is for handling untrusted
# input, but it seems to have been around for a while, and the
# highlight(1) executable is supported by gitweb and cgit.
#
# I'm also unsure about API stability, but highlight 3.x seems to
# have been around a few years and ikiwiki (apparently the only
# user of the SWIG/Perl bindings, at least in Debian) hasn't needed
# major changes to support it in recent years.
#
# Some code stolen from ikiwiki (GPL-2.0+)
# wrapper for SWIG-generated highlight.pm bindings
package PublicInbox::HlMod;
use strict;
use warnings;
use highlight; # SWIG-generated stuff
use PublicInbox::Hval qw(src_escape ascii_html);
my $hl;

sub _parse_filetypes ($) {
	my $ft_conf = $_[0]->searchFile('filetypes.conf') or
				die 'filetypes.conf not found by highlight';
	open my $fh, '<', $ft_conf or die "failed to open($ft_conf): $!";
	local $/;
	my $cfg = <$fh>;
	my %ext2lang;
	my @shebang; # order matters

	# Hrm... why isn't this exposed by the highlight API?
	# highlight >= 3.2 format (bind-style) (from ikiwiki)
	while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
			 Extensions\s*=\s*{([^}]+)}/sgx) {
		my $lang = $1;
		foreach my $bit (split(/,/, $2)) {
			$bit =~ s/.*"(.*)".*/$1/s;
			$ext2lang{$bit} = $lang;
		}
	}
	# AFAIK, all the regexps used by in filetypes.conf distributed
	# by highlight work as Perl REs
	while ($cfg =~ /\bLang\s*=\s*\"([^"]+)\"[,\s]+
			Shebang\s*=\s*\[\s*\[([^}]+)\s*\]\s*\]\s*}\s*,/sgx) {
		my ($lang, $re) = ($1, $2);
		eval {
			my $perl_re = qr/$re/;
			push @shebang, [ $lang, $perl_re ];
		};
		if ($@) {
			warn "$lang shebang=[[$re]] did not work in Perl: $@";
		}
	}
	(\%ext2lang, \@shebang);
}

# We only need one instance, so we don't need to do
# highlight::CodeGenerator::deleteInstance
sub new {
	my ($class) = @_;
	$hl ||= do {
		my $dir = highlight::DataDir->new;
		$dir->initSearchDirectories('');
		my ($ext2lang, $shebang) = _parse_filetypes($dir);
		bless {
			-dir => $dir,
			-ext2lang => $ext2lang,
			-shebang => $shebang,
		}, $class;
	};
}

sub _shebang2lang ($$) {
	my ($self, $str) = @_;
	my $shebang = $self->{-shebang};
	foreach my $s (@$shebang) {
		return $s->[0] if $$str =~ $s->[1];
	}
	undef;
}

sub _path2lang ($$) {
	my ($self, $path) = @_;
	my ($ext) = ($path =~ m!([^\\/\.]+)\z!);
	$ext = lc($ext);
	$self->{-ext2lang}->{$ext} || $ext;
}

sub do_hl {
	my ($self, $str, $path) = @_;
	my $lang = _path2lang($self, $path) if defined $path;
	do_hl_lang($self, $str, $lang);
}

sub do_hl_lang {
	my ($self, $str, $lang) = @_;

	my $dir = $self->{-dir};
	my $langpath;

	if (defined $lang) {
		$langpath = $dir->getLangPath("$lang.lang") or return;
		$lang = undef unless -f $langpath
	}
	unless (defined $lang) {
		$lang = _shebang2lang($self, $str) or return;
		$langpath = $dir->getLangPath("$lang.lang") or return;
		return unless -f $langpath
	}
	my $gen = $self->{$langpath} ||= do {
		my $g = highlight::CodeGenerator::getInstance($highlight::HTML);
		$g->setFragmentCode(1); # generate html fragment

		# whatever theme works
		my $themepath = $dir->getThemePath('print.theme');
		$g->initTheme($themepath);
		$g->loadLanguage($langpath);
		$g->setEncoding('utf-8');
		$g;
	};

	# we assume $$str is valid UTF-8, but the SWIG binding doesn't
	# know that, so ensure it's marked as UTF-8 even if it isnt...
	my $out = $gen->generateString($$str);
	utf8::decode($out);
	src_escape($out);
	\$out;
}

# Highlight text, but support Markdown "```$LANG" notation
# while preserving WYSIWYG of plain-text documentation.
# This is NOT to be enabled by default or encouraged for parsing
# emails, since it is NOT stable and can lead to standards
# proliferation of email.
sub do_hl_text {
	my ($self, $str) = @_;

	$$str = join('', map {
		if (/\A(``` ?)(\w+)\s*?\n(.+)(^```\s*)\z/sm) {
			my ($pfx, $lang, $code, $post) = ($1, $2, $3, $4);
			my $hl = do_hl_lang($self, \$code, $lang) || \$code;
			$pfx . $lang . "\n" . $$hl . $post;
		} else {
			ascii_html($_);
		}
	} split(/(^``` ?\w+\s*?\n.+?^```\s*$)/sm, $$str));
}

1;
debug log:

solving 36e3110 ...
found 36e3110 in https://80x24.org/public-inbox.git

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox