From 970da289a8ea3532538846905c7c01778066e1a9 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 17 Apr 2014 20:10:38 +0000 Subject: HTML: various encoding fixups --- lib/PublicInbox/Feed.pm | 23 +++++++++++++------ lib/PublicInbox/View.pm | 61 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 58 insertions(+), 26 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 0a652f6e..33406522 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -7,7 +7,7 @@ use XML::Atom::SimpleFeed; use Email::MIME; use Email::Address; use URI::Escape qw/uri_escape/; -use Encode qw/encode decode/; +use Encode qw/find_encoding/; use Encode::MIME::Header; use CGI qw(escapeHTML); use POSIX qw(strftime); @@ -15,6 +15,9 @@ use Date::Parse qw(strptime); use constant DATEFMT => '%Y-%m-%dT%H:%M:%SZ'; use PublicInbox::View; use Mail::Thread; +my $enc_utf8 = find_encoding('utf8'); +my $enc_ascii = find_encoding('us-ascii'); +my $enc_mime = find_encoding('MIME-Header'); # FIXME: workaround https://rt.cpan.org/Public/Bug/Display.html?id=22817 @@ -52,7 +55,7 @@ sub generate_html_index { my $top = $args->{top}; # bool local $ENV{GIT_DIR} = $args->{git_dir}; my $feed_opts = get_feedopts($args); - my $title = escapeHTML($feed_opts->{description} || ""); + my $title = xs_html($feed_opts->{description} || ""); my @messages; each_recent_blob($max, sub { my $str = `git cat-file blob $_[0]`; @@ -146,8 +149,9 @@ sub utf8_header { my ($simple, $name) = @_; my $val = $simple->header($name); return "" unless defined $val; - $val =~ tr/\t\r\n / /s; - encode('utf8', decode('MIME-Header', $val)); + $val =~ tr/\t\n / /s; + $val =~ tr/\r//d; + $enc_utf8->encode($enc_mime->decode($val)); } sub feed_date { @@ -220,9 +224,9 @@ sub dump_html_line { my @from = Email::Address->parse($from); $from = $from[0]->name; (defined($from) && length($from)) or $from = $from[0]->address; - $from = escapeHTML($from); - $subj = escapeHTML($subj); - $args->[0] .= "`-> $subj $from\n"; + $from = xs_html($from); + $subj = xs_html($subj); + $args->[0] .= "$subj $from\n"; } else { $args->[0] .= "[ Message not available ]\n"; } @@ -230,4 +234,9 @@ sub dump_html_line { dump_html_line($self->next, $level, $args) if $self->next; } +sub xs_html { + $enc_ascii->encode(escapeHTML($enc_utf8->decode($_[0])), + Encode::HTMLCREF); +} + 1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 2efbf1b5..84c7393b 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -5,8 +5,13 @@ use strict; use warnings; use URI::Escape qw/uri_escape/; use CGI qw/escapeHTML/; -use Encode qw/decode encode/; +use Encode qw/find_encoding/; use Encode::MIME::Header; +use Email::MIME::ContentType qw/parse_content_type/; + +my $enc_utf8 = find_encoding('utf8'); +my $enc_ascii = find_encoding('us-ascii'); +my $enc_mime = find_encoding('MIME-Header'); # public functions: sub as_html { @@ -26,28 +31,42 @@ sub as_feed_entry { # only private functions below. +sub enc_for { + my ($ct) = @_; + defined $ct or return $enc_utf8; + my $ct_parsed = parse_content_type($ct); + if ($ct_parsed) { + if (my $charset = $ct_parsed->{attributes}->{charset}) { + my $enc = find_encoding($charset); + return $enc if $enc; + } + } + $enc_utf8; +} + sub multipart_text_as_html { my ($mime, $full_pfx) = @_; my $rv = ""; my $part_nr = 0; + my $enc_msg = enc_for($mime->header("Content-Type")); # scan through all parts, looking for displayable text $mime->walk_parts(sub { my ($part) = @_; return if $part->subparts; # walk_parts already recurses - - my $fn = $part->filename; + my $enc = enc_for($part->content_type) || $enc_msg || $enc_utf8; if ($part_nr > 0) { + my $fn = $part->filename; defined($fn) or $fn = "part #" . ($part_nr + 1); - $rv .= add_filename_line($fn); + $rv .= add_filename_line($enc->decode($fn)); } if (defined $full_pfx) { - $rv .= add_text_body_short($part, $part_nr, + $rv .= add_text_body_short($enc, $part, $part_nr, $full_pfx); } else { - $rv .= add_text_body_full($part, $part_nr); + $rv .= add_text_body_full($enc, $part, $part_nr); } $rv .= "\n" unless $rv =~ /\n\z/s; ++$part_nr; @@ -62,13 +81,13 @@ sub add_filename_line { $len -= length($fn); $pad x= ($len/2) if ($len > 0); - "$pad " . escapeHTML($fn) . " $pad\n"; + "$pad " . ascii_html($fn) . " $pad\n"; } sub add_text_body_short { - my ($part, $part_nr, $full_pfx) = @_; + my ($enc, $part, $part_nr, $full_pfx) = @_; my $n = 0; - my $s = escapeHTML($part->body); + my $s = ascii_html($enc->decode($part->body)); $s =~ s!^((?:(?:>[^\n]+)\n)+)! my $cur = $1; my @lines = split(/\n/, $cur); @@ -93,9 +112,9 @@ sub add_text_body_short { } sub add_text_body_full { - my ($part, $part_nr) = @_; + my ($enc, $part, $part_nr) = @_; my $n = 0; - my $s = escapeHTML($part->body); + my $s = ascii_html($enc->decode($part->body)); $s =~ s!^((?:(?:>[^\n]+)\n)+)! my $cur = $1; my @lines = split(/\n/, $cur); @@ -110,14 +129,19 @@ sub add_text_body_full { sub trim_message_id { my ($mid) = @_; - $mid =~ s/\A\z//; - my $html = escapeHTML($mid); - my $href = escapeHTML(uri_escape($mid)); + $mid = $enc_mime->decode($mid); + $mid =~ s/\A\s*\s*\z//; + my $html = ascii_html($mid); + my $href = ascii_html(uri_escape($mid)); ($html, $href); } +sub ascii_html { + $enc_ascii->encode(escapeHTML($_[0]), Encode::HTMLCREF); +} + sub headers_to_html_header { my ($simple) = @_; @@ -126,10 +150,9 @@ sub headers_to_html_header { foreach my $h (qw(From To Cc Subject Date)) { my $v = $simple->header($h); defined $v or next; - $v = decode("MIME-Header", $v); - $v = encode("utf8", $v); - $v = escapeHTML($v); - $v =~ tr/\n/ /; + $v =~ tr/\n/ /s; + $v =~ tr/\r//d; + $v = ascii_html($enc_mime->decode($v)); $rv .= "$h: $v\n"; if ($h eq "From" || $h eq "Subject") { -- cgit v1.2.3-24-ge0c7