user/dev discussion of public-inbox itself
 help / color / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 7/8] view: escape ampersand in Message-IDs
Date: Sat, 15 Feb 2020 09:46:39 +0000
Message-ID: <20200215094640.19425-8-e@yhbt.net> (raw)
In-Reply-To: <20200215094640.19425-1-e@yhbt.net>

We need to escape ampersands (and some other characters for href
attributes), so introduce a `mid_href' sub to do just that.

'<', '>' and '"'  were always escaped, so there's no risk of tag
or attribute injection, but creative Message-IDs could cause
confusion for some parsers and generate invalid URLs.

Start getting rid of the bloated, over-engineered OO Hval API
while we're at it, I only noticed this bug because I started
killing off Hval->new* callers.
---
 lib/PublicInbox/ExtMsg.pm        | 19 +++++-------
 lib/PublicInbox/Hval.pm          |  7 ++---
 lib/PublicInbox/Linkify.pm       |  9 +++---
 lib/PublicInbox/Reply.pm         |  6 ++--
 lib/PublicInbox/SearchView.pm    |  4 +--
 lib/PublicInbox/View.pm          | 51 ++++++++++++++------------------
 lib/PublicInbox/WwwAtomStream.pm |  9 +++---
 t/psgi_bad_mids.t                |  9 ++++--
 8 files changed, 53 insertions(+), 61 deletions(-)

diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index c48c2db4..44884ad2 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -8,7 +8,7 @@
 package PublicInbox::ExtMsg;
 use strict;
 use warnings;
-use PublicInbox::Hval qw(ascii_html prurl);
+use PublicInbox::Hval qw(ascii_html prurl mid_href);
 use PublicInbox::WwwStream;
 our $MIN_PARTIAL_LEN = 16;
 
@@ -129,9 +129,8 @@ sub ext_msg {
 	}
 
 	my $code = 404;
-	my $h = PublicInbox::Hval->new_msgid($mid);
-	my $href = $h->{href};
-	my $html = $h->as_html;
+	my $href = mid_href($mid);
+	my $html = ascii_html($mid);
 	my $title = "&lt;$html&gt; not found";
 	my $s = "<pre>Message-ID &lt;$html&gt;\nnot found\n";
 	if ($n_partial) {
@@ -145,10 +144,9 @@ sub ext_msg {
 			my $env = $ctx->{env} if $ibx->{name} eq $cur_name;
 			my $u = $ibx->base_url($env) or next;
 			foreach my $m (@$res) {
-				my $p = PublicInbox::Hval->new_msgid($m);
-				my $r = $p->{href};
-				my $t = $p->as_html;
-				$s .= qq{<a\nhref="$u$r/">$u$t/</a>\n};
+				my $href = mid_href($m);
+				my $html = ascii_html($m);
+				$s .= qq{<a\nhref="$u$href/">$u$html/</a>\n};
 			}
 		}
 	}
@@ -183,9 +181,8 @@ sub ext_urls {
 
 sub exact {
 	my ($ctx, $found, $mid) = @_;
-	my $h = PublicInbox::Hval->new_msgid($mid);
-	my $href = $h->{href};
-	my $html = $h->as_html;
+	my $href = mid_href($mid);
+	my $html = ascii_html($mid);
 	my $title = "&lt;$html&gt; found in ";
 	my $end = @$found == 1 ? 'another inbox' : 'other inboxes';
 	$ctx->{-title_html} = $title . $end;
diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index 2e883f81..40c97da4 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -10,7 +10,7 @@ use Encode qw(find_encoding);
 use PublicInbox::MID qw/mid_clean mid_escape/;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape
-		to_attr prurl/;
+		to_attr prurl mid_href/;
 my $enc_ascii = find_encoding('us-ascii');
 
 # safe-ish acceptable filename pattern for portability
@@ -27,10 +27,7 @@ sub new {
 	}, $class;
 }
 
-sub new_msgid {
-	my ($class, $msgid) = @_;
-	$class->new($msgid, mid_escape($msgid));
-}
+sub mid_href { ascii_html(mid_escape($_[0])) }
 
 # some of these overrides are standard C escapes so they're
 # easy-to-understand when rendered.
diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm
index d176a7cc..2bd8f64a 100644
--- a/lib/PublicInbox/Linkify.pm
+++ b/lib/PublicInbox/Linkify.pm
@@ -13,7 +13,7 @@ package PublicInbox::Linkify;
 use strict;
 use warnings;
 use Digest::SHA qw/sha1_hex/;
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html mid_href);
 
 my $SALT = rand;
 my $LINK_RE = qr{([\('!])?\b((?:ftps?|https?|nntps?|gopher)://
@@ -94,10 +94,9 @@ sub linkify_2 {
 sub linkify_mids {
 	my ($self, $pfx, $str, $raw) = @_;
 	$$str =~ s!<([^>]+)>!
-		my $msgid = PublicInbox::Hval->new_msgid($1);
-		my $html = $msgid->as_html;
-		my $href = $msgid->{href};
-		$href = ascii_html($href); # for IDN
+		my $mid = $1;
+		my $html = ascii_html($mid);
+		my $href = mid_href($mid);
 
 		# salt this, as this could be exploited to show
 		# links in the HTML which don't show up in the raw mail.
diff --git a/lib/PublicInbox/Reply.pm b/lib/PublicInbox/Reply.pm
index edeb1ac2..5058ff8c 100644
--- a/lib/PublicInbox/Reply.pm
+++ b/lib/PublicInbox/Reply.pm
@@ -6,9 +6,9 @@ package PublicInbox::Reply;
 use strict;
 use warnings;
 use URI::Escape qw/uri_escape_utf8/;
-use PublicInbox::Hval qw/ascii_html obfuscate_addrs/;
+use PublicInbox::Hval qw(ascii_html obfuscate_addrs mid_href);
 use PublicInbox::Address;
-use PublicInbox::MID qw/mid_clean mid_escape/;
+use PublicInbox::MID qw(mid_clean);
 
 sub squote_maybe ($) {
 	my ($val) = @_;
@@ -76,7 +76,7 @@ sub mailto_arg_link {
 	$subj = "Re: $subj" unless $subj =~ /\bRe:/i;
 	my $mid = $hdr->header_raw('Message-ID');
 	push @arg, '--in-reply-to='.squote_maybe(mid_clean($mid));
-	my $irt = mid_escape($mid);
+	my $irt = mid_href($mid);
 	delete $cc->{$to};
 	if ($obfs) {
 		my $arg_to = $to;
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 7e508bb7..9b67b045 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -7,7 +7,7 @@ use strict;
 use warnings;
 use URI::Escape qw(uri_unescape uri_escape);
 use PublicInbox::SearchMsg;
-use PublicInbox::Hval qw/ascii_html obfuscate_addrs/;
+use PublicInbox::Hval qw(ascii_html obfuscate_addrs mid_href);
 use PublicInbox::View;
 use PublicInbox::WwwAtomStream;
 use PublicInbox::SearchThread;
@@ -115,7 +115,7 @@ sub mset_summary {
 			obfuscate_addrs($obfs_ibx, $f);
 		}
 		my $date = PublicInbox::View::fmt_ts($smsg->{ds});
-		my $mid = PublicInbox::Hval->new_msgid($smsg->{mid})->{href};
+		my $mid = mid_href($smsg->{mid});
 		$s = '(no subject)' if $s eq '';
 		$$res .= qq{$rank. <b><a\nhref="$mid/">}.
 			$s . "</a></b>\n";
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index d4bfa62d..14b7d81d 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -8,9 +8,9 @@ use strict;
 use warnings;
 use bytes (); # only for bytes::length
 use PublicInbox::MsgTime qw(msg_datestamp);
-use PublicInbox::Hval qw(ascii_html obfuscate_addrs prurl);
+use PublicInbox::Hval qw(ascii_html obfuscate_addrs prurl mid_href);
 use PublicInbox::Linkify;
-use PublicInbox::MID qw/id_compress mid_escape mids mids_for_index references/;
+use PublicInbox::MID qw/id_compress mids mids_for_index references/;
 use PublicInbox::MsgIter;
 use PublicInbox::Address;
 use PublicInbox::WwwStream;
@@ -29,7 +29,7 @@ sub msg_page_i {
 	my $more = $ctx->{more};
 	if ($nr == 1) {
 		# $more cannot be true w/o $smsg being defined:
-		$ctx->{mhref} = $more ? '../'.mid_escape($ctx->{smsg}->mid).'/'
+		$ctx->{mhref} = $more ? '../'.mid_href($ctx->{smsg}->{mid}).'/'
 				      : '';
 		multipart_text_as_html(delete $ctx->{mime}, $ctx);
 		${delete $ctx->{obuf}} .= '</pre><hr>';
@@ -84,7 +84,7 @@ sub msg_page_more {
 	my $next = $ibx->over->next_by_mid($ctx->{mid}, \$id, \$prev);
 	$ctx->{more} = $next ? [ $id, $prev, $next ] : undef;
 	return '' unless $smsg;
-	$ctx->{mhref} = '../' . mid_escape($smsg->{mid}) . '/';
+	$ctx->{mhref} = '../' . mid_href($smsg->{mid}) . '/';
 	my $mime = delete $smsg->{mime};
 	_msg_page_prepare_obuf($mime->header_obj, $ctx, $nr);
 	multipart_text_as_html($mime, $ctx);
@@ -220,7 +220,7 @@ sub index_entry {
 	obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx;
 	$rv .= "From: $from @ ".fmt_ts($ds)." UTC";
 	my $upfx = $ctx->{-upfx};
-	my $mhref = $upfx . mid_escape($mid_raw) . '/';
+	my $mhref = $upfx . mid_href($mid_raw) . '/';
 	$rv .= qq{ (<a\nhref="$mhref">permalink</a> / };
 	$rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n};
 	my $to = fold_addresses(_hdr_names_html($hdr, 'To'));
@@ -244,9 +244,8 @@ sub index_entry {
 
 	my $mapping = $ctx->{mapping};
 	if (!$mapping && (defined($irt) || defined($irt = in_reply_to($hdr)))) {
-		my $mirt = PublicInbox::Hval->new_msgid($irt);
-		my $href = $upfx . $mirt->{href}. '/';
-		my $html = $mirt->as_html;
+		my $href = $upfx . mid_href($irt) . '/';
+		my $html = ascii_html($irt);
 		$rv .= qq(In-Reply-To: &lt;<a\nhref="$href">$html</a>&gt;\n)
 	}
 	$rv .= "\n";
@@ -672,8 +671,7 @@ sub _msg_page_prepare_obuf {
 	}
 	$ctx->{-title_html} = join(' - ', @title);
 	if (scalar(@$mids) == 1) { # common case
-		my $mid = PublicInbox::Hval->new_msgid($mids->[0]);
-		my $mhtml = $mid->as_html;
+		my $mhtml = ascii_html($mids->[0]);
 		$rv .= "Message-ID: &lt;$mhtml&gt; ";
 		$rv .= "(<a\nhref=\"raw\">raw</a>)\n";
 	} else {
@@ -751,9 +749,8 @@ sub _parent_headers {
 		$refs = references($hdr);
 		my $irt = pop @$refs;
 		if (defined $irt) {
-			my $v = PublicInbox::Hval->new_msgid($irt);
-			my $html = $v->as_html;
-			my $href = $v->{href};
+			my $html = ascii_html($irt);
+			my $href = mid_href($irt);
 			$rv .= "In-Reply-To: &lt;";
 			$rv .= "<a\nhref=\"../$href/\">$html</a>&gt;\n";
 		}
@@ -787,17 +784,17 @@ sub html_footer {
 		$next = $prev = '    ';
 
 		if (my $n = $ctx->{next_msg}) {
-			$n = PublicInbox::Hval->new_msgid($n)->{href};
+			$n = mid_href($n);
 			$next = "<a\nhref=\"$upfx$n/\"\nrel=next>next</a>";
 		}
 		my $u;
 		my $par = $ctx->{parent_msg};
 		if ($par) {
-			$u = PublicInbox::Hval->new_msgid($par)->{href};
+			$u = mid_href($par);
 			$u = "$upfx$u/";
 		}
 		if (my $p = $ctx->{prev_msg}) {
-			$prev = PublicInbox::Hval->new_msgid($p)->{href};
+			$prev = mid_href($p);
 			if ($p && $par && $p eq $par) {
 				$prev = "<a\nhref=\"$upfx$prev/\"\n" .
 					'rel=prev>prev parent</a>';
@@ -819,9 +816,9 @@ sub html_footer {
 }
 
 sub linkify_ref_no_over {
-	my $v = PublicInbox::Hval->new_msgid($_[0]);
-	my $html = $v->as_html;
-	my $href = $v->{href};
+	my ($mid) = @_;
+	my $href = mid_href($mid);
+	my $html = ascii_html($mid);
 	"&lt;<a\nhref=\"../$href/\">$html</a>&gt;";
 }
 
@@ -833,9 +830,8 @@ sub anchor_for {
 sub ghost_parent {
 	my ($upfx, $mid) = @_;
 
-	$mid = PublicInbox::Hval->new_msgid($mid);
-	my $href = $mid->{href};
-	my $html = $mid->as_html;
+	my $href = mid_href($mid);
+	my $html = ascii_html($mid);
 	qq{[parent not found: &lt;<a\nhref="$upfx$href/">$html</a>&gt;]};
 }
 
@@ -996,7 +992,7 @@ sub skel_dump { # walk_thread callback
 		$map->[0] = "$d<a\nhref=\"$m\">$end";
 		$id = "\nid=r".$id;
 	} else {
-		$m = $ctx->{-upfx}.mid_escape($mid).'/';
+		$m = $ctx->{-upfx}.mid_href($mid).'/';
 	}
 	$$skel .=  $d . "<a\nhref=\"$m\"$id>" . $end;
 	1;
@@ -1010,9 +1006,8 @@ sub _skel_ghost {
 	$d .= '    '  if exists $ctx->{searchview};
 	$d .= indent_for($level) . th_pfx($level);
 	my $upfx = $ctx->{-upfx};
-	my $m = PublicInbox::Hval->new_msgid($mid);
-	my $href = $upfx . $m->{href} . '/';
-	my $html = $m->as_html;
+	my $href = $upfx . mid_href($mid) . '/';
+	my $html = ascii_html($mid);
 
 	my $mapping = $ctx->{mapping};
 	my $map = $mapping->{$mid} if $mapping;
@@ -1088,7 +1083,7 @@ sub dump_topics {
 		@$topic = ();
 		next unless defined $top_subj;  # ghost topic
 		my $mid = delete $seen->{$top_subj};
-		my $href = mid_escape($mid);
+		my $href = mid_href($mid);
 		my $prev_subj = [ split(/ /, $top_subj) ];
 		$top_subj = ascii_html($top_subj);
 		$ds = fmt_ts($ds);
@@ -1118,7 +1113,7 @@ sub dump_topics {
 			$prev_subj = \@next_prev;
 			$subj = ascii_html($subj);
 			obfuscate_addrs($obfs_ibx, $subj) if $obfs_ibx;
-			$href = mid_escape($mid);
+			$href = mid_href($mid);
 			$s .= indent_for($level) . TCHILD;
 			$s .= qq(<a\nhref="$href/T/#u">$subj</a>$omit\n);
 		}
diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm
index 658934a2..aa917ed8 100644
--- a/lib/PublicInbox/WwwAtomStream.pm
+++ b/lib/PublicInbox/WwwAtomStream.pm
@@ -12,8 +12,7 @@ use warnings;
 use POSIX qw(strftime);
 use Digest::SHA qw(sha1_hex);
 use PublicInbox::Address;
-use PublicInbox::Hval qw(ascii_html);
-use PublicInbox::MID qw(mid_escape);
+use PublicInbox::Hval qw(ascii_html mid_href);
 use PublicInbox::MsgTime qw(msg_timestamp);
 
 # called by PSGI server after getline:
@@ -71,7 +70,7 @@ sub atom_header {
 	my $mid = $ctx->{mid};
 	my $page_id;
 	if (defined $mid) { # per-thread
-		$self_url .= mid_escape($mid).'/t.atom';
+		$self_url .= mid_href($mid).'/t.atom';
 		$page_id = to_uuid("t\n".$mid)
 	} elsif (defined $search_q) {
 		my $query = $search_q->{'q'};
@@ -109,13 +108,13 @@ sub feed_entry {
 	my $base = $ctx->{feed_base_url};
 	if (defined $irt) {
 		my $irt_uuid = to_uuid($irt);
-		$irt = mid_escape($irt);
+		$irt = mid_href($irt);
 		$irt = qq(<thr:in-reply-to\nref="$irt_uuid"\n).
 			qq(href="$base$irt/"/>);
 	} else {
 		$irt = '';
 	}
-	my $href = $base . mid_escape($mid) . '/';
+	my $href = $base . mid_href($mid) . '/';
 	my $t = msg_timestamp($hdr);
 	my @t = gmtime(defined $t ? $t : time);
 	my $updated = feed_updated(@t);
diff --git a/t/psgi_bad_mids.t b/t/psgi_bad_mids.t
index d86c90bc..43025a4d 100644
--- a/t/psgi_bad_mids.t
+++ b/t/psgi_bad_mids.t
@@ -28,6 +28,7 @@ $im->{parallel} = 0;
 my $msgs = <<'';
 F1V5OR6NMF.3M649JTLO9IXD@tux.localdomain/hehe1"'<foo
 F1V5NB0PTU.3U0DCVGAJ750Z@tux.localdomain"'<>/foo
+F1V5NB0PTU.3U0DCVGAJ750Z@tux&.ampersand
 F1V5MIHGCU.2ABINKW6WBE8N@tux.localdomain/raw
 F1V5LF9D9C.2QT5PGXZQ050E@tux.localdomain/t.atom
 F1V58X3CMU.2DCCVAKQZGADV@tux.localdomain/../../../../foo
@@ -70,9 +71,13 @@ test_psgi(sub { $www->call(@_) }, sub {
 		'got escaped links to all messages');
 
 	@xmids = reverse @xmids;
+	my %uxs = ( gt => '>', lt => '<' );
 	foreach my $i (0..$#xmids) {
-		$res = $cb->(GET("/bad-mids/$xmids[$i]/raw"));
-		is($res->code, 200, 'got 200 OK raw message');
+		my $uri = $xmids[$i];
+		$uri =~ s/&#([0-9]+);/sprintf("%c", $1)/sge;
+		$uri =~ s/&(lt|gt);/$uxs{$1}/sge;
+		$res = $cb->(GET("/bad-mids/$uri/raw"));
+		is($res->code, 200, 'got 200 OK raw message '.$uri);
 		like($res->content, qr/Message-ID: <\Q$mids[$i]\E>/s,
 			'retrieved correct message');
 	}

  parent reply index

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-15  9:46 [PATCH 0/8] some view cleanups and minor bugfixes Eric Wong
2020-02-15  9:46 ` [PATCH 1/8] view: remove mhref arg from multipart_text_as_html Eric Wong
2020-02-15  9:46 ` [PATCH 2/8] view: single id="t" for multi-Subject messages Eric Wong
2020-02-15  9:46 ` [PATCH 3/8] view: dump_topics: better naming of top Subject Eric Wong
2020-02-15  9:46 ` [PATCH 4/8] view: cleanup topic accumulation and dumping Eric Wong
2020-02-15  9:46 ` [PATCH 5/8] view,searchview: avoid smsg method calls when using SQLite/Xapian Eric Wong
2020-02-15  9:46 ` [PATCH 6/8] view: escape Subject HTML directly Eric Wong
2020-02-15  9:46 ` Eric Wong [this message]
2020-02-15  9:46 ` [PATCH 8/8] view: remove last Hval->new caller Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200215094640.19425-8-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror http://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Example config snippet for mirrors

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git