user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 5/6] www: linkify inbox addresses in To/Cc headers
Date: Tue,  9 Jan 2024 11:39:27 +0000	[thread overview]
Message-ID: <20240109113928.992464-6-e@80x24.org> (raw)
In-Reply-To: <20240109113928.992464-1-e@80x24.org>

This makes it easier to discover contemporary messages
crossposted to other groups within the same WWW instance.
The internal cache is necessary for giant threads, and the
expiry mechanism is necessary to prevent attackers from
trivially OOM-ing.
---
 lib/PublicInbox/SearchView.pm |  2 +-
 lib/PublicInbox/View.pm       | 70 +++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 8f851738..2d3e942c 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -322,7 +322,7 @@ EOM
 
 	# link $INBOX_DIR/description text to "recent" view around
 	# the newest message in this result set:
-	$ctx->{-t_max} = max(map { delete $_->{ts} } @$msgs);
+	$ctx->{-t_max} = max(map { $_->{ts} } @$msgs);
 
 	@$msgs = reverse @$msgs if $r;
 	$ctx->{msgs} = $msgs;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 02b93d7b..39ec35c3 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -38,7 +38,7 @@ sub msg_page_i {
 				: $ctx->gone('over');
 		$ctx->{mhref} = ($ctx->{nr} || $ctx->{smsg}) ?
 				"../${\mid_href($smsg->{mid})}/" : '';
-		if (_msg_page_prepare($eml, $ctx)) {
+		if (_msg_page_prepare($eml, $ctx, $smsg->{ts})) {
 			$eml->each_part(\&add_text_body, $ctx, 1);
 			print { $ctx->{zfh} } '</pre><hr>';
 		}
@@ -183,6 +183,59 @@ sub nr_to_s ($$$) {
 	$nr == 1 ? "$nr $singular" : "$nr $plural";
 }
 
+sub addr2urlmap ($) {
+	my ($ctx) = @_;
+	# cache makes a huge difference with /[tT] and large threads
+	my $key = PublicInbox::Git::host_prefix_url($ctx->{env}, '');
+	my $ent = $ctx->{www}->{pi_cfg}->{-addr2urlmap}->{$key} // do {
+		my $by_addr = $ctx->{www}->{pi_cfg}->{-by_addr};
+		my (%addr2url, $url);
+		while (my ($addr, $ibx) = each %$by_addr) {
+			$url = $ibx->base_url // $ibx->base_url($ctx->{env});
+			$addr2url{$addr} = ascii_html($url) if defined $url;
+		}
+		# don't allow attackers to randomly change Host: headers
+		# and OOM us if the server handles all hostnames:
+		my $tmp = $ctx->{www}->{pi_cfg}->{-addr2urlmap};
+		my @k = keys %$tmp; # random order
+		delete @$tmp{@k[0..3]} if scalar(@k) > 7;
+		my $re = join('|', map { quotemeta } keys %addr2url);
+		$tmp->{$key} = [ qr/\b($re)\b/i, \%addr2url ];
+	};
+	@$ent;
+}
+
+sub to_cc_html ($$$$) {
+	my ($ctx, $eml, $field, $t) = @_;
+	my @vals = $eml->header($field) or return ('', 0);
+	my (undef, $addr2url) = addr2urlmap($ctx);
+	my $pairs = PublicInbox::Address::pairs(join(', ', @vals));
+	my ($len, $line_len, $html) = (0, 0, '');
+	my ($pair, $url);
+	my ($cur_ibx, $env) = @$ctx{qw(ibx env)};
+	# avoid excessive ascii_html calls (already hot in profiles):
+	my @html = split /\n/, ascii_html(join("\n", map {
+		$_->[0] // (split(/\@/, $_->[1]))[0]; # addr user if no name
+	} @$pairs));
+	for my $n (@html) {
+		$pair = shift @$pairs;
+		if ($line_len) { # 9 = display width of ",\t":
+			if ($line_len + length($n) > COLS - 9) {
+				$html .= ",\n\t";
+				$len += $line_len;
+				$line_len = 0;
+			} else {
+				$html .= ', ';
+				$line_len += 2;
+			}
+		}
+		$line_len += length($n);
+		$url = $addr2url->{lc $pair->[1]};
+		$html .= $url ? qq(<a\nhref="$url$t">$n</a>) : $n;
+	}
+	($html, $len + $line_len);
+}
+
 # Displays the text of of the message for /$INBOX/$MSGID/[Tt]/ endpoint
 # this is already inside a <pre>
 sub eml_entry {
@@ -207,7 +260,8 @@ sub eml_entry {
 	my $ds = delete $smsg->{ds}; # for v1 non-Xapian/SQLite users
 
 	# Deleting these fields saves about 400K as we iterate across 1K msgs
-	delete @$smsg{qw(ts blob)};
+	my ($t, undef) = delete @$smsg{qw(ts blob)};
+	$t = $t ? '?t='.ts2str($t) : '';
 
 	my $from = _hdr_names_html($eml, 'From');
 	obfuscate_addrs($obfs_ibx, $from) if $obfs_ibx;
@@ -216,9 +270,8 @@ sub eml_entry {
 	my $mhref = $upfx . mid_href($mid_raw) . '/';
 	$rv .= qq{ (<a\nhref="$mhref">permalink</a> / };
 	$rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n};
-	my $to = fold_addresses(_hdr_names_html($eml, 'To'));
-	my $cc = fold_addresses(_hdr_names_html($eml, 'Cc'));
-	my ($tlen, $clen) = (length($to), length($cc));
+	my ($to, $tlen) = to_cc_html($ctx, $eml, 'To', $t);
+	my ($cc, $clen) = to_cc_html($ctx, $eml, 'Cc', $t);
 	my $to_cc = '';
 	if (($tlen + $clen) > COLS) {
 		$to_cc .= '  To: '.$to."\n" if $tlen;
@@ -447,7 +500,7 @@ sub thread_html {
 
 	# link $INBOX_DIR/description text to "index_topics" view around
 	# the newest message in this thread
-	my $t = ts2str($ctx->{-t_max} = max(map { delete $_->{ts} } @$msgs));
+	my $t = ts2str($ctx->{-t_max} = max(map { $_->{ts} } @$msgs));
 	my $t_fmt = fmt_ts($ctx->{-t_max});
 
 	my $skel = '<hr><pre>';
@@ -613,7 +666,7 @@ sub add_text_body { # callback for each_part
 }
 
 sub _msg_page_prepare {
-	my ($eml, $ctx) = @_;
+	my ($eml, $ctx, $ts) = @_;
 	my $have_over = !!$ctx->{ibx}->over;
 	my $mids = mids_for_index($eml);
 	my $nr = $ctx->{nr}++;
@@ -649,6 +702,9 @@ href="d/">diff</a>)</pre><pre>];
 	$title[0] = $subj[0] // '(no subject)';
 	$hbuf .= "Date: $_\n" for $eml->header('Date');
 	$hbuf = ascii_html($hbuf);
+	my $t = $ts ? '?t='.ts2str($ts) : '';
+	my ($re, $addr2url) = addr2urlmap($ctx);
+	$hbuf =~ s!$re!qq(<a\nhref=").$addr2url->{lc $1}.qq($t">$1</a>)!sge;
 	$ctx->{-title_html} = ascii_html(join(' - ', @title));
 	if (my $obfs_ibx = $ctx->{-obfs_ibx}) {
 		obfuscate_addrs($obfs_ibx, $hbuf);

  parent reply	other threads:[~2024-01-09 11:39 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-09 11:39 [PATCH 0/6] WWW updates Eric Wong
2024-01-09 11:39 ` [PATCH 1/6] git: workaround occasional -watch error message Eric Wong
2024-01-09 11:39 ` [PATCH 2/6] doc: txt2pre: linkify -extindex(1), dedupe -config(5) Eric Wong
2024-01-09 11:39 ` [PATCH 3/6] test_common: key2sub: don't require final ';' in scripts Eric Wong
2024-01-09 11:39 ` [PATCH 4/6] git: lowercase host in host_prefix_url Eric Wong
2024-01-09 11:39 ` Eric Wong [this message]
2024-01-09 12:49   ` [PATCH 7/6] address: avoid [ undef, undef ] address pairs Eric Wong
2024-01-09 11:39 ` [PATCH 6/6] www: use autodie in more coderepo places Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240109113928.992464-6-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).