# Copyright (C) 2014-2021 all contributors # License: AGPL-3.0+ # two-step linkification. # intended usage is in the following order: # # linkify_1 # # linkify_2 # # Maybe this could be done more efficiently... package PublicInbox::Linkify; use strict; use warnings; use Digest::SHA qw/sha1_hex/; use PublicInbox::Hval qw(ascii_html mid_href); use PublicInbox::MID qw($MID_EXTRACT); my $SALT = rand; my $LINK_RE = qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher):// [\@:\w\.-]+(?:/ (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? )? )}xi; sub new { bless {}, $_[0] } # try to distinguish paired punctuation chars from the URL itself # Maybe other languages/formats can be supported here, too... my %pairs = ( "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays) "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby ); sub linkify_1 { $_[1] =~ s^$LINK_RE^ my $beg = $1 || ''; my $url = $2; my $end = ''; # it's fairly common to end URLs in messages with # '.', ',' or ';' to denote the end of a statement; # assume the intent was to end the statement/sentence # in English if (defined(my $re = $pairs{$beg})) { if ($url =~ s/$re//) { $end = $1; } } elsif ($url =~ s/(\))?([\.,;])\z//) { $end = $2; # require ')' to be paired with '(' if (defined $1) { # ')' if (index($url, '(') < 0) { $end = ")$end"; } else { $url .= ')'; } } } elsif ($url !~ /\(/ && $url =~ s/\)\z//) { $end = ')'; } $url = ascii_html($url); # for IDN # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($url . $SALT); $_[0]->{$key} = $url; $beg . 'PI-LINK-'. $key . $end; ^geo; $_[1]; } sub linkify_2 { # Added "PI-LINK-" prefix to avoid false-positives on git commits $_[1] =~ s!\bPI-LINK-([a-f0-9]{40})\b! my $key = $1; my $url = $_[0]->{$key}; if (defined $url) { "$url"; } else { # false positive or somebody tried to mess with us $key; } !ge; $_[1]; } # single pass linkification of within $str # with $pfx being the URL prefix sub linkify_mids { my ($self, $pfx, $str, $raw) = @_; $$str =~ s!$MID_EXTRACT! my $mid = $1; my $html = ascii_html($mid); my $href = mid_href($mid); # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($html . $SALT); my $repl = qq(<$html>); $repl .= qq{ (raw)} if $raw; $self->{$key} = $repl; 'PI-LINK-'. $key; !ge; $$str = ascii_html($$str); $$str =~ s!\bPI-LINK-([a-f0-9]{40})\b! my $key = $1; my $repl = $_[0]->{$key}; if (defined $repl) { $repl; } else { # false positive or somebody tried to mess with us $key; } !ge; } sub to_html { linkify_2($_[0], ascii_html(linkify_1(@_))) } 1;