From 472d39de46603b180ab6e739e0b31ab7ef559870 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 1 Mar 2016 03:44:04 +0000 Subject: linkify: do not capture trailing '.' or ';' in URLs It seems common for users to end statements with URLs, while it is rare for a URL itself to end with a '.' or ';'. So make a guess and assume the URL was intended to not include the trailing '.' or ';' --- MANIFEST | 1 + lib/PublicInbox/Linkify.pm | 10 +++++++++- t/linkify.t | 26 ++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 t/linkify.t diff --git a/MANIFEST b/MANIFEST index 5d790f9c..259f42ce 100644 --- a/MANIFEST +++ b/MANIFEST @@ -80,6 +80,7 @@ t/httpd-corner.psgi t/httpd-corner.t t/httpd.t t/init.t +t/linkify.t t/main-bin/spamc t/mda.t t/msgmap.t diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index 8f634f48..4eddedd0 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -25,6 +25,14 @@ sub linkify_1 { my ($self, $s) = @_; $s =~ s!$LINK_RE! my $url = $1; + my $end = ''; + + # it's fairly common to end URLs in messages with + # '.' or ';' to denote the end of a statement. + if ($url =~ s/(\.)\z// || $url =~ s/(;)\z//) { + $end = $1; + } + # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($url . $SALT); @@ -32,7 +40,7 @@ sub linkify_1 { # only escape ampersands, others do not match LINK_RE $url =~ s/&/&/g; $self->{$key} = $url; - 'PI-LINK-'. $key; + 'PI-LINK-'. $key . $end; !ge; $s; } diff --git a/t/linkify.t b/t/linkify.t new file mode 100644 index 00000000..586691ae --- /dev/null +++ b/t/linkify.t @@ -0,0 +1,26 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::Linkify; + +{ + my $l = PublicInbox::Linkify->new; + my $u = 'http://example.com/url-with-trailing-period'; + my $s = $u . '.'; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq($u.), 'trailing period not in URL'); +} + +{ + my $l = PublicInbox::Linkify->new; + my $u = 'http://example.com/url-with-trailing-semicolon'; + my $s = $u . ';'; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq($u;), 'trailing semicolon not in URL'); +} + +done_testing(); -- cgit v1.2.3-24-ge0c7