From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 2684D1FD99 for ; Thu, 18 Aug 2016 02:04:50 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] linkify: be stricter about matching RFC 3986 Date: Thu, 18 Aug 2016 02:04:50 +0000 Message-Id: <20160818020450.19475-1-e@80x24.org> List-Id: We're not to-the-letter about percent-encoding, but we should allow all the characters. This is mainly so we can effectively use the link to some Wikipedia pages with parentheses in them: https://en.wikipedia.org/wiki/Atom_(standard) https://en.wikipedia.org/wiki/Git_(software) --- lib/PublicInbox/Linkify.pm | 5 ++++- t/linkify.t | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index d4df689..ea7fd71 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -17,7 +17,10 @@ use Digest::SHA qw/sha1_hex/; my $SALT = rand; my $LINK_RE = qr{\b((?:ftps?|https?|nntps?|gopher):// [\@:\w\.-]+/ - ?[!,:~\$\@\w\+\&\?\.\%\;/#=-]*)}x; + (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) + (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? + (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? + )}xi; sub new { bless {}, shift } diff --git a/t/linkify.t b/t/linkify.t index 586691a..49cbbd6 100644 --- a/t/linkify.t +++ b/t/linkify.t @@ -23,4 +23,38 @@ use PublicInbox::Linkify; is($s, qq($u;), 'trailing semicolon not in URL'); } +{ + my $l = PublicInbox::Linkify->new; + my $u = 'http://example.com/url-with-(parens)'; + my $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello $u world), 'URL preserved'); + + $u .= "?query=a"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello $u world), 'query preserved'); + + $u .= "#fragment"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello $u world), + 'query + fragment preserved'); + + $u = "http://example.com/"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello $u world), "root URL preserved"); + + $u = "http://example.com/#fragment"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello $u world), "root + fragment"); +} + done_testing(); -- EW