From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.1 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 76E121F403; Wed, 13 Jun 2018 22:43:56 +0000 (UTC) Date: Wed, 13 Jun 2018 22:43:56 +0000 From: Eric Wong To: Leah Neukirchen Cc: meta@public-inbox.org Subject: [PATCH] www: use undecoded paths for Message-ID extraction Message-ID: <20180613224356.jz7abxkyg4i3tlf5@dcvr> References: <871sdfzy80.fsf@gmail.com> <20180612100915.shfo3ltn6aj55mrf@dcvr> <8736xsb5s5.fsf@vuxu.org> <20180613214055.2nudcx5e7w2y4q73@dcvr> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20180613214055.2nudcx5e7w2y4q73@dcvr> List-Id: > Leah Neukirchen wrote: > > During testing, we also found another thing when obscure characters > > are used in Message-IDs, esp. / and ?. > > > > E.g. using a Message-ID of > > will create a corrupt link. Some more "ideas" are at > > https://inbox.vuxu.org/pi-test/ I guess I'm spoiled by Rack where PATH_INFO is undecoded :x However, REQUEST_URI is specified in PSGI specs(*) Very lightly tested, but this seems to work; additions to the test suite will be necessary... ------8<---- Subject: [PATCH] www: use undecoded paths for Message-ID extraction In PSGI, PATH_INFO contains URI-decoded paths which cause problems when Message-IDs contain ambiguous characters for used for routing. Instead, extract the undecoded path from REQUEST_URI and use that. Reported-by: Leah Neukirchen https://public-inbox.org/meta/8736xsb5s5.fsf@vuxu.org/ --- lib/PublicInbox/WWW.pm | 40 ++++++++++++++++++++++++++++------------ t/cgi.t | 2 ++ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 24e24f1..c1c3926 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -36,6 +36,17 @@ sub run { PublicInbox::WWW->new->call($req->env); } +my %path_re_cache; + +sub path_re ($) { + my $sn = $_[0]->{SCRIPT_NAME}; + $path_re_cache{$sn} ||= do { + $sn = '/'.$sn unless index($sn, '/') == 0; + $sn =~ s!/\z!!; + qr!\A(?:https?://[^/]+)?\Q$sn\E(/[^\?\#]+)!; + }; +} + sub call { my ($self, $env) = @_; my $ctx = { env => $env, www => $self }; @@ -50,7 +61,8 @@ sub call { } split(/[&;]+/, $env->{QUERY_STRING}); $ctx->{qp} = \%qp; - my $path_info = $env->{PATH_INFO}; + # not using $env->{PATH_INFO} here since that's already decoded + my ($path_info) = ($env->{REQUEST_URI} =~ path_re($env)); my $method = $env->{REQUEST_METHOD}; if ($method eq 'POST') { @@ -91,13 +103,13 @@ sub call { invalid_inbox_mid($ctx, $1, $2) || get_attach($ctx, $idx, $fn); # in case people leave off the trailing slash: } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/(T|t)\z!o) { - my ($inbox, $mid, $suffix) = ($1, $2, $3); + my ($inbox, $mid_ue, $suffix) = ($1, $2, $3); $suffix .= $suffix =~ /\A[tT]\z/ ? '/#u' : '/'; - r301($ctx, $inbox, $mid, $suffix); + r301($ctx, $inbox, $mid_ue, $suffix); } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/R/?\z!o) { - my ($inbox, $mid) = ($1, $2); - r301($ctx, $inbox, $mid, '#R'); + my ($inbox, $mid_ue) = ($1, $2); + r301($ctx, $inbox, $mid_ue, '#R'); } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/f/?\z!o) { r301($ctx, $1, $2); @@ -164,11 +176,11 @@ sub invalid_inbox ($$) { # returns undef if valid, array ref response if invalid sub invalid_inbox_mid { - my ($ctx, $inbox, $mid) = @_; + my ($ctx, $inbox, $mid_ue) = @_; my $ret = invalid_inbox($ctx, $inbox); return $ret if $ret; - $ctx->{mid} = $mid; + my $mid = $ctx->{mid} = uri_unescape($mid_ue); my $ibx = $ctx->{-inbox}; if ($mid =~ m!\A([a-f0-9]{2})([a-f0-9]{38})\z!) { my ($x2, $x38) = ($1, $2); @@ -177,7 +189,7 @@ sub invalid_inbox_mid { require Email::Simple; my $s = Email::Simple->new($str); $mid = PublicInbox::MID::mid_clean($s->header('Message-ID')); - return r301($ctx, $inbox, $mid); + return r301($ctx, $inbox, mid_escape($mid)); } undef; } @@ -352,7 +364,7 @@ sub legacy_redirects { } sub r301 { - my ($ctx, $inbox, $mid, $suffix) = @_; + my ($ctx, $inbox, $mid_ue, $suffix) = @_; my $obj = $ctx->{-inbox}; unless ($obj) { my $r404 = invalid_inbox($ctx, $inbox); @@ -361,7 +373,11 @@ sub r301 { } my $url = $obj->base_url($ctx->{env}); my $qs = $ctx->{env}->{QUERY_STRING}; - $url .= (mid_escape($mid) . '/') if (defined $mid); + if (defined $mid_ue) { + # common, and much nicer as '@' than '%40': + $mid_ue =~ s/%40/@/g; + $url .= $mid_ue . '/'; + } $url .= $suffix if (defined $suffix); $url .= "?$qs" if $qs ne ''; @@ -371,9 +387,9 @@ sub r301 { } sub msg_page { - my ($ctx, $inbox, $mid, $e) = @_; + my ($ctx, $inbox, $mid_ue, $e) = @_; my $ret; - $ret = invalid_inbox_mid($ctx, $inbox, $mid) and return $ret; + $ret = invalid_inbox_mid($ctx, $inbox, $mid_ue) and return $ret; '' eq $e and return get_mid_html($ctx); 'T/' eq $e and return get_thread($ctx, 1); 't/' eq $e and return get_thread($ctx); diff --git a/t/cgi.t b/t/cgi.t index bd92ca3..2e2476d 100644 --- a/t/cgi.t +++ b/t/cgi.t @@ -225,6 +225,8 @@ sub cgi_run { my %env = ( PATH_INFO => $_[0], QUERY_STRING => $_[1] || "", + SCRIPT_NAME => '', + REQUEST_URI => $_[0] . ($_[1] ? "?$_[1]" : ''), REQUEST_METHOD => $_[2] || "GET", GATEWAY_INTERFACE => 'CGI/1.1', HTTP_ACCEPT => '*/*', -- (*) git clone https://github.com/plack/psgi-specs.git