From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-2.9 required=3.0 tests=ALL_TRUSTED,BAYES_00, URIBL_BLOCKED shortcircuit=no autolearn=unavailable version=3.3.2 X-Original-To: meta@public-inbox.org Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5FA3020310 for ; Fri, 26 Feb 2016 03:02:49 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] www: workaround for malformed NNTP links Date: Fri, 26 Feb 2016 03:02:49 +0000 Message-Id: <20160226030249.29126-1-e@80x24.org> List-Id: Some linkifiers to create invalid HTTP links when it sees a link intended for NNTP services. This means we may see links to news.public-inbox.org/inbox.comp.mail.public-inbox.meta point to "http://" on port 80 instead of 119. Try to redirect users to http://public-inbox.org/meta/ in this case. --- lib/PublicInbox/NewsWWW.pm | 80 ++++++++++++++++++++++++++++++++++++++++++++++ lib/PublicInbox/WWW.pm | 67 +++++++++++++++++++++++--------------- 2 files changed, 121 insertions(+), 26 deletions(-) create mode 100644 lib/PublicInbox/NewsWWW.pm diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm new file mode 100644 index 0000000..e19765c --- /dev/null +++ b/lib/PublicInbox/NewsWWW.pm @@ -0,0 +1,80 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +# +# Plack app redirector for mapping /$NEWSGROUP requests to +# the appropriate /$LISTNAME in PublicInbox::WWW because some +# auto-linkifiers cannot handle nntp:// redirects properly. +# This is also used directly by PublicInbox::WWW +package PublicInbox::NewsWWW; +use strict; +use warnings; +use PublicInbox::Config; +use URI::Escape qw(uri_escape_utf8); + +sub new { + my ($class, $pi_config) = @_; + $pi_config ||= PublicInbox::Config->new; + bless { pi_config => $pi_config }, $class; +} + +sub call { + my ($self, $env) = @_; + my $ng_map = $self->newsgroup_map; + my $path = $env->{PATH_INFO}; + $path =~ s!\A/+!!; + $path =~ s!/+\z!!; + + # some links may have the article number in them: + # /inbox.foo.bar/123456 + my ($ng, $article) = split(m!/+!, $path, 2); + if (my $info = $ng_map->{$ng}) { + my $url = PublicInbox::Hval::prurl($env, $info->{url}); + my $code = 301; + my $h = [ Location => $url, 'Content-Type' => 'text/plain' ]; + if (defined $article && $article =~ /\A\d+\z/) { + my $mid = eval { ng_mid_for($ng, $info, $article) }; + if (defined $mid) { + # article IDs are not stable across clones, + # do not encourage caching/bookmarking them + $code = 302; + $url .= uri_escape_utf8($mid) . '/'; + } + } + + return [ $code, $h, [ "Redirecting to $url\n" ] ] + } + [ 404, [ 'Content-Length' => 'text/plain' ], [] ]; +} + +sub ng_mid_for { + my ($ng, $info, $article) = @_; + # may fail due to lack of Danga::Socket + # for defer_weaken: + require PublicInbox::NewsGroup; + $ng = $info->{ng} ||= + PublicInbox::NewsGroup->new($ng, $info->{git_dir}, ''); + $ng->mm->mid_for($article); +} + +sub newsgroup_map { + my ($self) = @_; + my $rv; + $rv = $self->{ng_map} and return $rv; + my $pi_config = $self->{pi_config}; + my %ng_map; + foreach my $k (keys %$pi_config) { + $k =~ /\Apublicinbox\.([^\.]+)\.mainrepo\z/ or next; + my $listname = $1; + my $git_dir = $pi_config->{"publicinbox.$listname.mainrepo"}; + my $url = $pi_config->{"publicinbox.$listname.url"}; + defined $url or next; + my $ng = $pi_config->{"publicinbox.$listname.newsgroup"}; + next if (!defined $ng) || ($ng eq ''); # disabled + + $url =~ m!/\z! or $url .= '/'; + $ng_map{$ng} = { url => $url, git_dir => $git_dir }; + } + $self->{ng_map} = \%ng_map; +} + +1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 8372927..fd09280 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -45,7 +45,7 @@ sub call { if ($method eq 'POST' && $path_info =~ m!$LISTNAME_RE/(git-upload-pack)\z!) { my $path = $2; - return (invalid_list($ctx, $1) || + return (invalid_list($self, $ctx, $1) || serve_git($cgi, $ctx->{git}, $path)); } elsif ($method !~ /\AGET|HEAD\z/) { @@ -56,18 +56,19 @@ sub call { if ($path_info eq '/') { r404(); } elsif ($path_info =~ m!$LISTNAME_RE\z!o) { - invalid_list($ctx, $1) || r301($ctx, $1); + invalid_list($self, $ctx, $1) || r301($ctx, $1); } elsif ($path_info =~ m!$LISTNAME_RE(?:/|/index\.html)?\z!o) { - invalid_list($ctx, $1) || get_index($ctx); + invalid_list($self, $ctx, $1) || get_index($ctx); } elsif ($path_info =~ m!$LISTNAME_RE/(?:atom\.xml|new\.atom)\z!o) { - invalid_list($ctx, $1) || get_atom($ctx); + invalid_list($self, $ctx, $1) || get_atom($ctx); } elsif ($path_info =~ m!$LISTNAME_RE/ ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my $path = $2; - invalid_list($ctx, $1) || serve_git($cgi, $ctx->{git}, $path); + invalid_list($self, $ctx, $1) || + serve_git($cgi, $ctx->{git}, $path); } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/$END_RE\z!o) { - msg_page($ctx, $1, $2, $3); + msg_page($self, $ctx, $1, $2, $3); # in case people leave off the trailing slash: } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/(f|T|t|R)\z!o) { @@ -80,7 +81,7 @@ sub call { r301($ctx, $1, $2); } else { - legacy_redirects($ctx, $path_info); + legacy_redirects($self, $ctx, $path_info); } } @@ -118,7 +119,7 @@ sub r { [ $_[0], ['Content-Type' => 'text/plain'], [ join(' ', @_, "\n") ] ] } # returns undef if valid, array ref response if invalid sub invalid_list { - my ($ctx, $listname) = @_; + my ($self, $ctx, $listname, $mid) = @_; my $git_dir = $ctx->{pi_config}->get($listname, "mainrepo"); if (defined $git_dir) { $ctx->{git_dir} = $git_dir; @@ -126,17 +127,23 @@ sub invalid_list { $ctx->{listname} = $listname; return; } - r404(); + + # sometimes linkifiers (not ours!) screw up automatic link + # generation and link things intended for nntp:// to https?://, + # so try to infer links and redirect them to the appropriate + # list URL. + $self->news_www->call($ctx->{cgi}->{env}); } # returns undef if valid, array ref response if invalid sub invalid_list_mid { - my ($ctx, $listname, $mid) = @_; - my $ret = invalid_list($ctx, $listname, $mid); + my ($self, $ctx, $listname, $mid) = @_; + my $ret = invalid_list($self, $ctx, $listname, $mid); return $ret if $ret; $ctx->{mid} = $mid = uri_unescape($mid); if ($mid =~ /\A[a-f0-9]{40}\z/) { + # this is horiffically wasteful for legacy URLs: if ($mid = mid2blob($ctx)) { require Email::Simple; use PublicInbox::MID qw/mid_clean/; @@ -339,7 +346,7 @@ sub get_thread_atom { } sub legacy_redirects { - my ($ctx, $path_info) = @_; + my ($self, $ctx, $path_info) = @_; # single-message pages if ($path_info =~ m!$LISTNAME_RE/m/(\S+)/\z!o) { @@ -384,13 +391,13 @@ sub legacy_redirects { # some Message-IDs have slashes in them and the HTTP server # may try to be clever and unescape them :< } elsif ($path_info =~ m!$LISTNAME_RE/(\S+/\S+)/$END_RE\z!o) { - msg_page($ctx, $1, $2, $3); + msg_page($self, $ctx, $1, $2, $3); # in case people leave off the trailing slash: } elsif ($path_info =~ m!$LISTNAME_RE/(\S+/\S+)/(f|T|t)\z!o) { r301($ctx, $1, $2, $3 eq 't' ? 't/#u' : $3); } else { - r404(); + $self->news_www->call($ctx->{cgi}->{env}); } } @@ -410,18 +417,18 @@ sub r301 { } sub msg_page { - my ($ctx, $list, $mid, $e) = @_; - unless (invalid_list_mid($ctx, $list, $mid)) { - '' eq $e and return get_mid_html($ctx); - 't/' eq $e and return get_thread($ctx); - 't.atom' eq $e and return get_thread_atom($ctx); - 't.mbox' eq $e and return get_thread_mbox($ctx); - 't.mbox.gz' eq $e and return get_thread_mbox($ctx, '.gz'); - 'T/' eq $e and return get_thread($ctx, 1); - 'raw' eq $e and return get_mid_txt($ctx); - 'f/' eq $e and return get_full_html($ctx); - 'R/' eq $e and return get_reply_html($ctx); - } + my ($self, $ctx, $list, $mid, $e) = @_; + my $ret; + $ret = invalid_list_mid($self, $ctx, $list, $mid) and return $ret; + '' eq $e and return get_mid_html($ctx); + 't/' eq $e and return get_thread($ctx); + 't.atom' eq $e and return get_thread_atom($ctx); + 't.mbox' eq $e and return get_thread_mbox($ctx); + 't.mbox.gz' eq $e and return get_thread_mbox($ctx, '.gz'); + 'T/' eq $e and return get_thread($ctx, 1); + 'raw' eq $e and return get_mid_txt($ctx); + 'f/' eq $e and return get_full_html($ctx); + 'R/' eq $e and return get_reply_html($ctx); r404($ctx); } @@ -430,4 +437,12 @@ sub serve_git { PublicInbox::GitHTTPBackend::serve($cgi, $git, $path); } +sub news_www { + my ($self) = @_; + my $nw = $self->{news_www}; + return $nw if $nw; + require PublicInbox::NewsWWW; + $self->{news_www} = PublicInbox::NewsWWW->new($self->{pi_config}); +} + 1; -- EW