user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH] searchview: nofollow on mbox downloads
@ 2017-12-07 20:30 Eric Wong
  2017-12-08 21:01 ` [PATCH] search: force large mbox result downloads to POST Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2017-12-07 20:30 UTC (permalink / raw)
  To: meta

Some search results are gigantic, and search engines are
unlikely to be able to handle gzipped mboxes anyways.
---
 lib/PublicInbox/SearchView.pm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 3eff708..8e0c3cf 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -168,7 +168,8 @@ sub search_nav_top {
 	$rv .= qq{|<a\nhref="?$A">Atom feed</a>]};
 	my $m = $q->qs_html(x => 'm', r => undef);
 	warn "m: $m\n";
-	$rv .= qq{\n\t\t\t\t\t\tdownload: <a\nhref="?$m">mbox.gz</a>};
+	$rv .= qq{\n\t\t\t\t\t\tdownload: };
+	$rv .= qq{<a\nhref="?$m"\nrel="nofollow">mbox.gz</a>};
 }
 
 sub search_nav_bot {
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH] search: force large mbox result downloads to POST
  2017-12-07 20:30 [PATCH] searchview: nofollow on mbox downloads Eric Wong
@ 2017-12-08 21:01 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2017-12-08 21:01 UTC (permalink / raw)
  To: meta

This should prevent crawlers (including most robots.txt ignoring
ones) from burning our CPU time without severely compromising
usability for humans.
---
 Using nofollow doesn't help with some bots or results which
 already ended up in their crawling lists.

 lib/PublicInbox/SearchView.pm | 19 ++++++++++++-------
 lib/PublicInbox/WWW.pm        | 22 ++++++++++++++++++----
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 8e0c3cf..13e9c17 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -18,12 +18,19 @@ our $LIM = 200;
 
 sub noop {}
 
+sub mbox_results {
+	my ($ctx) = @_;
+	my $q = PublicInbox::SearchQuery->new($ctx->{qp});
+	my $x = $q->{x};
+	return PublicInbox::Mbox::mbox_all($ctx, $q->{'q'}) if $x eq 'm';
+	sres_top_html($ctx);
+}
+
 sub sres_top_html {
 	my ($ctx) = @_;
 	my $q = PublicInbox::SearchQuery->new($ctx->{qp});
 	my $x = $q->{x};
 	my $query = $q->{'q'};
-	return PublicInbox::Mbox::mbox_all($ctx, $query) if $x eq 'm';
 
 	my $code = 200;
 	# double the limit for expanded views:
@@ -60,7 +67,7 @@ retry:
 	} else {
 		return adump($_[0], $mset, $q, $ctx) if $x eq 'A';
 
-		$ctx->{-html_tip} = search_nav_top($mset, $q, $ctx) . "\n\n";
+		$ctx->{-html_tip} = search_nav_top($mset, $q, $ctx);
 		if ($x eq 't') {
 			$cb = mset_thread($ctx, $mset, $q);
 		} else {
@@ -131,8 +138,8 @@ sub err_txt {
 
 sub search_nav_top {
 	my ($mset, $q, $ctx) = @_;
-
-	my $rv = '<pre>';
+	my $m = $q->qs_html(x => 'm', r => undef);
+	my $rv = qq{<form\naction="?$m"\nmethod="post"><pre>};
 	my $initial_q = $ctx->{-uxs_retried};
 	if (defined $initial_q) {
 		my $rewritten = $q->{'q'};
@@ -166,10 +173,8 @@ sub search_nav_top {
 	}
 	my $A = $q->qs_html(x => 'A', r => undef);
 	$rv .= qq{|<a\nhref="?$A">Atom feed</a>]};
-	my $m = $q->qs_html(x => 'm', r => undef);
-	warn "m: $m\n";
 	$rv .= qq{\n\t\t\t\t\t\tdownload: };
-	$rv .= qq{<a\nhref="?$m"\nrel="nofollow">mbox.gz</a>};
+	$rv .= qq{<input\ntype=submit\nvalue="mbox.gz"/></pre></form><pre>};
 }
 
 sub search_nav_bot {
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index f3c702e..3fd77d4 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -53,10 +53,14 @@ sub call {
 	my $path_info = $env->{PATH_INFO};
 	my $method = $env->{REQUEST_METHOD};
 
-	if ($method eq 'POST' &&
-		 $path_info =~ m!$INBOX_RE/(git-upload-pack)\z!) {
-		my $path = $2;
-		return invalid_inbox($ctx, $1) || serve_git($ctx, $path);
+	if ($method eq 'POST') {
+		if ($path_info =~ m!$INBOX_RE/(git-upload-pack)\z!) {
+			my $path = $2;
+			return invalid_inbox($ctx, $1) ||
+				serve_git($ctx, $path);
+		} elsif ($path_info =~ m!$INBOX_RE/!o) {
+			return invalid_inbox($ctx, $1) || mbox_results($ctx);
+		}
 	}
 	elsif ($method !~ /\AGET|HEAD\z/) {
 		return r(405, 'Method Not Allowed');
@@ -400,6 +404,16 @@ sub serve_git {
 	PublicInbox::GitHTTPBackend::serve($ctx->{env}, $ctx->{git}, $path);
 }
 
+sub mbox_results {
+	my ($ctx) = @_;
+	if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) {
+		searcher($ctx) or return need_search($ctx);
+		require PublicInbox::SearchView;
+		return PublicInbox::SearchView::mbox_results($ctx);
+	}
+	r404();
+}
+
 sub serve_mbox_range {
 	my ($ctx, $inbox, $range) = @_;
 	invalid_inbox($ctx, $inbox) || eval {
-- 
EW

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2017-12-08 21:01 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-07 20:30 [PATCH] searchview: nofollow on mbox downloads Eric Wong
2017-12-08 21:01 ` [PATCH] search: force large mbox result downloads to POST Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).