From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B69471FA12 for ; Sat, 22 Aug 2020 06:06:28 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 5/5] mbox: disable "&t" on existing Xapian until full reindex Date: Sat, 22 Aug 2020 06:06:27 +0000 Message-Id: <20200822060627.15595-6-e@yhbt.net> In-Reply-To: <20200822060627.15595-1-e@yhbt.net> References: <20200822060627.15595-1-e@yhbt.net> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit List-Id: Expanding threads via over.sqlite3 for mbox.gz downloads without Xapian effectively collapsing on the THREADID column leads to repeated messages getting downloaded. To avoid that situation, use a "has_threadid" Xapian metadata flag that's only set on --reindex (and brand new Xapian DBs). This allows admins to upgrade WWW or do --reindex in any order; without worrying about users eating up bandwidth and CPU cycles. --- lib/PublicInbox/Mbox.pm | 2 +- lib/PublicInbox/Search.pm | 10 ++++++++- lib/PublicInbox/SearchIdx.pm | 16 ++++++++++++-- lib/PublicInbox/SearchView.pm | 21 ++++++++++++------- lib/PublicInbox/V2Writable.pm | 11 ++++++++++ t/init.t | 1 + t/psgi_search.t | 39 +++++++++++++++++++++++++++++++++-- 7 files changed, 87 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index c9b11c21..0223bead 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -262,7 +262,7 @@ sub mbox_all { $ctx->{ids} = $srch->mset_to_artnums($mset); require PublicInbox::MboxGz; my $fn; - if ($q->{t}) { + if ($q->{t} && $srch->has_threadid) { $fn = 'results-thread-'.$q_string; PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn); } else { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index bc820b64..01bbe73d 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -311,6 +311,12 @@ sub _do_enquire { retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]); } +# returns true if all docs have the THREADID value +sub has_threadid ($) { + my ($self) = @_; + (xdb($self)->get_metadata('has_threadid') // '') eq '1'; +} + sub _enquire_once { # retry_reopen callback my ($self, $query, $opts) = @{$_[0]}; my $xdb = xdb($self); @@ -328,7 +334,9 @@ sub _enquire_once { # retry_reopen callback } # `mairix -t / --threads' or JMAP collapseThreads - $enquire->set_collapse_key(THREADID) if $opts->{thread}; + if ($opts->{thread} && has_threadid($self)) { + $enquire->set_collapse_key(THREADID); + } my $offset = $opts->{offset} || 0; my $limit = $opts->{limit} || 50; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index baa6f41a..ade55756 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -131,6 +131,7 @@ sub idx_acquire { ($is_shard && need_xapian($self)))) { File::Path::mkpath($dir); nodatacow_dir($dir); + $self->{-set_has_threadid_once} = 1; } } return unless defined $flag; @@ -590,9 +591,17 @@ sub v1_checkpoint ($$;$) { $self->{mm}->{dbh}->commit; if ($newest && need_xapian($self)) { - my $cur = $self->{xdb}->get_metadata('last_commit'); + my $xdb = $self->{xdb}; + my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { - $self->{xdb}->set_metadata('last_commit', $newest); + $xdb->set_metadata('last_commit', $newest); + } + + # let SearchView know a full --reindex was done so it can + # generate ->has_threadid-dependent links + if ($sync->{reindex} && !ref($sync->{reindex})) { + my $n = $xdb->get_metadata('has_threadid'); + $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } } @@ -816,6 +825,9 @@ sub set_metadata_once { return if $self->{shard}; # only continue if undef or 0, not >0 my $xdb = $self->{xdb}; + if (delete($self->{-set_has_threadid_once})) { + $xdb->set_metadata('has_threadid', '1'); + } if (delete($self->{-set_indexlevel_once})) { my $level = $xdb->get_metadata('indexlevel'); if (!$level || $level ne 'medium') { diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index dd69564a..76428dfb 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -188,13 +188,20 @@ sub search_nav_top { $rv .= qq{summary|nested}; } my $A = $q->qs_html(x => 'A', r => undef); - $rv .= qq{|Atom feed]} . - qq{\n\t\t\tdownload mbox.gz: } . - # we set name=z w/o using it since it seems required for - # lynx (but works fine for w3m). - qq{|} . - qq{} . - qq{
};
+	$rv .= qq{|Atom feed]};
+	if ($ctx->{-inbox}->search->has_threadid) {
+		$rv .= qq{\n\t\t\tdownload mbox.gz: } .
+			# we set name=z w/o using it since it seems required for
+			# lynx (but works fine for w3m).
+			qq{} .
+			qq{|};
+	} else { # BOFH needs to --reindex
+		$rv .= qq{\n\t\t\t\t\t\tdownload: } .
+			qq{}
+	}
+	$rv .= qq{
};
 }
 
 sub search_nav_bot {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0a91a132..a32fdcf3 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1337,6 +1337,17 @@ sub index_sync {
 		xapian_only($self, $opt, $sync, $art_beg);
 	}
 
+	# --reindex on the command-line
+	if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
+		local $self->{parallel} = 0;
+		my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
+		if (my $xdb = $s0->idx_acquire) {
+			my $n = $xdb->get_metadata('has_threadid');
+			$xdb->set_metadata('has_threadid', 1) if $n ne '1';
+		}
+		$s0->idx_release;
+	}
+
 	# reindex does not pick up new changes, so we rerun w/o it:
 	if ($opt->{reindex}) {
 		my %again = %$opt;
diff --git a/t/init.t b/t/init.t
index dad09435..a5a9debc 100644
--- a/t/init.t
+++ b/t/init.t
@@ -108,6 +108,7 @@ SKIP: {
 		is(PublicInbox::Admin::detect_indexlevel($ibx), 'full',
 			"detected default indexlevel -V$v");
 		ok($ibx->{-skip_docdata}, "docdata skip set -V$v");
+		ok($ibx->search->has_threadid, 'has_threadid flag set on new inbox');
 	}
 
 	# loop for idempotency
diff --git a/t/psgi_search.t b/t/psgi_search.t
index 5d537363..c1677eb3 100644
--- a/t/psgi_search.t
+++ b/t/psgi_search.t
@@ -3,6 +3,7 @@
 use strict;
 use warnings;
 use Test::More;
+use IO::Uncompress::Gunzip qw(gunzip);
 use PublicInbox::Eml;
 use PublicInbox::Config;
 use PublicInbox::Inbox;
@@ -39,6 +40,12 @@ To: git\@vger.kernel.org
 EOF
 $im->add($mime);
 
+$im->add(PublicInbox::Eml->new(<<""));
+Message-ID: 
+From: replier 
+In-Reply-To: <$mid>
+Subject: mismatch
+
 $mime = PublicInbox::Eml->new(<<'EOF');
 Subject:
 Message-ID: 
@@ -79,6 +86,9 @@ test_psgi(sub { $www->call(@_) }, sub {
 	ok(index($html, 'by Ævar Arnfjörð Bjarmason') >= 0,
 		"displayed Ævar's name properly in HTML");
 
+	like($html, qr/download mbox\.gz: .*?"full threads"/s,
+		'"full threads" download option shown');
+
 	my $warn = [];
 	local $SIG{__WARN__} = sub { push @$warn, @_ };
 	$res = $cb->(GET('/test/?q=s:test&l=5e'));
@@ -118,8 +128,33 @@ test_psgi(sub { $www->call(@_) }, sub {
 	$res = $cb->(GET('/test/no-subject-at-all@example.com/t.mbox.gz'));
 	like($res->header('Content-Disposition'),
 		qr/filename=no-subject\.mbox\.gz/);
+
+	# "full threads" mbox.gz download
+	$res = $cb->(POST('/test/?q=s:test&x=m&t'));
+	is($res->code, 200, 'successful mbox download with threads');
+	gunzip(\($res->content) => \(my $before));
+	is_deeply([ "Message-ID: <$mid>\n", "Message-ID: \n" ],
+		[ grep(/^Message-ID:/m, split(/^/m, $before)) ],
+		'got full thread');
+
+	# clobber has_threadid to emulate old versions:
+	{
+		my $sidx = PublicInbox::SearchIdx->new($ibx, 0);
+		my $xdb = $sidx->idx_acquire;
+		$xdb->set_metadata('has_threadid', '0');
+		$sidx->idx_release;
+	}
+	$config->each_inbox(sub { delete $_[0]->{search} });
+	$res = $cb->(GET('/test/?q=s:test'));
+	is($res->code, 200, 'successful search w/o has_threadid');
+	unlike($html, qr/download mbox\.gz: .*?"full threads"/s,
+		'"full threads" download option not shown w/o has_threadid');
+
+	# in case somebody uses curl to bypass 
+ $res = $cb->(POST('/test/?q=s:test&x=m&t')); + is($res->code, 200, 'successful mbox download w/ threads'); + gunzip(\($res->content) => \(my $after)); + isnt($before, $after); }); done_testing(); - -1;