From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 9F2AB1F66E; Sat, 22 Aug 2020 06:39:04 +0000 (UTC) Date: Sat, 22 Aug 2020 06:39:04 +0000 From: Eric Wong To: meta@public-inbox.org Subject: Re: [PATCH 5/5] mbox: disable "&t" on existing Xapian until full reindex Message-ID: <20200822063904.GA14079@dcvr> References: <20200822060627.15595-1-e@yhbt.net> <20200822060627.15595-6-e@yhbt.net> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20200822060627.15595-6-e@yhbt.net> List-Id: Eric Wong wrote: > Expanding threads via over.sqlite3 for mbox.gz downloads without > Xapian effectively collapsing on the THREADID column leads to > repeated messages getting downloaded. > > To avoid that situation, use a "has_threadid" Xapian metadata > flag that's only set on --reindex (and brand new Xapian DBs). > > This allows admins to upgrade WWW or do --reindex in any order; > without worrying about users eating up bandwidth and CPU cycles. > --- > lib/PublicInbox/Mbox.pm | 2 +- > lib/PublicInbox/Search.pm | 10 ++++++++- > lib/PublicInbox/SearchIdx.pm | 16 ++++++++++++-- > lib/PublicInbox/SearchView.pm | 21 ++++++++++++------- > lib/PublicInbox/V2Writable.pm | 11 ++++++++++ > t/init.t | 1 + > t/psgi_search.t | 39 +++++++++++++++++++++++++++++++++-- > 7 files changed, 87 insertions(+), 13 deletions(-) > > diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm > index c9b11c21..0223bead 100644 > --- a/lib/PublicInbox/Mbox.pm > +++ b/lib/PublicInbox/Mbox.pm > @@ -262,7 +262,7 @@ sub mbox_all { > $ctx->{ids} = $srch->mset_to_artnums($mset); > require PublicInbox::MboxGz; > my $fn; > - if ($q->{t}) { > + if ($q->{t} && $srch->has_threadid) { > $fn = 'results-thread-'.$q_string; > PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn); > } else { > diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm > index bc820b64..01bbe73d 100644 > --- a/lib/PublicInbox/Search.pm > +++ b/lib/PublicInbox/Search.pm > @@ -311,6 +311,12 @@ sub _do_enquire { > retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]); > } > > +# returns true if all docs have the THREADID value > +sub has_threadid ($) { > + my ($self) = @_; > + (xdb($self)->get_metadata('has_threadid') // '') eq '1'; > +} > + > sub _enquire_once { # retry_reopen callback > my ($self, $query, $opts) = @{$_[0]}; > my $xdb = xdb($self); > @@ -328,7 +334,9 @@ sub _enquire_once { # retry_reopen callback > } > > # `mairix -t / --threads' or JMAP collapseThreads > - $enquire->set_collapse_key(THREADID) if $opts->{thread}; > + if ($opts->{thread} && has_threadid($self)) { > + $enquire->set_collapse_key(THREADID); > + } > > my $offset = $opts->{offset} || 0; > my $limit = $opts->{limit} || 50; > diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm > index baa6f41a..ade55756 100644 > --- a/lib/PublicInbox/SearchIdx.pm > +++ b/lib/PublicInbox/SearchIdx.pm > @@ -131,6 +131,7 @@ sub idx_acquire { > ($is_shard && need_xapian($self)))) { > File::Path::mkpath($dir); > nodatacow_dir($dir); > + $self->{-set_has_threadid_once} = 1; > } > } > return unless defined $flag; > @@ -590,9 +591,17 @@ sub v1_checkpoint ($$;$) { > > $self->{mm}->{dbh}->commit; > if ($newest && need_xapian($self)) { > - my $cur = $self->{xdb}->get_metadata('last_commit'); > + my $xdb = $self->{xdb}; > + my $cur = $xdb->get_metadata('last_commit'); > if (need_update($self, $cur, $newest)) { > - $self->{xdb}->set_metadata('last_commit', $newest); > + $xdb->set_metadata('last_commit', $newest); > + } > + > + # let SearchView know a full --reindex was done so it can > + # generate ->has_threadid-dependent links > + if ($sync->{reindex} && !ref($sync->{reindex})) { > + my $n = $xdb->get_metadata('has_threadid'); > + $xdb->set_metadata('has_threadid', '1') if $n ne '1'; > } > } > > @@ -816,6 +825,9 @@ sub set_metadata_once { > return if $self->{shard}; # only continue if undef or 0, not >0 > my $xdb = $self->{xdb}; > > + if (delete($self->{-set_has_threadid_once})) { > + $xdb->set_metadata('has_threadid', '1'); > + } > if (delete($self->{-set_indexlevel_once})) { > my $level = $xdb->get_metadata('indexlevel'); > if (!$level || $level ne 'medium') { > diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm > index dd69564a..76428dfb 100644 > --- a/lib/PublicInbox/SearchView.pm > +++ b/lib/PublicInbox/SearchView.pm > @@ -188,13 +188,20 @@ sub search_nav_top { > $rv .= qq{summary|nested}; > } > my $A = $q->qs_html(x => 'A', r => undef); > - $rv .= qq{|Atom feed]} . > - qq{\n\t\t\tdownload mbox.gz: } . > - # we set name=z w/o using it since it seems required for > - # lynx (but works fine for w3m). > - qq{|} . > - qq{} . > - qq{
};
> +	$rv .= qq{|Atom feed]};
> +	if ($ctx->{-inbox}->search->has_threadid) {
> +		$rv .= qq{\n\t\t\tdownload mbox.gz: } .
> +			# we set name=z w/o using it since it seems required for
> +			# lynx (but works fine for w3m).
> +			qq{ +				q{value="results only"/>} .
> +			qq{| +				q{value="full threads"/>};
> +	} else { # BOFH needs to --reindex
> +		$rv .= qq{\n\t\t\t\t\t\tdownload: } .
> +			qq{}
> +	}
> +	$rv .= qq{
};
>  }
>  
>  sub search_nav_bot {
> diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
> index 0a91a132..a32fdcf3 100644
> --- a/lib/PublicInbox/V2Writable.pm
> +++ b/lib/PublicInbox/V2Writable.pm
> @@ -1337,6 +1337,17 @@ sub index_sync {
>  		xapian_only($self, $opt, $sync, $art_beg);
>  	}
>  
> +	# --reindex on the command-line
> +	if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
> +		local $self->{parallel} = 0;
> +		my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
> +		if (my $xdb = $s0->idx_acquire) {
> +			my $n = $xdb->get_metadata('has_threadid');
> +			$xdb->set_metadata('has_threadid', 1) if $n ne '1';
> +		}
> +		$s0->idx_release;
> +	}

Assigning {parallel} there is useless; I originally used
SearchIdxShard instead of SearchIdx but avoided the *Shard
bit to avoid an extra idx_acquire.

Anyways, we actually need to flock the inbox.lock there to
avoid contending with -index/-watch/-mda etc.

So I'll squash this in:

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index a32fdcf3..b0148dba 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1339,13 +1339,14 @@ sub index_sync {
 
 	# --reindex on the command-line
 	if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
-		local $self->{parallel} = 0;
+		$self->lock_acquire;
 		my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
 		if (my $xdb = $s0->idx_acquire) {
 			my $n = $xdb->get_metadata('has_threadid');
 			$xdb->set_metadata('has_threadid', 1) if $n ne '1';
 		}
 		$s0->idx_release;
+		$self->lock_release;
 	}
 
 	# reindex does not pick up new changes, so we rerun w/o it: