user/dev discussion of public-inbox itself
 help / color / mirror / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: Re: [PATCH 5/5] mbox: disable "&t" on existing Xapian until full reindex
Date: Sat, 22 Aug 2020 06:39:04 +0000
Message-ID: <20200822063904.GA14079@dcvr> (raw)
In-Reply-To: <20200822060627.15595-6-e@yhbt.net>

Eric Wong <e@yhbt.net> wrote:
> Expanding threads via over.sqlite3 for mbox.gz downloads without
> Xapian effectively collapsing on the THREADID column leads to
> repeated messages getting downloaded.
> 
> To avoid that situation, use a "has_threadid" Xapian metadata
> flag that's only set on --reindex (and brand new Xapian DBs).
> 
> This allows admins to upgrade WWW or do --reindex in any order;
> without worrying about users eating up bandwidth and CPU cycles.
> ---
>  lib/PublicInbox/Mbox.pm       |  2 +-
>  lib/PublicInbox/Search.pm     | 10 ++++++++-
>  lib/PublicInbox/SearchIdx.pm  | 16 ++++++++++++--
>  lib/PublicInbox/SearchView.pm | 21 ++++++++++++-------
>  lib/PublicInbox/V2Writable.pm | 11 ++++++++++
>  t/init.t                      |  1 +
>  t/psgi_search.t               | 39 +++++++++++++++++++++++++++++++++--
>  7 files changed, 87 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
> index c9b11c21..0223bead 100644
> --- a/lib/PublicInbox/Mbox.pm
> +++ b/lib/PublicInbox/Mbox.pm
> @@ -262,7 +262,7 @@ sub mbox_all {
>  	$ctx->{ids} = $srch->mset_to_artnums($mset);
>  	require PublicInbox::MboxGz;
>  	my $fn;
> -	if ($q->{t}) {
> +	if ($q->{t} && $srch->has_threadid) {
>  		$fn = 'results-thread-'.$q_string;
>  		PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn);
>  	} else {
> diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
> index bc820b64..01bbe73d 100644
> --- a/lib/PublicInbox/Search.pm
> +++ b/lib/PublicInbox/Search.pm
> @@ -311,6 +311,12 @@ sub _do_enquire {
>  	retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]);
>  }
>  
> +# returns true if all docs have the THREADID value
> +sub has_threadid ($) {
> +	my ($self) = @_;
> +	(xdb($self)->get_metadata('has_threadid') // '') eq '1';
> +}
> +
>  sub _enquire_once { # retry_reopen callback
>  	my ($self, $query, $opts) = @{$_[0]};
>  	my $xdb = xdb($self);
> @@ -328,7 +334,9 @@ sub _enquire_once { # retry_reopen callback
>  	}
>  
>  	# `mairix -t / --threads' or JMAP collapseThreads
> -	$enquire->set_collapse_key(THREADID) if $opts->{thread};
> +	if ($opts->{thread} && has_threadid($self)) {
> +		$enquire->set_collapse_key(THREADID);
> +	}
>  
>  	my $offset = $opts->{offset} || 0;
>  	my $limit = $opts->{limit} || 50;
> diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
> index baa6f41a..ade55756 100644
> --- a/lib/PublicInbox/SearchIdx.pm
> +++ b/lib/PublicInbox/SearchIdx.pm
> @@ -131,6 +131,7 @@ sub idx_acquire {
>  				($is_shard && need_xapian($self)))) {
>  			File::Path::mkpath($dir);
>  			nodatacow_dir($dir);
> +			$self->{-set_has_threadid_once} = 1;
>  		}
>  	}
>  	return unless defined $flag;
> @@ -590,9 +591,17 @@ sub v1_checkpoint ($$;$) {
>  
>  	$self->{mm}->{dbh}->commit;
>  	if ($newest && need_xapian($self)) {
> -		my $cur = $self->{xdb}->get_metadata('last_commit');
> +		my $xdb = $self->{xdb};
> +		my $cur = $xdb->get_metadata('last_commit');
>  		if (need_update($self, $cur, $newest)) {
> -			$self->{xdb}->set_metadata('last_commit', $newest);
> +			$xdb->set_metadata('last_commit', $newest);
> +		}
> +
> +		# let SearchView know a full --reindex was done so it can
> +		# generate ->has_threadid-dependent links
> +		if ($sync->{reindex} && !ref($sync->{reindex})) {
> +			my $n = $xdb->get_metadata('has_threadid');
> +			$xdb->set_metadata('has_threadid', '1') if $n ne '1';
>  		}
>  	}
>  
> @@ -816,6 +825,9 @@ sub set_metadata_once {
>  	return if $self->{shard}; # only continue if undef or 0, not >0
>  	my $xdb = $self->{xdb};
>  
> +	if (delete($self->{-set_has_threadid_once})) {
> +		$xdb->set_metadata('has_threadid', '1');
> +	}
>  	if (delete($self->{-set_indexlevel_once})) {
>  		my $level = $xdb->get_metadata('indexlevel');
>  		if (!$level || $level ne 'medium') {
> diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
> index dd69564a..76428dfb 100644
> --- a/lib/PublicInbox/SearchView.pm
> +++ b/lib/PublicInbox/SearchView.pm
> @@ -188,13 +188,20 @@ sub search_nav_top {
>  		$rv .= qq{<a\nhref="?$s">summary</a>|<b>nested</b>};
>  	}
>  	my $A = $q->qs_html(x => 'A', r => undef);
> -	$rv .= qq{|<a\nhref="?$A">Atom feed</a>]} .
> -		qq{\n\t\t\tdownload mbox.gz: } .
> -		# we set name=z w/o using it since it seems required for
> -		# lynx (but works fine for w3m).
> -		qq{<input\ntype=submit\nname=z\nvalue="results only"/>|} .
> -		qq{<input\ntype=submit\nname=x\nvalue="full threads"/>} .
> -		qq{</pre></form><pre>};
> +	$rv .= qq{|<a\nhref="?$A">Atom feed</a>]};
> +	if ($ctx->{-inbox}->search->has_threadid) {
> +		$rv .= qq{\n\t\t\tdownload mbox.gz: } .
> +			# we set name=z w/o using it since it seems required for
> +			# lynx (but works fine for w3m).
> +			qq{<input\ntype=submit\nname=z\n} .
> +				q{value="results only"/>} .
> +			qq{|<input\ntype=submit\nname=x\n} .
> +				q{value="full threads"/>};
> +	} else { # BOFH needs to --reindex
> +		$rv .= qq{\n\t\t\t\t\t\tdownload: } .
> +			qq{<input\ntype=submit\nname=z\nvalue="mbox.gz"/>}
> +	}
> +	$rv .= qq{</pre></form><pre>};
>  }
>  
>  sub search_nav_bot {
> diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
> index 0a91a132..a32fdcf3 100644
> --- a/lib/PublicInbox/V2Writable.pm
> +++ b/lib/PublicInbox/V2Writable.pm
> @@ -1337,6 +1337,17 @@ sub index_sync {
>  		xapian_only($self, $opt, $sync, $art_beg);
>  	}
>  
> +	# --reindex on the command-line
> +	if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
> +		local $self->{parallel} = 0;
> +		my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
> +		if (my $xdb = $s0->idx_acquire) {
> +			my $n = $xdb->get_metadata('has_threadid');
> +			$xdb->set_metadata('has_threadid', 1) if $n ne '1';
> +		}
> +		$s0->idx_release;
> +	}

Assigning {parallel} there is useless; I originally used
SearchIdxShard instead of SearchIdx but avoided the *Shard
bit to avoid an extra idx_acquire.

Anyways, we actually need to flock the inbox.lock there to
avoid contending with -index/-watch/-mda etc.

So I'll squash this in:

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index a32fdcf3..b0148dba 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1339,13 +1339,14 @@ sub index_sync {
 
 	# --reindex on the command-line
 	if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
-		local $self->{parallel} = 0;
+		$self->lock_acquire;
 		my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
 		if (my $xdb = $s0->idx_acquire) {
 			my $n = $xdb->get_metadata('has_threadid');
 			$xdb->set_metadata('has_threadid', 1) if $n ne '1';
 		}
 		$s0->idx_release;
+		$self->lock_release;
 	}
 
 	# reindex does not pick up new changes, so we rerun w/o it:

  reply	other threads:[~2020-08-22  6:39 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-22  6:06 [PATCH 0/5] "mairix -t" workalike for mbox.gz downloads Eric Wong
2020-08-22  6:06 ` [PATCH 1/5] searchidxshard: clear $msgref buffer properly Eric Wong
2020-08-22  6:06 ` [PATCH 2/5] searchidx: put all shard-related stuff in SearchIdxShard.pm Eric Wong
2020-08-22  6:06 ` [PATCH 3/5] searchidx: index THREADID in Xapian Eric Wong
2020-08-22  6:06 ` [PATCH 4/5] search: support downloading mboxes results with full thread Eric Wong
2020-08-22  6:06 ` [PATCH 5/5] mbox: disable "&t" on existing Xapian until full reindex Eric Wong
2020-08-22  6:39   ` Eric Wong [this message]
2020-08-22  6:42 ` [PATCH 0/5] "mairix -t" workalike for mbox.gz downloads Eric Wong
2020-08-22 20:12   ` Kyle Meyer
2020-08-22 20:30     ` Eric Wong
2020-08-22 21:04       ` Kyle Meyer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200822063904.GA14079@dcvr \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V1 meta meta/ https://public-inbox.org/meta \
		meta@public-inbox.org
	public-inbox-index meta

Example config snippet for mirrors.
Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.io/gmane.mail.public-inbox.general
 note: .onion URLs require Tor: https://www.torproject.org/

code repositories for the project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

AGPL code for this site: git clone https://public-inbox.org/public-inbox.git