From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 5/5] mbox: disable "&t" on existing Xapian until full reindex
Date: Sat, 22 Aug 2020 06:06:27 +0000 [thread overview]
Message-ID: <20200822060627.15595-6-e@yhbt.net> (raw)
In-Reply-To: <20200822060627.15595-1-e@yhbt.net>
Expanding threads via over.sqlite3 for mbox.gz downloads without
Xapian effectively collapsing on the THREADID column leads to
repeated messages getting downloaded.
To avoid that situation, use a "has_threadid" Xapian metadata
flag that's only set on --reindex (and brand new Xapian DBs).
This allows admins to upgrade WWW or do --reindex in any order;
without worrying about users eating up bandwidth and CPU cycles.
---
lib/PublicInbox/Mbox.pm | 2 +-
lib/PublicInbox/Search.pm | 10 ++++++++-
lib/PublicInbox/SearchIdx.pm | 16 ++++++++++++--
lib/PublicInbox/SearchView.pm | 21 ++++++++++++-------
lib/PublicInbox/V2Writable.pm | 11 ++++++++++
t/init.t | 1 +
t/psgi_search.t | 39 +++++++++++++++++++++++++++++++++--
7 files changed, 87 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index c9b11c21..0223bead 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -262,7 +262,7 @@ sub mbox_all {
$ctx->{ids} = $srch->mset_to_artnums($mset);
require PublicInbox::MboxGz;
my $fn;
- if ($q->{t}) {
+ if ($q->{t} && $srch->has_threadid) {
$fn = 'results-thread-'.$q_string;
PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn);
} else {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index bc820b64..01bbe73d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -311,6 +311,12 @@ sub _do_enquire {
retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]);
}
+# returns true if all docs have the THREADID value
+sub has_threadid ($) {
+ my ($self) = @_;
+ (xdb($self)->get_metadata('has_threadid') // '') eq '1';
+}
+
sub _enquire_once { # retry_reopen callback
my ($self, $query, $opts) = @{$_[0]};
my $xdb = xdb($self);
@@ -328,7 +334,9 @@ sub _enquire_once { # retry_reopen callback
}
# `mairix -t / --threads' or JMAP collapseThreads
- $enquire->set_collapse_key(THREADID) if $opts->{thread};
+ if ($opts->{thread} && has_threadid($self)) {
+ $enquire->set_collapse_key(THREADID);
+ }
my $offset = $opts->{offset} || 0;
my $limit = $opts->{limit} || 50;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index baa6f41a..ade55756 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -131,6 +131,7 @@ sub idx_acquire {
($is_shard && need_xapian($self)))) {
File::Path::mkpath($dir);
nodatacow_dir($dir);
+ $self->{-set_has_threadid_once} = 1;
}
}
return unless defined $flag;
@@ -590,9 +591,17 @@ sub v1_checkpoint ($$;$) {
$self->{mm}->{dbh}->commit;
if ($newest && need_xapian($self)) {
- my $cur = $self->{xdb}->get_metadata('last_commit');
+ my $xdb = $self->{xdb};
+ my $cur = $xdb->get_metadata('last_commit');
if (need_update($self, $cur, $newest)) {
- $self->{xdb}->set_metadata('last_commit', $newest);
+ $xdb->set_metadata('last_commit', $newest);
+ }
+
+ # let SearchView know a full --reindex was done so it can
+ # generate ->has_threadid-dependent links
+ if ($sync->{reindex} && !ref($sync->{reindex})) {
+ my $n = $xdb->get_metadata('has_threadid');
+ $xdb->set_metadata('has_threadid', '1') if $n ne '1';
}
}
@@ -816,6 +825,9 @@ sub set_metadata_once {
return if $self->{shard}; # only continue if undef or 0, not >0
my $xdb = $self->{xdb};
+ if (delete($self->{-set_has_threadid_once})) {
+ $xdb->set_metadata('has_threadid', '1');
+ }
if (delete($self->{-set_indexlevel_once})) {
my $level = $xdb->get_metadata('indexlevel');
if (!$level || $level ne 'medium') {
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index dd69564a..76428dfb 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -188,13 +188,20 @@ sub search_nav_top {
$rv .= qq{<a\nhref="?$s">summary</a>|<b>nested</b>};
}
my $A = $q->qs_html(x => 'A', r => undef);
- $rv .= qq{|<a\nhref="?$A">Atom feed</a>]} .
- qq{\n\t\t\tdownload mbox.gz: } .
- # we set name=z w/o using it since it seems required for
- # lynx (but works fine for w3m).
- qq{<input\ntype=submit\nname=z\nvalue="results only"/>|} .
- qq{<input\ntype=submit\nname=x\nvalue="full threads"/>} .
- qq{</pre></form><pre>};
+ $rv .= qq{|<a\nhref="?$A">Atom feed</a>]};
+ if ($ctx->{-inbox}->search->has_threadid) {
+ $rv .= qq{\n\t\t\tdownload mbox.gz: } .
+ # we set name=z w/o using it since it seems required for
+ # lynx (but works fine for w3m).
+ qq{<input\ntype=submit\nname=z\n} .
+ q{value="results only"/>} .
+ qq{|<input\ntype=submit\nname=x\n} .
+ q{value="full threads"/>};
+ } else { # BOFH needs to --reindex
+ $rv .= qq{\n\t\t\t\t\t\tdownload: } .
+ qq{<input\ntype=submit\nname=z\nvalue="mbox.gz"/>}
+ }
+ $rv .= qq{</pre></form><pre>};
}
sub search_nav_bot {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0a91a132..a32fdcf3 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1337,6 +1337,17 @@ sub index_sync {
xapian_only($self, $opt, $sync, $art_beg);
}
+ # --reindex on the command-line
+ if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') {
+ local $self->{parallel} = 0;
+ my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0);
+ if (my $xdb = $s0->idx_acquire) {
+ my $n = $xdb->get_metadata('has_threadid');
+ $xdb->set_metadata('has_threadid', 1) if $n ne '1';
+ }
+ $s0->idx_release;
+ }
+
# reindex does not pick up new changes, so we rerun w/o it:
if ($opt->{reindex}) {
my %again = %$opt;
diff --git a/t/init.t b/t/init.t
index dad09435..a5a9debc 100644
--- a/t/init.t
+++ b/t/init.t
@@ -108,6 +108,7 @@ SKIP: {
is(PublicInbox::Admin::detect_indexlevel($ibx), 'full',
"detected default indexlevel -V$v");
ok($ibx->{-skip_docdata}, "docdata skip set -V$v");
+ ok($ibx->search->has_threadid, 'has_threadid flag set on new inbox');
}
# loop for idempotency
diff --git a/t/psgi_search.t b/t/psgi_search.t
index 5d537363..c1677eb3 100644
--- a/t/psgi_search.t
+++ b/t/psgi_search.t
@@ -3,6 +3,7 @@
use strict;
use warnings;
use Test::More;
+use IO::Uncompress::Gunzip qw(gunzip);
use PublicInbox::Eml;
use PublicInbox::Config;
use PublicInbox::Inbox;
@@ -39,6 +40,12 @@ To: git\@vger.kernel.org
EOF
$im->add($mime);
+$im->add(PublicInbox::Eml->new(<<""));
+Message-ID: <reply\@asdf>
+From: replier <r\@example.com>
+In-Reply-To: <$mid>
+Subject: mismatch
+
$mime = PublicInbox::Eml->new(<<'EOF');
Subject:
Message-ID: <blank-subject@example.com>
@@ -79,6 +86,9 @@ test_psgi(sub { $www->call(@_) }, sub {
ok(index($html, 'by Ævar Arnfjörð Bjarmason') >= 0,
"displayed Ævar's name properly in HTML");
+ like($html, qr/download mbox\.gz: .*?"full threads"/s,
+ '"full threads" download option shown');
+
my $warn = [];
local $SIG{__WARN__} = sub { push @$warn, @_ };
$res = $cb->(GET('/test/?q=s:test&l=5e'));
@@ -118,8 +128,33 @@ test_psgi(sub { $www->call(@_) }, sub {
$res = $cb->(GET('/test/no-subject-at-all@example.com/t.mbox.gz'));
like($res->header('Content-Disposition'),
qr/filename=no-subject\.mbox\.gz/);
+
+ # "full threads" mbox.gz download
+ $res = $cb->(POST('/test/?q=s:test&x=m&t'));
+ is($res->code, 200, 'successful mbox download with threads');
+ gunzip(\($res->content) => \(my $before));
+ is_deeply([ "Message-ID: <$mid>\n", "Message-ID: <reply\@asdf>\n" ],
+ [ grep(/^Message-ID:/m, split(/^/m, $before)) ],
+ 'got full thread');
+
+ # clobber has_threadid to emulate old versions:
+ {
+ my $sidx = PublicInbox::SearchIdx->new($ibx, 0);
+ my $xdb = $sidx->idx_acquire;
+ $xdb->set_metadata('has_threadid', '0');
+ $sidx->idx_release;
+ }
+ $config->each_inbox(sub { delete $_[0]->{search} });
+ $res = $cb->(GET('/test/?q=s:test'));
+ is($res->code, 200, 'successful search w/o has_threadid');
+ unlike($html, qr/download mbox\.gz: .*?"full threads"/s,
+ '"full threads" download option not shown w/o has_threadid');
+
+ # in case somebody uses curl to bypass <form>
+ $res = $cb->(POST('/test/?q=s:test&x=m&t'));
+ is($res->code, 200, 'successful mbox download w/ threads');
+ gunzip(\($res->content) => \(my $after));
+ isnt($before, $after);
});
done_testing();
-
-1;
next prev parent reply other threads:[~2020-08-22 6:06 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-08-22 6:06 [PATCH 0/5] "mairix -t" workalike for mbox.gz downloads Eric Wong
2020-08-22 6:06 ` [PATCH 1/5] searchidxshard: clear $msgref buffer properly Eric Wong
2020-08-22 6:06 ` [PATCH 2/5] searchidx: put all shard-related stuff in SearchIdxShard.pm Eric Wong
2020-08-22 6:06 ` [PATCH 3/5] searchidx: index THREADID in Xapian Eric Wong
2020-08-22 6:06 ` [PATCH 4/5] search: support downloading mboxes results with full thread Eric Wong
2020-08-22 6:06 ` Eric Wong [this message]
2020-08-22 6:39 ` [PATCH 5/5] mbox: disable "&t" on existing Xapian until full reindex Eric Wong
2020-08-22 6:42 ` [PATCH 0/5] "mairix -t" workalike for mbox.gz downloads Eric Wong
2020-08-22 20:12 ` Kyle Meyer
2020-08-22 20:30 ` Eric Wong
2020-08-22 21:04 ` Kyle Meyer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200822060627.15595-6-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).