From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 2/6] cindex: preserve indexlevel across invocations
Date: Fri, 7 Apr 2023 12:40:49 +0000 [thread overview]
Message-ID: <20230407124053.2233988-3-e@80x24.org> (raw)
In-Reply-To: <20230407124053.2233988-1-e@80x24.org>
This matches the behavior of mail indexers and ensures `medium'
indices don't grow unexpectedly to be come `full' indices.
---
lib/PublicInbox/CodeSearchIdx.pm | 15 +++++++++--
lib/PublicInbox/SearchIdx.pm | 2 +-
t/cindex.t | 45 ++++++++++++++++++++++++++++++++
3 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 5f20325a..3a3fc03e 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -85,7 +85,6 @@ sub new {
xpfx => "$dir/cidx". PublicInbox::CodeSearch::CIDX_SCHEMA_VER,
cidx_dir => $dir,
creat => 1, # TODO: get rid of this, should be implicit
- indexlevel => $l,
transact_bytes => 0, # for checkpoint
total_bytes => 0, # for lock_release
current_info => '',
@@ -617,16 +616,28 @@ sub cidx_init ($) {
}
$self->lock_acquire;
my @shards;
+ my $l = $self->{indexlevel} //= $self->{-opt}->{indexlevel};
+
for my $n (0..($self->{nshard} - 1)) {
my $shard = bless { %$self, shard => $n }, ref($self);
delete @$shard{qw(lockfh lock_path)};
- $shard->idx_acquire;
+ my $xdb = $shard->idx_acquire;
+ if (!$n) {
+ if (($l // '') eq 'medium') {
+ $xdb->set_metadata('indexlevel', $l);
+ } elsif (($l // '') eq 'full') {
+ $xdb->set_metadata('indexlevel', ''); # unset
+ }
+ $l ||= $xdb->get_metadata('indexlevel') || 'full';
+ }
+ $shard->{indexlevel} = $l;
$shard->idx_release;
$shard->wq_workers_start("cidx shard[$n]", 1, $SIGSET, {
siblings => \@shards, # for ipc_atfork_child
}, \&shard_done_wait, $self);
push @shards, $shard;
}
+ $self->{indexlevel} //= $l;
# this warning needs to happen after idx_acquire
state $once;
warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index f36c8f97..699af432 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -90,7 +90,7 @@ sub new {
$self;
}
-sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels }
+sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels }
sub idx_release {
my ($self, $wake) = @_;
diff --git a/t/cindex.t b/t/cindex.t
index 9da0ba69..d40f73ff 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -4,11 +4,13 @@
use v5.12;
use PublicInbox::TestCommon;
use Cwd qw(getcwd abs_path);
+use List::Util qw(sum);
require_mods(qw(json Search::Xapian));
use_ok 'PublicInbox::CodeSearchIdx';
require PublicInbox::Import;
my ($tmp, $for_destroy) = tmpdir();
my $pwd = getcwd();
+my @unused_keys = qw(last_commit has_threadid skip_docdata);
# I reworked CodeSearchIdx->shard_worker to handle empty trees
# in the initial commit generated by cvs2svn for xapian.git
@@ -71,7 +73,48 @@ ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp, "$tmp/wt0"]),
ok(-e "$tmp/ext/cidx.lock", 'external dir created');
ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo');
+ok(run_script([qw(-cindex -L medium --dangerous -q -d),
+ "$tmp/med", $zp, "$tmp/wt0"]), 'cindex external medium');
+
+my $no_metadata_set = sub {
+ my ($i, $extra, $xdb) = @_;
+ for my $xdb (@$xdb) {
+ for my $k (@unused_keys, @$extra) {
+ is($xdb->get_metadata($k) // '', '',
+ "metadata $k unset in shard #$i");
+ }
+ ++$i;
+ }
+};
+
+{
+ my $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+ my $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+ ok($full_size > $mid_size, 'full size > mid size') or
+ diag "full=$full_size mid=$mid_size";
+ for my $l (qw(med ext)) {
+ ok(run_script([qw(-cindex -q --reindex -u -d), "$tmp/$l"]),
+ "reindex $l");
+ }
+ $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+ $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+ ok($full_size > $mid_size, 'full size > mid size after reindex') or
+ diag "full=$full_size mid=$mid_size";
+ my $csrch = PublicInbox::CodeSearch->new("$tmp/med");
+ my ($xdb0, @xdb) = $csrch->xdb_shards_flat;
+ $no_metadata_set->(0, [], [ $xdb0 ]);
+ is($xdb0->get_metadata('indexlevel'), 'medium',
+ 'indexlevel set in shard #0');
+ $no_metadata_set->(1, ['indexlevel'], \@xdb);
+
+ ok(run_script([qw(-cindex -q -L full --reindex -u -d), "$tmp/med"]),
+ 'reindex medium as full');
+ @xdb = $csrch->xdb_shards_flat;
+ $no_metadata_set->(0, ['indexlevel'], \@xdb);
+}
+
use_ok 'PublicInbox::CodeSearch';
+
if ('multi-repo search') {
my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
my $mset = $csrch->mset('NUL');
@@ -86,6 +129,8 @@ if ('multi-repo search') {
$mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") });
@have = sort(map { $_->get_document->get_data } $mset->items);
is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter');
+ my @xdb = $csrch->xdb_shards_flat;
+ $no_metadata_set->(0, ['indexlevel'], \@xdb);
}
if ('--update') {
next prev parent reply other threads:[~2023-04-07 12:40 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-04-07 12:40 [PATCH 0/6] cindex fixes, and some spring cleaning Eric Wong
2023-04-07 12:40 ` [PATCH 1/6] cindex: improve progress display Eric Wong
2023-04-07 12:40 ` Eric Wong [this message]
2023-04-07 12:40 ` [PATCH 3/6] umask: hoist out of InboxWritable Eric Wong
2023-04-07 12:40 ` [PATCH 4/6] umask: rely on the OnDestroy-based call where applicable Eric Wong
2023-04-07 12:40 ` [PATCH 5/6] searchidx: use vstring to improve readability Eric Wong
2023-04-07 12:40 ` [PATCH 6/6] switch git version comparisons to vstrings, too Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230407124053.2233988-3-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).