From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 697A01FA19 for ; Mon, 10 Aug 2020 02:12:07 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 11/14] convert: support new -index options Date: Mon, 10 Aug 2020 02:12:02 +0000 Message-Id: <20200810021205.18909-12-e@yhbt.net> In-Reply-To: <20200810021205.18909-1-e@yhbt.net> References: <20200810021205.18909-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Converting v1 inboxes from v2 can be a painful experience on HDD. Some of the new options in the CLI or config file make it less painful. --- Documentation/public-inbox-convert.pod | 19 +++++++ lib/PublicInbox/Admin.pm | 36 ++++++++++++ script/public-inbox-convert | 77 +++++++++++++++++++------- script/public-inbox-index | 47 ++-------------- 4 files changed, 117 insertions(+), 62 deletions(-) diff --git a/Documentation/public-inbox-convert.pod b/Documentation/public-inbox-convert.pod index a8a5658c..a7958cf8 100644 --- a/Documentation/public-inbox-convert.pod +++ b/Documentation/public-inbox-convert.pod @@ -33,6 +33,25 @@ at 4 due to various bottlenecks. The number of Xapian shards will be 1 less than the JOBS value, since there is a single process which distributes work to the Xapian shards. +=item -L LEVEL, --index-level=LEVEL + +=item -c, --compact + +=item -v, --verbose + +=item --no-fsync + +=item --sequential-shard + +=item --batch-size=BYTES + +=item --max-size=BYTES + +These options affect indexing. They have no effect if +L is specified + +See L for a description of these options. + =back =head1 ENVIRONMENT diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 8a9a81c9..ce720beb 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -265,4 +265,40 @@ sub parse_unsigned ($) { 1; } +sub index_prepare ($$) { + my ($opt, $cfg) = @_; + my $env; + if ($opt->{compact}) { + require PublicInbox::Xapcmd; + PublicInbox::Xapcmd::check_compact(); + $opt->{compact_opt} = { -coarse_lock => 1, compact => 1 }; + if (defined(my $jobs = $opt->{jobs})) { + $opt->{compact_opt}->{jobs} = $jobs; + } + } + for my $k (qw(max_size batch_size)) { + my $git_key = "publicInbox.index".ucfirst($k); + $git_key =~ s/_([a-z])/\U$1/g; + defined(my $v = $opt->{$k} // $cfg->{lc($git_key)}) or next; + parse_unsigned(\$v) or die "`$git_key=$v' not parsed\n"; + $v > 0 or die "`$git_key=$v' must be positive\n"; + $opt->{$k} = $v; + } + + # out-of-the-box builds of Xapian 1.4.x are still limited to 32-bit + # https://getting-started-with-xapian.readthedocs.io/en/latest/concepts/indexing/limitations.html + $opt->{batch_size} and + $env = { XAPIAN_FLUSH_THRESHOLD => '4294967295' }; + + for my $k (qw(sequential_shard)) { + my $git_key = "publicInbox.index".ucfirst($k); + $git_key =~ s/_([a-z])/\U$1/g; + defined(my $s = $opt->{$k} // $cfg->{lc($git_key)}) or next; + defined(my $v = $cfg->git_bool($s)) + or die "`$git_key=$s' not boolean\n"; + $opt->{$k} = $v; + } + $env; +} + 1; diff --git a/script/public-inbox-convert b/script/public-inbox-convert index dbb2bd38..ca16b0dc 100755 --- a/script/public-inbox-convert +++ b/script/public-inbox-convert @@ -12,26 +12,57 @@ use PublicInbox::Git; use PublicInbox::Spawn qw(spawn); use Cwd 'abs_path'; use File::Copy 'cp'; # preserves permissions: -my $usage = "Usage: public-inbox-convert OLD NEW\n"; -my $jobs; -my $index = 1; -my %opts = ( - '--jobs|j=i' => \$jobs, - '--index!' => \$index, -); -GetOptions(%opts) or die "bad command-line args\n$usage"; +my $usage = 'Usage: public-inbox-convert [options] OLD NEW'; +my $help = < 1, + # index defaults: + quiet => -1, compact => 0, maxsize => undef, fsync => 1, + reindex => 1, # we always reindex +}; +GetOptions($opt, qw(jobs|j=i index! help|?), + # index options + qw(verbose|v+ rethread compact|c+ fsync|sync! + indexlevel|index-level|L=s max_size|max-size=s + batch_size|batch-size=s + sequential_shard|sequential-shard|seq-shard + )) or die <{help}) { print $help; exit 0 }; my $old_dir = shift(@ARGV) or die $usage; my $new_dir = shift(@ARGV) or die $usage; die "$new_dir exists\n" if -d $new_dir; die "$old_dir not a directory\n" unless -d $old_dir; -my $config = PublicInbox::Config->new; +my $cfg = PublicInbox::Config->new; $old_dir = abs_path($old_dir); my $old; -if ($config) { - $config->each_inbox(sub { - $old = $_[0] if abs_path($_[0]->{inboxdir}) eq $old_dir; - }); -} +$cfg->each_inbox(sub { + $old = $_[0] if abs_path($_[0]->{inboxdir}) eq $old_dir; +}); unless ($old) { warn "W: $old_dir not configured in " . PublicInbox::Config::default_file() . "\n"; @@ -48,16 +79,20 @@ if ($old->version >= 2) { } $old->{indexlevel} //= PublicInbox::Admin::detect_indexlevel($old); -if ($index) { +my $env; +if ($opt->{'index'}) { my $mods = {}; PublicInbox::Admin::scan_ibx_modules($mods, $old); PublicInbox::Admin::require_or_die(keys %$mods); + PublicInbox::Admin::progress_prepare($opt); + $env = PublicInbox::Admin::index_prepare($opt, $cfg); } - +local %ENV = (%$env, %ENV) if $env; my $new = { %$old }; $new->{inboxdir} = abs_path($new_dir); $new->{version} = 2; -$new = PublicInbox::InboxWritable->new($new); +$new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} }); +$new->{-no_fsync} = 1 if !$opt->{fsync}; my $v2w; $old->umask_prepare; @@ -73,7 +108,7 @@ $old->with_umask(sub { local $ENV{GIT_CONFIG} = $old_cfg; my $new_cfg = "$new->{inboxdir}/all.git/config"; $v2w = PublicInbox::V2Writable->new($new, 1); - $v2w->init_inbox($jobs); + $v2w->init_inbox(delete $opt->{jobs}); unlink $new_cfg; link_or_copy($old_cfg, $new_cfg); if (my $alt = $new->{altid}) { @@ -98,7 +133,7 @@ $clone may not be valid after migrating to v2, not copying my $state = ''; my $head = $old->{ref_head} || 'HEAD'; my ($rd, $pid) = $old->git->popen(qw(fast-export --use-done-feature), $head); -$v2w->idx_init; +$v2w->idx_init($opt); my $im = $v2w->importer; my ($r, $w) = $im->gfi_start; my $h = '[0-9a-f]'; @@ -155,10 +190,10 @@ if (my $mm = $old->mm) { # we want to trigger a reindex, not a from scratch index if # we're reusing the msgmap from an existing v1 installation. - $v2w->idx_init; + $v2w->idx_init($opt); my $epoch0 = PublicInbox::Git->new($v2w->git_init(0)); chop(my $cmt = $epoch0->qx(qw(rev-parse --verify), $head)); $v2w->last_epoch_commit(0, $cmt); } -$v2w->index_sync({reindex => 1}) if $index; +$v2w->index_sync($opt) if delete $opt->{'index'}; $v2w->done; diff --git a/script/public-inbox-index b/script/public-inbox-index index b1d29ec1..14d3afd4 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -32,7 +32,6 @@ options: BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes) See public-inbox-index(1) man page for full documentation. EOF -my $compact_opt; my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune fsync|sync! xapian_only|xapian-only @@ -51,47 +50,11 @@ if ($opt->{xapian_only} && !$opt->{reindex}) { require PublicInbox::Admin; PublicInbox::Admin::require_or_die('-index'); -if ($opt->{compact}) { - require PublicInbox::Xapcmd; - PublicInbox::Xapcmd::check_compact(); - $compact_opt = { -coarse_lock => 1, compact => 1 }; - if (defined(my $jobs = $opt->{jobs})) { - $compact_opt->{jobs} = $jobs; - } -} - my $cfg = PublicInbox::Config->new; # Config is loaded by Admin my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg); PublicInbox::Admin::require_or_die('-index'); unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 } -my $max_size = $opt->{max_size} // $cfg->{lc('publicInbox.indexMaxSize')}; -if (defined $max_size) { - PublicInbox::Admin::parse_unsigned(\$max_size) or - die "`publicInbox.indexMaxSize=$max_size' not parsed\n"; - $opt->{max_size} = $max_size; -} - -my $bs = $opt->{batch_size} // $cfg->{lc('publicInbox.indexBatchSize')}; -if (defined $bs) { - PublicInbox::Admin::parse_unsigned(\$bs) or - die "`publicInbox.indexBatchSize=$bs' not parsed\n"; - $opt->{batch_size} = $bs; -} - -# out-of-the-box builds of Xapian 1.4.x are still limited to 32-bit -# https://getting-started-with-xapian.readthedocs.io/en/latest/concepts/indexing/limitations.html -local $ENV{XAPIAN_FLUSH_THRESHOLD} ||= '4294967295' if defined($bs); - -my $s = $opt->{sequential_shard} // - $cfg->{lc('publicInbox.indexSequentialShard')}; -if (defined $s) { - my $v = $cfg->git_bool($s); - defined($v) or - die "`publicInbox.indexSequentialShard=$s' not boolean\n"; - $opt->{sequential_shard} = $v; -} - my $mods = {}; foreach my $ibx (@ibxs) { # XXX: users can shoot themselves in the foot, with opt->{indexlevel} @@ -101,12 +64,14 @@ foreach my $ibx (@ibxs) { } PublicInbox::Admin::require_or_die(keys %$mods); +my $env = PublicInbox::Admin::index_prepare($opt, $cfg); +local %ENV = (%ENV, %$env) if $env; require PublicInbox::InboxWritable; PublicInbox::Admin::progress_prepare($opt); for my $ibx (@ibxs) { $ibx = PublicInbox::InboxWritable->new($ibx); if ($opt->{compact} >= 2) { - PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt); + PublicInbox::Xapcmd::run($ibx, 'compact', $opt->{compact_opt}); } $ibx->{-no_fsync} = 1 if !$opt->{fsync}; @@ -118,8 +83,8 @@ EOL $ibx_opt = { %$opt, sequential_shard => $v }; } PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); - if ($compact_opt) { - local $compact_opt->{jobs} = 0 if $ibx_opt->{sequential_shard}; - PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt); + if (my $copt = $opt->{compact_opt}) { + local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard}; + PublicInbox::Xapcmd::run($ibx, 'compact', $copt); } }