diff options
-rw-r--r-- | Documentation/public-inbox-config.pod | 6 | ||||
-rw-r--r-- | Documentation/public-inbox-index.pod | 53 | ||||
-rw-r--r-- | Documentation/public-inbox-v2-format.pod | 11 | ||||
-rw-r--r-- | lib/PublicInbox/Config.pm | 9 | ||||
-rw-r--r-- | lib/PublicInbox/V2Writable.pm | 55 | ||||
-rw-r--r-- | lib/PublicInbox/WatchMaildir.pm | 2 | ||||
-rwxr-xr-x | script/public-inbox-index | 22 | ||||
-rw-r--r-- | t/config.t | 6 | ||||
-rw-r--r-- | t/v2mirror.t | 14 |
9 files changed, 161 insertions, 17 deletions
diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index e6108c35..05b84819 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -139,6 +139,10 @@ allow for searching for phrases using quoted text. Default: C<full> +=item publicinbox.<name>.indexSequentialShard + +See L<public-inbox-index(1)/publicInbox.indexSequentialShard> + =item publicinbox.<name>.httpbackendmax If a digit, the maximum number of parallel @@ -291,6 +295,8 @@ or /usr/share/cgit/ See L<public-inbox-edit(1)> =item publicinbox.indexMaxSize +=item publicinbox.indexBatchSize +=item publicinbox.indexSequentialShard See L<public-inbox-index(1)> diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index aeb1b3a3..f525ba54 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -34,12 +34,16 @@ normal search functionality. =item --jobs=JOBS, -j -Control the number of Xapian indexing jobs in a +Influences the number of Xapian indexing shards in a (L<public-inbox-v2-format(5)>) inbox. C<--jobs=0> is accepted as of public-inbox 1.6.0 (PENDING) to disable parallel indexing. +If the inbox has not been indexed, C<JOBS - 1> shards +will be created (one job is always needed for indexing +the overview and article number mapping). + Default: the number of existing Xapian shards =item --compact / -c @@ -120,6 +124,14 @@ and Xapian. This is only effective with Xapian 1.4+. Available in public-inbox 1.6.0 (PENDING). +=item --sequential-shard + +Sets or overrides L</publicinbox.indexSequentialShard> on a +per-invocation basis. See L</publicinbox.indexSequentialShard> +below. + +Available in public-inbox 1.6.0 (PENDING). + =back =head1 FILES @@ -167,6 +179,45 @@ inbox with 3 shards will flush every 3 megabytes by default. Default: 1m (one megabyte) +=item publicinbox.indexBatchSize + +Flushes changes to the filesystem and releases locks after +indexing the given number of bytes. The default value of C<1m> +(one megabyte) is low to minimize memory use and reduce +contention with parallel invocations of L<public-inbox-mda(1)>, +L<public-inbox-learn(1)>, and L<public-inbox-watch(1)>. + +Increase this value on powerful systems to improve throughput at +the expense of memory use. The reduction of lock granularity +may not be noticeable on fast systems. + +This option is available in public-inbox 1.6 or later. +public-inbox 1.5 and earlier used the current default, C<1m>. + +For L<public-inbox-v2-format(5)> inboxes, this value is +multiplied by the number of Xapian shards. Thus a typical v2 +inbox with 3 shards will flush every 3 megabytes by default. + +Default: 1m (one megabyte) + +=item publicinbox.indexSequentialShard +=item publicinbox.<inbox_name>.indexSequentialShard + +For L<public-inbox-v2-format(5)> inboxes, setting this to C<true> +allows indexing Xapian shards in multiple passes. This speeds up +indexing on rotational storage with high seek latency by allowing +individual shards to fit into the kernel page cache. + +Using a higher-than-normal number of C<--jobs> with +L<public-inbox-init(1)> may be required to ensure individual +shards are small enough to fit into cache. + +Available in public-inbox 1.6.0 (PENDING). + +This is ignored on L<public-inbox-v1-format(5)> inboxes. + +Default: false, shards are indexed in parallel + =back =head1 ENVIRONMENT diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod index 9e284a75..6876989c 100644 --- a/Documentation/public-inbox-v2-format.pod +++ b/Documentation/public-inbox-v2-format.pod @@ -113,9 +113,14 @@ improved with high-quality and high-quantity solid-state storage. Issuing TRIM commands with L<fstrim(8)> was necessary to maintain consistent performance while developing this feature. -Rotational storage devices are NOT recommended for indexing of -large mail archives; but are fine for backup and usable for -small instances. +Rotational storage devices perform significantly worse than +solid state storage for indexing of large mail archives; but are +fine for backup and usable for small instances. + +As of public-inbox 1.6.0, the C<--sequential-shard> option of +L<public-inbox-index(1)> may be used with a high shard count +to ensure individual shards fit into page cache when the entire +Xapian DB cannot. Our use of the L</OVERVIEW DB> requires Xapian document IDs to remain stable. Using L<public-inbox-compact(1)> and diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 67199bb3..f9184bd2 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -369,8 +369,8 @@ sub _fill_code_repo { $git; } -sub _git_config_bool ($) { - my ($val) = @_; +sub git_bool { + my ($val) = $_[-1]; # $_[0] may be $self, or $val if ($val =~ /\A(?:false|no|off|[\-\+]?(?:0x)?0+)\z/i) { 0; } elsif ($val =~ /\A(?:true|yes|on|[\-\+]?(?:0x)?[0-9]+)\z/i) { @@ -386,7 +386,8 @@ sub _fill { foreach my $k (qw(inboxdir filter newsgroup watch httpbackendmax - replyto feedmax nntpserver indexlevel)) { + replyto feedmax nntpserver + indexlevel indexsequentialshard)) { my $v = $self->{"$pfx.$k"}; $ibx->{$k} = $v if defined $v; } @@ -400,7 +401,7 @@ sub _fill { foreach my $k (qw(obfuscate)) { my $v = $self->{"$pfx.$k"}; defined $v or next; - if (defined(my $bval = _git_config_bool($v))) { + if (defined(my $bval = git_bool($v))) { $ibx->{$k} = $bval; } else { warn "Ignoring $pfx.$k=$v in config, not boolean\n"; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index f98afa61..7bc24592 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -875,7 +875,8 @@ sub reindex_checkpoint ($$) { $self->{ibx}->git->cleanup; # *async_wait ${$sync->{need_checkpoint}} = 0; - $sync->{mm_tmp}->atfork_prepare; + my $mm_tmp = $sync->{mm_tmp}; + $mm_tmp->atfork_prepare if $mm_tmp; $self->done; # release lock if (my $pr = $sync->{-opt}->{-progress}) { @@ -884,7 +885,7 @@ sub reindex_checkpoint ($$) { # allow -watch or -mda to write... $self->idx_init; # reacquire lock - $sync->{mm_tmp}->atfork_parent; + $mm_tmp->atfork_parent if $mm_tmp; } sub index_oid { # cat_async callback @@ -1085,7 +1086,10 @@ sub sync_prepare ($$$) { } $all->cat_async_wait; } - return 0 if (!$regen_max && !keys(%{$self->{unindex_range}})); + if (!$regen_max && !keys(%{$self->{unindex_range}})) { + $sync->{-regen_fmt} = "%u/?\n"; + return 0; + } # reindex should NOT see new commits anymore, if we do, # it's a problem and we need to notice it via die() @@ -1177,6 +1181,36 @@ sub sync_ranges ($$$) { $ranges; } +sub index_xap_only { # git->cat_async callback + my ($bref, $oid, $type, $size, $smsg) = @_; + my $self = $smsg->{v2w}; + my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); + $idx->begin_txn_lazy; + $idx->add_message(PublicInbox::Eml->new($bref), $smsg); + $self->{transact_bytes} += $size; +} + +sub index_seq_shard ($$$) { + my ($self, $sync, $off) = @_; + my $ibx = $self->{ibx}; + my $max = $ibx->mm->max or return; + my $all = $ibx->git; + my $over = $ibx->over; + my $batch_bytes = $PublicInbox::SearchIdx::BATCH_BYTES; + if (my $pr = $sync->{-opt}->{-progress}) { + $pr->("Xapian indexlevel=$ibx->{indexlevel} % $off\n"); + } + for (my $num = $off; $num <= $max; $num += $self->{shards}) { + my $smsg = $over->get_art($num) or next; + $smsg->{v2w} = $self; + $all->cat_async($smsg->{blob}, \&index_xap_only, $smsg); + if ($self->{transact_bytes} >= $batch_bytes) { + ${$sync->{nr}} = $num; + reindex_checkpoint($self, $sync); + } + } +} + sub index_epoch ($$$) { my ($self, $sync, $i) = @_; @@ -1218,6 +1252,11 @@ sub index_sync { my $epoch_max; my $latest = git_dir_latest($self, \$epoch_max); return unless defined $latest; + + my $seq = $opt->{sequentialshard}; + my $idxlevel = $self->{ibx}->{indexlevel}; + local $self->{ibx}->{indexlevel} = 'basic' if $seq; + $self->idx_init($opt); # acquire lock fill_alternates($self, $epoch_max); $self->{over}->rethread_prepare($opt); @@ -1252,6 +1291,16 @@ sub index_sync { $pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr; } + if ($seq) { # deal with Xapian shards sequentially + my $end = $self->{shards} - 1; + $self->{ibx}->{indexlevel} = $idxlevel; + delete $sync->{mm_tmp}; + $self->idx_init($opt); # re-acquire lock + index_seq_shard($self, $sync, $_) for (0..$end); + $self->{ibx}->git->cat_async_wait; + $self->done; + } + # reindex does not pick up new changes, so we rerun w/o it: if ($opt->{reindex}) { my %again = %$opt; diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index 142118bd..2ba10a9e 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -285,7 +285,7 @@ sub cfg_intvl ($$$) { sub cfg_bool ($$$) { my ($cfg, $key, $url) = @_; my $orig = $cfg->urlmatch($key, $url) // return; - my $bool = PublicInbox::Config::_git_config_bool($orig); + my $bool = $cfg->git_bool($orig); warn "W: $key=$orig for $url is not boolean\n" unless defined($bool); $bool; } diff --git a/script/public-inbox-index b/script/public-inbox-index index 5a0ceab7..be518134 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -16,7 +16,8 @@ use PublicInbox::Xapcmd; my $compact_opt; my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync! - indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s)) + indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s + sequentialshard|seq-shard|sequential-shard)) or die "bad command-line args\n$usage"; die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; @@ -46,6 +47,15 @@ if (my $bs = $opt->{batchsize} // $cfg->{lc('publicInbox.indexBatchSize')}) { $PublicInbox::SearchIdx::BATCH_BYTES = $bs; } +my $s = $opt->{sequentialshard} // + $cfg->{lc('publicInbox.indexSequentialShard')}; +if (defined $s) { + my $v = $cfg->git_bool($s); + defined($v) or + die "`publicInbox.indexSequentialShard=$s' not boolean\n"; + $opt->{sequentialshard} = $v; +} + my $mods = {}; foreach my $ibx (@ibxs) { # XXX: users can shoot themselves in the foot, with opt->{indexlevel} @@ -63,6 +73,14 @@ for my $ibx (@ibxs) { PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt); } $ibx->{-no_sync} = 1 if !$opt->{sync}; - PublicInbox::Admin::index_inbox($ibx, undef, $opt); + + my $ibx_opt = $opt; + if (defined(my $s = $ibx->{indexsequentialshard})) { + defined(my $v = $cfg->git_bool($s)) or die <<EOL; +publicInbox.$ibx->{name}.indexSequentialShard not boolean +EOL + $ibx_opt = { %$opt, sequentialshard => $v }; + } + PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt) if $compact_opt; } @@ -220,18 +220,18 @@ EOF { for my $t (qw(TRUE true yes on 1 +1 -1 13 0x1 0x12 0X5)) { - is(PublicInbox::Config::_git_config_bool($t), 1, "$t is true"); + is(PublicInbox::Config::git_bool($t), 1, "$t is true"); is(xqx([qw(git -c), "test.val=$t", qw(config --bool test.val)]), "true\n", "$t matches git-config behavior"); } for my $f (qw(FALSE false no off 0 +0 +000 00 0x00 0X0)) { - is(PublicInbox::Config::_git_config_bool($f), 0, "$f is false"); + is(PublicInbox::Config::git_bool($f), 0, "$f is false"); is(xqx([qw(git -c), "test.val=$f", qw(config --bool test.val)]), "false\n", "$f matches git-config behavior"); } - is(PublicInbox::Config::_git_config_bool('bogus'), undef, + is(PublicInbox::Config::git_bool('bogus'), undef, 'bogus is undef'); } diff --git a/t/v2mirror.t b/t/v2mirror.t index b24528fe..a4ac682d 100644 --- a/t/v2mirror.t +++ b/t/v2mirror.t @@ -4,6 +4,7 @@ use strict; use warnings; use Test::More; use PublicInbox::TestCommon; +use File::Path qw(remove_tree); use Cwd qw(abs_path); require_git(2.6); local $ENV{HOME} = abs_path('t'); @@ -189,6 +190,19 @@ is($mibx->git->check($to_purge), undef, 'unindex+prune successful in mirror'); is(scalar($mset->items), 0, '1@example.com no longer visible in mirror'); } +if ('sequential-shard') { + $mset = $mibx->search->query('m:15@example.com', {mset => 1}); + is(scalar($mset->items), 1, 'large message not indexed'); + remove_tree(glob("$tmpdir/m/xap*"), glob("$tmpdir/m/msgmap.*")); + my $cmd = [ qw(-index -j9 --sequential-shard), "$tmpdir/m" ]; + ok(run_script($cmd), '--sequential-shard works'); + my @shards = glob("$tmpdir/m/xap*/?"); + is(scalar(@shards), 8, 'got expected shard count'); + PublicInbox::InboxWritable::cleanup($mibx); + $mset = $mibx->search->query('m:15@example.com', {mset => 1}); + is(scalar($mset->items), 1, 'search works after --sequential-shard'); +} + if ('max size') { $mime->header_set('Message-ID', '<2big@a>'); my $max = '2k'; |