From 06a2418fd053c9a5b80217e74d1b47b8e1ca85e1 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 7 Aug 2020 01:14:04 +0000 Subject: index: v2: --sequential-shard option This gives better page cache utilization for Xapian indexing on slow storage by improving locality for random I/O activity on the Xapian DB. Instead of doing a single-pass to index both SQLite and Xapian; this indexes them separately. The first pass is identical to indexlevel=basic: it indexes both over.sqlite3 and msgmap.sqlite3. Subsequent passes only operate on a single Xapian shard for documents belonging to that shard. Given enough shards, each individual shard can be made small enough to fit into the kernel page cache and avoid HDD seeks for read activity. Doing rough tests with a busy system with a 7200 RPM HDD with ext4, full indexing of LKML (9 epochs) goes from ~80 hours (-j0) to ~30 hours (-j8) with 16GB RAM with 7 shards configured and fsync(2) disabled (--no-sync) and `--batch-size=10m'. --- script/public-inbox-index | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'script') diff --git a/script/public-inbox-index b/script/public-inbox-index index 5a0ceab7..be518134 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -16,7 +16,8 @@ use PublicInbox::Xapcmd; my $compact_opt; my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync! - indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s)) + indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s + sequentialshard|seq-shard|sequential-shard)) or die "bad command-line args\n$usage"; die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; @@ -46,6 +47,15 @@ if (my $bs = $opt->{batchsize} // $cfg->{lc('publicInbox.indexBatchSize')}) { $PublicInbox::SearchIdx::BATCH_BYTES = $bs; } +my $s = $opt->{sequentialshard} // + $cfg->{lc('publicInbox.indexSequentialShard')}; +if (defined $s) { + my $v = $cfg->git_bool($s); + defined($v) or + die "`publicInbox.indexSequentialShard=$s' not boolean\n"; + $opt->{sequentialshard} = $v; +} + my $mods = {}; foreach my $ibx (@ibxs) { # XXX: users can shoot themselves in the foot, with opt->{indexlevel} @@ -63,6 +73,14 @@ for my $ibx (@ibxs) { PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt); } $ibx->{-no_sync} = 1 if !$opt->{sync}; - PublicInbox::Admin::index_inbox($ibx, undef, $opt); + + my $ibx_opt = $opt; + if (defined(my $s = $ibx->{indexsequentialshard})) { + defined(my $v = $cfg->git_bool($s)) or die <{name}.indexSequentialShard not boolean +EOL + $ibx_opt = { %$opt, sequentialshard => $v }; + } + PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt) if $compact_opt; } -- cgit v1.2.3-24-ge0c7