From 06a2418fd053c9a5b80217e74d1b47b8e1ca85e1 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 7 Aug 2020 01:14:04 +0000 Subject: index: v2: --sequential-shard option This gives better page cache utilization for Xapian indexing on slow storage by improving locality for random I/O activity on the Xapian DB. Instead of doing a single-pass to index both SQLite and Xapian; this indexes them separately. The first pass is identical to indexlevel=basic: it indexes both over.sqlite3 and msgmap.sqlite3. Subsequent passes only operate on a single Xapian shard for documents belonging to that shard. Given enough shards, each individual shard can be made small enough to fit into the kernel page cache and avoid HDD seeks for read activity. Doing rough tests with a busy system with a 7200 RPM HDD with ext4, full indexing of LKML (9 epochs) goes from ~80 hours (-j0) to ~30 hours (-j8) with 16GB RAM with 7 shards configured and fsync(2) disabled (--no-sync) and `--batch-size=10m'. --- lib/PublicInbox/Config.pm | 9 ++++--- lib/PublicInbox/V2Writable.pm | 55 ++++++++++++++++++++++++++++++++++++++--- lib/PublicInbox/WatchMaildir.pm | 2 +- 3 files changed, 58 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 67199bb3..f9184bd2 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -369,8 +369,8 @@ sub _fill_code_repo { $git; } -sub _git_config_bool ($) { - my ($val) = @_; +sub git_bool { + my ($val) = $_[-1]; # $_[0] may be $self, or $val if ($val =~ /\A(?:false|no|off|[\-\+]?(?:0x)?0+)\z/i) { 0; } elsif ($val =~ /\A(?:true|yes|on|[\-\+]?(?:0x)?[0-9]+)\z/i) { @@ -386,7 +386,8 @@ sub _fill { foreach my $k (qw(inboxdir filter newsgroup watch httpbackendmax - replyto feedmax nntpserver indexlevel)) { + replyto feedmax nntpserver + indexlevel indexsequentialshard)) { my $v = $self->{"$pfx.$k"}; $ibx->{$k} = $v if defined $v; } @@ -400,7 +401,7 @@ sub _fill { foreach my $k (qw(obfuscate)) { my $v = $self->{"$pfx.$k"}; defined $v or next; - if (defined(my $bval = _git_config_bool($v))) { + if (defined(my $bval = git_bool($v))) { $ibx->{$k} = $bval; } else { warn "Ignoring $pfx.$k=$v in config, not boolean\n"; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index f98afa61..7bc24592 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -875,7 +875,8 @@ sub reindex_checkpoint ($$) { $self->{ibx}->git->cleanup; # *async_wait ${$sync->{need_checkpoint}} = 0; - $sync->{mm_tmp}->atfork_prepare; + my $mm_tmp = $sync->{mm_tmp}; + $mm_tmp->atfork_prepare if $mm_tmp; $self->done; # release lock if (my $pr = $sync->{-opt}->{-progress}) { @@ -884,7 +885,7 @@ sub reindex_checkpoint ($$) { # allow -watch or -mda to write... $self->idx_init; # reacquire lock - $sync->{mm_tmp}->atfork_parent; + $mm_tmp->atfork_parent if $mm_tmp; } sub index_oid { # cat_async callback @@ -1085,7 +1086,10 @@ sub sync_prepare ($$$) { } $all->cat_async_wait; } - return 0 if (!$regen_max && !keys(%{$self->{unindex_range}})); + if (!$regen_max && !keys(%{$self->{unindex_range}})) { + $sync->{-regen_fmt} = "%u/?\n"; + return 0; + } # reindex should NOT see new commits anymore, if we do, # it's a problem and we need to notice it via die() @@ -1177,6 +1181,36 @@ sub sync_ranges ($$$) { $ranges; } +sub index_xap_only { # git->cat_async callback + my ($bref, $oid, $type, $size, $smsg) = @_; + my $self = $smsg->{v2w}; + my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); + $idx->begin_txn_lazy; + $idx->add_message(PublicInbox::Eml->new($bref), $smsg); + $self->{transact_bytes} += $size; +} + +sub index_seq_shard ($$$) { + my ($self, $sync, $off) = @_; + my $ibx = $self->{ibx}; + my $max = $ibx->mm->max or return; + my $all = $ibx->git; + my $over = $ibx->over; + my $batch_bytes = $PublicInbox::SearchIdx::BATCH_BYTES; + if (my $pr = $sync->{-opt}->{-progress}) { + $pr->("Xapian indexlevel=$ibx->{indexlevel} % $off\n"); + } + for (my $num = $off; $num <= $max; $num += $self->{shards}) { + my $smsg = $over->get_art($num) or next; + $smsg->{v2w} = $self; + $all->cat_async($smsg->{blob}, \&index_xap_only, $smsg); + if ($self->{transact_bytes} >= $batch_bytes) { + ${$sync->{nr}} = $num; + reindex_checkpoint($self, $sync); + } + } +} + sub index_epoch ($$$) { my ($self, $sync, $i) = @_; @@ -1218,6 +1252,11 @@ sub index_sync { my $epoch_max; my $latest = git_dir_latest($self, \$epoch_max); return unless defined $latest; + + my $seq = $opt->{sequentialshard}; + my $idxlevel = $self->{ibx}->{indexlevel}; + local $self->{ibx}->{indexlevel} = 'basic' if $seq; + $self->idx_init($opt); # acquire lock fill_alternates($self, $epoch_max); $self->{over}->rethread_prepare($opt); @@ -1252,6 +1291,16 @@ sub index_sync { $pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr; } + if ($seq) { # deal with Xapian shards sequentially + my $end = $self->{shards} - 1; + $self->{ibx}->{indexlevel} = $idxlevel; + delete $sync->{mm_tmp}; + $self->idx_init($opt); # re-acquire lock + index_seq_shard($self, $sync, $_) for (0..$end); + $self->{ibx}->git->cat_async_wait; + $self->done; + } + # reindex does not pick up new changes, so we rerun w/o it: if ($opt->{reindex}) { my %again = %$opt; diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index 142118bd..2ba10a9e 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -285,7 +285,7 @@ sub cfg_intvl ($$$) { sub cfg_bool ($$$) { my ($cfg, $key, $url) = @_; my $orig = $cfg->urlmatch($key, $url) // return; - my $bool = PublicInbox::Config::_git_config_bool($orig); + my $bool = $cfg->git_bool($orig); warn "W: $key=$orig for $url is not boolean\n" unless defined($bool); $bool; } -- cgit v1.2.3-24-ge0c7