about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--Documentation/public-inbox-config.pod6
-rw-r--r--Documentation/public-inbox-index.pod53
-rw-r--r--Documentation/public-inbox-v2-format.pod11
-rw-r--r--lib/PublicInbox/Config.pm9
-rw-r--r--lib/PublicInbox/V2Writable.pm55
-rw-r--r--lib/PublicInbox/WatchMaildir.pm2
-rwxr-xr-xscript/public-inbox-index22
-rw-r--r--t/config.t6
-rw-r--r--t/v2mirror.t14
9 files changed, 161 insertions, 17 deletions
diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod
index e6108c35..05b84819 100644
--- a/Documentation/public-inbox-config.pod
+++ b/Documentation/public-inbox-config.pod
@@ -139,6 +139,10 @@ allow for searching for phrases using quoted text.
 
 Default: C<full>
 
+=item publicinbox.<name>.indexSequentialShard
+
+See L<public-inbox-index(1)/publicInbox.indexSequentialShard>
+
 =item publicinbox.<name>.httpbackendmax
 
 If a digit, the maximum number of parallel
@@ -291,6 +295,8 @@ or /usr/share/cgit/
 See L<public-inbox-edit(1)>
 
 =item publicinbox.indexMaxSize
+=item publicinbox.indexBatchSize
+=item publicinbox.indexSequentialShard
 
 See L<public-inbox-index(1)>
 
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index aeb1b3a3..f525ba54 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -34,12 +34,16 @@ normal search functionality.
 
 =item --jobs=JOBS, -j
 
-Control the number of Xapian indexing jobs in a
+Influences the number of Xapian indexing shards in a
 (L<public-inbox-v2-format(5)>) inbox.
 
 C<--jobs=0> is accepted as of public-inbox 1.6.0 (PENDING)
 to disable parallel indexing.
 
+If the inbox has not been indexed, C<JOBS - 1> shards
+will be created (one job is always needed for indexing
+the overview and article number mapping).
+
 Default: the number of existing Xapian shards
 
 =item --compact / -c
@@ -120,6 +124,14 @@ and Xapian.  This is only effective with Xapian 1.4+.
 
 Available in public-inbox 1.6.0 (PENDING).
 
+=item --sequential-shard
+
+Sets or overrides L</publicinbox.indexSequentialShard> on a
+per-invocation basis.  See L</publicinbox.indexSequentialShard>
+below.
+
+Available in public-inbox 1.6.0 (PENDING).
+
 =back
 
 =head1 FILES
@@ -167,6 +179,45 @@ inbox with 3 shards will flush every 3 megabytes by default.
 
 Default: 1m (one megabyte)
 
+=item publicinbox.indexBatchSize
+
+Flushes changes to the filesystem and releases locks after
+indexing the given number of bytes.  The default value of C<1m>
+(one megabyte) is low to minimize memory use and reduce
+contention with parallel invocations of L<public-inbox-mda(1)>,
+L<public-inbox-learn(1)>, and L<public-inbox-watch(1)>.
+
+Increase this value on powerful systems to improve throughput at
+the expense of memory use.  The reduction of lock granularity
+may not be noticeable on fast systems.
+
+This option is available in public-inbox 1.6 or later.
+public-inbox 1.5 and earlier used the current default, C<1m>.
+
+For L<public-inbox-v2-format(5)> inboxes, this value is
+multiplied by the number of Xapian shards.  Thus a typical v2
+inbox with 3 shards will flush every 3 megabytes by default.
+
+Default: 1m (one megabyte)
+
+=item publicinbox.indexSequentialShard
+=item publicinbox.<inbox_name>.indexSequentialShard
+
+For L<public-inbox-v2-format(5)> inboxes, setting this to C<true>
+allows indexing Xapian shards in multiple passes.  This speeds up
+indexing on rotational storage with high seek latency by allowing
+individual shards to fit into the kernel page cache.
+
+Using a higher-than-normal number of C<--jobs> with
+L<public-inbox-init(1)> may be required to ensure individual
+shards are small enough to fit into cache.
+
+Available in public-inbox 1.6.0 (PENDING).
+
+This is ignored on L<public-inbox-v1-format(5)> inboxes.
+
+Default: false, shards are indexed in parallel
+
 =back
 
 =head1 ENVIRONMENT
diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod
index 9e284a75..6876989c 100644
--- a/Documentation/public-inbox-v2-format.pod
+++ b/Documentation/public-inbox-v2-format.pod
@@ -113,9 +113,14 @@ improved with high-quality and high-quantity solid-state storage.
 Issuing TRIM commands with L<fstrim(8)> was necessary to maintain
 consistent performance while developing this feature.
 
-Rotational storage devices are NOT recommended for indexing of
-large mail archives; but are fine for backup and usable for
-small instances.
+Rotational storage devices perform significantly worse than
+solid state storage for indexing of large mail archives; but are
+fine for backup and usable for small instances.
+
+As of public-inbox 1.6.0, the C<--sequential-shard> option of
+L<public-inbox-index(1)> may be used with a high shard count
+to ensure individual shards fit into page cache when the entire
+Xapian DB cannot.
 
 Our use of the L</OVERVIEW DB> requires Xapian document IDs to
 remain stable.  Using L<public-inbox-compact(1)> and
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 67199bb3..f9184bd2 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -369,8 +369,8 @@ sub _fill_code_repo {
         $git;
 }
 
-sub _git_config_bool ($) {
-        my ($val) = @_;
+sub git_bool {
+        my ($val) = $_[-1]; # $_[0] may be $self, or $val
         if ($val =~ /\A(?:false|no|off|[\-\+]?(?:0x)?0+)\z/i) {
                 0;
         } elsif ($val =~ /\A(?:true|yes|on|[\-\+]?(?:0x)?[0-9]+)\z/i) {
@@ -386,7 +386,8 @@ sub _fill {
 
         foreach my $k (qw(inboxdir filter newsgroup
                         watch httpbackendmax
-                        replyto feedmax nntpserver indexlevel)) {
+                        replyto feedmax nntpserver
+                        indexlevel indexsequentialshard)) {
                 my $v = $self->{"$pfx.$k"};
                 $ibx->{$k} = $v if defined $v;
         }
@@ -400,7 +401,7 @@ sub _fill {
         foreach my $k (qw(obfuscate)) {
                 my $v = $self->{"$pfx.$k"};
                 defined $v or next;
-                if (defined(my $bval = _git_config_bool($v))) {
+                if (defined(my $bval = git_bool($v))) {
                         $ibx->{$k} = $bval;
                 } else {
                         warn "Ignoring $pfx.$k=$v in config, not boolean\n";
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index f98afa61..7bc24592 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -875,7 +875,8 @@ sub reindex_checkpoint ($$) {
 
         $self->{ibx}->git->cleanup; # *async_wait
         ${$sync->{need_checkpoint}} = 0;
-        $sync->{mm_tmp}->atfork_prepare;
+        my $mm_tmp = $sync->{mm_tmp};
+        $mm_tmp->atfork_prepare if $mm_tmp;
         $self->done; # release lock
 
         if (my $pr = $sync->{-opt}->{-progress}) {
@@ -884,7 +885,7 @@ sub reindex_checkpoint ($$) {
 
         # allow -watch or -mda to write...
         $self->idx_init; # reacquire lock
-        $sync->{mm_tmp}->atfork_parent;
+        $mm_tmp->atfork_parent if $mm_tmp;
 }
 
 sub index_oid { # cat_async callback
@@ -1085,7 +1086,10 @@ sub sync_prepare ($$$) {
                 }
                 $all->cat_async_wait;
         }
-        return 0 if (!$regen_max && !keys(%{$self->{unindex_range}}));
+        if (!$regen_max && !keys(%{$self->{unindex_range}})) {
+                $sync->{-regen_fmt} = "%u/?\n";
+                return 0;
+        }
 
         # reindex should NOT see new commits anymore, if we do,
         # it's a problem and we need to notice it via die()
@@ -1177,6 +1181,36 @@ sub sync_ranges ($$$) {
         $ranges;
 }
 
+sub index_xap_only { # git->cat_async callback
+        my ($bref, $oid, $type, $size, $smsg) = @_;
+        my $self = $smsg->{v2w};
+        my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
+        $idx->begin_txn_lazy;
+        $idx->add_message(PublicInbox::Eml->new($bref), $smsg);
+        $self->{transact_bytes} += $size;
+}
+
+sub index_seq_shard ($$$) {
+        my ($self, $sync, $off) = @_;
+        my $ibx = $self->{ibx};
+        my $max = $ibx->mm->max or return;
+        my $all = $ibx->git;
+        my $over = $ibx->over;
+        my $batch_bytes = $PublicInbox::SearchIdx::BATCH_BYTES;
+        if (my $pr = $sync->{-opt}->{-progress}) {
+                $pr->("Xapian indexlevel=$ibx->{indexlevel} % $off\n");
+        }
+        for (my $num = $off; $num <= $max; $num += $self->{shards}) {
+                my $smsg = $over->get_art($num) or next;
+                $smsg->{v2w} = $self;
+                $all->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
+                if ($self->{transact_bytes} >= $batch_bytes) {
+                        ${$sync->{nr}} = $num;
+                        reindex_checkpoint($self, $sync);
+                }
+        }
+}
+
 sub index_epoch ($$$) {
         my ($self, $sync, $i) = @_;
 
@@ -1218,6 +1252,11 @@ sub index_sync {
         my $epoch_max;
         my $latest = git_dir_latest($self, \$epoch_max);
         return unless defined $latest;
+
+        my $seq = $opt->{sequentialshard};
+        my $idxlevel = $self->{ibx}->{indexlevel};
+        local $self->{ibx}->{indexlevel} = 'basic' if $seq;
+
         $self->idx_init($opt); # acquire lock
         fill_alternates($self, $epoch_max);
         $self->{over}->rethread_prepare($opt);
@@ -1252,6 +1291,16 @@ sub index_sync {
                 $pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr;
         }
 
+        if ($seq) { # deal with Xapian shards sequentially
+                my $end = $self->{shards} - 1;
+                $self->{ibx}->{indexlevel} = $idxlevel;
+                delete $sync->{mm_tmp};
+                $self->idx_init($opt); # re-acquire lock
+                index_seq_shard($self, $sync, $_) for (0..$end);
+                $self->{ibx}->git->cat_async_wait;
+                $self->done;
+        }
+
         # reindex does not pick up new changes, so we rerun w/o it:
         if ($opt->{reindex}) {
                 my %again = %$opt;
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index 142118bd..2ba10a9e 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -285,7 +285,7 @@ sub cfg_intvl ($$$) {
 sub cfg_bool ($$$) {
         my ($cfg, $key, $url) = @_;
         my $orig = $cfg->urlmatch($key, $url) // return;
-        my $bool = PublicInbox::Config::_git_config_bool($orig);
+        my $bool = $cfg->git_bool($orig);
         warn "W: $key=$orig for $url is not boolean\n" unless defined($bool);
         $bool;
 }
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 5a0ceab7..be518134 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -16,7 +16,8 @@ use PublicInbox::Xapcmd;
 my $compact_opt;
 my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 };
 GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync!
-                indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s))
+                indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s
+                sequentialshard|seq-shard|sequential-shard))
         or die "bad command-line args\n$usage";
 die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
 
@@ -46,6 +47,15 @@ if (my $bs = $opt->{batchsize} // $cfg->{lc('publicInbox.indexBatchSize')}) {
         $PublicInbox::SearchIdx::BATCH_BYTES = $bs;
 }
 
+my $s = $opt->{sequentialshard} //
+                        $cfg->{lc('publicInbox.indexSequentialShard')};
+if (defined $s) {
+        my $v = $cfg->git_bool($s);
+        defined($v) or
+                die "`publicInbox.indexSequentialShard=$s' not boolean\n";
+        $opt->{sequentialshard} = $v;
+}
+
 my $mods = {};
 foreach my $ibx (@ibxs) {
         # XXX: users can shoot themselves in the foot, with opt->{indexlevel}
@@ -63,6 +73,14 @@ for my $ibx (@ibxs) {
                 PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt);
         }
         $ibx->{-no_sync} = 1 if !$opt->{sync};
-        PublicInbox::Admin::index_inbox($ibx, undef, $opt);
+
+        my $ibx_opt = $opt;
+        if (defined(my $s = $ibx->{indexsequentialshard})) {
+                defined(my $v = $cfg->git_bool($s)) or die <<EOL;
+publicInbox.$ibx->{name}.indexSequentialShard not boolean
+EOL
+                $ibx_opt = { %$opt, sequentialshard => $v };
+        }
+        PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
         PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt) if $compact_opt;
 }
diff --git a/t/config.t b/t/config.t
index d7fd9446..ee51c6cc 100644
--- a/t/config.t
+++ b/t/config.t
@@ -220,18 +220,18 @@ EOF
 
 {
         for my $t (qw(TRUE true yes on 1 +1 -1 13 0x1 0x12 0X5)) {
-                is(PublicInbox::Config::_git_config_bool($t), 1, "$t is true");
+                is(PublicInbox::Config::git_bool($t), 1, "$t is true");
                 is(xqx([qw(git -c), "test.val=$t",
                         qw(config --bool test.val)]),
                         "true\n", "$t matches git-config behavior");
         }
         for my $f (qw(FALSE false no off 0 +0 +000 00 0x00 0X0)) {
-                is(PublicInbox::Config::_git_config_bool($f), 0, "$f is false");
+                is(PublicInbox::Config::git_bool($f), 0, "$f is false");
                 is(xqx([qw(git -c), "test.val=$f",
                         qw(config --bool test.val)]),
                         "false\n", "$f matches git-config behavior");
         }
-        is(PublicInbox::Config::_git_config_bool('bogus'), undef,
+        is(PublicInbox::Config::git_bool('bogus'), undef,
                 'bogus is undef');
 }
 
diff --git a/t/v2mirror.t b/t/v2mirror.t
index b24528fe..a4ac682d 100644
--- a/t/v2mirror.t
+++ b/t/v2mirror.t
@@ -4,6 +4,7 @@ use strict;
 use warnings;
 use Test::More;
 use PublicInbox::TestCommon;
+use File::Path qw(remove_tree);
 use Cwd qw(abs_path);
 require_git(2.6);
 local $ENV{HOME} = abs_path('t');
@@ -189,6 +190,19 @@ is($mibx->git->check($to_purge), undef, 'unindex+prune successful in mirror');
         is(scalar($mset->items), 0, '1@example.com no longer visible in mirror');
 }
 
+if ('sequential-shard') {
+        $mset = $mibx->search->query('m:15@example.com', {mset => 1});
+        is(scalar($mset->items), 1, 'large message not indexed');
+        remove_tree(glob("$tmpdir/m/xap*"), glob("$tmpdir/m/msgmap.*"));
+        my $cmd = [ qw(-index -j9 --sequential-shard), "$tmpdir/m" ];
+        ok(run_script($cmd), '--sequential-shard works');
+        my @shards = glob("$tmpdir/m/xap*/?");
+        is(scalar(@shards), 8, 'got expected shard count');
+        PublicInbox::InboxWritable::cleanup($mibx);
+        $mset = $mibx->search->query('m:15@example.com', {mset => 1});
+        is(scalar($mset->items), 1, 'search works after --sequential-shard');
+}
+
 if ('max size') {
         $mime->header_set('Message-ID', '<2big@a>');
         my $max = '2k';