From b0372059b451c01fba1fdfe8a1879fbd5c7ca53d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 21 Jun 2020 00:21:31 +0000 Subject: init: add -j / --jobs parameter On a powerful (by my standards) machine with 16GB RAM and an 7200 RPM HDD marketed for "enterprise" use, indexing a 8.1G (in git) LKML snapshot from Sep 2019 did not finish after 7 days with the default number (3) of Xapian shards (`--jobs=4') and `--batch-size=10m'. Indexing starts off fast, but progressively get slower as contents of the inbox (including Xapian + SQLite DBs) could no longer be cached by the kernel. Once the on-disk size increased, HDD seek contention between the Xapian shard workers slowed the process down to a crawl. With a single shard, it still took around 3.5 days to index on the HDD. That's not good, but it's far better than not finishing after 7 days. So allow unfortunate HDD users to easily specify a single shard on public-inbox-init. For reference, a freshly TRIM-ed low-end TLC SSD on the SATA II bus on the same machine indexes that same snapshot of LKML in ~7 hours with 3 shards and the same 10m batch size. In the past, a higher-end consumer grade MLC SSDs on similar hardware indexed a similarly sized-data set in ~4 hours. --- t/v2mirror.t | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 't') diff --git a/t/v2mirror.t b/t/v2mirror.t index fc03c3d7..b24528fe 100644 --- a/t/v2mirror.t +++ b/t/v2mirror.t @@ -80,9 +80,11 @@ foreach my $i (0..$epoch_max) { ok(-d "$tmpdir/m/git/$i.git", "mirror $i OK"); } -@cmd = ("-init", '-V2', 'm', "$tmpdir/m", 'http://example.com/m', +@cmd = ("-init", '-j1', '-V2', 'm', "$tmpdir/m", 'http://example.com/m', 'alt@example.com'); ok(run_script(\@cmd), 'initialized public-inbox -V2'); +my @shards = glob("$tmpdir/m/xap*/?"); +is(scalar(@shards), 1, 'got a single shard on init'); ok(run_script([qw(-index -j0), "$tmpdir/m"]), 'indexed'); -- cgit v1.2.3-24-ge0c7