From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 3199C1F8C7 for ; Sat, 9 Oct 2021 12:04:41 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] extindex: support --reindex --fast Date: Sat, 9 Oct 2021 05:04:41 -0700 Message-Id: <20211009120441.24600-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This mode only checks history for missed/stale messages and doesn't attempt to reindex messages which are already indexed. --- Documentation/public-inbox-extindex.pod | 22 ++++++++++++++++++++++ lib/PublicInbox/ExtSearchIdx.pm | 6 +++--- script/public-inbox-extindex | 9 ++++++--- t/extsearch.t | 5 +++++ 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/Documentation/public-inbox-extindex.pod b/Documentation/public-inbox-extindex.pod index 2e2e6383b79b..a0fca83c6255 100644 --- a/Documentation/public-inbox-extindex.pod +++ b/Documentation/public-inbox-extindex.pod @@ -40,6 +40,28 @@ C set to C and their respective Xapian public-inboxes where cross-posting is common, this allows significant space savings on Xapian indices. +=item --gc + +Perform garbage collection instead of indexing. Use this if +inboxes are removed from the extindex, or if messages are +purged or removed from some inboxes. + +=item --reindex + +Forces a re-index of all messages in the extindex. This can be +used for in-place upgrades and bugfixes while read-only server +processes are utilizing the index. Keep in mind this roughly +doubles the size of the already-large Xapian database. + +The extindex locks will be released roughly every 10s to +allow L and L +processes to write to the extindex. + +=item --fast + +Used with C<--reindex>, it will only look for new and stale +entries and not touch already-indexed messages. + =back =head1 FILES diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 7cc8dd952559..20c4cf7807ea 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -811,7 +811,7 @@ sub _reindex_check_unseen ($$$) { local $sync->{-regen_fmt} = "$ekey checking unseen %u/".$ibx->over->max."\n"; ${$sync->{nr}} = 0; - + my $fast = $sync->{-opt}->{fast}; while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) { ${$sync->{nr}} = $beg; $beg = $msgs->[-1]->{num} + 1; @@ -835,7 +835,7 @@ ibx_id = ? AND xnum = ? AND oidbin = ? # the first time around ASAP: if (scalar(@$docids) == 0) { reindex_unseen($self, $sync, $ibx, $xsmsg); - } else { # already seen, reindex later + } elsif (!$fast) { # already seen, reindex later for my $r (@$docids) { $self->{oidx}->eidxq_add($r->[0]); } @@ -853,7 +853,7 @@ sub _reindex_check_stale ($$$) { my $fetching; my $ekey = $ibx->eidx_key; local $sync->{-regen_fmt} = - "$ekey check stale/missing %u/".$ibx->over->max."\n"; + "$ekey checking stale/missing %u/".$ibx->over->max."\n"; ${$sync->{nr}} = 0; do { if (checkpoint_due($sync)) { diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index 1572a1d23d82..c63f5dc26fd2 100755 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -18,6 +18,8 @@ usage: public-inbox-extindex [options] [EXTINDEX_DIR] [INBOX_DIR...] --max-size=BYTES do not index messages larger than the given size --gc perform garbage collection instead of indexing --dedupe[=MSGID] fix prior deduplication errors (may be repeated) + --reindex index previously indexed inboxes + --fast only reindex unseen/stale messages --verbose | -v increase verbosity (may be repeated) --dry-run | -n dry-run on --dedupe @@ -26,7 +28,7 @@ See public-inbox-extindex(1) man page for full documentation. EOF my $opt = { quiet => -1, compact => 0, fsync => 1, scan => 1 }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i - fsync|sync! + fsync|sync! fast indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s dedupe:s@ gc commit-interval=i watch scan! dry-run|n @@ -59,9 +61,10 @@ if ($opt->{gc}) { } else { @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); } -if ($opt->{'dry-run'} && !$opt->{dedupe}) { +$opt->{'dry-run'} && !$opt->{dedupe} and die "E: --dry-run only affects --dedupe\n"; -} +$opt->{fast} && !$opt->{reindex} and + die "E: --fast only affects --reindex\n"; PublicInbox::Admin::require_or_die(qw(-search)); PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n"; diff --git a/t/extsearch.t b/t/extsearch.t index ca586f61c29f..896e270414bd 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -336,6 +336,11 @@ if ('reindex catches missed messages') { $es->{xdb}->reopen; $mset = $es->mset("mid:$new->{mid}"); is($mset->size, 0, 'stale mid gone Xapian'); + + ok(run_script([qw(-extindex --reindex --all --fast), "$home/extindex"], + undef, $opt), '--reindex w/ --fast'); + ok(!run_script([qw(-extindex --all --fast), "$home/extindex"], + undef, $opt), '--fast alone makes no sense'); } if ('reindex catches content bifurcation') {