From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 02/14] index: --sequential-shard works incrementally
Date: Mon, 10 Aug 2020 02:11:53 +0000 [thread overview]
Message-ID: <20200810021205.18909-3-e@yhbt.net> (raw)
In-Reply-To: <20200810021205.18909-1-e@yhbt.net>
We should never reindex all data in Xapian unless --reindex is
specified on the command-line. This means users who put
publicInbox.indexSequentialShard in their config file won't have
to put up with a full reindex at every invocation, only when
they specify --reindex.
We'll also cleanup the progress output to not emit non-sensical
ranges where the starting number is higher than the end.
---
lib/PublicInbox/V2Writable.pm | 36 ++++++++++++++++++++++-------------
1 file changed, 23 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index f7a318e5..0b527f18 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1198,20 +1198,20 @@ sub index_xap_only { # git->cat_async callback
sub index_xap_step ($$$;$) {
my ($self, $sync, $beg, $step) = @_;
- my $ibx = $self->{ibx};
- my $all = $ibx->git;
- my $over = $ibx->over;
- my $batch_bytes = batch_bytes($self);
- $step //= $self->{shards};
my $end = $sync->{art_end};
+ return if $beg > $end; # nothing to do
+
+ $step //= $self->{shards};
+ my $ibx = $self->{ibx};
if (my $pr = $sync->{-opt}->{-progress}) {
$pr->("Xapian indexlevel=$ibx->{indexlevel} ".
"$beg..$end (% $step)\n");
}
+ my $batch_bytes = batch_bytes($self);
for (my $num = $beg; $num <= $end; $num += $step) {
- my $smsg = $over->get_art($num) or next;
+ my $smsg = $ibx->over->get_art($num) or next;
$smsg->{v2w} = $self;
- $all->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
+ $ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
if ($self->{transact_bytes} >= $batch_bytes) {
${$sync->{nr}} = $num;
reindex_checkpoint($self, $sync);
@@ -1253,8 +1253,9 @@ sub index_epoch ($$$) {
}
sub xapian_only {
- my ($self, $opt, $sync) = @_;
+ my ($self, $opt, $sync, $art_beg) = @_;
my $seq = $opt->{sequentialshard};
+ $art_beg //= 0;
local $self->{parallel} = 0 if $seq;
$self->idx_init($opt); # acquire lock
if (my $art_end = $self->{ibx}->mm->max) {
@@ -1268,9 +1269,11 @@ sub xapian_only {
$sync->{art_end} = $art_end;
if ($seq || !$self->{parallel}) {
my $shard_end = $self->{shards} - 1;
- index_xap_step($self, $sync, $_) for (0..$shard_end);
+ for (0..$shard_end) {
+ index_xap_step($self, $sync, $art_beg + $_)
+ }
} else { # parallel (maybe)
- index_xap_step($self, $sync, 0, 1);
+ index_xap_step($self, $sync, $art_beg, 1);
}
}
$self->{ibx}->git->cat_async_wait;
@@ -1289,6 +1292,7 @@ sub index_sync {
return unless defined $latest;
my $seq = $opt->{sequentialshard};
+ my $art_beg; # the NNTP article number we start xapian_only at
my $idxlevel = $self->{ibx}->{indexlevel};
local $self->{ibx}->{indexlevel} = 'basic' if $seq;
@@ -1312,6 +1316,12 @@ sub index_sync {
$self->{mm}->{dbh}->begin_work;
$sync->{mm_tmp} =
$self->{mm}->tmp_clone($self->{ibx}->{inboxdir});
+
+ # xapian_only works incrementally w/o --reindex
+ if ($seq && !$opt->{reindex}) {
+ $art_beg = $sync->{mm_tmp}->max;
+ $art_beg++ if defined($art_beg);
+ }
}
if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) {
$sync->{index_oid} = \&index_oid;
@@ -1326,10 +1336,10 @@ sub index_sync {
$pr->('all.git '.sprintf($sync->{-regen_fmt}, $$nr)) if $pr;
}
- if ($seq) { # deal with Xapian shards sequentially
+ # deal with Xapian shards sequentially
+ if ($seq && delete($sync->{mm_tmp})) {
$self->{ibx}->{indexlevel} = $idxlevel;
- delete $sync->{mm_tmp};
- xapian_only($self, $opt, $sync);
+ xapian_only($self, $opt, $sync, $art_beg);
}
# reindex does not pick up new changes, so we rerun w/o it:
next prev parent reply other threads:[~2020-08-10 2:12 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-08-10 2:11 [PATCH 00/14] more indexing related improvements Eric Wong
2020-08-10 2:11 ` [PATCH 01/14] index: require --reindex when using --xapian-only Eric Wong
2020-08-10 2:11 ` Eric Wong [this message]
2020-08-10 2:11 ` [PATCH 03/14] doc: index: more notes about latest changes Eric Wong
2020-08-10 2:38 ` Kyle Meyer
2020-08-10 6:29 ` Eric Wong
2020-08-10 2:11 ` [PATCH 04/14] doc: add some notes around -xcpdb / -edit / -purge Eric Wong
2020-08-10 2:11 ` [PATCH 05/14] index+xcpdb: improve SIG{INT,TERM,HUP,PIPE} behavior Eric Wong
2020-08-10 2:11 ` [PATCH 06/14] msgmap: tmp_clone: simplify + meaningful filename Eric Wong
2020-08-10 2:11 ` [PATCH 07/14] avoid File::Temp::tempfile in more places Eric Wong
2020-08-10 2:11 ` [PATCH 08/14] admin: use a generic veriable name Eric Wong
2020-08-10 2:38 ` Kyle Meyer
2020-08-10 2:12 ` [PATCH 09/14] index: cleanup internal variables Eric Wong
2020-08-10 2:12 ` [PATCH 10/14] searchidx: use singular `$opt' for consistency with v2 Eric Wong
2020-08-10 2:12 ` [PATCH 11/14] convert: support new -index options Eric Wong
2020-08-10 2:12 ` [PATCH 12/14] convert: speed up --help Eric Wong
2020-08-10 2:12 ` [PATCH 13/14] convert: check ARGV more correctly Eric Wong
2020-08-10 2:12 ` [PATCH 14/14] convert: set No_COW on copied SQLite files Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200810021205.18909-3-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).