* [PATCH 6/7] index: support --xapian-only switch
2020-08-07 1:13 7% [PATCH 0/7] index: --sequential-shard and other stuff Eric Wong
@ 2020-08-07 1:14 6% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-08-07 1:14 UTC (permalink / raw)
To: meta
This is useful for speeding up indexing runs when only Xapian
rules change but SQLite indexing doesn't change. This mostly
implies `--reindex', but does NOT pick up new messages (because
SQLite indexing needs to occur for that).
I'm leaving this undocumented in the manpage for now since it's
mainly to speed up development and testing. Users upgrading to
1.6.0 will be advised to `--reindex --rethread', anyways, due to
the threading improvements since 1.1.0-pre1.
It may make sense to document for 1.7+ when there's Xapian-only
indexing changes, though.
---
lib/PublicInbox/SearchIdxShard.pm | 10 ++++--
lib/PublicInbox/V2Writable.pm | 51 +++++++++++++++++++++++--------
script/public-inbox-index | 5 +--
3 files changed, 48 insertions(+), 18 deletions(-)
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index cb79f3dc..59b36087 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -89,16 +89,20 @@ sub shard_worker_loop ($$$$$) {
# called by V2Writable
sub index_raw {
- my ($self, $msgref, $mime, $smsg) = @_;
+ my ($self, $msgref, $eml, $smsg) = @_;
if (my $w = $self->{w}) {
# mid must be last, it can contain spaces (but not LF)
print $w join(' ', @$smsg{qw(raw_bytes bytes
num blob ds ts mid)}),
"\n", $$msgref or die "failed to write shard $!\n";
} else {
- $$msgref = undef;
+ if ($eml) {
+ $$msgref = undef;
+ } else { # --xapian-only + --sequential-shard:
+ $eml = PublicInbox::Eml->new($msgref);
+ }
$self->begin_txn_lazy;
- $self->add_message($mime, $smsg);
+ $self->add_message($eml, $smsg);
}
}
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 7bc24592..6b1effe5 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1185,22 +1185,24 @@ sub index_xap_only { # git->cat_async callback
my ($bref, $oid, $type, $size, $smsg) = @_;
my $self = $smsg->{v2w};
my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
- $idx->begin_txn_lazy;
- $idx->add_message(PublicInbox::Eml->new($bref), $smsg);
+ $smsg->{raw_bytes} = $size;
+ $idx->index_raw($bref, undef, $smsg);
$self->{transact_bytes} += $size;
}
-sub index_seq_shard ($$$) {
- my ($self, $sync, $off) = @_;
+sub index_xap_step ($$$;$) {
+ my ($self, $sync, $beg, $step) = @_;
my $ibx = $self->{ibx};
- my $max = $ibx->mm->max or return;
my $all = $ibx->git;
my $over = $ibx->over;
my $batch_bytes = $PublicInbox::SearchIdx::BATCH_BYTES;
+ $step //= $self->{shards};
+ my $end = $sync->{art_end};
if (my $pr = $sync->{-opt}->{-progress}) {
- $pr->("Xapian indexlevel=$ibx->{indexlevel} % $off\n");
+ $pr->("Xapian indexlevel=$ibx->{indexlevel} ".
+ "$beg..$end (% $step)\n");
}
- for (my $num = $off; $num <= $max; $num += $self->{shards}) {
+ for (my $num = $beg; $num <= $end; $num += $step) {
my $smsg = $over->get_art($num) or next;
$smsg->{v2w} = $self;
$all->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
@@ -1244,10 +1246,37 @@ sub index_epoch ($$$) {
update_last_commit($self, $git, $i, $stk->{latest_cmt});
}
+sub xapian_only {
+ my ($self, $opt, $sync) = @_;
+ my $seq = $opt->{sequentialshard};
+ local $self->{parallel} = 0 if $seq;
+ $self->idx_init($opt); # acquire lock
+ if (my $art_end = $self->{ibx}->mm->max) {
+ $sync //= {
+ need_checkpoint => \(my $bool = 0),
+ -opt => $opt,
+ v2w => $self,
+ nr => \(my $nr = 0),
+ -regen_fmt => "%u/?\n",
+ };
+ $sync->{art_end} = $art_end;
+ if ($seq || !$self->{parallel}) {
+ my $shard_end = $self->{shards} - 1;
+ index_xap_step($self, $sync, $_) for (0..$shard_end);
+ } else { # parallel (maybe)
+ index_xap_step($self, $sync, 0, 1);
+ }
+ }
+ $self->{ibx}->git->cat_async_wait;
+ $self->done;
+}
+
# public, called by public-inbox-index
sub index_sync {
my ($self, $opt) = @_;
- $opt ||= {};
+ $opt //= $_[1] //= {};
+ goto \&xapian_only if $opt->{xapianonly};
+
my $pr = $opt->{-progress};
my $epoch_max;
my $latest = git_dir_latest($self, \$epoch_max);
@@ -1292,13 +1321,9 @@ sub index_sync {
}
if ($seq) { # deal with Xapian shards sequentially
- my $end = $self->{shards} - 1;
$self->{ibx}->{indexlevel} = $idxlevel;
delete $sync->{mm_tmp};
- $self->idx_init($opt); # re-acquire lock
- index_seq_shard($self, $sync, $_) for (0..$end);
- $self->{ibx}->git->cat_async_wait;
- $self->done;
+ xapian_only($self, $opt, $sync);
}
# reindex does not pick up new changes, so we rerun w/o it:
diff --git a/script/public-inbox-index b/script/public-inbox-index
index be518134..a52fb1bf 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -16,6 +16,7 @@ use PublicInbox::Xapcmd;
my $compact_opt;
my $opt = { quiet => -1, compact => 0, maxsize => undef, sync => 1 };
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune sync!
+ xapianonly|xapian-only
indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s
sequentialshard|seq-shard|sequential-shard))
or die "bad command-line args\n$usage";
@@ -59,8 +60,8 @@ if (defined $s) {
my $mods = {};
foreach my $ibx (@ibxs) {
# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
- $ibx->{indexlevel} //= $opt->{indexlevel} //
- PublicInbox::Admin::detect_indexlevel($ibx);
+ $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapianonly} ?
+ 'full' : PublicInbox::Admin::detect_indexlevel($ibx));
$ibx->{index_max_size} = $max_size;
PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
}
^ permalink raw reply related [relevance 6%]
* [PATCH 0/7] index: --sequential-shard and other stuff
@ 2020-08-07 1:13 7% Eric Wong
2020-08-07 1:14 6% ` [PATCH 6/7] index: support --xapian-only switch Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-08-07 1:13 UTC (permalink / raw)
To: meta
1/7 is a minor usability fix (more on the way)
5/7 is a major improvement for HDDs
6/7 is useful to developers, and may be useful to users
a few months down the line
And the rest are minor fixes related to indexing...
Eric Wong (7):
xapcmd: quietly no-op on indexlevel=basic
xapcmd: remove redundant searchidx require
xapcmd: drop outdated comment
v2writable: fix rethread cleanup
index: v2: indexSequentialShard / --sequential-shard option
index: support --xapian-only switch
index+xcpdb: rename `--no-sync' to `--no-fsync'
Documentation/public-inbox-config.pod | 6 ++
Documentation/public-inbox-index.pod | 55 ++++++++++++++-
Documentation/public-inbox-v2-format.pod | 11 ++-
Documentation/public-inbox-xcpdb.pod | 2 +-
lib/PublicInbox/Config.pm | 9 +--
lib/PublicInbox/OverIdx.pm | 2 +-
lib/PublicInbox/SearchIdx.pm | 6 +-
lib/PublicInbox/SearchIdxShard.pm | 10 ++-
lib/PublicInbox/V2Writable.pm | 88 ++++++++++++++++++++++--
lib/PublicInbox/WatchMaildir.pm | 2 +-
lib/PublicInbox/Xapcmd.pm | 18 ++---
script/public-inbox-index | 33 +++++++--
t/config.t | 6 +-
t/indexlevels-mirror.t | 24 +++++--
t/v2mirror.t | 14 ++++
15 files changed, 235 insertions(+), 51 deletions(-)
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-08-07 1:13 7% [PATCH 0/7] index: --sequential-shard and other stuff Eric Wong
2020-08-07 1:14 6% ` [PATCH 6/7] index: support --xapian-only switch Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).