user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH 0/2] index: support --compact / -c
@ 2020-03-28  0:56 Eric Wong
  2020-03-28  0:56 ` [PATCH 1/2] searchidxshard: ensure we set indexlevel on shard[0] Eric Wong
  2020-03-28  0:56 ` [PATCH 2/2] index: support --compact / -c on command-line Eric Wong
  0 siblings, 2 replies; 3+ messages in thread
From: Eric Wong @ 2020-03-28  0:56 UTC (permalink / raw)
  To: meta

It looks like HDDs and SSDs have gotten and will get even more
expensive due to manufacturing freezes from the pandemic.

Indexing (especially with --reindex to fixup old bugs) takes a
large amount of space, so support running compact immediately
after indexing to avoid users having to script a -compact
invocation for each inbox.  Compacting before indexing can be
triggered by using this switch twice, to further reduce space
overhead at a small time loss.

Note: I only found the bug fixed in 1/2 while testing 2/2.  It
took me a while to fix this bug because I've probably lost 10
IQ points from the stress of recent weeks :<

Eric Wong (2):
  searchidxshard: ensure we set indexlevel on shard[0]
  index: support --compact / -c on command-line

 Documentation/public-inbox-index.pod | 24 ++++++++++++++++++++----
 lib/PublicInbox/InboxWritable.pm     |  1 +
 lib/PublicInbox/SearchIdx.pm         | 26 +++++++++++++++++---------
 lib/PublicInbox/SearchIdxShard.pm    |  4 +++-
 lib/PublicInbox/Xapcmd.pm            |  4 +++-
 script/public-inbox-index            | 20 +++++++++++++++++---
 t/convert-compact.t                  | 13 +++++++++++++
 t/init.t                             |  7 ++++++-
 8 files changed, 80 insertions(+), 19 deletions(-)

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] searchidxshard: ensure we set indexlevel on shard[0]
  2020-03-28  0:56 [PATCH 0/2] index: support --compact / -c Eric Wong
@ 2020-03-28  0:56 ` Eric Wong
  2020-03-28  0:56 ` [PATCH 2/2] index: support --compact / -c on command-line Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2020-03-28  0:56 UTC (permalink / raw)
  To: meta

For sharded v2 repositories with few-enough messages, it is
possible for shard[0] to go unused and never trigger the
->commit_txn_lazy to set the indexlevel field in Xapian
metadata.

So set it immediately at initialization and avoid this case.
While we're at it, avoid triggering needless pwrite syscalls
from ->set_metadata by checking with ->get_metadata, first.
---
 lib/PublicInbox/SearchIdx.pm      | 26 +++++++++++++++++---------
 lib/PublicInbox/SearchIdxShard.pm |  4 +++-
 t/init.t                          |  7 ++++++-
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 44b05813..7d089e7a 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -58,6 +58,7 @@ sub new {
 		ibx_ver => $version,
 		indexlevel => $indexlevel,
 	}, $class;
+	$self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium';
 	$ibx->umask_prepare;
 	if ($version == 1) {
 		$self->{lock_path} = "$inboxdir/ssoma.lock";
@@ -842,20 +843,27 @@ sub begin_txn_lazy {
 	});
 }
 
+# store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard)
+# This metadata is read by Admin::detect_indexlevel:
+sub set_indexlevel {
+	my ($self) = @_;
+
+	if (!$self->{shard} && # undef or 0, not >0
+			delete($self->{-set_indexlevel_once})) {
+		my $xdb = $self->{xdb};
+		my $level = $xdb->get_metadata('indexlevel');
+		if (!$level || $level ne 'medium') {
+			$xdb->set_metadata('indexlevel', 'medium');
+		}
+	}
+}
+
 sub commit_txn_lazy {
 	my ($self) = @_;
 	delete $self->{txn} or return;
 	$self->{-inbox}->with_umask(sub {
 		if (my $xdb = $self->{xdb}) {
-
-			# store 'indexlevel=medium' in v2 shard=0 and
-			# v1 (only one shard)
-			# This metadata is read by Admin::detect_indexlevel:
-			if (!$self->{shard} # undef or 0, not >0
-			    && $self->{indexlevel} eq 'medium') {
-				$xdb->set_metadata('indexlevel', 'medium');
-			}
-
+			set_indexlevel($self);
 			$xdb->commit_transaction;
 		}
 		$self->{over}->commit_lazy if $self->{over};
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 2b48b1b4..1ea01095 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -11,9 +11,11 @@ use IO::Handle (); # autoflush
 
 sub new {
 	my ($class, $v2writable, $shard) = @_;
-	my $self = $class->SUPER::new($v2writable->{-inbox}, 1, $shard);
+	my $ibx = $v2writable->{-inbox};
+	my $self = $class->SUPER::new($ibx, 1, $shard);
 	# create the DB before forking:
 	$self->_xdb_acquire;
+	$self->set_indexlevel;
 	$self->_xdb_release;
 	$self->spawn_worker($v2writable, $shard) if $v2writable->{parallel};
 	$self;
diff --git a/t/init.t b/t/init.t
index e20ff006..a78c2fc8 100644
--- a/t/init.t
+++ b/t/init.t
@@ -5,6 +5,7 @@ use warnings;
 use Test::More;
 use PublicInbox::Config;
 use PublicInbox::TestCommon;
+use PublicInbox::Admin;
 use File::Basename;
 my ($tmpdir, $for_destroy) = tmpdir();
 sub quiet_fail {
@@ -72,11 +73,15 @@ SKIP: {
 	quiet_fail($cmd, 'initializing V2 as V1 fails');
 
 	foreach my $lvl (qw(medium basic)) {
+		my $dir = "$tmpdir/v2$lvl";
 		$cmd = [ '-init', "v2$lvl", '-V2', '-L', $lvl,
-			"$tmpdir/v2$lvl", "http://example.com/v2$lvl",
+			$dir, "http://example.com/v2$lvl",
 			"v2$lvl\@example.com" ];
 		ok(run_script($cmd), "-init -L $lvl");
 		is(read_indexlevel("v2$lvl"), $lvl, "indexlevel set to '$lvl'");
+		my $ibx = PublicInbox::Inbox->new({ inboxdir => $dir });
+		is(PublicInbox::Admin::detect_indexlevel($ibx), $lvl,
+			'detected expected level w/o config');
 	}
 
 	# loop for idempotency

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] index: support --compact / -c on command-line
  2020-03-28  0:56 [PATCH 0/2] index: support --compact / -c Eric Wong
  2020-03-28  0:56 ` [PATCH 1/2] searchidxshard: ensure we set indexlevel on shard[0] Eric Wong
@ 2020-03-28  0:56 ` Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2020-03-28  0:56 UTC (permalink / raw)
  To: meta

It's more convenient to specify `-c' / `--compact' on the
command-line when reindexing than it is to invoke
public-inbox-compact(1) separately.

This is especially convenient in low-space situations when
public-inbox-index is operating on multiple inboxes
sequentially, as compaction can happen immediately after
indexing each inbox, instead of waiting until all inboxes are
indexed.
---
 Documentation/public-inbox-index.pod | 24 ++++++++++++++++++++----
 lib/PublicInbox/InboxWritable.pm     |  1 +
 lib/PublicInbox/Xapcmd.pm            |  4 +++-
 script/public-inbox-index            | 20 +++++++++++++++++---
 t/convert-compact.t                  | 13 +++++++++++++
 5 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index 14113ec8..dede5d2e 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -4,7 +4,7 @@ public-inbox-index - create and update search indices
 
 =head1 SYNOPSIS
 
-public-inbox-index [OPTIONS] INBOX_DIR
+public-inbox-index [OPTIONS] INBOX_DIR...
 
 =head1 DESCRIPTION
 
@@ -32,16 +32,32 @@ normal search functionality.
 
 =over
 
+=item --compact / -c
+
+Compacts the Xapian DBs after indexing.  This is recommended
+when using C<--reindex> to avoid running out of disk space
+while indexing multiple inboxes.
+
+While option takes a negligible amount of time compared to
+C<--reindex>, it requires temporarily duplicating the entire
+contents of the Xapian DB.
+
+This switch may be specified twice, in which case compaction
+happens both before and after indexing to minimize the temporal
+footprint of the (re)indexing operation.
+
 =item --reindex
 
 Forces a re-index of all messages in the inbox.
 This can be used for in-place upgrades and bugfixes while
 NNTP/HTTP server processes are utilizing the index.  Keep in
 mind this roughly doubles the size of the already-large
-Xapian database.  Running L<public-inbox-compact(1)>
-afterwards is recommended to release free space.
+Xapian database.  Using this with C<--compact> or running
+L<public-inbox-compact(1)> afterwards is recommended to
+release free space.
 
-This does not touch the NNTP article number database.
+This does not touch the NNTP article number database or
+affect threading.
 
 =item --prune
 
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index e684f546..ce979ea2 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -19,6 +19,7 @@ use constant {
 
 sub new {
 	my ($class, $ibx, $creat_opt) = @_;
+	return $ibx if ref($ibx) eq $class;
 	my $self = bless $ibx, $class;
 
 	# TODO: maybe stop supporting this
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 7414c9b6..8e2b9063 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -217,13 +217,15 @@ sub prepare_run {
 	($tmp, \@queue);
 }
 
+sub check_compact () { runnable_or_die($XAPIAN_COMPACT) }
+
 sub run {
 	my ($ibx, $task, $opt) = @_; # task = 'cpdb' or 'compact'
 	my $cb = \&${\"PublicInbox::Xapcmd::$task"};
 	PublicInbox::Admin::progress_prepare($opt ||= {});
 	defined(my $dir = $ibx->{inboxdir}) or die "no inboxdir defined\n";
 	-d $dir or die "inboxdir=$dir does not exist\n";
-	runnable_or_die($XAPIAN_COMPACT) if $opt->{compact};
+	check_compact() if $opt->{compact};
 	my $reindex; # v1:{ from => $x40 }, v2:{ from => [ $x40, $x40, .. ] } }
 
 	if (!$opt->{-coarse_lock}) {
diff --git a/script/public-inbox-index b/script/public-inbox-index
index c6910420..7def9964 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -11,12 +11,19 @@ use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $usage = "public-inbox-index INBOX_DIR";
 use PublicInbox::Admin;
 PublicInbox::Admin::require_or_die('-index');
+use PublicInbox::Xapcmd;
 
-my $opt = { quiet => -1 };
-GetOptions($opt, qw(verbose|v+ reindex jobs|j=i prune indexlevel|L=s))
+my $compact_opt;
+my $opt = { quiet => -1, compact => 0 };
+GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune indexlevel|L=s))
 	or die "bad command-line args\n$usage";
 die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0;
 
+if ($opt->{compact}) {
+	require PublicInbox::Xapcmd;
+	PublicInbox::Xapcmd::check_compact();
+	$compact_opt = { -coarse_lock => 1, compact => 1 };
+}
 
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV);
 PublicInbox::Admin::require_or_die('-index');
@@ -31,4 +38,11 @@ foreach my $ibx (@ibxs) {
 
 PublicInbox::Admin::require_or_die(keys %$mods);
 PublicInbox::Admin::progress_prepare($opt);
-PublicInbox::Admin::index_inbox($_, undef, $opt) for @ibxs;
+for my $ibx (@ibxs) {
+	$ibx = PublicInbox::InboxWritable->new($ibx);
+	if ($opt->{compact} >= 2) {
+		PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt);
+	}
+	PublicInbox::Admin::index_inbox($ibx, undef, $opt);
+	PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt) if $compact_opt;
+}
diff --git a/t/convert-compact.t b/t/convert-compact.t
index 1671caad..70609c7d 100644
--- a/t/convert-compact.t
+++ b/t/convert-compact.t
@@ -115,4 +115,17 @@ my $msgs = $ibx->recent({limit => 1000});
 is($msgs->[0]->{mid}, 'a-mid@b', 'message exists in history');
 is(scalar @$msgs, 1, 'only one message in history');
 
+$ibx = undef;
+$err = '';
+$cmd = [ qw(-index --reindex -c), "$tmpdir/v2" ];
+ok(run_script($cmd, undef, $rdr), '--reindex -c');
+like($err, qr/xapian-compact/, 'xapian-compact ran (-c)');
+
+$rdr->{2} = \(my $err2 = '');
+$cmd = [ qw(-index --reindex -cc), "$tmpdir/v2" ];
+ok(run_script($cmd, undef, $rdr), '--reindex -c -c');
+like($err2, qr/xapian-compact/, 'xapian-compact ran (-c -c)');
+ok(scalar(split(/\n/, $err2)) > scalar(split(/\n/, $err)),
+	'-compacted twice');
+
 done_testing();

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-03-28  0:56 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-28  0:56 [PATCH 0/2] index: support --compact / -c Eric Wong
2020-03-28  0:56 ` [PATCH 1/2] searchidxshard: ensure we set indexlevel on shard[0] Eric Wong
2020-03-28  0:56 ` [PATCH 2/2] index: support --compact / -c on command-line Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).