user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 09/14] index: cleanup internal variables
Date: Mon, 10 Aug 2020 02:12:00 +0000	[thread overview]
Message-ID: <20200810021205.18909-10-e@yhbt.net> (raw)
In-Reply-To: <20200810021205.18909-1-e@yhbt.net>

Move away from hard-to-read alllowercase naming and favor
snake_case or separated-by-dashes.

We'll keep `--indexlevel' as-is for now, since it's been around
for several releases; but we'll support `--index-level' in the
CLI and update our documentation in a few months.

We'll also clarify that publicInbox.indexMaxSize is only
intended for -index, and not -watch or -mda.
---
 Documentation/public-inbox-index.pod |  6 ++++-
 Documentation/public-inbox-init.pod  |  2 +-
 lib/PublicInbox/SearchIdx.pm         | 14 +++++------
 lib/PublicInbox/V2Writable.pm        | 26 +++++++++------------
 script/public-inbox-index            | 35 ++++++++++++++--------------
 script/public-inbox-init             |  8 ++-----
 6 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index 3ae3b008..7a97432e 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -158,7 +158,11 @@ value.  A single suffix modifier of C<k>, C<m> or C<g> is
 supported, thus the value of C<1m> to prevents indexing of
 messages larger than one megabyte.
 
-This is useful for avoiding memory exhaustion in mirrors.
+This is useful for avoiding memory exhaustion in mirrors
+via git.  It does not prevent L<public-inbox-mda(1)> or
+L<public-inbox-watch(1)> from importing (and indexing)
+a message.
+
 This option is only available in public-inbox 1.5 or later.
 
 Default: none
diff --git a/Documentation/public-inbox-init.pod b/Documentation/public-inbox-init.pod
index fd9fc637..d0c87563 100644
--- a/Documentation/public-inbox-init.pod
+++ b/Documentation/public-inbox-init.pod
@@ -31,7 +31,7 @@ L<DBD::SQLite>.
 
 Default: C<1>
 
-=item --indexlevel <basic|medium|full>
+=item -L, --indexlevel <basic|medium|full>
 
 Controls the indexing level for L<public-inbox-index(1)>
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1cf3e66c..7f2447fe 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -67,7 +67,6 @@ sub new {
 		my $dir = $self->xdir;
 		$self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3");
 		$self->{over}->{-no_fsync} = 1 if $ibx->{-no_fsync};
-		$self->{index_max_size} = $ibx->{index_max_size};
 	} elsif ($version == 2) {
 		defined $shard or die "shard is required for v2\n";
 		# shard is a number
@@ -553,10 +552,10 @@ sub index_sync {
 sub check_size { # check_async cb for -index --max-size=...
 	my ($oid, $type, $size, $arg, $git) = @_;
 	(($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}";
-	if ($size <= $arg->{index_max_size}) {
+	if ($size <= $arg->{max_size}) {
 		$git->cat_async($oid, $arg->{index_oid}, $arg);
 	} else {
-		warn "W: skipping $oid ($size > $arg->{index_max_size})\n";
+		warn "W: skipping $oid ($size > $arg->{max_size})\n";
 	}
 }
 
@@ -573,7 +572,7 @@ sub v1_checkpoint ($$;$) {
 			$self->{mm}->last_commit($newest);
 		}
 	} else {
-		${$sync->{max}} = $BATCH_BYTES;
+		${$sync->{max}} = $self->{batch_bytes};
 	}
 
 	$self->{mm}->{dbh}->commit;
@@ -603,7 +602,7 @@ sub v1_checkpoint ($$;$) {
 sub process_stack {
 	my ($self, $sync, $stk) = @_;
 	my $git = $self->{ibx}->git;
-	my $max = $BATCH_BYTES;
+	my $max = $self->{batch_bytes};
 	my $nr = 0;
 	$sync->{nr} = \$nr;
 	$sync->{max} = \$max;
@@ -617,13 +616,13 @@ sub process_stack {
 			$git->cat_async($oid, \&unindex_both, $self);
 		}
 	}
-	if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) {
+	if ($sync->{max_size} = $sync->{-opt}->{max_size}) {
 		$sync->{index_oid} = \&index_both;
 	}
 	while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
 		if ($f eq 'm') {
 			my $arg = { %$sync, autime => $at, cotime => $ct };
-			if ($sync->{index_max_size}) {
+			if ($sync->{max_size}) {
 				$git->check_async($oid, \&check_size, $arg);
 			} else {
 				$git->cat_async($oid, \&index_both, $arg);
@@ -749,6 +748,7 @@ sub _index_sync {
 	my ($self, $opts) = @_;
 	my $tip = $opts->{ref} || 'HEAD';
 	my $git = $self->{ibx}->git;
+	$self->{batch_bytes} = $opts->{batch_size} // $BATCH_BYTES;
 	$git->batch_prepare;
 	my $pr = $opts->{-progress};
 	my $sync = { reindex => $opts->{reindex}, -opt => $opts };
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 93646e57..8e36b92c 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -152,12 +152,6 @@ sub add {
 	$self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb);
 }
 
-sub batch_bytes ($) {
-	my ($self) = @_;
-	($self->{parallel} ? $self->{shards} : 1) *
-		$PublicInbox::SearchIdx::BATCH_BYTES;
-}
-
 # indexes a message, returns true if checkpointing is needed
 sub do_idx ($$$$) {
 	my ($self, $msgref, $mime, $smsg) = @_;
@@ -166,7 +160,7 @@ sub do_idx ($$$$) {
 	my $idx = idx_shard($self, $smsg->{num} % $self->{shards});
 	$idx->index_raw($msgref, $mime, $smsg);
 	my $n = $self->{transact_bytes} += $smsg->{raw_bytes};
-	$n >= batch_bytes($self);
+	$n >= $self->{batch_bytes};
 }
 
 sub _add {
@@ -287,6 +281,9 @@ sub _idx_init { # with_umask callback
 	# xcpdb can change shard count while -watch is idle
 	my $nshards = count_shards($self);
 	$self->{shards} = $nshards if $nshards && $nshards != $self->{shards};
+	$self->{batch_bytes} = $opt->{batch_size} //
+				$PublicInbox::SearchIdx::BATCH_BYTES;
+	$self->{batch_bytes} *= $self->{shards} if $self->{parallel};
 
 	# need to create all shards before initializing msgmap FD
 	# idx_shards must be visible to all forked processes
@@ -891,7 +888,7 @@ sub reindex_checkpoint ($$) {
 	}
 
 	# allow -watch or -mda to write...
-	$self->idx_init; # reacquire lock
+	$self->idx_init($sync->{-opt}); # reacquire lock
 	$mm_tmp->atfork_parent if $mm_tmp;
 }
 
@@ -1208,12 +1205,11 @@ sub index_xap_step ($$$;$) {
 		$pr->("Xapian indexlevel=$ibx->{indexlevel} ".
 			"$beg..$end (% $step)\n");
 	}
-	my $batch_bytes = batch_bytes($self);
 	for (my $num = $beg; $num <= $end; $num += $step) {
 		my $smsg = $ibx->over->get_art($num) or next;
 		$smsg->{v2w} = $self;
 		$ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg);
-		if ($self->{transact_bytes} >= $batch_bytes) {
+		if ($self->{transact_bytes} >= $self->{batch_bytes}) {
 			${$sync->{nr}} = $num;
 			reindex_checkpoint($self, $sync);
 		}
@@ -1236,7 +1232,7 @@ sub index_epoch ($$$) {
 		$self->{current_info} = "$i.git $oid";
 		if ($f eq 'm') {
 			my $arg = { %$sync, autime => $at, cotime => $ct };
-			if ($sync->{index_max_size}) {
+			if ($sync->{max_size}) {
 				$all->check_async($oid, \&check_size, $arg);
 			} else {
 				$all->cat_async($oid, \&index_oid, $arg);
@@ -1255,7 +1251,7 @@ sub index_epoch ($$$) {
 
 sub xapian_only {
 	my ($self, $opt, $sync, $art_beg) = @_;
-	my $seq = $opt->{sequentialshard};
+	my $seq = $opt->{sequential_shard};
 	$art_beg //= 0;
 	local $self->{parallel} = 0 if $seq;
 	$self->idx_init($opt); # acquire lock
@@ -1285,14 +1281,14 @@ sub xapian_only {
 sub index_sync {
 	my ($self, $opt) = @_;
 	$opt //= $_[1] //= {};
-	goto \&xapian_only if $opt->{xapianonly};
+	goto \&xapian_only if $opt->{xapian_only};
 
 	my $pr = $opt->{-progress};
 	my $epoch_max;
 	my $latest = git_dir_latest($self, \$epoch_max);
 	return unless defined $latest;
 
-	my $seq = $opt->{sequentialshard};
+	my $seq = $opt->{sequential_shard};
 	my $art_beg; # the NNTP article number we start xapian_only at
 	my $idxlevel = $self->{ibx}->{indexlevel};
 	local $self->{ibx}->{indexlevel} = 'basic' if $seq;
@@ -1324,7 +1320,7 @@ sub index_sync {
 			$art_beg++ if defined($art_beg);
 		}
 	}
-	if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) {
+	if ($sync->{max_size} = $opt->{max_size}) {
 		$sync->{index_oid} = \&index_oid;
 	}
 	# work forwards through history
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 9e0907be..b1d29ec1 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -17,7 +17,7 @@ usage: $usage
 options:
 
   --no-fsync          speed up indexing, risk corruption on power outage
-  --indexlevel=LEVEL  `basic', 'medium', or `full' (default: full)
+  -L LEVEL            `basic', `medium', or `full' (default: full)
   --compact | -c      run public-inbox-compact(1) after indexing
   --sequential-shard  index Xapian shards sequentially for slow storage
   --jobs=NUM          set or disable parallelization (NUM=0)
@@ -33,16 +33,17 @@ BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
 See public-inbox-index(1) man page for full documentation.
 EOF
 my $compact_opt;
-my $opt = { quiet => -1, compact => 0, maxsize => undef, fsync => 1 };
+my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 };
 GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
-		fsync|sync! xapianonly|xapian-only
-		indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s
-		sequentialshard|seq-shard|sequential-shard
+		fsync|sync! xapian_only|xapian-only
+		indexlevel|index-level|L=s max_size|max-size=s
+		batch_size|batch-size=s
+		sequential_shard|seq-shard|sequential-shard
 		help|?))
 	or die "bad command-line args\n$usage";
 if ($opt->{help}) { print $help; exit 0 };
 die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
-if ($opt->{xapianonly} && !$opt->{reindex}) {
+if ($opt->{xapian_only} && !$opt->{reindex}) {
 	die "--xapian-only requires --reindex\n";
 }
 
@@ -64,40 +65,38 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg);
 PublicInbox::Admin::require_or_die('-index');
 unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 }
 
-my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')};
+my $max_size = $opt->{max_size} // $cfg->{lc('publicInbox.indexMaxSize')};
 if (defined $max_size) {
 	PublicInbox::Admin::parse_unsigned(\$max_size) or
 		die "`publicInbox.indexMaxSize=$max_size' not parsed\n";
+	$opt->{max_size} = $max_size;
 }
 
-my $bs = $opt->{batchsize} // $cfg->{lc('publicInbox.indexBatchSize')};
+my $bs = $opt->{batch_size} // $cfg->{lc('publicInbox.indexBatchSize')};
 if (defined $bs) {
 	PublicInbox::Admin::parse_unsigned(\$bs) or
 		die "`publicInbox.indexBatchSize=$bs' not parsed\n";
+	$opt->{batch_size} = $bs;
 }
-no warnings 'once';
-local $PublicInbox::SearchIdx::BATCH_BYTES = $bs if defined($bs);
-use warnings 'once';
 
 # out-of-the-box builds of Xapian 1.4.x are still limited to 32-bit
 # https://getting-started-with-xapian.readthedocs.io/en/latest/concepts/indexing/limitations.html
 local $ENV{XAPIAN_FLUSH_THRESHOLD} ||= '4294967295' if defined($bs);
 
-my $s = $opt->{sequentialshard} //
+my $s = $opt->{sequential_shard} //
 			$cfg->{lc('publicInbox.indexSequentialShard')};
 if (defined $s) {
 	my $v = $cfg->git_bool($s);
 	defined($v) or
 		die "`publicInbox.indexSequentialShard=$s' not boolean\n";
-	$opt->{sequentialshard} = $v;
+	$opt->{sequential_shard} = $v;
 }
 
 my $mods = {};
 foreach my $ibx (@ibxs) {
 	# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
-	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapianonly} ?
+	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
 			'full' : PublicInbox::Admin::detect_indexlevel($ibx));
-	$ibx->{index_max_size} = $max_size;
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
 }
 
@@ -112,15 +111,15 @@ for my $ibx (@ibxs) {
 	$ibx->{-no_fsync} = 1 if !$opt->{fsync};
 
 	my $ibx_opt = $opt;
-	if (defined(my $s = $ibx->{indexsequentialshard})) {
+	if (defined(my $s = $ibx->{lc('indexSequentialShard')})) {
 		defined(my $v = $cfg->git_bool($s)) or die <<EOL;
 publicInbox.$ibx->{name}.indexSequentialShard not boolean
 EOL
-		$ibx_opt = { %$opt, sequentialshard => $v };
+		$ibx_opt = { %$opt, sequential_shard => $v };
 	}
 	PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
 	if ($compact_opt) {
-		local $compact_opt->{jobs} = 0 if $ibx_opt->{sequentialshard};
+		local $compact_opt->{jobs} = 0 if $ibx_opt->{sequential_shard};
 		PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt);
 	}
 }
diff --git a/script/public-inbox-init b/script/public-inbox-init
index 6a959db7..1c8066df 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -27,7 +27,7 @@ use Cwd qw/abs_path/;
 my ($version, $indexlevel, $skip_epoch, $skip_artnum, $jobs);
 my %opts = (
 	'V|version=i' => \$version,
-	'L|indexlevel=s' => \$indexlevel,
+	'L|index-level|indexlevel=s' => \$indexlevel,
 	'S|skip|skip-epoch=i' => \$skip_epoch,
 	'N|skip-artnum=i' => \$skip_artnum,
 	'j|jobs=i' => \$jobs,
@@ -103,11 +103,7 @@ if (-e $pi_config) {
 	exit(1) if $conflict;
 
 	my $ibx = $cfg->lookup_name($name);
-	if ($ibx) {
-		if (!defined($indexlevel) && $ibx->{indexlevel}) {
-			$indexlevel = $ibx->{indexlevel};
-		}
-	}
+	$indexlevel //= $ibx->{indexlevel} if $ibx;
 }
 my $pi_config_tmp = $fh->filename;
 close($fh) or die "failed to close $pi_config_tmp: $!\n";

  parent reply	other threads:[~2020-08-10  2:12 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-10  2:11 [PATCH 00/14] more indexing related improvements Eric Wong
2020-08-10  2:11 ` [PATCH 01/14] index: require --reindex when using --xapian-only Eric Wong
2020-08-10  2:11 ` [PATCH 02/14] index: --sequential-shard works incrementally Eric Wong
2020-08-10  2:11 ` [PATCH 03/14] doc: index: more notes about latest changes Eric Wong
2020-08-10  2:38   ` Kyle Meyer
2020-08-10  6:29     ` Eric Wong
2020-08-10  2:11 ` [PATCH 04/14] doc: add some notes around -xcpdb / -edit / -purge Eric Wong
2020-08-10  2:11 ` [PATCH 05/14] index+xcpdb: improve SIG{INT,TERM,HUP,PIPE} behavior Eric Wong
2020-08-10  2:11 ` [PATCH 06/14] msgmap: tmp_clone: simplify + meaningful filename Eric Wong
2020-08-10  2:11 ` [PATCH 07/14] avoid File::Temp::tempfile in more places Eric Wong
2020-08-10  2:11 ` [PATCH 08/14] admin: use a generic veriable name Eric Wong
2020-08-10  2:38   ` Kyle Meyer
2020-08-10  2:12 ` Eric Wong [this message]
2020-08-10  2:12 ` [PATCH 10/14] searchidx: use singular `$opt' for consistency with v2 Eric Wong
2020-08-10  2:12 ` [PATCH 11/14] convert: support new -index options Eric Wong
2020-08-10  2:12 ` [PATCH 12/14] convert: speed up --help Eric Wong
2020-08-10  2:12 ` [PATCH 13/14] convert: check ARGV more correctly Eric Wong
2020-08-10  2:12 ` [PATCH 14/14] convert: set No_COW on copied SQLite files Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200810021205.18909-10-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).