user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 0/6] cindex fixes, and some spring cleaning
@ 2023-04-07 12:40  7% Eric Wong
  2023-04-07 12:40  5% ` [PATCH 2/6] cindex: preserve indexlevel across invocations Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2023-04-07 12:40 UTC (permalink / raw)
  To: meta

cindex now preserves indexlevel like older mail-based indices

We'll reuse umask code across inboxes, extindex, and now cindex
for code.  This also lets us simplify some of our old code by
eliminating some useless functions.

The vstring stuff should make the code more readable to new
contributors.

Eric Wong (6):
  cindex: improve progress display
  cindex: preserve indexlevel across invocations
  umask: hoist out of InboxWritable
  umask: rely on the OnDestroy-based call where applicable
  searchidx: use vstring to improve readability
  switch git version comparisons to vstrings, too

 MANIFEST                         |  1 +
 lib/PublicInbox/CodeSearchIdx.pm | 49 ++++++++++++----------
 lib/PublicInbox/ExtSearchIdx.pm  | 17 +++-----
 lib/PublicInbox/Git.pm           | 11 +++--
 lib/PublicInbox/InboxWritable.pm | 70 +-------------------------------
 lib/PublicInbox/LeiMirror.pm     |  2 +-
 lib/PublicInbox/SearchIdx.pm     | 37 ++++++++---------
 lib/PublicInbox/TestCommon.pm    | 16 +++-----
 lib/PublicInbox/Umask.pm         | 70 ++++++++++++++++++++++++++++++++
 lib/PublicInbox/V2Writable.pm    | 23 ++++-------
 lib/PublicInbox/Xapcmd.pm        | 35 ++++++++--------
 script/public-inbox-convert      |  9 ++--
 t/cindex.t                       | 45 ++++++++++++++++++++
 t/search.t                       |  7 ++--
 14 files changed, 212 insertions(+), 180 deletions(-)
 create mode 100644 lib/PublicInbox/Umask.pm


^ permalink raw reply	[relevance 7%]

* [PATCH 2/6] cindex: preserve indexlevel across invocations
  2023-04-07 12:40  7% [PATCH 0/6] cindex fixes, and some spring cleaning Eric Wong
@ 2023-04-07 12:40  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2023-04-07 12:40 UTC (permalink / raw)
  To: meta

This matches the behavior of mail indexers and ensures `medium'
indices don't grow unexpectedly to be come `full' indices.
---
 lib/PublicInbox/CodeSearchIdx.pm | 15 +++++++++--
 lib/PublicInbox/SearchIdx.pm     |  2 +-
 t/cindex.t                       | 45 ++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 5f20325a..3a3fc03e 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -85,7 +85,6 @@ sub new {
 		xpfx => "$dir/cidx".  PublicInbox::CodeSearch::CIDX_SCHEMA_VER,
 		cidx_dir => $dir,
 		creat => 1, # TODO: get rid of this, should be implicit
-		indexlevel => $l,
 		transact_bytes => 0, # for checkpoint
 		total_bytes => 0, # for lock_release
 		current_info => '',
@@ -617,16 +616,28 @@ sub cidx_init ($) {
 	}
 	$self->lock_acquire;
 	my @shards;
+	my $l = $self->{indexlevel} //= $self->{-opt}->{indexlevel};
+
 	for my $n (0..($self->{nshard} - 1)) {
 		my $shard = bless { %$self, shard => $n }, ref($self);
 		delete @$shard{qw(lockfh lock_path)};
-		$shard->idx_acquire;
+		my $xdb = $shard->idx_acquire;
+		if (!$n) {
+			if (($l // '') eq 'medium') {
+				$xdb->set_metadata('indexlevel', $l);
+			} elsif (($l // '') eq 'full') {
+				$xdb->set_metadata('indexlevel', ''); # unset
+			}
+			$l ||= $xdb->get_metadata('indexlevel') || 'full';
+		}
+		$shard->{indexlevel} = $l;
 		$shard->idx_release;
 		$shard->wq_workers_start("cidx shard[$n]", 1, $SIGSET, {
 			siblings => \@shards, # for ipc_atfork_child
 		}, \&shard_done_wait, $self);
 		push @shards, $shard;
 	}
+	$self->{indexlevel} //= $l;
 	# this warning needs to happen after idx_acquire
 	state $once;
 	warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index f36c8f97..699af432 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -90,7 +90,7 @@ sub new {
 	$self;
 }
 
-sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels }
+sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels }
 
 sub idx_release {
 	my ($self, $wake) = @_;
diff --git a/t/cindex.t b/t/cindex.t
index 9da0ba69..d40f73ff 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -4,11 +4,13 @@
 use v5.12;
 use PublicInbox::TestCommon;
 use Cwd qw(getcwd abs_path);
+use List::Util qw(sum);
 require_mods(qw(json Search::Xapian));
 use_ok 'PublicInbox::CodeSearchIdx';
 require PublicInbox::Import;
 my ($tmp, $for_destroy) = tmpdir();
 my $pwd = getcwd();
+my @unused_keys = qw(last_commit has_threadid skip_docdata);
 
 # I reworked CodeSearchIdx->shard_worker to handle empty trees
 # in the initial commit generated by cvs2svn for xapian.git
@@ -71,7 +73,48 @@ ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp, "$tmp/wt0"]),
 ok(-e "$tmp/ext/cidx.lock", 'external dir created');
 ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo');
 
+ok(run_script([qw(-cindex -L medium --dangerous -q -d),
+	"$tmp/med", $zp, "$tmp/wt0"]), 'cindex external medium');
+
+my $no_metadata_set = sub {
+	my ($i, $extra, $xdb) = @_;
+	for my $xdb (@$xdb) {
+		for my $k (@unused_keys, @$extra) {
+			is($xdb->get_metadata($k) // '', '',
+				"metadata $k unset in shard #$i");
+		}
+		++$i;
+	}
+};
+
+{
+	my $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+	my $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+	ok($full_size > $mid_size, 'full size > mid size') or
+		diag "full=$full_size mid=$mid_size";
+	for my $l (qw(med ext)) {
+		ok(run_script([qw(-cindex -q --reindex -u -d), "$tmp/$l"]),
+			"reindex $l");
+	}
+	$mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+	$full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+	ok($full_size > $mid_size, 'full size > mid size after reindex') or
+		diag "full=$full_size mid=$mid_size";
+	my $csrch = PublicInbox::CodeSearch->new("$tmp/med");
+	my ($xdb0, @xdb) = $csrch->xdb_shards_flat;
+	$no_metadata_set->(0, [], [ $xdb0 ]);
+	is($xdb0->get_metadata('indexlevel'), 'medium',
+		'indexlevel set in shard #0');
+	$no_metadata_set->(1, ['indexlevel'], \@xdb);
+
+	ok(run_script([qw(-cindex -q -L full --reindex -u -d), "$tmp/med"]),
+		'reindex medium as full');
+	@xdb = $csrch->xdb_shards_flat;
+	$no_metadata_set->(0, ['indexlevel'], \@xdb);
+}
+
 use_ok 'PublicInbox::CodeSearch';
+
 if ('multi-repo search') {
 	my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
 	my $mset = $csrch->mset('NUL');
@@ -86,6 +129,8 @@ if ('multi-repo search') {
 	$mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") });
 	@have = sort(map { $_->get_document->get_data } $mset->items);
 	is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter');
+	my @xdb = $csrch->xdb_shards_flat;
+	$no_metadata_set->(0, ['indexlevel'], \@xdb);
 }
 
 if ('--update') {

^ permalink raw reply related	[relevance 5%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2023-04-07 12:40  7% [PATCH 0/6] cindex fixes, and some spring cleaning Eric Wong
2023-04-07 12:40  5% ` [PATCH 2/6] cindex: preserve indexlevel across invocations Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).