user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 11/18] cindex: delay associate until prune+indexing finish
Date: Mon, 13 Nov 2023 13:15:44 +0000	[thread overview]
Message-ID: <20231113131551.843230-12-e@80x24.org> (raw)
In-Reply-To: <20231113131551.843230-1-e@80x24.org>

Prune can get rid of invalid commits while indexing can add new
candidates for association, so we don't dump coderepo roots for
association until those are squared away.  However, we can dump
inbox info since we don't touch inboxes while -cindex is running.
---
 lib/PublicInbox/CidxComm.pm      |  6 ++--
 lib/PublicInbox/CodeSearchIdx.pm | 48 +++++++++++++++++++++-----------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/lib/PublicInbox/CidxComm.pm b/lib/PublicInbox/CidxComm.pm
index c7ab3c10..80a235e9 100644
--- a/lib/PublicInbox/CidxComm.pm
+++ b/lib/PublicInbox/CidxComm.pm
@@ -13,8 +13,8 @@ use parent qw(PublicInbox::DS);
 use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT);
 
 sub new {
-	my ($cls, $rd, $cidx) = @_;
-	my $self = bless { cidx => $cidx }, $cls;
+	my ($cls, $rd, $cidx, $drs) = @_;
+	my $self = bless { cidx => $cidx, drs => $drs }, $cls;
 	$self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT);
 }
 
@@ -22,7 +22,7 @@ sub event_step {
 	my ($self) = @_;
 	my $rd = $self->{sock} // return warn('BUG?: no {sock}');
 	$self->close; # EPOLL_CTL_DEL
-	delete($self->{cidx})->cidx_read_comm($rd);
+	delete($self->{cidx})->cidx_read_comm($rd, delete $self->{drs});
 }
 
 1;
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 0bd26af2..04c514fe 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -76,8 +76,8 @@ our (
 	$NPROC,
 	$XHC, # XapClient
 	$REPO_CTX, # current repo being indexed in shards
-	$IDX_TODO, # [ $git0, $root0, $git1, $root1, ...]
-	$GIT_TODO, # [ GIT_DIR0, GIT_DIR1, ...]
+	$IDX_TODO, # PublicInbox::Git object arrayref
+	$GIT_TODO, # PublicInbox::Git object arrayref
 	%ALT_FH, # hexlen => tmp IO for TMPDIR git alternates
 	$TMPDIR, # File::Temp->newdir object for prune
 	@PRUNE_QUEUE, # GIT_DIRs to prepare for pruning
@@ -337,8 +337,10 @@ sub prune_done { # called via prune_do completion
 	return if $DO_QUIT || !$PRUNE_DONE;
 	die "BUG: \$PRUNE_DONE->[$n] already defined" if $PRUNE_DONE->[$n];
 	$PRUNE_DONE->[$n] = 1;
-	grep(defined, @$PRUNE_DONE) == @IDX_SHARDS and
-		progress($self, 'prune done')
+	if (grep(defined, @$PRUNE_DONE) == @IDX_SHARDS) {
+		progress($self, 'prune done');
+		index_next($self); # may kick dump_roots_start
+	}
 }
 
 sub seen ($$) {
@@ -506,10 +508,15 @@ sub assoc_max_init ($) {
 	$max < 0 ? ((2 ** 31) - 1) : $max;
 }
 
+sub start_xhc () {
+	my ($xhc, $pid) = PublicInbox::XapClient::start_helper("-j$NPROC");
+	awaitpid($pid, \&cmd_done, ['xap_helper', "-j$NPROC"]);
+	$xhc;
+}
+
 sub dump_roots_start {
 	my ($self, $associate) = @_;
-	($XHC, my $pid) = PublicInbox::XapClient::start_helper("-j$NPROC");
-	awaitpid($pid, \&cmd_done, ['xap_helper', "-j$NPROC"]);
+	$XHC //= start_xhc;
 	$associate // die 'BUG: no $associate';
 	$TODO{associating} = 1; # keep shards_active() happy
 	progress($self, 'dumping IDs from coderepos');
@@ -559,6 +566,7 @@ EOM
 
 sub dump_ibx_start {
 	my ($self, $associate) = @_;
+	$XHC //= start_xhc;
 	my ($sort_opt, $fold_opt);
 	pipe(local $sort_opt->{0}, $DUMP_IBX_WPIPE);
 	pipe(local $fold_opt->{0}, local $sort_opt->{1});
@@ -581,11 +589,10 @@ sub index_next ($) {
 		fp_start($self, $git, $prep_repo);
 		ct_start($self, $git, $prep_repo);
 	} elsif ($TMPDIR) {
-		return if delete($TODO{dump_roots_start});
+		delete $TODO{dump_roots_start};
 		delete $TODO{dump_ibx_start}; # runs OnDestroy once
 		return dump_ibx($self, shift @IBXQ) if @IBXQ;
 		undef $DUMP_IBX_WPIPE; # done dumping inboxes
-		undef $XHC;
 		delete $TODO{associate};
 	}
 	# else: wait for shards_active (post_loop_do) callback
@@ -604,7 +611,7 @@ sub next_repos { # OnDestroy cb
 }
 
 sub index_done { # OnDestroy cb called when done indexing each code repo
-	my ($repo_ctx) = @_;
+	my ($repo_ctx, $drs) = @_;
 	my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)};
 
 	return if $DO_QUIT;
@@ -615,6 +622,7 @@ sub index_done { # OnDestroy cb called when done indexing each code repo
 	$active->{$n} = undef;
 	my ($c, $p) = PublicInbox::PktOp->pair;
 	$c->{ops}->{repo_stored} = [ $self, $repo_ctx ];
+	$c->{-cidx_dump_roots_start} = $drs if $drs;
 	$IDX_SHARDS[$n]->wq_io_do('store_repo', [ $p->{op_p} ], $repo);
 	# repo_stored will fire once store_repo is done
 }
@@ -638,8 +646,9 @@ sub index_repo { # run_git cb
 	$repo->{git_dir} = $git->{git_dir};
 	my $repo_ctx = $REPO_CTX = { self => $self, repo => $repo };
 	delete $git->{-cidx_gits_fini}; # may fire gits_fini
+	my $drs = delete $git->{-cidx_dump_roots_start};
 	my $index_done = PublicInbox::OnDestroy->new($$, \&index_done,
-							$repo_ctx);
+							$repo_ctx, $drs);
 	my ($c, $p) = PublicInbox::PktOp->pair;
 	$c->{ops}->{shard_done} = [ $self, $repo_ctx, $index_done ];
 	for my $n (0..$#shard_in) {
@@ -738,6 +747,7 @@ EOM
 	@shards;
 }
 
+# called when all git coderepos are done
 sub gits_fini {
 	undef $GITS_NR;
 	PublicInbox::DS::enqueue_reap(); # kick @post_loop_do
@@ -749,6 +759,9 @@ sub scan_git_dirs ($) {
 	$GITS_NR = @$GIT_TODO;
 	my $gits_fini = PublicInbox::OnDestroy->new($$, \&gits_fini);
 	$_->{-cidx_gits_fini} = $gits_fini for @$GIT_TODO;
+	if (my $drs = $TODO{dump_roots_start}) {
+		$_->{-cidx_dump_roots_start} = $drs for @$GIT_TODO;
+	}
 	progress($self, "scanning $GITS_NR code repositories...");
 }
 
@@ -797,7 +810,7 @@ sub kill_shards { $_->wq_kill(@_) for (@IDX_SHARDS) }
 
 sub parent_quit {
 	$DO_QUIT = POSIX->can("SIG$_[0]")->();
-	$XHC = undef;
+	$XHC = 0; # stops the process
 	kill_shards(@_);
 	warn "# SIG$_[0] received, quitting...\n";
 }
@@ -870,6 +883,7 @@ sub cmd_done { # run_await cb for sort, xapian-delve, sed failures
 sub associate {
 	my ($self) = @_;
 	return if $DO_QUIT;
+	$XHC = 0; # should not be recreated again
 	@IDX_SHARDS or return warn("# aborting on no shards\n");
 	unlink("$TMPDIR/root2id");
 	my @pending = keys %{$self->{PENDING}};
@@ -949,7 +963,8 @@ sub init_prune ($) {
 	require_progs('prune', 'xapian-delve' => \@delve, sed => \@sed,
 			comm => \@COMM, awk => \@AWK);
 	for (0..$#IDX_SHARDS) { push @delve, "$self->{xpfx}/$_" }
-	my $run_prune = PublicInbox::OnDestroy->new($$, \&run_prune, $self);
+	my $run_prune = PublicInbox::OnDestroy->new($$, \&run_prune, $self,
+						$TODO{dump_roots_start});
 	my ($sort_opt, $sed_opt, $delve_opt);
 	pipe(local $sed_opt->{0}, local $delve_opt->{1});
 	pipe(local $sort_opt->{0}, local $sed_opt->{1});
@@ -975,7 +990,7 @@ sub dump_git_commits { # run_await cb
 }
 
 sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done
-	my ($self) = @_;
+	my ($self, $drs) = @_;
 	return if $DO_QUIT;
 	# setup the following pipeline: (
 	#	git --git-dir=hexlen40.git cat-file \
@@ -991,7 +1006,7 @@ sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done
 	run_await(\@AWK, $CMD_ENV, $awk_opt, \&cmd_done);
 	run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done);
 	my $comm_rd = popen_rd(\@COMM, $CMD_ENV, $comm_opt, \&cmd_done, \@COMM);
-	PublicInbox::CidxComm->new($comm_rd, $self); # calls cidx_read_comm
+	PublicInbox::CidxComm->new($comm_rd, $self, $drs); # ->cidx_read_comm
 	my $git_ver = PublicInbox::Git::git_version();
 	push @PRUNE_BATCH, '--buffer' if $git_ver ge v2.6;
 
@@ -1007,7 +1022,7 @@ EOM
 }
 
 sub cidx_read_comm { # via PublicInbox::CidxComm::event_step
-	my ($self, $comm_rd) = @_;
+	my ($self, $comm_rd, $drs) = @_;
 	return if $DO_QUIT;
 	$_->wq_do('prune_init') for @IDX_SHARDS;
 	while (defined(my $cmt = <$comm_rd>)) {
@@ -1022,6 +1037,7 @@ sub cidx_read_comm { # via PublicInbox::CidxComm::event_step
 	}
 	my ($c, $p) = PublicInbox::PktOp->pair;
 	$c->{ops}->{prune_done} = [ $self ];
+	$c->{-cidx_dump_roots_start} = $drs;
 	$_->wq_io_do('prune_commit', [ $p->{op_p} ]) for @IDX_SHARDS;
 }
 
@@ -1103,8 +1119,8 @@ sub show_roots { # for diagnostics
 
 sub do_inits { # called via PublicInbox::DS::add_timer
 	my ($self) = @_;
-	init_prune($self);
 	init_associate_postfork($self);
+	init_prune($self);
 	scan_git_dirs($self) if $self->{-opt}->{scan} // 1;
 	my $max = $TODO{associate} ? max($LIVE_JOBS, $NPROC) : $LIVE_JOBS;
 	index_next($self) for (1..$max);

  parent reply	other threads:[~2023-11-13 13:15 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-13 13:15 [PATCH 00/18] cindex: some --associate work Eric Wong
2023-11-13 13:15 ` [PATCH 01/18] cindex: check `say' errors w/ close or ->flush Eric Wong
2023-11-13 13:15 ` [PATCH 02/18] tmpfile: check `stat' errors, use autodie for unlink Eric Wong
2023-11-13 13:15 ` [PATCH 03/18] cindex: use `local' for pipes between processes Eric Wong
2023-11-13 13:15 ` [PATCH 04/18] xap_helper_cxx: use write_file helper Eric Wong
2023-11-13 13:15 ` [PATCH 05/18] xap_helper_cxx: make the build process ccache-friendly Eric Wong
2023-11-13 13:15 ` [PATCH 06/18] xap_helper_cxx: use -pipe by default in CXXFLAGS Eric Wong
2023-11-13 13:15 ` [PATCH 07/18] xap_client: spawn C++ xap_helper directly Eric Wong
2023-11-13 13:15 ` [PATCH 08/18] treewide: update read_all to avoid eof|close checks Eric Wong
2023-11-13 13:15 ` [PATCH 09/18] spawn: don't append to scalarrefs on stdout/stderr Eric Wong
2023-11-13 13:15 ` [PATCH 10/18] cindex: imply --all with --associate w/o -I/--only Eric Wong
2023-11-13 13:15 ` Eric Wong [this message]
2023-11-13 13:15 ` [PATCH 12/18] xap_helper: Perl dump_ibx respects `-m MAX' Eric Wong
2023-11-13 13:15 ` [PATCH 13/18] cidx_xap_helper_aux: complain about truncated inputs Eric Wong
2023-11-13 13:15 ` [PATCH 14/18] xap_helper: stricter and harsher error handling Eric Wong
2023-11-13 13:15 ` [PATCH 15/18] xap_helper: better variable naming for key buffer Eric Wong
2023-11-13 13:15 ` [PATCH 16/18] cindex: do not guess integer maximum for Xapian Eric Wong
2023-11-13 13:15 ` [PATCH 17/18] cindex: rename associate-max => window Eric Wong
2023-11-13 13:15 ` [PATCH 18/18] cindex: support --associate-aggressive shortcut Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231113131551.843230-12-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).