user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 24/28] cindex: add support for --prune
Date: Tue, 21 Mar 2023 23:07:39 +0000	[thread overview]
Message-ID: <20230321230743.3020032-24-e@80x24.org> (raw)
In-Reply-To: <20230321230743.3020032-1-e@80x24.org>

This gets rid of both inaccessible commits AND repositories.
It will only unindex commits which are pruned in git, first,
so repos with auto GC disabled will need GC to prune them.
---
 lib/PublicInbox/CodeSearchIdx.pm | 86 ++++++++++++++++++++++++++++++--
 t/cindex.t                       | 16 ++++++
 2 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index e875b93e..095c153e 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -43,6 +43,7 @@ our (
 	@RDONLY_SHARDS, # Xapian::Database
 	@IDX_SHARDS, # clones of self
 	$MAX_SIZE,
+	$TMP_GIT, # PublicInbox::Git object for --reindex and --prune
 );
 
 # stop walking history if we see >$SEEN_MAX existing commits, this assumes
@@ -543,7 +544,7 @@ sub git { $_[0]->{git} }
 sub load_existing ($) { # for -u/--update
 	my ($self) = @_;
 	my $dirs = $self->{git_dirs} // [];
-	if ($self->{-opt}->{update}) {
+	if ($self->{-opt}->{update} || $self->{-opt}->{prune}) {
 		local $self->{xdb};
 		$self->xdb or
 			die "E: $self->{cidx_dir} non-existent for --update\n";
@@ -556,6 +557,7 @@ sub load_existing ($) { # for -u/--update
 				undef;
 			}
 		} $self->all_terms('P');
+		@missing = () if $self->{-opt}->{prune};
 		@missing and warn "W: the following repos no longer exist:\n",
 				(map { "W:\t$_\n" } @missing),
 				"W: use --prune to remove them from ",
@@ -612,6 +614,64 @@ sub scan_git_dirs ($) {
 	cidx_reap($self, 0);
 }
 
+sub prune_cb { # git->check_async callback
+	my ($hex, $type, undef, $self_id) = @_;
+	if ($type ne 'commit') {
+		my ($self, $id) = @$self_id;
+		progress($self, "$hex $type");
+		++$self->{pruned};
+		$self->{xdb}->delete_document($id);
+	}
+}
+
+sub shard_prune { # via wq_io_do
+	my ($self, $n, $git_dir) = @_;
+	my $op_p = delete($self->{0}) // die 'BUG: no {0} op_p';
+	my $git = PublicInbox::Git->new($git_dir); # TMP_GIT copy
+	$self->begin_txn_lazy;
+	my $xdb = $self->{xdb};
+	my $cur = $xdb->postlist_begin('Tc');
+	my $end = $xdb->postlist_end('Tc');
+	my ($id, @cmt, $oid);
+	local $self->{pruned} = 0;
+	for (; $cur != $end && !$DO_QUIT; $cur++) {
+		@cmt = xap_terms('Q', $xdb, $id = $cur->get_docid);
+		scalar(@cmt) == 1 or
+			warn "BUG? shard[$n] #$id has multiple commits: @cmt";
+		for $oid (@cmt) {
+			$git->check_async($oid, \&prune_cb, [ $self, $id ]);
+		}
+	}
+	$git->async_wait_all;
+	for my $d ($self->all_terms('P')) { # GIT_DIR paths
+		last if $DO_QUIT;
+		next if -d $d;
+		for $id (docids_by_postlist($self, 'P'.$d)) {
+			progress($self, "$d gone #$id");
+			$xdb->delete_document($id);
+		}
+	}
+	$self->commit_txn_lazy;
+	$self->{pruned} and
+		progress($self, "[$n] pruned $self->{pruned} commits");
+	send($op_p, "shard_done $n", MSG_EOR);
+}
+
+sub do_prune ($) {
+	my ($self) = @_;
+	my $consumers = {};
+	my $git_dir = $TMP_GIT->{git_dir};
+	my $n = 0;
+	local $self->{-shard_ok} = {};
+	for my $s (@IDX_SHARDS) {
+		my ($c, $p) = PublicInbox::PktOp->pair;
+		$c->{ops}->{shard_done} = [ $self ];
+		$s->wq_io_do('shard_prune', [ $p->{op_p} ], $n, $git_dir);
+		$consumers->{$n++} = $c;
+	}
+	wait_consumers($self, $TMP_GIT, $consumers);
+}
+
 sub shards_active { # post_loop_do
 	scalar(grep { $_->{-cidx_quit} } @IDX_SHARDS);
 }
@@ -625,6 +685,25 @@ sub parent_quit {
 	warn "# SIG$_[0] received, quitting...\n";
 }
 
+sub init_tmp_git_dir ($) {
+	my ($self) = @_;
+	return unless ($self->{-opt}->{prune} || $self->{-opt}->{reindex});
+	require File::Temp;
+	require PublicInbox::Import;
+	my $tmp = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1);
+	PublicInbox::Import::init_bare("$tmp", 'cidx-all');
+	my $f = "$tmp/objects/info/alternates";
+	open my $fh, '>', $f or die "open($f): $!";
+	my $o;
+	for (@{$self->{git_dirs}}) { # TODO: sha256 check?
+		$o = $_.'/objects';
+		say $fh $o if -d $o;
+	}
+	close $fh or die "close($f): $!";
+	$TMP_GIT = PublicInbox::Git->new("$tmp");
+	$TMP_GIT->{-tmp} = $tmp;
+}
+
 sub cidx_run { # main entry point
 	my ($self) = @_;
 	local $self->{todo} = [];
@@ -634,6 +713,7 @@ sub cidx_run { # main entry point
 		\&PublicInbox::DS::sig_setmask, $SIGSET);
 	local $LIVE = {};
 	local $DO_QUIT;
+	local $TMP_GIT;
 	local @IDX_SHARDS = cidx_init($self);
 	local $self->{current_info} = '';
 	local $MY_SIG = {
@@ -671,8 +751,8 @@ sub cidx_run { # main entry point
 	local $LIVE_JOBS = $self->{-opt}->{jobs} ||
 			PublicInbox::IPC::detect_nproc() || 2;
 	local @RDONLY_SHARDS = $self->xdb_shards_flat;
-
-	# do_prune($self) if $self->{-opt}->{prune}; TODO
+	init_tmp_git_dir($self);
+	do_prune($self) if $self->{-opt}->{prune};
 	scan_git_dirs($self) if $self->{-opt}->{scan} // 1;
 
 	for my $s (@IDX_SHARDS) {
diff --git a/t/cindex.t b/t/cindex.t
index c93e4e4e..5d269217 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -95,4 +95,20 @@ EOM
 	is(scalar($mset->items), 1, 'got updated result');
 }
 
+if ('--prune') {
+	my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+	is(scalar($csrch->mset('s:hi')->items), 1, 'got hit');
+
+	rename("$tmp/wt0/.git", "$tmp/wt0/.giit") or xbail "rename $!";
+	ok(run_script([qw(-cindex -q --prune -d), "$tmp/ext"]), 'prune');
+	$csrch->reopen;
+	is(scalar($csrch->mset('s:hi')->items), 0, 'hit pruned');
+
+	rename("$tmp/wt0/.giit", "$tmp/wt0/.git") or xbail "rename $!";
+	ok(run_script([qw(-cindex -qu -d), "$tmp/ext"]), 'update');
+	$csrch->reopen;
+	is(scalar($csrch->mset('s:hi')->items), 0,
+		'hit stays pruned since GIT_DIR was previously pruned');
+}
+
 done_testing;

  parent reply	other threads:[~2023-03-21 23:07 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-21 23:07 [PATCH 00/28] cindex coderepo commit indexer Eric Wong
2023-03-21 23:07 ` [PATCH 01/28] ipc: move nproc_shards from v2writable Eric Wong
2023-03-21 23:07   ` [PATCH 02/28] search: relocate all_terms from lei_search Eric Wong
2023-03-21 23:07   ` [PATCH 03/28] admin: hoist out resolve_git_dir Eric Wong
2023-03-21 23:07   ` [PATCH 04/28] admin: ensure resolved GIT_DIR is absolute Eric Wong
2023-03-21 23:07   ` [PATCH 05/28] test_common: create_inbox: use `$!' properly on mkdir failure Eric Wong
2023-03-21 23:07   ` [PATCH 06/28] codesearch: initial cut w/ -cindex tool Eric Wong
2023-03-21 23:07   ` [PATCH 07/28] cindex: parallelize prep phases Eric Wong
2023-03-21 23:07   ` [PATCH 08/28] cindex: use read-only shards during " Eric Wong
2023-03-21 23:07   ` [PATCH 09/28] searchidxshard: improve comment wording Eric Wong
2023-03-21 23:07   ` [PATCH 10/28] cindex: use DS and workqueues for parallelism Eric Wong
2023-03-21 23:07   ` [PATCH 11/28] ds: @post_loop_do replaces SetPostLoopCallback Eric Wong
2023-03-21 23:07   ` [PATCH 12/28] cindex: implement --exclude= like -clone Eric Wong
2023-03-21 23:07   ` [PATCH 13/28] cindex: show shard number in progress message Eric Wong
2023-03-21 23:07   ` [PATCH 14/28] cindex: drop `unchanged' " Eric Wong
2023-03-21 23:07   ` [PATCH 15/28] cindex: handle graceful shutdown by default Eric Wong
2023-03-21 23:07   ` [PATCH 16/28] sigfd: pass signal name rather than number to callback Eric Wong
2023-03-21 23:07   ` [PATCH 17/28] cindex: implement --max-size=SIZE Eric Wong
2023-03-21 23:07   ` [PATCH 18/28] cindex: check for checkpoint before giant messages Eric Wong
2023-03-21 23:07   ` [PATCH 19/28] cindex: truncate or drop body for over-sized commits Eric Wong
2023-03-21 23:07   ` [PATCH 20/28] cindex: attempt to give oldest commits lowest docids Eric Wong
2023-03-21 23:07   ` [PATCH 21/28] cindex: improve granularity of quit checks Eric Wong
2023-03-21 23:07   ` [PATCH 22/28] spawn: show failing directory for chdir failures Eric Wong
2023-03-21 23:07   ` [PATCH 23/28] cindex: filter out non-existent git directories Eric Wong
2023-03-21 23:07   ` Eric Wong [this message]
2023-03-21 23:07   ` [PATCH 25/28] cindex: implement reindex Eric Wong
2023-03-21 23:07   ` [PATCH 26/28] cindex: squelch incompatible options Eric Wong
2023-03-21 23:07   ` [PATCH 27/28] cindex: respect existing permissions Eric Wong
2023-03-21 23:07   ` [PATCH 28/28] cindex: ignore SIGPIPE Eric Wong
2023-03-24 10:40     ` [PATCH 29/28] cindex: --prune checkpoints to avoid OOM Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230321230743.3020032-24-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).