user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 5/7] cindex: fix sorting and uniqueness
Date: Thu, 24 Aug 2023 01:22:34 +0000	[thread overview]
Message-ID: <20230824012236.3968030-6-e@80x24.org> (raw)
In-Reply-To: <20230824012236.3968030-1-e@80x24.org>

We can't rely on combining the `-u' and `-k1,1' switches of POSIX
sort(1) to do what we want.  So only rely on `sort -k1,1' while
introducing a small Perl helper to fold identical prefixes into
one line.  In other words, input such as:

  deadbeef 0
  deadbeef 1
  deadbeef 2

Was getting deduplicated into a single line:

  deadbeef 0

... with `sort -u -k1,1'
This makes puts the output into a more optimal form for eventual
(not-fully-implemented-yet) parsing:

  deadbeef 0,1,2

ORS is current the comma (`,') for inbox IDs, but it'll be a
space (` ') for coderepo root IDs.  This implementation also
combines identical IDs in the 2nd column.  Thus:

  deadbeef 0
  deadbeef 0

Becomes a single `deadbeef 0' line thanks to the use of
XS List::Util::uniq (which beats a pure Perl hash).

I attempted to implement this in awk but Perl is close enough to
gawk in performance while being shorter and easier-to-understand
due to List::Util::uniq.  mawk was faster, but still not enough
to matter as the bottleneck is from iterating through Xapian
MSets.
---
 lib/PublicInbox/CodeSearchIdx.pm | 63 +++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index b8afecd2..4a41b1da 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -21,12 +21,14 @@
 #
 # * to_root_id - each line is of the format:
 #
-#	$PFX $ROOT_COMMIT_OID_ID
+#	$PFX $ROOT_COMMIT_OID_IDS
 #
 # * to_ibx_id - each line is of the format:
 #
-#	$PFX $IBX_ID
+#	$PFX $IBX_IDS
 #
+# $IBX_IDS is a comma-delimited list of integers ($IBX_ID)
+# $ROOT_COMMIT_OID_IDS is space-delimited
 # In both cases, $PFX is typically the value of the patchid (XDFID) but it
 # can be configured to use any combination of patchid, dfpre, dfpost or
 # dfblob.
@@ -134,6 +136,20 @@ sub new {
 	$self;
 }
 
+# This is similar to uniq(1) on the first column, but combines the
+# contents of subsequent columns using $OFS.
+our @UNIQ_FOLD = ($^X, $^W ? ('-w') : (), qw(-MList::Util=uniq -ane), <<'EOM');
+BEGIN { $ofs = $ENV{OFS} // ','; $apfx = '' }
+if ($F[0] eq $apfx) {
+	shift @F;
+	push @ids, @F;
+} else {
+	print $apfx.' '.join($ofs, uniq(@ids))."\n" if @ids;
+	($apfx, @ids) = @F;
+}
+END { print $apfx.' '.join($ofs, uniq(@ids))."\n" if @ids }
+EOM
+
 # TODO: may be used for reshard/compact
 sub count_shards { scalar($_[0]->xdb_shards_flat) }
 
@@ -519,16 +535,21 @@ sub dump_roots_once {
 	@ID2ROOT = map { pack('H*', $_) } $self->all_terms('G');
 	my $id = 0;
 	my %root2id = map { $_ => $id++ } @ID2ROOT;
-	pipe(my ($r, $w)) or die "pipe: $!";
+	# dump_shard_roots | sort -k1,1 | OFS=' ' uniq_fold >to_root_id
+	pipe(my ($sort_r, $sort_w)) or die "pipe: $!";
+	pipe(my ($fold_r, $fold_w)) or die "pipe: $!";
 	my @sort = (@SORT, '-k1,1');
 	my $dst = "$TMPDIR/to_root_id";
 	open my $fh, '>', $dst or die "open($dst): $!";
-	my $sort_pid = spawn(\@sort, $CMD_ENV, { 0 => $r, 1 => $fh });
-	close $r or die "close: $!";
+	my $env = { %$CMD_ENV, OFS => ' ' };
+	my $sort_pid = spawn(\@sort, $CMD_ENV, { 0 => $sort_r, 1 => $fold_w });
+	my $fold_pid = spawn(\@UNIQ_FOLD, $env, { 0 => $fold_r, 1 => $fh });
 	awaitpid($sort_pid, \&cmd_done, \@sort, $associate);
+	awaitpid($fold_pid, \&cmd_done, [@UNIQ_FOLD, '(shards)'], $associate);
 	my ($c, $p) = PublicInbox::PktOp->pair;
 	$c->{ops}->{dump_shard_roots_done} = [ $self, $associate ];
-	my @arg = ('dump_shard_roots', [ $p->{op_p}, $w ], \%root2id, $QRY_STR);
+	my @arg = ('dump_shard_roots', [ $p->{op_p}, $sort_w ],
+			\%root2id, $QRY_STR);
 	$_->wq_io_do(@arg) for @IDX_SHARDS;
 	progress($self, 'waiting on dump_shard_roots sort');
 }
@@ -549,12 +570,15 @@ sub dump_ibx { # sends to xap_helper.h
 sub dump_ibx_start {
 	my ($self, $associate) = @_;
 	pipe(my $sort_r, $DUMP_IBX_WPIPE) or die "pipe: $!";
-	my @sort = (@SORT, '-k1,1');
+	pipe(my ($fold_r, $fold_w)) or die "pipe: $!";
+	my @sort = (@SORT, '-k1,1'); # sort only on ASSOC_PFX
+	# pipeline: dump_ibx | sort -k1,1 | uniq_fold >to_ibx_id
 	my $dst = "$TMPDIR/to_ibx_id";
 	open my $fh, '>', $dst or die "open($dst): $!";
-	my $sort_pid = spawn(\@sort, $CMD_ENV, { 0 => $sort_r, 1 => $fh });
-	close $sort_r or die "close: $!";
+	my $sort_pid = spawn(\@sort, $CMD_ENV, { 0 => $sort_r, 1 => $fold_w });
+	my $fold_pid = spawn(\@UNIQ_FOLD, $CMD_ENV, { 0 => $fold_r, 1 => $fh });
 	awaitpid($sort_pid, \&cmd_done, \@sort, $associate);
+	awaitpid($fold_pid, \&cmd_done, [@UNIQ_FOLD, '(ibx)'], $associate);
 	($XHC, $XH_PID) = PublicInbox::XapClient::start_helper("-j$NPROC");
 }
 
@@ -869,9 +893,11 @@ sub associate {
 	my @join = ('time', @JOIN, 'to_ibx_id', 'to_root_id');
 	my $rd = popen_rd(\@join, $CMD_ENV, { -C => "$TMPDIR" });
 	my %score;
-	while (<$rd>) { # PFX ibx_id root_id
-		my (undef, $ibx_id, @root_id) = split(/ /, $_);
-		++$score{"$ibx_id $_"} for @root_id;
+	while (<$rd>) { # PFX ibx_ids root_id
+		my (undef, $ibx_ids, @root_ids) = split(/ /, $_);
+		for my $ibx_id (split(/,/, $ibx_ids)) {
+			++$score{"$ibx_id $_"} for @root_ids;
+		}
 	}
 	close $rd or die "@join failed: $?=$?";
 	my $min = $self->{-opt}->{'assoc-min'} // 10;
@@ -939,9 +965,10 @@ sub init_prune ($) {
 	my $run_prune = PublicInbox::OnDestroy->new($$, \&run_prune, $self);
 	pipe(my ($sed_in, $delve_out)) or die "pipe: $!";
 	pipe(my ($sort_in, $sed_out)) or die "pipe: $!";
+	my @sort_u = (@SORT, '-u');
 	open(my $sort_out, '+>', "$TMPDIR/indexed_commits") or die "open: $!";
-	my $pid = spawn(\@SORT, $CMD_ENV, { 0 => $sort_in, 1 => $sort_out });
-	awaitpid($pid, \&cmd_done, \@SORT, $run_prune);
+	my $pid = spawn(\@sort_u, $CMD_ENV, { 0 => $sort_in, 1 => $sort_out });
+	awaitpid($pid, \&cmd_done, \@sort_u, $run_prune);
 	$pid = spawn(\@sed, $CMD_ENV, { 0 => $sed_in, 1 => $sed_out });
 	awaitpid($pid, \&cmd_done, \@sed, $run_prune);
 	$pid = spawn(\@delve, undef, { 1 => $delve_out });
@@ -971,12 +998,13 @@ sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done
 	pipe(my ($sort_in, $awk_out)) or die "pipe: $!";
 	pipe(my ($comm_in, $sort_out)) or die "pipe: $!";
 	my $awk_pid = spawn(\@AWK, $CMD_ENV, { 0 => $awk_in, 1 => $awk_out });
-	my $sort_pid = spawn(\@SORT, $CMD_ENV,
+	my @sort_u = (@SORT, '-u');
+	my $sort_pid = spawn(\@sort_u, $CMD_ENV,
 				{ 0 => $sort_in, 1 => $sort_out });
 	my ($comm_rd, $comm_pid) = popen_rd(\@COMM, $CMD_ENV,
 				{ 0 => $comm_in, -C => "$TMPDIR" });
 	awaitpid($awk_pid, \&cmd_done, \@AWK);
-	awaitpid($sort_pid, \&cmd_done, \@SORT);
+	awaitpid($sort_pid, \&cmd_done, \@sort_u);
 	awaitpid($comm_pid, \&cmd_done, \@COMM);
 	PublicInbox::CidxComm->new($comm_rd, $self); # calls cidx_read_comm
 	my $git_ver = PublicInbox::Git::git_version();
@@ -1092,10 +1120,9 @@ sub cidx_run { # main entry point
 	local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, @PRUNE_QUEUE,
 		$REPO_CTX, %ALT_FH, $TMPDIR, @AWK, @COMM, $CMD_ENV,
 		%TODO, @IBXQ, @IBX, @JOIN, @ASSOC_PFX, $DUMP_IBX_WPIPE,
-		@ID2ROOT, @DUMP_SHARD_ROOTS_OK, $XH_PID, $XHC);
+		@ID2ROOT, @DUMP_SHARD_ROOTS_OK, $XH_PID, $XHC, @SORT);
 	local $BATCH_BYTES = $self->{-opt}->{batch_size} //
 				$PublicInbox::SearchIdx::BATCH_BYTES;
-	local @SORT = (undef, '-u');
 	local $self->{ASSOC_PFX} = \@ASSOC_PFX;
 	local $self->{PENDING} = {};
 	local $self->{-pi_cfg};

  parent reply	other threads:[~2023-08-24  1:22 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-24  1:22 [PATCH 0/7] cindex: optional C++ Xapian helper Eric Wong
2023-08-24  1:22 ` [PATCH 1/7] search: hoist out shards_dir for future use Eric Wong
2023-08-24  1:22 ` [PATCH 2/7] cindex: read-only association dump Eric Wong
2023-08-24  1:22 ` [PATCH 3/7] cindex: add --show-roots switch Eric Wong
2023-08-24  1:22 ` [PATCH 4/7] introduce optional C++ xap_helper Eric Wong
2023-08-24 11:23   ` Štěpán Němec
2023-08-24 11:49     ` Eric Wong
2023-08-24  1:22 ` Eric Wong [this message]
2023-08-24  1:22 ` [PATCH 6/7] cindex: implement dump_roots in C++ Eric Wong
2023-08-24  1:22 ` [PATCH 7/7] xap_helper: reopen+retry in MSetIterator loops Eric Wong
2023-08-24 12:30 ` [PATCH 8/7] drop unused CidxRecvIbx.pm Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230824012236.3968030-6-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).