From 0ab999b988c62b701bde26a8bad2b75da0a7ac43 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 5 Dec 2023 09:46:23 +0000 Subject: cindex: index full (40/64 char) hex blob OIDs This future proofs the index against git auto-abbreviation needing more characters as the repo grows. It'll be useful for joining against inboxes using dfpre. As with emails, we'll continue indexing abbreviated blob OIDs down to 7 hex characters so a SHA-1 git repo will have all abbreviations of the OID from 7-39 hex characters in addition to the 40 character unabbreviated form. --- lib/PublicInbox/CodeSearchIdx.pm | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'lib/PublicInbox') diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index ec0fc6e3..20aac584 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -107,6 +107,8 @@ our ( $DUMP_IBX_WPIPE, # goes to sort(1) $ANY_SHARD, # shard round-robin for scan fingerprinting @OFF2ROOT, + $GIT_VER, + @NO_ABBREV, ); # stop walking history if we see >$SEEN_MAX existing commits, this assumes @@ -304,7 +306,7 @@ sub shard_index { # via wq_io_do in IDX_SHARDS my $in = delete($self->{0}) // die 'BUG: no {0} input'; my $op_p = delete($self->{1}) // die 'BUG: no {1} op_p'; sysseek($in, 0, SEEK_SET); - my $cmd = $git->cmd(@LOG_STDIN); + my $cmd = $git->cmd(@NO_ABBREV, @LOG_STDIN); my $rd = popen_rd($cmd, undef, { 0 => $in }, \&cidx_reap_log, $cmd, $self, $op_p); PublicInbox::CidxLogP->new($rd, $self, $git, $roots); @@ -1151,15 +1153,14 @@ sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done); my $comm_rd = popen_rd(\@COMM, $CMD_ENV, $comm_opt, \&cmd_done, \@COMM); PublicInbox::CidxComm->new($comm_rd, $self, $drs); # ->cidx_read_comm - my $git_ver = PublicInbox::Git::git_version(); - push @PRUNE_BATCH, '--buffer' if $git_ver ge v2.6; + push @PRUNE_BATCH, '--buffer' if $GIT_VER ge v2.6; # Yes, we pipe --unordered git output to sort(1) because sorting # inside git leads to orders-of-magnitude slowdowns on rotational # storage. GNU sort(1) also works well on larger-than-memory # datasets, and it's not worth eliding sort(1) for old git. - push @PRUNE_BATCH, '--unordered' if $git_ver ge v2.19; - warn(sprintf(<{-opt}->{batch_size} // $PublicInbox::SearchIdx::BATCH_BYTES; @@ -1289,6 +1290,8 @@ sub cidx_run { # main entry point local $self->{PENDING} = {}; # used by PublicInbox::CidxXapHelperAux my $cfg = $self->{-opt}->{-pi_cfg} // die 'BUG: -pi_cfg unset'; $self->{-cfg_f} = $cfg->{-f} = rel2abs_collapsed($cfg->{-f}); + local $GIT_VER = PublicInbox::Git::git_version(); + @NO_ABBREV = ('-c', 'core.abbrev='.($GIT_VER lt v2.31.0 ? 40 : 'no')); if (grep { $_ } @{$self->{-opt}}{qw(prune join)}) { require File::Temp; $TMPDIR = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1); -- cgit v1.2.3-24-ge0c7