diff options
author | Eric Wong <e@80x24.org> | 2023-04-19 21:54:48 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2023-04-20 18:22:55 +0000 |
commit | cbd13f06bf17357a9e160a6ae54a3f532e9eb068 (patch) | |
tree | b90607adbd26eccdafa656046c82d4d83767d9a4 | |
parent | c0d0d5b4131e5b18e0d5ae9877460204a8cd2f3c (diff) | |
download | public-inbox-cbd13f06bf17357a9e160a6ae54a3f532e9eb068.tar.gz |
This special support is only needed for --prune at the moment since the indexing side works on a per-repo basis. There's no automated tests, yet, but it seems to work well on my sha256 projects when sharing a cindex with sha1 projects.
-rw-r--r-- | lib/PublicInbox/CodeSearchIdx.pm | 100 | ||||
-rw-r--r-- | lib/PublicInbox/Git.pm | 4 | ||||
-rw-r--r-- | lib/PublicInbox/Import.pm | 10 |
3 files changed, 79 insertions, 35 deletions
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 78032c00..54dbf785 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -31,6 +31,7 @@ use PublicInbox::Config qw(glob2re); use PublicInbox::Spawn qw(spawn popen_rd); use PublicInbox::OnDestroy; use PublicInbox::CidxLogP; +use PublicInbox::Git qw(%OFMT2HEXLEN); use Socket qw(MSG_EOR); use Carp (); our ( @@ -44,7 +45,6 @@ our ( @RDONLY_XDB, # Xapian::Database @IDX_SHARDS, # clones of self $MAX_SIZE, - $TMP_GIT, # PublicInbox::Git object for --prune $REINDEX, # PublicInbox::SharedKV @GIT_DIR_GONE, # [ git_dir1, git_dir2 ] %TO_PRUNE, # (docid => docid) mapping (hash in case of retry_reopen) @@ -58,6 +58,9 @@ our ( %ACTIVE_GIT_DIR, # GIT_DIR => undef mapping for prune $IDX_TODO, # [ $git0, $root0, $git1, $root1, ...] $GIT_TODO, # [ GIT_DIR0, GIT_DIR1, ...] + %HEXLEN2TMPGIT, # ((40|64) => PublicInbox::Git for prune) + %ALT_FH, # '', or 'sha256' => tmp IO for TMPGIT alternates + $TMPDIR, # File::Temp->newdir object ); # stop walking history if we see >$SEEN_MAX existing commits, this assumes @@ -701,28 +704,33 @@ sub event_step { # may be requeued via DS scalar(@cmt) == 1 or warn "BUG? shard[$self->{shard}] #$PRUNE_CUR has multiple commits: @cmt"; for my $o (@cmt) { - $TMP_GIT->check_async($o, \&prune_cb, - [$self, $PRUNE_CUR]) + $HEXLEN2TMPGIT{length($o)}->check_async($o, + \&prune_cb, [$self, $PRUNE_CUR]) } } } - $TMP_GIT->async_wait_all; + $_->async_wait_all for (values %HEXLEN2TMPGIT); cidx_ckpoint($self); return PublicInbox::DS::requeue($self) if $PRUNE_CUR <= $PRUNE_MAX; send($PRUNE_OP_P, "prune_done $self->{shard}", MSG_EOR); $PRUNE_NR //= 0; progress($self, "prune [$self->{shard}] $PRUNE_NR done"); - $TMP_GIT->cleanup; - $TMP_GIT = $PRUNE_OP_P = $PRUNE_CUR = $PRUNE_MAX = undef; - %ACTIVE_GIT_DIR = (); + $_->cleanup for (values %HEXLEN2TMPGIT); + $PRUNE_OP_P = $PRUNE_CUR = $PRUNE_MAX = undef; + undef %ACTIVE_GIT_DIR; + undef %HEXLEN2TMPGIT; } sub prune_start { # via wq_io_do in IDX_SHARDS - my ($self, $git_dir, @active_git_dir) = @_; + my ($self, $tmpdir, @active_git_dir) = @_; $PRUNE_CUR = 1; $PRUNE_OP_P = delete $self->{0} // die 'BUG: no {0} op_p'; %ACTIVE_GIT_DIR = map { $_ => undef } @active_git_dir; - $TMP_GIT = PublicInbox::Git->new($git_dir); # TMP_GIT copy + for my $git_dir (<$tmpdir/*.git>) { + my ($hexlen) = ($git_dir =~ m!/hexlen([0-9]+)\.git\z!); + $hexlen or die "BUG: no hexlen in $git_dir"; + $HEXLEN2TMPGIT{$hexlen} = PublicInbox::Git->new($git_dir); + } $self->begin_txn_lazy; $PRUNE_MAX = $self->{xdb}->get_lastdocid // 1; event_step($self); @@ -750,24 +758,6 @@ sub parent_quit { warn "# SIG$_[0] received, quitting...\n"; } -sub init_tmp_git_dir ($) { - my ($self) = @_; - require File::Temp; - require PublicInbox::Import; - my $tmp = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1); - PublicInbox::Import::init_bare("$tmp", 'cidx-all'); - my $f = "$tmp/objects/info/alternates"; - open my $fh, '>', $f or die "open($f): $!"; - my $o; - for (@{$self->{git_dirs}}) { # TODO: sha256 check? - $o = $_.'/objects'; - say $fh $o if -d $o; - } - close $fh or die "close($f): $!"; - $TMP_GIT = PublicInbox::Git->new("$tmp"); - $TMP_GIT->{-tmp} = $tmp; -} - sub prep_umask ($) { my ($self) = @_; if ($self->{-cidx_internal}) { # respect core.sharedRepository @@ -789,16 +779,60 @@ sub prep_umask ($) { } } -sub start_prune ($) { +sub prep_alternate { # awaitpid callback for config extensions.objectFormat + my ($pid, $objdir, $out, $send_prune) = @_; + my $status = $? >> 8; + my $fmt; + if ($status == 1) { # unset, default is '' (SHA-1) + $fmt = 'sha1'; + } elsif ($status == 0) { + seek($out, 0, SEEK_SET) or die "seek: $!"; + chomp($fmt = <$out> // 'sha1'); + } else { + return warn("git config \$?=$? for objdir=$objdir"); + } + my $hexlen = $OFMT2HEXLEN{$fmt} // return warn <<EOM; +E: ignoring objdir=$objdir, unknown extensions.objectFormat=$fmt +EOM + unless ($ALT_FH{$fmt}) { + my $git_dir = "$TMPDIR/hexlen$hexlen.git"; + PublicInbox::Import::init_bare($git_dir, 'cidx-all', $fmt); + my $f = "$git_dir/objects/info/alternates"; + open $ALT_FH{$fmt}, '>', $f or die "open($f): $!"; + } + say { $ALT_FH{$fmt} } $out or die "say: $!"; + # send_prune fires on the last one +} + +sub init_prune ($) { my ($self) = @_; return (@$PRUNE_DONE = map { 1 } @IDX_SHARDS) if !$self->{-opt}->{prune}; - init_tmp_git_dir($self); + + require File::Temp; + require PublicInbox::Import; + $TMPDIR = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1); + my $send_prune = PublicInbox::OnDestroy->new($$, \&send_prune, $self); + my $cmd = [ 'git', undef, 'config', 'extensions.objectFormat' ]; + for (@{$self->{git_dirs}}) { + my $o = $_.'/objects'; + next if !-d $o; + $cmd->[1] = "--git-dir=$_"; + open my $out, '+>', undef or die "open(tmp): $!"; + my $pid = spawn($cmd, undef, { 1 => $out }); + awaitpid($pid, \&prep_alternate, $o, $out, $send_prune); + } +} + +sub send_prune { # OnDestroy when `git config extensions.objectFormat' are done + my ($self) = @_; + for (values %ALT_FH) { close $_ or die "close: $!" } + %ALT_FH = (); my @active_git_dir = (@{$self->{git_dirs}}, @GIT_DIR_GONE); my ($c, $p) = PublicInbox::PktOp->pair; $c->{ops}->{prune_done} = [ $self ]; for my $s (@IDX_SHARDS) { $s->wq_io_do('prune_start', [ $p->{op_p} ], - $TMP_GIT->{git_dir}, @active_git_dir) + "$TMPDIR", @active_git_dir) } } @@ -812,8 +846,8 @@ sub cidx_run { # main entry point local $LIVE = {}; local $PRUNE_DONE = []; local $IDX_TODO = []; - local ($DO_QUIT, $TMP_GIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, - $GIT_TODO, $REPO_CTX); + local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, + $GIT_TODO, $REPO_CTX, %ALT_FH, $TMPDIR, %HEXLEN2TMPGIT); local $BATCH_BYTES = $self->{-opt}->{batch_size} // $PublicInbox::SearchIdx::BATCH_BYTES; local @IDX_SHARDS = cidx_init($self); @@ -859,7 +893,7 @@ sub cidx_run { # main entry point local $LIVE_JOBS = $self->{-opt}->{jobs} || PublicInbox::IPC::detect_nproc() || 2; local @RDONLY_XDB = $self->xdb_shards_flat; - start_prune($self); + init_prune($self); scan_git_dirs($self) if $self->{-opt}->{scan} // 1; local @PublicInbox::DS::post_loop_do = (\&shards_active); diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 3108ed85..61ba8aa1 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -22,7 +22,9 @@ use IO::Poll qw(POLLIN); use Carp qw(croak carp); use PublicInbox::SHA (); use PublicInbox::DS qw(awaitpid); -our @EXPORT_OK = qw(git_unquote git_quote); +our %HEXLEN2SHA = (40 => 1, 64 => 256); +our %OFMT2HEXLEN = (sha1 => 40, sha256 => 64); +our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN); our $PIPE_BUFSIZ = 65536; # Linux default our $in_cleanup; our $RDTIMEO = 60_000; # milliseconds diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 39719bcb..59462e9a 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -461,13 +461,21 @@ my @INIT_FILES = ('HEAD' => undef, # filled in at runtime EOC sub init_bare { - my ($dir, $head) = @_; # or self + my ($dir, $head, $fmt) = @_; # or self $dir = $dir->{git}->{git_dir} if ref($dir); require File::Path; File::Path::make_path(map { $dir.$_ } qw(/objects/info /refs/heads)); $INIT_FILES[1] //= 'ref: '.default_branch."\n"; my @fn_contents = @INIT_FILES; $fn_contents[1] = "ref: refs/heads/$head\n" if defined $head; + $fn_contents[3] = <<EOM if defined($fmt) && $fmt ne 'sha1'; +[core] + repositoryFormatVersion = 1 + filemode = true + bare = true +[extensions] + objectFormat = $fmt +EOM while (my ($fn, $contents) = splice(@fn_contents, 0, 2)) { my $f = $dir.'/'.$fn; next if -f $f; |