From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-3.6 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id D470B1FAF0 for ; Mon, 28 Nov 2022 05:32:32 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1669613552; bh=AuTx/HwXdRqJq7zyeVOD07Z2M/p0++Y80MxUtRjxBSM=; h=From:To:Subject:Date:In-Reply-To:References:From; b=Nnqr9E43c6iH8rxL7f28KiyipvO99bAXQAU/19AF8zI1f4unJQn+bmVTYAGF+/Tb9 K1IlCFV6wzLSgvg2hcw8jh6uCAJ+1OBDmUHeRHDZFxjJE0GldmAWMgsZ9j3T2/subZ 9DKeb5IZdlbzH70K0bYBfofKWt8tVfZLhyfvRask= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 02/95] clone: support --include and --exclude with multi-clone Date: Mon, 28 Nov 2022 05:30:59 +0000 Message-Id: <20221128053232.291618-3-e@80x24.org> In-Reply-To: <20221128053232.291618-1-e@80x24.org> References: <20221128053232.291618-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: These will be handy when someone is interested in a subset of inboxes on a large hosting site. --- Documentation/public-inbox-clone.pod | 14 +++++++++++++ lib/PublicInbox/LeiMirror.pm | 31 +++++++++++++++++++++++++--- script/public-inbox-clone | 2 +- t/www_listing.t | 22 ++++++++++++++++++++ 4 files changed, 65 insertions(+), 4 deletions(-) diff --git a/Documentation/public-inbox-clone.pod b/Documentation/public-inbox-clone.pod index c80c3c5f..7e95146e 100644 --- a/Documentation/public-inbox-clone.pod +++ b/Documentation/public-inbox-clone.pod @@ -51,6 +51,20 @@ C<--epoch=~2..> clones the three latest epochs. Default: C<0..~0> or C<0..> or C<..~0> (all epochs, all three examples are equivalent) +=item -I PATTERN + +=item --include=PATTERN + +When cloning a top-level with multiple inboxes, only clone inboxes and +repositories matching a given wildcard pattern (using C<*?> and C<[]> is +supported). + +=item --exclude=PATTERN + +When cloning a top-level with multiple inboxes, ignore inboxes and +repositories matching the given wildcard pattern. Supports the same +wildcards as L + =item -q =item --quiet diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm index e356b5c5..d5017642 100644 --- a/lib/PublicInbox/LeiMirror.pm +++ b/lib/PublicInbox/LeiMirror.pm @@ -347,6 +347,8 @@ sub decode_manifest ($$$) { sub multi_inbox ($$$) { my ($self, $path, $m) = @_; + my $incl = $self->{lei}->{opt}->{include}; + my $excl = $self->{lei}->{opt}->{exclude}; # assuming everything not v2 is v1, for now my @v1 = sort grep(!m!.+/git/[0-9]+\.git\z!, keys %$m); @@ -354,13 +356,35 @@ sub multi_inbox ($$$) { my $v2 = {}; for (@v2_epochs) { - m!\A/(.+)/git/[0-9]+\.git\z! or die "BUG: $_"; + m!\A(/.+)/git/[0-9]+\.git\z! or die "BUG: $_"; push @{$v2->{$1}}, $_; } my $n = scalar(keys %$v2) + scalar(@v1); - my $ret; # { v1 => [ ... ], v2 => { $inbox_name => [ epochs ] }} + my @orig = defined($incl // $excl) ? (keys %$v2, @v1) : (); + if (defined $incl) { + my $re = '(?:'.join('|', map { + $self->{lei}->glob2re($_) // qr/\A\Q$_\E\z/ + } @$incl).')'; + my @gone = delete @$v2{grep(!/$re/, keys %$v2)}; + delete @$m{map { @$_ } @gone} and $self->{-culled_manifest} = 1; + delete @$m{grep(!/$re/, @v1)} and $self->{-culled_manifest} = 1; + @v1 = grep(/$re/, @v1); + } + if (defined $excl) { + my $re = '(?:'.join('|', map { + $self->{lei}->glob2re($_) // qr/\A\Q$_\E\z/ + } @$excl).')'; + my @gone = delete @$v2{grep(/$re/, keys %$v2)}; + delete @$m{map { @$_ } @gone} and $self->{-culled_manifest} = 1; + delete @$m{grep(/$re/, @v1)} and $self->{-culled_manifest} = 1; + @v1 = grep(!/$re/, @v1); + } + my $ret; # { v1 => [ ... ], v2 => { "/$inbox_name" => [ epochs ] }} $ret->{v1} = \@v1 if @v1; $ret->{v2} = $v2 if keys %$v2; + $ret //= @orig ? "Nothing to clone, available repositories:\n\t". + join("\n\t", sort @orig) + : "Nothing available to clone\n"; my $path_pfx = ''; # PSGI mount prefixes and manifest.js.gz prefixes don't always align... @@ -407,6 +431,7 @@ sub try_manifest { return try_scrape($self); } my ($path_pfx, $n, $multi) = multi_inbox($self, \$path, $m); + return $lei->child_error(1, $multi) if !ref($multi); if (my $v2 = delete $multi->{v2}) { for my $name (sort keys %$v2) { my $epochs = delete $v2->{$name}; @@ -449,7 +474,7 @@ EOM clone_v1($self, 1); } } - if (delete $self->{-culled_manifest}) { # set by clone_v2 + if (delete $self->{-culled_manifest}) { # set by clone_v2/-I/--exclude # write the smaller manifest if epochs were skipped so # users won't have to delete manifest if they +w an # epoch they no longer want to skip diff --git a/script/public-inbox-clone b/script/public-inbox-clone index 54059d03..4244e0c8 100755 --- a/script/public-inbox-clone +++ b/script/public-inbox-clone @@ -21,7 +21,7 @@ options: --quiet | -q increase verbosity (may be repeated) -C DIR chdir to specified directory EOF -GetOptions($opt, qw(help|h quiet|q verbose|v+ C=s@ c=s@ +GetOptions($opt, qw(help|h quiet|q verbose|v+ C=s@ c=s@ include|I=s@ exclude=s@ no-torsocks torsocks=s epoch=s)) or die $help; if ($opt->{help}) { print $help; exit }; require PublicInbox::Admin; # loads Config diff --git a/t/www_listing.t b/t/www_listing.t index e88bfbc5..e6bb1bda 100644 --- a/t/www_listing.t +++ b/t/www_listing.t @@ -135,8 +135,29 @@ EOM my $opt = { 2 => \(my $clone_err = '') }; ok(run_script(['-clone', "http://$host:$port/pfx", "$tmpdir/pfx" ], undef, $opt), 'pfx clone w/pfx') or diag "clone_err=$clone_err"; + + open my $mh, '<', "$tmpdir/pfx/manifest.js.gz" or xbail "open: $!"; + gunzip(\(do { local $/; <$mh> }) => \(my $mjs = '')); + my $mf = $json->decode($mjs); + is_deeply([sort keys %$mf], [ qw(/alt /bare /v2/git/0.git + /v2/git/1.git /v2/git/2.git) ], + 'manifest saved'); + for (keys %$mf) { ok(-d "$tmpdir/pfx$_", "pfx/$_ cloned") } + + $clone_err = ''; + ok(run_script(['-clone', '--include=*/alt', + "http://$host:$port/pfx", "$tmpdir/incl" ], + undef, $opt), 'clone w/include') or diag "clone_err=$clone_err"; + ok(-d "$tmpdir/incl/alt", 'alt cloned'); + ok(!-d "$tmpdir/incl/v2" && !-d "$tmpdir/incl/bare", 'only alt cloned'); + undef $td; + open $mh, '<', "$tmpdir/incl/manifest.js.gz" or xbail "open: $!"; + gunzip(\(do { local $/; <$mh> }) => \($mjs = '')); + $mf = $json->decode($mjs); + is_deeply([keys %$mf], [ '/alt' ], 'excluded keys skipped in manifest'); + $td = start_script($cmd, $env, { 3 => $sock }); # default publicinboxGrokManifest match=domain default @@ -146,6 +167,7 @@ EOM $clone_err = ''; ok(run_script(['-clone', "http://$host:$port/", "$tmpdir/full" ], undef, $opt), 'full clone') or diag "clone_err=$clone_err"; + ok(-d "$tmpdir/full/$_", "$_ cloned") for qw(alt v2 bare); undef $td;