From cfc2f64069e245a700b60113705be477857c51e5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 6 Feb 2021 12:18:40 +0000 Subject: lei: add-external --mirror support This can be useful for users who want to clone and mirror an existing public-inbox. This doesn't have update support, yet, so users will need to run "git fetch && public-inbox-index" for now. --- lib/PublicInbox/LeiMirror.pm | 288 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 lib/PublicInbox/LeiMirror.pm (limited to 'lib/PublicInbox/LeiMirror.pm') diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm new file mode 100644 index 00000000..bb172e6a --- /dev/null +++ b/lib/PublicInbox/LeiMirror.pm @@ -0,0 +1,288 @@ +# Copyright (C) 2021 all contributors +# License: AGPL-3.0+ + +# "lei add-external --mirror" support +package PublicInbox::LeiMirror; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use PublicInbox::Spawn qw(popen_rd spawn); +use PublicInbox::PktOp; + +sub mirror_done { # EOF callback for main daemon + my ($lei) = @_; + my $mrr = delete $lei->{mrr}; + $mrr->wq_wait_old($lei) if $mrr; + # FIXME: check $? before finish + $lei->add_external_finish($mrr->{dst}); + $lei->dclose; +} + +# for old installations without manifest.js.gz +sub try_scrape { + my ($self) = @_; + my $uri = URI->new($self->{src}); + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $cmd = $curl->for_uri($lei, $uri); + my $opt = { 0 => $lei->{0}, 2 => $lei->{2} }; + my $fh = popen_rd($cmd, $lei->{env}, $opt); + my $html = do { local $/; <$fh> } // die "read(curl $uri): $!"; + close($fh) or return $lei->child_error($?, "@$cmd failed"); + + # we grep with URL below, we don't want Subject/From headers + # making us clone random URLs + my @urls = ($html =~ m!\bgit clone --mirror ([a-z\+]+://\S+)!g); + my $url = $uri->as_string; + chop($url) eq '/' or die "BUG: $uri not canonicalized"; + + # since this is for old instances w/o manifest.js.gz, try v1 first + return clone_v1($self) if grep(m!\A\Q$url\E/*\z!, @urls); + if (my @v2_urls = grep(m!\A\Q$url\E/[0-9]+\z!, @urls)) { + my %v2_uris = map { $_ => URI->new($_) } @v2_urls; # uniq + return clone_v2($self, [ values %v2_uris ]); + } + + # filter out common URLs served by WWW (e.g /$MSGID/T/) + if (@urls && $url =~ s!/+[^/]+\@[^/]+/.*\z!! && + grep(m!\A\Q$url\E/*\z!, @urls)) { + die <<""; +E: confused by scraping <$uri>, did you mean <$url>? + + } + @urls and die <<""; +E: confused by scraping <$uri>, got ambiguous results: +@urls + + die "E: scraping <$uri> revealed nothing\n"; +} + +sub clone_cmd { + my ($lei) = @_; + my @cmd = qw(git); + # we support "-c $key=$val" for arbitrary git config options + # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 + push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []}; + push @cmd, qw(clone --mirror); + push @cmd, '-q' if $lei->{opt}->{quiet}; + push @cmd, '-v' if $lei->{opt}->{verbose}; + # XXX any other options to support? + # --reference is tricky with multiple epochs... + @cmd; +} + +# tries the relatively new /$INBOX/_/text/config/raw endpoint +sub _try_config { + my ($self) = @_; + my $dst = $self->{dst}; + if (!-d $dst || !mkdir($dst)) { + require File::Path; + File::Path::mkpath($dst); + -d $dst or die "mkpath($dst): $!\n"; + } + my $uri = URI->new($self->{src}); + my $lei = $self->{lei}; + my $path = $uri->path; + chop($path) eq '/' or die "BUG: $uri not canonicalized"; + $uri->path($path . '/_/text/config/raw'); + my $cmd = $self->{curl}->for_uri($lei, $uri); + push @$cmd, '--compressed'; # curl decompresses for us + my $ce = "$dst/inbox.config.example"; + my $f = "$ce-$$.tmp"; + open(my $fh, '+>', $f) or return $lei->err("open $f: $! (non-fatal)"); + my $opt = { 0 => $lei->{0}, 1 => $fh, 2 => $lei->{2} }; + $lei->qerr("# @$cmd"); + my $pid = spawn($cmd, $lei->{env}, $opt); + waitpid($pid, 0) == $pid or return $lei->err("waitpid @$cmd: $!"); + if (($? >> 8) == 22) { # 404 missing + unlink($f) if -s $fh == 0; + return; + } + return $lei->err("# @$cmd failed (non-fatal)") if $?; + rename($f, $ce) or return $lei->err("link($f, $ce): $! (non-fatal)"); + my $cfg = PublicInbox::Config::git_config_dump($f); + my $ibx = $self->{ibx} = {}; + for my $sec (grep(/\Apublicinbox\./, @{$cfg->{-section_order}})) { + for (qw(address newsgroup nntpmirror)) { + $ibx->{$_} = $cfg->{"$sec.$_"}; + } + } +} + +sub index_cloned_inbox { + my ($self, $iv) = @_; + my $ibx = delete($self->{ibx}) // { + address => [ 'lei@example.com' ], + version => $iv, + }; + $ibx->{inboxdir} = $self->{dst}; + PublicInbox::Inbox->new($ibx); + PublicInbox::InboxWritable->new($ibx); + my $opt = {}; + my $lei = $self->{lei}; + for my $sw ($lei->index_opt) { + my ($k) = ($sw =~ /\A([\w-]+)/); + $opt->{$k} = $lei->{opt}->{$k}; + } + # force synchronous dwaitpid for v2: + local $PublicInbox::DS::in_loop = 0; + my $cfg = PublicInbox::Config->new; + my $env = PublicInbox::Admin::index_prepare($opt, $cfg); + local %ENV = (%ENV, %$env) if $env; + PublicInbox::Admin::progress_prepare($opt, $lei->{2}); + PublicInbox::Admin::index_inbox($ibx, undef, $opt); +} + +sub clone_v1 { + my ($self) = @_; + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $uri = URI->new($self->{src}); + my $pfx = $curl->torsocks($lei, $uri) or return; + my $cmd = [ @$pfx, clone_cmd($lei), $uri->as_string, $self->{dst} ]; + $lei->qerr("# @$cmd"); + my $pid = spawn($cmd, $lei->{env}, $lei); + waitpid($pid, 0) == $pid or die "BUG: waitpid @$cmd: $!"; + $? == 0 or return $lei->child_error($?, "@$cmd failed"); + _try_config($self); + index_cloned_inbox($self, 1); +} + +sub clone_v2 { + my ($self, $v2_uris) = @_; + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $pfx //= $curl->torsocks($lei, $v2_uris->[0]) or return; + my @epochs; + my $dst = $self->{dst}; + my @src_edst; + for my $uri (@$v2_uris) { + my $src = $uri->as_string; + my $edst = $dst; + $src =~ m!/([0-9]+)(?:\.git)?\z! or die <<""; +failed to extract epoch number from $src + + my $nr = $1 + 0; + $edst .= "/git/$nr.git"; + push @src_edst, [ $src, $edst ]; + } + my $lk = bless { lock_path => "$dst/inbox.lock" }, 'PublicInbox::Lock'; + _try_config($self); + my $on_destroy = $lk->lock_for_scope($$); + my @cmd = clone_cmd($lei); + while (my $pair = shift(@src_edst)) { + my $cmd = [ @$pfx, @cmd, @$pair ]; + $lei->qerr("# @$cmd"); + my $pid = spawn($cmd, $lei->{env}, $lei); + waitpid($pid, 0) == $pid or die "BUG: waitpid @$cmd: $!"; + $? == 0 or return $lei->child_error($?, "@$cmd failed"); + } + undef $on_destroy; # unlock + index_cloned_inbox($self, 2); +} + +sub try_manifest { + my ($self) = @_; + my $uri = URI->new($self->{src}); + my $lei = $self->{lei}; + my $curl = $self->{curl} //= PublicInbox::LeiCurl->new($lei) or return; + my $path = $uri->path; + chop($path) eq '/' or die "BUG: $uri not canonicalized"; + $uri->path($path . '/manifest.js.gz'); + my $cmd = $curl->for_uri($lei, $uri); + $lei->qerr("# @$cmd"); + my $opt = { 0 => $lei->{0}, 2 => $lei->{2} }; + my $fh = popen_rd($cmd, $lei->{env}, $opt); + my $gz = do { local $/; <$fh> } // die "read(curl $uri): $!"; + unless (close $fh) { + return try_scrape($self) if ($? >> 8) == 22; # 404 missing + return $lei->child_error($?, "@$cmd failed"); + } + my $js; + gunzip(\$gz => \$js, MultiStream => 1) or + die "gunzip($uri): $GunzipError"; + my $m = eval { PublicInbox::Config->json->decode($js) }; + die "$uri: error decoding `$js': $@" if $@; + ref($m) eq 'HASH' or die "$uri unknown type: ".ref($m); + + my $v1_bare = $m->{$path}; + my @v2_epochs = grep(m!\A\Q$path\E/git/[0-9]+\.git\z!, keys %$m); + if (@v2_epochs) { + # It may be possible to have v1 + v2 in parallel someday: + $lei->err(<path($_); $uri->clone } @v2_epochs; + clone_v2($self, \@v2_epochs); + } elsif ($v1_bare) { + clone_v1($self); + } elsif (my @maybe = grep(m!\Q$path\E!, keys %$m)) { + die "E: confused by <$uri>, possible matches:\n@maybe"; + } else { + die "E: confused by <$uri>"; + } +} + +sub start_clone_url { + my ($self) = @_; + return try_manifest($self) if $self->{src} =~ m!\Ahttps?://!; + die "TODO: non-HTTP/HTTPS clone of $self->{src} not supported, yet"; +} + +sub do_mirror { # via wq_do + my ($self) = @_; + my $lei = $self->{lei}; + eval { + my $iv = $lei->{opt}->{'inbox-version'}; + if (defined $iv) { + return clone_v1($self) if $iv == 1; + return try_scrape($self) if $iv == 2; + die "bad --inbox-version=$iv\n"; + } + return start_clone_url($self) if $self->{src} =~ m!://!; + die "TODO: cloning local directories not supported, yet"; + }; + return $lei->fail($@) if $@; + $lei->qerr("# mirrored $self->{src} => $self->{dst}"); +} + +sub start { + my ($cls, $lei, $src, $dst) = @_; + my $self = bless { lei => $lei, src => $src, dst => $dst }, $cls; + $lei->{mrr} = $self; + if ($src =~ m!https?://!) { + require URI; + require PublicInbox::LeiCurl; + } + require PublicInbox::Lock; + require PublicInbox::Inbox; + require PublicInbox::Admin; + require PublicInbox::InboxWritable; + my $ops = { + '!' => [ $lei->can('fail_handler'), $lei ], + 'x_it' => [ $lei->can('x_it'), $lei ], + 'child_error' => [ $lei->can('child_error'), $lei ], + '' => [ \&mirror_done, $lei ], + }; + ($lei->{pkt_op_c}, $lei->{pkt_op_p}) = PublicInbox::PktOp->pair($ops); + $self->wq_workers_start('lei_mirror', 1, $lei->oldset, {lei => $lei}); + my $op = delete $lei->{pkt_op_c}; + delete $lei->{pkt_op_p}; + $self->wq_do('do_mirror', []); + $self->wq_close(1); + $lei->event_step_init; # wait for shutdowns + if ($lei->{oneshot}) { + while ($op->{sock}) { $op->event_step } + } +} + +sub ipc_atfork_child { + my ($self) = @_; + $self->{lei}->lei_atfork_child; + $self->SUPER::ipc_atfork_child; +} + +1; -- cgit v1.2.3-24-ge0c7