about summary refs log tree commit homepage
path: root/lib/PublicInbox
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-09-12 07:47:12 +0000
committerEric Wong <e@80x24.org>2021-09-12 07:48:56 +0000
commitb45a1dffa647f6427d0c900fcc55753db7a1994c (patch)
treeeafb5e666662bc65afd1aad33aeb203134b1efd7 /lib/PublicInbox
parent02a0f3959b2e74f7217fcdca848822e7230acd6b (diff)
downloadpublic-inbox-b45a1dffa647f6427d0c900fcc55753db7a1994c.tar.gz
Setting up and maintaining git-only mirrors of v2 inboxes is
complex since multiple commands are required to clone and fetch
into epochs.

Unlike grokmirror, these commands do not require any
configuration.  Instead, they rely on existing git config files
and work like "git clone --mirror" and "git fetch",
respectively.

Like grokmirror, they use manifest.js.gz, but only on a
per-inbox basis so users won't have to clone every inbox of a
large instance nor edit config files to include/exclude inboxes
they're interested in.
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r--lib/PublicInbox/Admin.pm8
-rw-r--r--lib/PublicInbox/Fetch.pm145
-rw-r--r--lib/PublicInbox/LEI.pm6
-rw-r--r--lib/PublicInbox/LeiMirror.pm95
4 files changed, 214 insertions, 40 deletions
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 2534958b..9ff59bca 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -372,4 +372,12 @@ sub index_prepare ($$) {
         $env;
 }
 
+sub do_chdir ($) {
+        my $chdir = $_[0] // return;
+        for my $d (@$chdir) {
+                next if $d eq ''; # same as git(1)
+                chdir $d or die "cd $d: $!";
+        }
+}
+
 1;
diff --git a/lib/PublicInbox/Fetch.pm b/lib/PublicInbox/Fetch.pm
new file mode 100644
index 00000000..d795731c
--- /dev/null
+++ b/lib/PublicInbox/Fetch.pm
@@ -0,0 +1,145 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+# Wrapper to "git fetch" remote public-inboxes
+package PublicInbox::Fetch;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::IPC);
+use URI ();
+use PublicInbox::Spawn qw(popen_rd);
+use PublicInbox::Admin;
+use PublicInbox::LEI;
+use PublicInbox::LeiCurl;
+use PublicInbox::LeiMirror;
+use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
+use File::Temp ();
+
+sub new { bless {}, __PACKAGE__ }
+
+sub fetch_cmd ($$) {
+        my ($lei, $opt) = @_;
+        my @cmd = qw(git);
+        $opt->{$_} = $lei->{$_} for (0..2);
+        # we support "-c $key=$val" for arbitrary git config options
+        # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050
+        push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []};
+        push @cmd, 'fetch';
+        push @cmd, '-q' if $lei->{opt}->{quiet};
+        push @cmd, '-v' if $lei->{opt}->{verbose};
+        @cmd;
+}
+
+sub remote_url ($$) {
+        my ($lei, $dir) = @_; # TODO: support non-"origin"?
+        my $cmd = [ qw(git config remote.origin.url) ];
+        my $fh = popen_rd($cmd, undef, { -C => $dir, 2 => $lei->{2} });
+        my $url = <$fh>;
+        close $fh or return;
+        chomp $url;
+        $url;
+}
+
+sub do_fetch {
+        my ($cls, $lei, $cd) = @_;
+        my $ibx_ver;
+        my $curl = PublicInbox::LeiCurl->new($lei) or return;
+        my $dir = PublicInbox::Admin::resolve_inboxdir($cd, \$ibx_ver);
+        if ($ibx_ver == 1) {
+                my $url = remote_url($lei, $dir) //
+                        die "E: $dir missing remote.origin.url\n";
+                my $uri = URI->new($url);
+                my $torsocks = $curl->torsocks($lei, $uri);
+                my $opt = { -C => $dir };
+                my $cmd = [ @$torsocks, fetch_cmd($lei, $opt) ];
+                my $cerr = PublicInbox::LeiMirror::run_reap($lei, $cmd, $opt);
+                $lei->child_error($cerr, "@$cmd failed") if $cerr;
+                return;
+        }
+        # v2:
+        opendir my $dh, "$dir/git" or die "opendir $dir/git: $!";
+        my @epochs = sort { $b <=> $a } map { substr($_, 0, -4) + 0 }
+                                grep(/\A[0-9]+\.git\z/, readdir($dh));
+        my ($git_url, $epoch);
+        for my $nr (@epochs) { # try newest epoch, first
+                my $edir = "$dir/git/$nr.git";
+                if (defined(my $url = remote_url($lei, $edir))) {
+                        $git_url = $url;
+                        $epoch = $nr;
+                        last;
+                } else {
+                        warn "W: $edir missing remote.origin.url\n";
+                }
+        }
+        $git_url or die "Unable to determine git URL\n";
+        my $inbox_url = $git_url;
+        $inbox_url =~ s!/git/$epoch(?:\.git)?/?\z!! or
+                $inbox_url =~ s!/$epoch(?:\.git)?/?\z!! or die <<EOM;
+Unable to infer inbox URL from <$git_url>
+EOM
+        $lei->qerr("# inbox URL: $inbox_url/");
+        my $muri = URI->new("$inbox_url/manifest.js.gz");
+        my $ft = File::Temp->new(TEMPLATE => 'manifest-XXXX',
+                                UNLINK => 1, DIR => $dir);
+        my $fn = $ft->filename;
+        my @opt = (qw(-R -o), $fn);
+        my $mf = "$dir/manifest.js.gz";
+        my $m0; # current manifest.js.gz contents
+        if (open my $fh, '<', $mf) {
+                $m0 = eval {
+                        PublicInbox::LeiMirror::decode_manifest($fh, $mf, $mf)
+                };
+                $lei->err($@) if $@;
+                push @opt, '-z', $mf if defined($m0);
+        }
+        my $curl_cmd = $curl->for_uri($lei, $muri, @opt);
+        my $opt = {};
+        $opt->{$_} = $lei->{$_} for (0..2);
+        my $cerr = PublicInbox::LeiMirror::run_reap($lei, $curl_cmd, $opt);
+        return $lei->child_error($cerr, "@$curl_cmd failed") if $cerr;
+        return if !-s $ft; # 304 Not Modified via curl -z
+
+        my $m1 = PublicInbox::LeiMirror::decode_manifest($ft, $fn, $muri);
+        my $mdiff = { %$m1 };
+
+        # filter out unchanged entries
+        while (my ($k, $v0) = each %{$m0 // {}}) {
+                my $cur = $m1->{$k} // next;
+                my $f0 = $v0->{fingerprint} // next;
+                my $f1 = $cur->{fingerprint} // next;
+                my $t0 = $v0->{modified} // next;
+                my $t1 = $cur->{modified} // next;
+                delete($mdiff->{$k}) if $f0 eq $f1 && $t0 == $t1;
+        }
+        my $ibx_uri = URI->new("$inbox_url/");
+        my ($path_pfx, $v1_bare, @v2_epochs) =
+                PublicInbox::LeiMirror::deduce_epochs($mdiff, $ibx_uri->path);
+        defined($v1_bare) and die <<EOM;
+E: got v1 `$v1_bare' when expecting v2 epoch(s) in <$muri>, WTF?
+EOM
+        my @epoch_nr = sort { $a <=> $b }
+                map { my ($nr) = (m!/([0-9]+)\.git\z!g) } @v2_epochs;
+
+        # n.b. this expects all epochs are from the same host
+        my $torsocks = $curl->torsocks($lei, $muri);
+        for my $nr (@epoch_nr) {
+                my $dir = "$dir/git/$nr.git";
+                my $cmd;
+                my $opt = {};
+                if (-d $dir) {
+                        $opt->{-C} = $dir;
+                        $cmd = [ @$torsocks, fetch_cmd($lei, $opt) ];
+                } else {
+                        my $e_uri = $ibx_uri->clone;
+                        $e_uri->path($ibx_uri->path."git/$nr.git");
+                        $cmd = [ @$torsocks,
+                                PublicInbox::LeiMirror::clone_cmd($lei, $opt),
+                                $$e_uri, $dir ];
+                }
+                my $cerr = PublicInbox::LeiMirror::run_reap($lei, $cmd, $opt);
+                return $lei->child_error($cerr, "@$cmd failed") if $cerr;
+        }
+        rename($fn, $mf) or die "E: rename($fn, $mf): $!\n";
+        $ft->unlink_on_destroy(0);
+}
+
+1;
diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index aff2bf19..6d5d3c03 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -468,6 +468,8 @@ sub x_it ($$) {
                 $self->{pkt_op_p}->pkt_do('x_it', $code);
         } elsif ($self->{sock}) { # to lei(1) client
                 send($self->{sock}, "x_it $code", MSG_EOR);
+        } elsif ($quit == \&CORE::exit) { # an admin command
+                exit($code >> 8);
         } # else ignore if client disconnected
 }
 
@@ -511,7 +513,7 @@ sub fail ($$;$) {
         my ($self, $buf, $exit_code) = @_;
         $self->{failed}++;
         err($self, $buf) if defined $buf;
-        # calls fail_handler:
+        # calls fail_handler
         $self->{pkt_op_p}->pkt_do('!') if $self->{pkt_op_p};
         x_it($self, ($exit_code // 1) << 8);
         undef;
@@ -536,6 +538,8 @@ sub child_error { # passes non-fatal curl exit codes to user
                 $self->{pkt_op_p}->pkt_do('child_error', $child_error);
         } elsif ($self->{sock}) { # to lei(1) client
                 send($self->{sock}, "child_error $child_error", MSG_EOR);
+        } else { # non-lei admin command
+                $self->{child_error} ||= $child_error;
         } # else noop if client disconnected
 }
 
diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm
index 355813bd..c128d13d 100644
--- a/lib/PublicInbox/LeiMirror.pm
+++ b/lib/PublicInbox/LeiMirror.pm
@@ -8,6 +8,7 @@ use v5.10.1;
 use parent qw(PublicInbox::IPC);
 use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 use PublicInbox::Spawn qw(popen_rd spawn);
+use File::Temp ();
 
 sub do_finish_mirror { # dwaitpid callback
         my ($arg, $pid) = @_;
@@ -18,7 +19,9 @@ sub do_finish_mirror { # dwaitpid callback
         } elsif (!unlink($f)) {
                 $lei->err("unlink($f): $!") unless $!{ENOENT};
         } else {
-                $lei->add_external_finish($mrr->{dst});
+                if ($lei->{cmd} ne 'public-inbox-clone') {
+                        $lei->add_external_finish($mrr->{dst});
+                }
                 $lei->qerr("# mirrored $mrr->{src} => $mrr->{dst}");
         }
         $lei->dclose;
@@ -121,33 +124,38 @@ sub _try_config {
 
 sub index_cloned_inbox {
         my ($self, $iv) = @_;
-        my $ibx = delete($self->{ibx}) // {
-                address => [ 'lei@example.com' ],
-                version => $iv,
-        };
-        $ibx->{inboxdir} = $self->{dst};
-        PublicInbox::Inbox->new($ibx);
-        PublicInbox::InboxWritable->new($ibx);
-        my $opt = {};
         my $lei = $self->{lei};
-        for my $sw ($lei->index_opt) {
-                my ($k) = ($sw =~ /\A([\w-]+)/);
-                $opt->{$k} = $lei->{opt}->{$k};
+
+        # n.b. public-inbox-clone works w/o (SQLite || Xapian)
+        # lei is useless without Xapian + SQLite
+        if ($lei->{cmd} ne 'public-inbox-clone') {
+                my $ibx = delete($self->{ibx}) // {
+                        address => [ 'lei@example.com' ],
+                        version => $iv,
+                };
+                $ibx->{inboxdir} = $self->{dst};
+                PublicInbox::Inbox->new($ibx);
+                PublicInbox::InboxWritable->new($ibx);
+                my $opt = {};
+                for my $sw ($lei->index_opt) {
+                        my ($k) = ($sw =~ /\A([\w-]+)/);
+                        $opt->{$k} = $lei->{opt}->{$k};
+                }
+                # force synchronous dwaitpid for v2:
+                local $PublicInbox::DS::in_loop = 0;
+                my $cfg = PublicInbox::Config->new(undef, $lei->{2});
+                my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+                local %ENV = (%ENV, %$env) if $env;
+                PublicInbox::Admin::progress_prepare($opt, $lei->{2});
+                PublicInbox::Admin::index_inbox($ibx, undef, $opt);
         }
-        # force synchronous dwaitpid for v2:
-        local $PublicInbox::DS::in_loop = 0;
-        my $cfg = PublicInbox::Config->new(undef, $lei->{2});
-        my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
-        local %ENV = (%ENV, %$env) if $env;
-        PublicInbox::Admin::progress_prepare($opt, $lei->{2});
-        PublicInbox::Admin::index_inbox($ibx, undef, $opt);
         open my $x, '>', "$self->{dst}/mirror.done"; # for do_finish_mirror
 }
 
 sub run_reap {
         my ($lei, $cmd, $opt) = @_;
         $lei->qerr("# @$cmd");
-        $opt->{pgid} = 0;
+        $opt->{pgid} = 0 if $lei->{sock};
         my $pid = spawn($cmd, undef, $opt);
         my $reap = PublicInbox::OnDestroy->new($lei->can('sigint_reap'), $pid);
         waitpid($pid, 0) == $pid or die "waitpid @$cmd: $!";
@@ -205,6 +213,7 @@ sub deduce_epochs ($$) {
         my ($m, $path) = @_;
         my ($v1_bare, @v2_epochs);
         my $path_pfx = '';
+        $path =~ s!/+\z!!;
         do {
                 $v1_bare = $m->{$path};
                 @v2_epochs = grep(m!\A\Q$path\E/git/[0-9]+\.git\z!, keys %$m);
@@ -213,6 +222,18 @@ sub deduce_epochs ($$) {
         ($path_pfx, $v1_bare, @v2_epochs);
 }
 
+sub decode_manifest ($$$) {
+        my ($fh, $fn, $uri) = @_;
+        my $js;
+        my $gz = do { local $/; <$fh> } // die "slurp($fn): $!";
+        gunzip(\$gz => \$js, MultiStream => 1) or
+                die "gunzip($uri): $GunzipError\n";
+        my $m = eval { PublicInbox::Config->json->decode($js) };
+        die "$uri: error decoding `$js': $@\n" if $@;
+        ref($m) eq 'HASH' or die "$uri unknown type: ".ref($m);
+        $m;
+}
+
 sub try_manifest {
         my ($self) = @_;
         my $uri = URI->new($self->{src});
@@ -221,26 +242,19 @@ sub try_manifest {
         my $path = $uri->path;
         chop($path) eq '/' or die "BUG: $uri not canonicalized";
         $uri->path($path . '/manifest.js.gz');
-        my $cmd = $curl->for_uri($lei, $uri);
-        $lei->qerr("# @$cmd");
-        my $opt = { 0 => $lei->{0}, 2 => $lei->{2} };
-        my ($fh, $pid) = popen_rd($cmd, undef, $opt);
-        my $reap = PublicInbox::OnDestroy->new($lei->can('sigint_reap'), $pid);
-        my $gz = do { local $/; <$fh> } // die "read(curl $uri): $!";
-        close $fh;
-        waitpid($pid, 0) == $pid or die "waitpid @$cmd: $!";
-        @$reap = ();
-        if ($?) {
-                return try_scrape($self) if ($? >> 8) == 22; # 404 missing
-                return $lei->child_error($?, "@$cmd failed");
+        my $pdir = $lei->rel2abs($self->{dst});
+        $pdir =~ s!/[^/]+/?\z!!;
+        my $ft = File::Temp->new(TEMPLATE => 'manifest-XXXX',
+                                UNLINK => 1, DIR => $pdir);
+        my $fn = $ft->filename;
+        my $cmd = $curl->for_uri($lei, $uri, '-R', '-o', $fn);
+        my $opt = { 0 => $lei->{0}, 1 => $lei->{1}, 2 => $lei->{2} };
+        my $cerr = run_reap($lei, $cmd, $opt);
+        if ($cerr) {
+                return try_scrape($self) if ($cerr >> 8) == 22; # 404 missing
+                return $lei->child_error($cerr, "@$cmd failed");
         }
-        my $js;
-        gunzip(\$gz => \$js, MultiStream => 1) or
-                die "gunzip($uri): $GunzipError";
-        my $m = eval { PublicInbox::Config->json->decode($js) };
-        die "$uri: error decoding `$js': $@" if $@;
-        ref($m) eq 'HASH' or die "$uri unknown type: ".ref($m);
-
+        my $m = decode_manifest($ft, $fn, $uri);
         my ($path_pfx, $v1_bare, @v2_epochs) = deduce_epochs($m, $path);
         if (@v2_epochs) {
                 # It may be possible to have v1 + v2 in parallel someday:
@@ -254,6 +268,9 @@ EOM
                         $uri->clone
                 } @v2_epochs;
                 clone_v2($self, \@v2_epochs);
+                my $fin = "$self->{dst}/manifest.js.gz";
+                rename($fn, $fin) or die "E: rename($fn, $fin): $!";
+                $ft->unlink_on_destroy(0);
         } elsif (defined $v1_bare) {
                 clone_v1($self);
         } else {