diff options
Diffstat (limited to 'lib/PublicInbox/Import.pm')
-rw-r--r-- | lib/PublicInbox/Import.pm | 234 |
1 files changed, 120 insertions, 114 deletions
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 17adfabd..ed34d548 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # git fast-import-based ssoma-mda MDA replacement @@ -6,10 +6,9 @@ # and public-inbox-watch. Not the WWW or NNTP code which only # requires read-only access. package PublicInbox::Import; -use strict; +use v5.12; use parent qw(PublicInbox::Lock); -use v5.10.1; -use PublicInbox::Spawn qw(run_die popen_rd); +use PublicInbox::Spawn qw(run_die run_qx spawn); use PublicInbox::MID qw(mids mid2path); use PublicInbox::Address; use PublicInbox::Smsg; @@ -18,13 +17,16 @@ use PublicInbox::ContentHash qw(content_digest); use PublicInbox::MDA; use PublicInbox::Eml; use POSIX qw(strftime); +use autodie qw(socketpair); +use Carp qw(croak); +use Socket qw(AF_UNIX SOCK_STREAM); +use PublicInbox::IO qw(read_all); sub default_branch () { state $default_branch = do { - my $r = popen_rd([qw(git config --global init.defaultBranch)], + my $h = run_qx([qw(git config --global init.defaultBranch)], { GIT_CONFIG => undef }); - chomp(my $h = <$r> // ''); - close $r; + chomp $h; $h eq '' ? 'refs/heads/master' : "refs/heads/$h"; } } @@ -55,11 +57,10 @@ sub new { # idempotent start function sub gfi_start { my ($self) = @_; - - return ($self->{in}, $self->{out}) if $self->{in}; - - my ($in_r, $out_r, $out_w); - pipe($out_r, $out_w) or die "pipe failed: $!"; + my $io = $self->{io}; + return $io if $io; + socketpair($io, my $s2, AF_UNIX, SOCK_STREAM, 0); + $io->autoflush(1); $self->lock_acquire; eval { @@ -72,21 +73,20 @@ sub gfi_start { die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; $self->{-tree} = { map { $_ => 1 } split(/\0/, $t) }; } - $in_r = $self->{in} = $git->popen(qw(fast-import - --quiet --done --date-format=raw), - undef, { 0 => $out_r }); - $out_w->autoflush(1); - $self->{out} = $out_w; + my $gfi = [ 'git', "--git-dir=$git->{git_dir}", qw(fast-import + --quiet --done --date-format=raw) ]; + my $pid = spawn($gfi, undef, { 0 => $s2, 1 => $s2 }); $self->{nchg} = 0; + $self->{io} = PublicInbox::IO::attach_pid($io, $pid); }; if ($@) { $self->lock_release; die $@; } - ($in_r, $out_w); + $self->{io}; } -sub wfail () { die "write to fast-import failed: $!" } +sub wfail () { croak "write to fast-import failed: $!" } sub now_raw () { time . ' +0000' } @@ -98,60 +98,43 @@ sub norm_body ($) { } # only used for v1 (ssoma) inboxes -sub _check_path ($$$$) { - my ($r, $w, $tip, $path) = @_; +sub _check_path ($$$) { + my ($io, $tip, $path) = @_; return if $tip eq ''; - print $w "ls $tip $path\n" or wfail; + print $io "ls $tip $path\n" or wfail; local $/ = "\n"; - defined(my $info = <$r>) or die "EOF from fast-import: $!"; + my $info = <$io> // die "EOF from fast-import: $!"; $info =~ /\Amissing / ? undef : $info; } -sub _cat_blob ($$$) { - my ($r, $w, $oid) = @_; - print $w "cat-blob $oid\n" or wfail; +sub _cat_blob ($$) { + my ($io, $oid) = @_; + print $io "cat-blob $oid\n" or wfail; local $/ = "\n"; - my $info = <$r>; - defined $info or die "EOF from fast-import / cat-blob: $!"; + my $info = <$io> // die "EOF from fast-import / cat-blob: $!"; $info =~ /\A[a-f0-9]{40,} blob ([0-9]+)\n\z/ or return; - my $left = $1; - my $offset = 0; - my $buf = ''; - my $n; - while ($left > 0) { - $n = read($r, $buf, $left, $offset); - defined($n) or die "read cat-blob failed: $!"; - $n == 0 and die 'fast-export (cat-blob) died'; - $left -= $n; - $offset += $n; - } - $n = read($r, my $lf, 1); - defined($n) or die "read final byte of cat-blob failed: $!"; - die "bad read on final byte: <$lf>" if $lf ne "\n"; - - # fixup some bugginess in old versions: - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $buf = read_all($io, my $len = $1 + 1); + my $lf = chop $buf; + croak "bad read on final byte: <$lf>" if $lf ne "\n"; \$buf; } sub cat_blob { my ($self, $oid) = @_; - my ($r, $w) = $self->gfi_start; - _cat_blob($r, $w, $oid); + _cat_blob($self->{io} // return, $oid); } sub check_remove_v1 { - my ($r, $w, $tip, $path, $mime) = @_; + my ($io, $tip, $path, $mime) = @_; - my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); + my $info = _check_path($io, $tip, $path) or return ('MISSING',undef); $info =~ m!\A100644 blob ([a-f0-9]{40,})\t!s or die "not blob: $info"; my $oid = $1; - my $msg = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed"; - my $cur = PublicInbox::Eml->new($msg); - my $cur_s = $cur->header('Subject'); - $cur_s = '' unless defined $cur_s; - my $cur_m = $mime->header('Subject'); - $cur_m = '' unless defined $cur_m; + my $bref = _cat_blob($io, $oid) or die "BUG: cat-blob $1 failed"; + PublicInbox::Eml::strip_from($$bref); + my $cur = PublicInbox::Eml->new($bref); + my $cur_s = $cur->header('Subject') // ''; + my $cur_m = $mime->header('Subject') // ''; if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { return ('MISMATCH', $cur); } @@ -160,16 +143,15 @@ sub check_remove_v1 { sub checkpoint { my ($self) = @_; - return unless $self->{in}; - print { $self->{out} } "checkpoint\n" or wfail; + print { $self->{io} // return } "checkpoint\n" or wfail; undef; } sub progress { my ($self, $msg) = @_; - return unless $self->{in}; - print { $self->{out} } "progress $msg\n" or wfail; - readline($self->{in}) eq "progress $msg\n" or die + my $io = $self->{io} or return; + print $io "progress $msg\n" or wfail; + readline($io) eq "progress $msg\n" or die "progress $msg not received\n"; undef; } @@ -185,8 +167,8 @@ sub _update_git_info ($$) { my $env = { GIT_INDEX_FILE => $index }; run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env); } - eval { run_die([@cmd, 'update-server-info']) }; my $ibx = $self->{ibx}; + eval { run_die([@cmd, 'update-server-info']) } if $ibx; if ($ibx && $ibx->version == 1 && -d "$ibx->{inboxdir}/public-inbox" && eval { require PublicInbox::SearchIdx }) { eval { @@ -195,7 +177,10 @@ sub _update_git_info ($$) { }; warn "$ibx->{inboxdir} index failed: $@\n" if $@; } - eval { run_die([@cmd, qw(gc --auto)]) } if $do_gc; + if ($do_gc) { + my @quiet = (-t STDERR ? () : '-q'); + eval { run_die([@cmd, qw(gc --auto), @quiet]) } + } } sub barrier { @@ -216,10 +201,9 @@ sub barrier { # used for v2 sub get_mark { my ($self, $mark) = @_; - die "not active\n" unless $self->{in}; - my ($r, $w) = $self->gfi_start; - print $w "get-mark $mark\n" or wfail; - defined(my $oid = <$r>) or die "get-mark failed, need git 2.6.0+\n"; + my $io = $self->{io} or croak "not active\n"; + print $io "get-mark $mark\n" or wfail; + my $oid = <$io> // die "get-mark failed, need git 2.6.0+\n"; chomp($oid); $oid; } @@ -236,11 +220,11 @@ sub remove { my $path_type = $self->{path_type}; my ($path, $err, $cur, $blob); - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my $tip = $self->{tip}; if ($path_type eq '2/38') { $path = mid2path(v1_mid0($mime)); - ($err, $cur) = check_remove_v1($r, $w, $tip, $path, $mime); + ($err, $cur) = check_remove_v1($io, $tip, $path, $mime); return ($err, $cur) if $err; } else { my $sref; @@ -252,7 +236,7 @@ sub remove { } my $len = length($$sref); $blob = $self->{mark}++; - print $w "blob\nmark :$blob\ndata $len\n", + print $io "blob\nmark :$blob\ndata $len\n", $$sref, "\n" or wfail; } @@ -260,22 +244,22 @@ sub remove { my $commit = $self->{mark}++; my $parent = $tip =~ /\A:/ ? $tip : undef; unless ($parent) { - print $w "reset $ref\n" or wfail; + print $io "reset $ref\n" or wfail; } my $ident = $self->{ident}; my $now = now_raw(); $msg //= 'rm'; my $len = length($msg) + 1; - print $w "commit $ref\nmark :$commit\n", + print $io "commit $ref\nmark :$commit\n", "author $ident $now\n", "committer $ident $now\n", "data $len\n$msg\n\n", 'from ', ($parent ? $parent : $tip), "\n" or wfail; if (defined $path) { - print $w "D $path\n\n" or wfail; + print $io "D $path\n\n" or wfail; } else { - clean_tree_v2($self, $w, 'd'); - print $w "M 100644 :$blob d\n\n" or wfail; + clean_tree_v2($self, $io, 'd'); + print $io "M 100644 :$blob d\n\n" or wfail; } $self->{nchg}++; (($self->{tip} = ":$commit"), $cur); @@ -337,11 +321,38 @@ sub extract_cmt_info ($;$) { # kill potentially confusing/misleading headers our @UNWANTED_HEADERS = (qw(Bytes Lines Content-Length), qw(Status X-Status)); +our $DROP_UNIQUE_UNSUB; sub drop_unwanted_headers ($) { my ($eml) = @_; for (@UNWANTED_HEADERS, @PublicInbox::MDA::BAD_HEADERS) { $eml->header_set($_); } + + # We don't want public-inbox readers to be able to unsubcribe the + # address which does archiving. WARNING: this breaks DKIM if the + # mailing list sender follows RFC 8058, section 4; but breaking DKIM + # (or have senders ignore RFC 8058 sec. 4) is preferable to having + # saboteurs unsubscribing independent archivists: + if ($DROP_UNIQUE_UNSUB && grep(/\AList-Unsubscribe=One-Click\z/, + $eml->header_raw('List-Unsubscribe-Post'))) { + for (qw(List-Unsubscribe-Post List-Unsubscribe)) { + $eml->header_set($_) + } + } +} + +sub load_config ($;$) { + my ($cfg, $do_exit) = @_; + my $v = $cfg->{lc 'publicinboxImport.dropUniqueUnsubscribe'}; + if (defined $v) { + $DROP_UNIQUE_UNSUB = $cfg->git_bool($v) // do { + warn <<EOM; +E: publicinboxImport.dropUniqueUnsubscribe=$v in $cfg->{-f} is not boolean +EOM + $do_exit //= \&CORE::exit; + $do_exit->(78); # EX_CONFIG + }; + } } # used by V2Writable, too @@ -365,11 +376,11 @@ sub v1_mid0 ($) { $mids->[0]; } sub clean_tree_v2 ($$$) { - my ($self, $w, $keep) = @_; + my ($self, $io, $keep) = @_; my $tree = $self->{-tree} or return; #v2 only delete $tree->{$keep}; foreach (keys %$tree) { - print $w "D $_\n" or wfail; + print $io "D $_\n" or wfail; } %$tree = ($keep => 1); } @@ -388,10 +399,10 @@ sub add { $path = 'm'; } - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my $tip = $self->{tip}; if ($path_type eq '2/38') { - _check_path($r, $w, $tip, $path) and return; + _check_path($io, $tip, $path) and return; } drop_unwanted_headers($mime); @@ -405,8 +416,7 @@ sub add { my $raw_email = $mime->{-public_inbox_raw} // $mime->as_string; my $n = length($raw_email); $self->{bytes_added} += $n; - print $w "blob\nmark :$blob\ndata ", $n, "\n" or wfail; - print $w $raw_email, "\n" or wfail; + print $io "blob\nmark :$blob\ndata $n\n", $raw_email, "\n" or wfail; # v2: we need this for Xapian if ($smsg) { @@ -423,7 +433,7 @@ sub add { $x->dbh; $x; }; - return if !$u->set_maybe(pack('H*', $smsg->{blob}), 1); + return if !$u->set_maybe($smsg->oidbin, 1); return if (!$oidx->vivify_xvmd($smsg) && $eidx_git->check($smsg->{blob})); } @@ -433,19 +443,19 @@ sub add { my $parent = $tip =~ /\A:/ ? $tip : undef; unless ($parent) { - print $w "reset $ref\n" or wfail; + print $io "reset $ref\n" or wfail; } - print $w "commit $ref\nmark :$commit\n", + print $io "commit $ref\nmark :$commit\n", "author $author $at\n", - "committer $self->{ident} $ct\n" or wfail; - print $w "data ", (length($subject) + 1), "\n", + "committer $self->{ident} $ct\n", + "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; if ($tip ne '') { - print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; + print $io 'from ', ($parent ? $parent : $tip), "\n" or wfail; } - clean_tree_v2($self, $w, $path); - print $w "M 100644 :$blob $path\n\n" or wfail; + clean_tree_v2($self, $io, $path); + print $io "M 100644 :$blob $path\n\n" or wfail; $self->{nchg}++; $self->{tip} = ":$commit"; } @@ -461,32 +471,37 @@ my @INIT_FILES = ('HEAD' => undef, # filled in at runtime EOC sub init_bare { - my ($dir, $head) = @_; # or self + my ($dir, $head, $fmt) = @_; # or self $dir = $dir->{git}->{git_dir} if ref($dir); require File::Path; - File::Path::mkpath([ map { "$dir/$_" } qw(objects/info refs/heads) ]); + File::Path::make_path(map { $dir.$_ } qw(/objects/info /refs/heads)); $INIT_FILES[1] //= 'ref: '.default_branch."\n"; my @fn_contents = @INIT_FILES; $fn_contents[1] = "ref: refs/heads/$head\n" if defined $head; + $fn_contents[3] = <<EOM if defined($fmt) && $fmt ne 'sha1'; +[core] + repositoryFormatVersion = 1 + filemode = true + bare = true +[extensions] + objectFormat = $fmt +EOM while (my ($fn, $contents) = splice(@fn_contents, 0, 2)) { my $f = $dir.'/'.$fn; next if -f $f; - open my $fh, '>', $f or die "open $f: $!"; - print $fh $contents or die "print $f: $!"; - close $fh or die "close $f: $!"; + PublicInbox::IO::write_file '>', $f, $contents; } } # true if locked and active -sub active { !!$_[0]->{out} } +sub active { !!$_[0]->{io} } sub done { my ($self) = @_; - my $w = delete $self->{out} or return; + my $io = delete $self->{io} or return; eval { - my $r = delete $self->{in} or die 'BUG: missing {in} when done'; - print $w "done\n" or wfail; - close $r or die "fast-import failed: $?"; # ProcessPipe::CLOSE + print $io "done\n" or wfail; + $io->close or croak "close fast-import \$?=$?"; # reaps }; my $wait_err = $@; my $nchg = delete $self->{nchg}; @@ -499,13 +514,7 @@ sub done { die $wait_err if $wait_err; } -sub atfork_child { - my ($self) = @_; - foreach my $f (qw(in out)) { - next unless defined($self->{$f}); - close $self->{$f} or die "failed to close import[$f]: $!\n"; - } -} +sub atfork_child { (delete($_[0]->{io}) // return)->close } sub digest2mid ($$;$) { my ($dig, $hdr, $fallback_time) = @_; @@ -558,7 +567,7 @@ sub replace_oids { my $git = $self->{git}; my @export = (qw(fast-export --no-data --use-done-feature), $old); my $rd = $git->popen(@export); - my ($r, $w) = $self->gfi_start; + my $io = gfi_start($self); my @buf; my $nreplace = 0; my @oids; @@ -569,17 +578,14 @@ sub replace_oids { push @buf, "reset $tmp\n"; } elsif (/^commit (?:.+)/) { if (@buf) { - print $w @buf or wfail; + print $io @buf or wfail; @buf = (); } push @buf, "commit $tmp\n"; } elsif (/^data ([0-9]+)/) { # only commit message, so $len is small: - my $len = $1; # + 1 for trailing "\n" push @buf, $_; - my $n = read($rd, my $buf, $len) or die "read: $!"; - $len == $n or die "short read ($n < $len)"; - push @buf, $buf; + push @buf, read_all($rd, my $len = $1); } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { my ($oid, $path) = ($1, $2); $tree->{$path} = 1; @@ -606,7 +612,7 @@ sub replace_oids { rewrite_commit($self, \@oids, \@buf, $mime); $nreplace++; } - print $w @buf, "\n" or wfail; + print $io @buf, "\n" or wfail; @buf = (); } elsif ($_ eq "done\n") { $done = 1; @@ -617,9 +623,9 @@ sub replace_oids { push @buf, $_; } } - close $rd or die "close fast-export failed: $?"; + $rd->close or die "E: git @export (\$?=$?)"; if (@buf) { - print $w @buf or wfail; + print $io @buf or wfail; } die 'done\n not seen from fast-export' unless $done; chomp(my $cmt = $self->get_mark(":$mark")) if $nreplace; |