about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2019-06-09 02:51:40 +0000
committerEric Wong <e@80x24.org>2019-06-09 04:33:16 +0000
commit60321ef7c5e8c2abc395bebadadf77d789a06945 (patch)
tree8847cb8fa5be942619d8ebcaf1956c10a8b298b8
parentcfff674918e8fc3e0763ac4818c07e0ba7fa716e (diff)
downloadpublic-inbox-60321ef7c5e8c2abc395bebadadf77d789a06945.tar.gz
v2writable: implement ->replace call
Much of the existing purge code is repurposed to a general
"replace" functionality.

->purge is simpler because it can just drop the information.
Unlike ->purge, ->replace needs to edit existing git commits (in
case of From: and Subject: headers) and reindex the modified
message.

We currently disallow editing of References:, In-Reply-To: and
Message-ID headers because it can cause bad side effects with
our threading (and our lack of rethreading support to deal with
excessive matching from incorrect/invalid References).
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/Import.pm44
-rw-r--r--lib/PublicInbox/V2Writable.pm171
-rw-r--r--t/replace.t199
4 files changed, 372 insertions, 43 deletions
diff --git a/MANIFEST b/MANIFEST
index 5085bff8..321da033 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -236,6 +236,7 @@ t/psgi_text.t
 t/psgi_v2.t
 t/purge.t
 t/qspawn.t
+t/replace.t
 t/reply.t
 t/search-thr-index.t
 t/search.t
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 2c8fe848..137b2b78 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -484,26 +484,38 @@ sub digest2mid ($$) {
         "$dt.$b64" . '@z';
 }
 
-sub clean_purge_buffer {
-        my ($oids, $buf) = @_;
-        my $cmt_msg = 'purged '.join(' ',@$oids)."\n";
+sub rewrite_commit ($$$$) {
+        my ($self, $oids, $buf, $mime) = @_;
+        my ($name, $email, $at, $ct, $subject);
+        if ($mime) {
+                ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime);
+        } else {
+                $name = $email = '';
+                $subject = 'purged '.join(' ', @$oids);
+        }
         @$oids = ();
-
+        $subject .= "\n";
         foreach my $i (0..$#$buf) {
                 my $l = $buf->[$i];
                 if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) {
-                        $buf->[$i] = "author <> $1\n";
+                        $at //= $1;
+                        $buf->[$i] = "author $name <$email> $at\n";
+                } elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) {
+                        $ct //= $1;
+                        $buf->[$i] = "committer $self->{ident} $ct\n";
                 } elsif ($l =~ /^data ([0-9]+)/) {
-                        $buf->[$i++] = "data " . length($cmt_msg) . "\n";
-                        $buf->[$i] = $cmt_msg;
+                        $buf->[$i++] = "data " . length($subject) . "\n";
+                        $buf->[$i] = $subject;
                         last;
                 }
         }
 }
 
+# returns the new commit OID if a replacement was done
+# returns undef if nothing was done
 sub replace_oids {
-        my ($self, $replace) = @_; # oid => raw string
-        my $tmp = "refs/heads/replace-".((keys %$replace)[0]);
+        my ($self, $mime, $replace_map) = @_; # oid => raw string
+        my $tmp = "refs/heads/replace-".((keys %$replace_map)[0]);
         my $old = $self->{'ref'};
         my $git = $self->{git};
         my @export = (qw(fast-export --no-data --use-done-feature), $old);
@@ -533,7 +545,7 @@ sub replace_oids {
                 } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) {
                         my ($oid, $path) = ($1, $2);
                         $tree->{$path} = 1;
-                        my $sref = $replace->{$oid};
+                        my $sref = $replace_map->{$oid};
                         if (defined $sref) {
                                 push @oids, $oid;
                                 my $n = length($$sref);
@@ -548,10 +560,12 @@ sub replace_oids {
                         push @buf, $_ if $tree->{$path};
                 } elsif ($_ eq "\n") {
                         if (@oids) {
-                                my $out = join('', @buf);
-                                $out =~ s/^/# /sgm;
-                                warn "purge rewriting\n", $out, "\n";
-                                clean_purge_buffer(\@oids, \@buf);
+                                if (!$mime) {
+                                        my $out = join('', @buf);
+                                        $out =~ s/^/# /sgm;
+                                        warn "purge rewriting\n", $out, "\n";
+                                }
+                                rewrite_commit($self, \@oids, \@buf, $mime);
                                 $nreplace++;
                         }
                         $w->print(@buf, "\n") or wfail;
@@ -585,7 +599,7 @@ sub replace_oids {
 
         # check that old OIDs are gone
         my $err = 0;
-        foreach my $oid (keys %$replace) {
+        foreach my $oid (keys %$replace_map) {
                 my @info = $git->check($oid);
                 if (@info) {
                         warn "$oid not replaced\n";
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index d6f72b07..3484807b 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -11,7 +11,7 @@ use PublicInbox::SearchIdxPart;
 use PublicInbox::MIME;
 use PublicInbox::Git;
 use PublicInbox::Import;
-use PublicInbox::MID qw(mids);
+use PublicInbox::MID qw(mids references);
 use PublicInbox::ContentId qw(content_id content_digest);
 use PublicInbox::Inbox;
 use PublicInbox::OverIdx;
@@ -297,26 +297,30 @@ sub idx_init {
         });
 }
 
-sub purge_oids ($$) {
-        my ($self, $purge) = @_; # $purge = { $object_id => \'', ... }
+# returns an array mapping [ epoch => latest_commit ]
+# latest_commit may be undef if nothing was done to that epoch
+# $replace_map = { $object_id => $strref, ... }
+sub _replace_oids ($$$) {
+        my ($self, $mime, $replace_map) = @_;
         $self->done;
         my $pfx = "$self->{-inbox}->{mainrepo}/git";
-        my $purges = [];
+        my $rewrites = []; # epoch => commit
         my $max = $self->{epoch_max};
 
         unless (defined($max)) {
                 defined(my $latest = git_dir_latest($self, \$max)) or return;
                 $self->{epoch_max} = $max;
         }
+
         foreach my $i (0..$max) {
                 my $git_dir = "$pfx/$i.git";
                 -d $git_dir or next;
                 my $git = PublicInbox::Git->new($git_dir);
                 my $im = $self->import_init($git, 0, 1);
-                $purges->[$i] = $im->replace_oids($purge);
+                $rewrites->[$i] = $im->replace_oids($mime, $replace_map);
                 $im->done;
         }
-        $purges;
+        $rewrites;
 }
 
 sub content_ids ($) {
@@ -339,25 +343,31 @@ sub content_matches ($$) {
         0
 }
 
-sub remove_internal ($$$$) {
-        my ($self, $mime, $cmt_msg, $purge) = @_;
+# used for removing or replacing (purging)
+sub rewrite_internal ($$;$$$) {
+        my ($self, $old_mime, $cmt_msg, $new_mime, $sref) = @_;
         $self->idx_init;
-        my $im = $self->importer unless $purge;
+        my ($im, $need_reindex, $replace_map);
+        if ($sref) {
+                $replace_map = {}; # oid => sref
+                $need_reindex = [] if $new_mime;
+        } else {
+                $im = $self->importer;
+        }
         my $over = $self->{over};
-        my $cids = content_ids($mime);
+        my $cids = content_ids($old_mime);
         my $parts = $self->{idx_parts};
-        my $mm = $self->{mm};
         my $removed;
-        my $mids = mids($mime->header_obj);
+        my $mids = mids($old_mime->header_obj);
 
         # We avoid introducing new blobs into git since the raw content
         # can be slightly different, so we do not need the user-supplied
         # message now that we have the mids and content_id
-        $mime = undef;
+        $old_mime = undef;
         my $mark;
 
         foreach my $mid (@$mids) {
-                my %gone;
+                my %gone; # num => [ smsg, raw ]
                 my ($id, $prev);
                 while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
                         my $msg = get_blob($self, $smsg);
@@ -380,17 +390,21 @@ sub remove_internal ($$$$) {
                 }
                 foreach my $num (keys %gone) {
                         my ($smsg, $orig) = @{$gone{$num}};
-                        $mm->num_delete($num);
                         # $removed should only be set once assuming
                         # no bugs in our deduplication code:
                         $removed = $smsg;
                         my $oid = $smsg->{blob};
-                        if ($purge) {
-                                $purge->{$oid} = \'';
+                        if ($replace_map) {
+                                $replace_map->{$oid} = $sref;
                         } else {
                                 ($mark, undef) = $im->remove($orig, $cmt_msg);
                         }
                         $orig = undef;
+                        if ($need_reindex) { # ->replace
+                                push @$need_reindex, $smsg;
+                        } else { # ->purge or ->remove
+                                $self->{mm}->num_delete($num);
+                        }
                         unindex_oid_remote($self, $oid, $mid);
                 }
         }
@@ -399,8 +413,9 @@ sub remove_internal ($$$$) {
                 my $cmt = $im->get_mark($mark);
                 $self->{last_commit}->[$self->{epoch_max}] = $cmt;
         }
-        if ($purge && scalar keys %$purge) {
-                return purge_oids($self, $purge);
+        if ($replace_map && scalar keys %$replace_map) {
+                my $rewrites = _replace_oids($self, $new_mime, $replace_map);
+                return { rewrites => $rewrites, need_reindex => $need_reindex };
         }
         $removed;
 }
@@ -409,22 +424,122 @@ sub remove_internal ($$$$) {
 sub remove {
         my ($self, $mime, $cmt_msg) = @_;
         $self->{-inbox}->with_umask(sub {
-                remove_internal($self, $mime, $cmt_msg, undef);
+                rewrite_internal($self, $mime, $cmt_msg);
         });
 }
 
+sub _replace ($$;$$) {
+        my ($self, $old_mime, $new_mime, $sref) = @_;
+        my $rewritten = $self->{-inbox}->with_umask(sub {
+                rewrite_internal($self, $old_mime, undef, $new_mime, $sref);
+        }) or return;
+
+        my $rewrites = $rewritten->{rewrites};
+        # ->done is called if there are rewrites since we gc+prune from git
+        $self->idx_init if @$rewrites;
+
+        for my $i (0..$#$rewrites) {
+                defined(my $cmt = $rewrites->[$i]) or next;
+                $self->{last_commit}->[$i] = $cmt;
+        }
+        $rewritten;
+}
+
 # public
 sub purge {
         my ($self, $mime) = @_;
-        my $purges = $self->{-inbox}->with_umask(sub {
-                remove_internal($self, $mime, undef, {});
-        }) or return;
-        $self->idx_init if @$purges; # ->done is called on purges
-        for my $i (0..$#$purges) {
-                defined(my $cmt = $purges->[$i]) or next;
-                $self->{last_commit}->[$i] = $cmt;
+        my $rewritten = _replace($self, $mime, undef, \'') or return;
+        $rewritten->{rewrites}
+}
+
+# returns the git object_id of $fh, does not write the object to FS
+sub git_hash_raw ($$) {
+        my ($self, $raw) = @_;
+        # grab the expected OID we have to reindex:
+        open my $tmp_fh, '+>', undef or die "failed to open tmp: $!";
+        $tmp_fh->autoflush(1);
+        print $tmp_fh $$raw or die "print \$tmp_fh: $!";
+        sysseek($tmp_fh, 0, 0) or die "seek failed: $!";
+
+        my ($r, $w);
+        pipe($r, $w) or die "failed to create pipe: $!";
+        my $rdr = { 0 => fileno($tmp_fh), 1 => fileno($w) };
+        my $git_dir = $self->{-inbox}->git->{git_dir};
+        my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)];
+        my $pid = spawn($cmd, undef, $rdr);
+        close $w;
+        local $/ = "\n";
+        chomp(my $oid = <$r>);
+        waitpid($pid, 0) == $pid or die "git hash-object did not finish";
+        die "git hash-object failed: $?" if $?;
+        $oid =~ /\A[a-f0-9]{40}\z/ or die "OID not expected: $oid";
+        $oid;
+}
+
+sub _check_mids_match ($$$) {
+        my ($old_list, $new_list, $hdrs) = @_;
+        my %old_mids = map { $_ => 1 } @$old_list;
+        my %new_mids = map { $_ => 1 } @$new_list;
+        my @old = keys %old_mids;
+        my @new = keys %new_mids;
+        my $err = "$hdrs may not be changed when replacing\n";
+        die $err if scalar(@old) != scalar(@new);
+        delete @new_mids{@old};
+        delete @old_mids{@new};
+        die $err if (scalar(keys %old_mids) || scalar(keys %new_mids));
+}
+
+# Changing Message-IDs or References with ->replace isn't supported.
+# The rules for dealing with messages with multiple or conflicting
+# Message-IDs are pretty complex and rethreading hasn't been fully
+# implemented, yet.
+sub check_mids_match ($$) {
+        my ($old_mime, $new_mime) = @_;
+        my $old = $old_mime->header_obj;
+        my $new = $new_mime->header_obj;
+        _check_mids_match(mids($old), mids($new), 'Message-ID(s)');
+        _check_mids_match(references($old), references($new),
+                        'References/In-Reply-To');
+}
+
+# public
+sub replace ($$$) {
+        my ($self, $old_mime, $new_mime) = @_;
+
+        check_mids_match($old_mime, $new_mime);
+
+        # mutt will always add Content-Length:, Status:, Lines: when editing
+        PublicInbox::Import::drop_unwanted_headers($new_mime);
+
+        my $raw = $new_mime->as_string;
+        my $expect_oid = git_hash_raw($self, \$raw);
+        my $rewritten = _replace($self, $old_mime, $new_mime, \$raw) or return;
+        my $need_reindex = $rewritten->{need_reindex};
+
+        # just in case we have bugs in deduplication code:
+        my $n = scalar(@$need_reindex);
+        if ($n > 1) {
+                my $list = join(', ', map {
+                                        "$_->{num}: <$_->{mid}>"
+                                } @$need_reindex);
+                warn <<"";
+W: rewritten $n messages matching content of original message (expected: 1).
+W: possible bug in public-inbox, NNTP article IDs and Message-IDs follow:
+W: $list
+
+        }
+
+        # make sure we really got the OID:
+        my ($oid, $type, $len) = $self->{-inbox}->git->check($expect_oid);
+        $oid eq $expect_oid or die "BUG: $expect_oid not found after replace";
+
+        # reindex modified messages:
+        for my $smsg (@$need_reindex) {
+                my $num = $smsg->{num};
+                my $mid0 = $smsg->{mid};
+                do_idx($self, \$raw, $new_mime, $len, $num, $oid, $mid0);
         }
-        $purges;
+        $rewritten->{rewrites};
 }
 
 sub last_commit_part ($$;$) {
diff --git a/t/replace.t b/t/replace.t
new file mode 100644
index 00000000..6fae5511
--- /dev/null
+++ b/t/replace.t
@@ -0,0 +1,199 @@
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::MIME;
+use PublicInbox::InboxWritable;
+use File::Temp qw/tempdir/;
+require './t/common.perl';
+require_git(2.6); # replace is v2 only, for now...
+foreach my $mod (qw(DBD::SQLite)) {
+        eval "require $mod";
+        plan skip_all => "$mod missing for $0" if $@;
+}
+
+sub test_replace ($$$) {
+        my ($v, $level, $opt) = @_;
+        diag "v$v $level replace";
+        my $this = "pi-$v-$level-replace";
+        my $tmpdir = tempdir("$this-tmp-XXXXXX", TMPDIR => 1, CLEANUP => 1);
+        my $ibx = PublicInbox::Inbox->new({
+                mainrepo => "$tmpdir/testbox",
+                name => $this,
+                version => $v,
+                -primary_address => 'test@example.com',
+                indexlevel => $level,
+        });
+
+        my $orig = PublicInbox::MIME->new(<<'EOF');
+From: Barbra Streisand <effect@example.com>
+To: test@example.com
+Subject: confidential
+Message-ID: <replace@example.com>
+Date: Fri, 02 Oct 1993 00:00:00 +0000
+
+Top secret info about my house in Malibu...
+EOF
+        my $im = PublicInbox::InboxWritable->new($ibx, {nproc=>1})->importer;
+        # fake a bunch of epochs
+        $im->{rotate_bytes} = $opt->{rotate_bytes} if $opt->{rotate_bytes};
+
+        if ($opt->{pre}) {
+                $opt->{pre}->($im, 1, 2);
+                $orig->header_set('References', '<1@example.com>');
+        }
+        ok($im->add($orig), 'add message to be replaced');
+        if ($opt->{post}) {
+                $opt->{post}->($im, 3, { 4 => 'replace@example.com' });
+        }
+        $im->done;
+        my $thread_a = $ibx->over->get_thread('replace@example.com');
+
+        my %before = map {; delete($_->{blob}) => $_ } @{$ibx->recent};
+        my $reject = PublicInbox::MIME->new($orig->as_string);
+        foreach my $mid (['<replace@example.com>', '<extra@example.com>'],
+                                [], ['<replaced@example.com>']) {
+                $reject->header_set('Message-ID', @$mid);
+                my $ok = eval { $im->replace($orig, $reject) };
+                like($@, qr/Message-ID.*may not be changed/,
+                        '->replace died on Message-ID change');
+                ok(!$ok, 'no replacement happened');
+        }
+
+        # prepare the replacement
+        my $expect = "Move along, nothing to see here\n";
+        my $repl = PublicInbox::MIME->new($orig->as_string);
+        $repl->header_set('From', '<redactor@example.com>');
+        $repl->header_set('Subject', 'redacted');
+        $repl->header_set('Date', 'Sat, 02 Oct 2010 00:00:00 +0000');
+        $repl->body_str_set($expect);
+
+        my @warn;
+        local $SIG{__WARN__} = sub { push @warn, @_ };
+        ok(my $cmts = $im->replace($orig, $repl), 'replaced message');
+        my $changed_epochs = 0;
+        for my $tip (@$cmts) {
+                next if !defined $tip;
+                $changed_epochs++;
+                like($tip, qr/\A[a-f0-9]{40}\z/,
+                        'replace returned current commit');
+        }
+        is($changed_epochs, 1, 'only one epoch changed');
+
+        $im->done;
+        my $m = PublicInbox::MIME->new($ibx->msg_by_mid('replace@example.com'));
+        is($m->body, $expect, 'replaced message');
+        is_deeply(\@warn, [], 'no warnings on noop');
+
+        my @cat = qw(cat-file --buffer --batch --batch-all-objects);
+        my $git = $ibx->git;
+        my @all = $git->qx(@cat);
+        is_deeply([grep(/confidential/, @all)], [], 'nothing confidential');
+        is_deeply([grep(/Streisand/, @all)], [], 'Streisand who?');
+        is_deeply([grep(/\bOct 1993\b/, @all)], [], 'nothing from Oct 1993');
+        my $t19931002 = qr/ 749520000 /;
+        is_deeply([grep(/$t19931002/, @all)], [], "nothing matches $t19931002");
+
+        for my $dir (glob("$ibx->{mainrepo}/git/*.git")) {
+                my ($bn) = ($dir =~ m!([^/]+)\z!);
+                is(system(qw(git --git-dir), $dir, qw(fsck --strict)), 0,
+                        "git fsck is clean in epoch $bn");
+        }
+
+        my $thread_b = $ibx->over->get_thread('replace@example.com');
+        is_deeply([sort map { $_->{mid} } @$thread_b],
+                [sort map { $_->{mid} } @$thread_a], 'threading preserved');
+
+        if (my $srch = $ibx->search) {
+                for my $q ('f:streisand', 's:confidential', 'malibu') {
+                        my $msgs = $srch->query($q);
+                        is_deeply($msgs, [], "no match for $q");
+                }
+                my @ok = ('f:redactor', 's:redacted', 'nothing to see');
+                if ($opt->{pre}) {
+                        push @ok, 'm:1@example.com', 'm:2@example.com',
+                                's:message2', 's:message1';
+                }
+                if ($opt->{post}) {
+                        push @ok, 'm:3@example.com', 'm:4@example.com',
+                                's:message3', 's:message4';
+                }
+                for my $q (@ok) {
+                        my $msgs = $srch->query($q);
+                        ok($msgs->[0], "got match for $q");
+                }
+        }
+
+        # check overview matches:
+        my %after = map {; delete($_->{blob}) => $_ } @{$ibx->recent};
+        my @before_blobs = keys %before;
+        foreach my $blob (@before_blobs) {
+                delete $before{$blob} if delete $after{$blob};
+        }
+
+        is(scalar keys %before, 1, 'one unique blob from before left');
+        is(scalar keys %after, 1, 'one unique blob from after left');
+        foreach my $blob (keys %before) {
+                is($git->check($blob), undef, 'old blob not found');
+                my $smsg = $before{$blob};
+                is($smsg->{subject}, 'confidential', 'before subject');
+                is($smsg->{mid}, 'replace@example.com', 'before MID');
+        }
+        foreach my $blob (keys %after) {
+                ok($git->check($blob), 'new blob found');
+                my $smsg = $after{$blob};
+                is($smsg->{subject}, 'redacted', 'after subject');
+                is($smsg->{mid}, 'replace@example.com', 'before MID');
+        }
+        @warn = ();
+        is($im->replace($orig, $repl), undef, 'no-op replace returns undef');
+        is($im->purge($orig), undef, 'no-op purge returns undef');
+        is_deeply(\@warn, [], 'no warnings on noop');
+}
+
+sub pad_msgs {
+        my ($im, @range) = @_;
+        for my $i (@range) {
+                my $irt;
+                if (ref($i) eq 'HASH') {
+                        ($i, $irt) = each %$i;
+                }
+                my $sec = sprintf('%0d', $i);
+                my $mime = PublicInbox::MIME->new(<<EOF);
+From: foo\@example.com
+To: test\@example.com
+Message-ID: <$i\@example.com>
+Date: Fri, 02, Jan 1970 00:00:$sec +0000
+Subject: message$i
+
+message number$i
+EOF
+
+                if (defined($irt)) {
+                        $mime->header_set('References', "<$irt>");
+                }
+
+                $im->add($mime);
+        }
+}
+
+my $opt = { pre => *pad_msgs };
+test_replace(2, 'basic', {});
+test_replace(2, 'basic', $opt);
+test_replace(2, 'basic', $opt = { %$opt, post => *pad_msgs });
+test_replace(2, 'basic', $opt = { %$opt, rotate_bytes => 1 });
+
+SKIP: if ('test xapian') {
+        require PublicInbox::Search;
+        PublicInbox::Search::load_xapian() or skip 'Search::Xapian missing', 8;
+        for my $l (qw(medium)) {
+                test_replace(2, $l, {});
+                $opt = { pre => *pad_msgs };
+                test_replace(2, $l, $opt);
+                test_replace(2, $l, $opt = { %$opt, post => *pad_msgs });
+                test_replace(2, $l, $opt = { %$opt, rotate_bytes => 1 });
+        }
+};
+
+done_testing();