From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 4D8E31F97F for ; Sun, 9 Jun 2019 02:51:48 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 04/11] v2writable: implement ->replace call Date: Sun, 9 Jun 2019 02:51:40 +0000 Message-Id: <20190609025147.24966-5-e@80x24.org> In-Reply-To: <20190609025147.24966-1-e@80x24.org> References: <20190609025147.24966-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Much of the existing purge code is repurposed to a general "replace" functionality. ->purge is simpler because it can just drop the information. Unlike ->purge, ->replace needs to edit existing git commits (in case of From: and Subject: headers) and reindex the modified message. We currently disallow editing of References:, In-Reply-To: and Message-ID headers because it can cause bad side effects with our threading (and our lack of rethreading support to deal with excessive matching from incorrect/invalid References). --- MANIFEST | 1 + lib/PublicInbox/Import.pm | 44 +++++--- lib/PublicInbox/V2Writable.pm | 171 ++++++++++++++++++++++++----- t/replace.t | 199 ++++++++++++++++++++++++++++++++++ 4 files changed, 372 insertions(+), 43 deletions(-) create mode 100644 t/replace.t diff --git a/MANIFEST b/MANIFEST index 5085bff..321da03 100644 --- a/MANIFEST +++ b/MANIFEST @@ -236,6 +236,7 @@ t/psgi_text.t t/psgi_v2.t t/purge.t t/qspawn.t +t/replace.t t/reply.t t/search-thr-index.t t/search.t diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 2c8fe84..137b2b7 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -484,26 +484,38 @@ sub digest2mid ($$) { "$dt.$b64" . '@z'; } -sub clean_purge_buffer { - my ($oids, $buf) = @_; - my $cmt_msg = 'purged '.join(' ',@$oids)."\n"; +sub rewrite_commit ($$$$) { + my ($self, $oids, $buf, $mime) = @_; + my ($name, $email, $at, $ct, $subject); + if ($mime) { + ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime); + } else { + $name = $email = ''; + $subject = 'purged '.join(' ', @$oids); + } @$oids = (); - + $subject .= "\n"; foreach my $i (0..$#$buf) { my $l = $buf->[$i]; if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) { - $buf->[$i] = "author <> $1\n"; + $at //= $1; + $buf->[$i] = "author $name <$email> $at\n"; + } elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) { + $ct //= $1; + $buf->[$i] = "committer $self->{ident} $ct\n"; } elsif ($l =~ /^data ([0-9]+)/) { - $buf->[$i++] = "data " . length($cmt_msg) . "\n"; - $buf->[$i] = $cmt_msg; + $buf->[$i++] = "data " . length($subject) . "\n"; + $buf->[$i] = $subject; last; } } } +# returns the new commit OID if a replacement was done +# returns undef if nothing was done sub replace_oids { - my ($self, $replace) = @_; # oid => raw string - my $tmp = "refs/heads/replace-".((keys %$replace)[0]); + my ($self, $mime, $replace_map) = @_; # oid => raw string + my $tmp = "refs/heads/replace-".((keys %$replace_map)[0]); my $old = $self->{'ref'}; my $git = $self->{git}; my @export = (qw(fast-export --no-data --use-done-feature), $old); @@ -533,7 +545,7 @@ sub replace_oids { } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { my ($oid, $path) = ($1, $2); $tree->{$path} = 1; - my $sref = $replace->{$oid}; + my $sref = $replace_map->{$oid}; if (defined $sref) { push @oids, $oid; my $n = length($$sref); @@ -548,10 +560,12 @@ sub replace_oids { push @buf, $_ if $tree->{$path}; } elsif ($_ eq "\n") { if (@oids) { - my $out = join('', @buf); - $out =~ s/^/# /sgm; - warn "purge rewriting\n", $out, "\n"; - clean_purge_buffer(\@oids, \@buf); + if (!$mime) { + my $out = join('', @buf); + $out =~ s/^/# /sgm; + warn "purge rewriting\n", $out, "\n"; + } + rewrite_commit($self, \@oids, \@buf, $mime); $nreplace++; } $w->print(@buf, "\n") or wfail; @@ -585,7 +599,7 @@ sub replace_oids { # check that old OIDs are gone my $err = 0; - foreach my $oid (keys %$replace) { + foreach my $oid (keys %$replace_map) { my @info = $git->check($oid); if (@info) { warn "$oid not replaced\n"; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index d6f72b0..3484807 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -11,7 +11,7 @@ use PublicInbox::SearchIdxPart; use PublicInbox::MIME; use PublicInbox::Git; use PublicInbox::Import; -use PublicInbox::MID qw(mids); +use PublicInbox::MID qw(mids references); use PublicInbox::ContentId qw(content_id content_digest); use PublicInbox::Inbox; use PublicInbox::OverIdx; @@ -297,26 +297,30 @@ sub idx_init { }); } -sub purge_oids ($$) { - my ($self, $purge) = @_; # $purge = { $object_id => \'', ... } +# returns an array mapping [ epoch => latest_commit ] +# latest_commit may be undef if nothing was done to that epoch +# $replace_map = { $object_id => $strref, ... } +sub _replace_oids ($$$) { + my ($self, $mime, $replace_map) = @_; $self->done; my $pfx = "$self->{-inbox}->{mainrepo}/git"; - my $purges = []; + my $rewrites = []; # epoch => commit my $max = $self->{epoch_max}; unless (defined($max)) { defined(my $latest = git_dir_latest($self, \$max)) or return; $self->{epoch_max} = $max; } + foreach my $i (0..$max) { my $git_dir = "$pfx/$i.git"; -d $git_dir or next; my $git = PublicInbox::Git->new($git_dir); my $im = $self->import_init($git, 0, 1); - $purges->[$i] = $im->replace_oids($purge); + $rewrites->[$i] = $im->replace_oids($mime, $replace_map); $im->done; } - $purges; + $rewrites; } sub content_ids ($) { @@ -339,25 +343,31 @@ sub content_matches ($$) { 0 } -sub remove_internal ($$$$) { - my ($self, $mime, $cmt_msg, $purge) = @_; +# used for removing or replacing (purging) +sub rewrite_internal ($$;$$$) { + my ($self, $old_mime, $cmt_msg, $new_mime, $sref) = @_; $self->idx_init; - my $im = $self->importer unless $purge; + my ($im, $need_reindex, $replace_map); + if ($sref) { + $replace_map = {}; # oid => sref + $need_reindex = [] if $new_mime; + } else { + $im = $self->importer; + } my $over = $self->{over}; - my $cids = content_ids($mime); + my $cids = content_ids($old_mime); my $parts = $self->{idx_parts}; - my $mm = $self->{mm}; my $removed; - my $mids = mids($mime->header_obj); + my $mids = mids($old_mime->header_obj); # We avoid introducing new blobs into git since the raw content # can be slightly different, so we do not need the user-supplied # message now that we have the mids and content_id - $mime = undef; + $old_mime = undef; my $mark; foreach my $mid (@$mids) { - my %gone; + my %gone; # num => [ smsg, raw ] my ($id, $prev); while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { my $msg = get_blob($self, $smsg); @@ -380,17 +390,21 @@ sub remove_internal ($$$$) { } foreach my $num (keys %gone) { my ($smsg, $orig) = @{$gone{$num}}; - $mm->num_delete($num); # $removed should only be set once assuming # no bugs in our deduplication code: $removed = $smsg; my $oid = $smsg->{blob}; - if ($purge) { - $purge->{$oid} = \''; + if ($replace_map) { + $replace_map->{$oid} = $sref; } else { ($mark, undef) = $im->remove($orig, $cmt_msg); } $orig = undef; + if ($need_reindex) { # ->replace + push @$need_reindex, $smsg; + } else { # ->purge or ->remove + $self->{mm}->num_delete($num); + } unindex_oid_remote($self, $oid, $mid); } } @@ -399,8 +413,9 @@ sub remove_internal ($$$$) { my $cmt = $im->get_mark($mark); $self->{last_commit}->[$self->{epoch_max}] = $cmt; } - if ($purge && scalar keys %$purge) { - return purge_oids($self, $purge); + if ($replace_map && scalar keys %$replace_map) { + my $rewrites = _replace_oids($self, $new_mime, $replace_map); + return { rewrites => $rewrites, need_reindex => $need_reindex }; } $removed; } @@ -409,22 +424,122 @@ sub remove_internal ($$$$) { sub remove { my ($self, $mime, $cmt_msg) = @_; $self->{-inbox}->with_umask(sub { - remove_internal($self, $mime, $cmt_msg, undef); + rewrite_internal($self, $mime, $cmt_msg); }); } +sub _replace ($$;$$) { + my ($self, $old_mime, $new_mime, $sref) = @_; + my $rewritten = $self->{-inbox}->with_umask(sub { + rewrite_internal($self, $old_mime, undef, $new_mime, $sref); + }) or return; + + my $rewrites = $rewritten->{rewrites}; + # ->done is called if there are rewrites since we gc+prune from git + $self->idx_init if @$rewrites; + + for my $i (0..$#$rewrites) { + defined(my $cmt = $rewrites->[$i]) or next; + $self->{last_commit}->[$i] = $cmt; + } + $rewritten; +} + # public sub purge { my ($self, $mime) = @_; - my $purges = $self->{-inbox}->with_umask(sub { - remove_internal($self, $mime, undef, {}); - }) or return; - $self->idx_init if @$purges; # ->done is called on purges - for my $i (0..$#$purges) { - defined(my $cmt = $purges->[$i]) or next; - $self->{last_commit}->[$i] = $cmt; + my $rewritten = _replace($self, $mime, undef, \'') or return; + $rewritten->{rewrites} +} + +# returns the git object_id of $fh, does not write the object to FS +sub git_hash_raw ($$) { + my ($self, $raw) = @_; + # grab the expected OID we have to reindex: + open my $tmp_fh, '+>', undef or die "failed to open tmp: $!"; + $tmp_fh->autoflush(1); + print $tmp_fh $$raw or die "print \$tmp_fh: $!"; + sysseek($tmp_fh, 0, 0) or die "seek failed: $!"; + + my ($r, $w); + pipe($r, $w) or die "failed to create pipe: $!"; + my $rdr = { 0 => fileno($tmp_fh), 1 => fileno($w) }; + my $git_dir = $self->{-inbox}->git->{git_dir}; + my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)]; + my $pid = spawn($cmd, undef, $rdr); + close $w; + local $/ = "\n"; + chomp(my $oid = <$r>); + waitpid($pid, 0) == $pid or die "git hash-object did not finish"; + die "git hash-object failed: $?" if $?; + $oid =~ /\A[a-f0-9]{40}\z/ or die "OID not expected: $oid"; + $oid; +} + +sub _check_mids_match ($$$) { + my ($old_list, $new_list, $hdrs) = @_; + my %old_mids = map { $_ => 1 } @$old_list; + my %new_mids = map { $_ => 1 } @$new_list; + my @old = keys %old_mids; + my @new = keys %new_mids; + my $err = "$hdrs may not be changed when replacing\n"; + die $err if scalar(@old) != scalar(@new); + delete @new_mids{@old}; + delete @old_mids{@new}; + die $err if (scalar(keys %old_mids) || scalar(keys %new_mids)); +} + +# Changing Message-IDs or References with ->replace isn't supported. +# The rules for dealing with messages with multiple or conflicting +# Message-IDs are pretty complex and rethreading hasn't been fully +# implemented, yet. +sub check_mids_match ($$) { + my ($old_mime, $new_mime) = @_; + my $old = $old_mime->header_obj; + my $new = $new_mime->header_obj; + _check_mids_match(mids($old), mids($new), 'Message-ID(s)'); + _check_mids_match(references($old), references($new), + 'References/In-Reply-To'); +} + +# public +sub replace ($$$) { + my ($self, $old_mime, $new_mime) = @_; + + check_mids_match($old_mime, $new_mime); + + # mutt will always add Content-Length:, Status:, Lines: when editing + PublicInbox::Import::drop_unwanted_headers($new_mime); + + my $raw = $new_mime->as_string; + my $expect_oid = git_hash_raw($self, \$raw); + my $rewritten = _replace($self, $old_mime, $new_mime, \$raw) or return; + my $need_reindex = $rewritten->{need_reindex}; + + # just in case we have bugs in deduplication code: + my $n = scalar(@$need_reindex); + if ($n > 1) { + my $list = join(', ', map { + "$_->{num}: <$_->{mid}>" + } @$need_reindex); + warn <<""; +W: rewritten $n messages matching content of original message (expected: 1). +W: possible bug in public-inbox, NNTP article IDs and Message-IDs follow: +W: $list + + } + + # make sure we really got the OID: + my ($oid, $type, $len) = $self->{-inbox}->git->check($expect_oid); + $oid eq $expect_oid or die "BUG: $expect_oid not found after replace"; + + # reindex modified messages: + for my $smsg (@$need_reindex) { + my $num = $smsg->{num}; + my $mid0 = $smsg->{mid}; + do_idx($self, \$raw, $new_mime, $len, $num, $oid, $mid0); } - $purges; + $rewritten->{rewrites}; } sub last_commit_part ($$;$) { diff --git a/t/replace.t b/t/replace.t new file mode 100644 index 0000000..6fae551 --- /dev/null +++ b/t/replace.t @@ -0,0 +1,199 @@ +# Copyright (C) 2019 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use PublicInbox::InboxWritable; +use File::Temp qw/tempdir/; +require './t/common.perl'; +require_git(2.6); # replace is v2 only, for now... +foreach my $mod (qw(DBD::SQLite)) { + eval "require $mod"; + plan skip_all => "$mod missing for $0" if $@; +} + +sub test_replace ($$$) { + my ($v, $level, $opt) = @_; + diag "v$v $level replace"; + my $this = "pi-$v-$level-replace"; + my $tmpdir = tempdir("$this-tmp-XXXXXX", TMPDIR => 1, CLEANUP => 1); + my $ibx = PublicInbox::Inbox->new({ + mainrepo => "$tmpdir/testbox", + name => $this, + version => $v, + -primary_address => 'test@example.com', + indexlevel => $level, + }); + + my $orig = PublicInbox::MIME->new(<<'EOF'); +From: Barbra Streisand +To: test@example.com +Subject: confidential +Message-ID: +Date: Fri, 02 Oct 1993 00:00:00 +0000 + +Top secret info about my house in Malibu... +EOF + my $im = PublicInbox::InboxWritable->new($ibx, {nproc=>1})->importer; + # fake a bunch of epochs + $im->{rotate_bytes} = $opt->{rotate_bytes} if $opt->{rotate_bytes}; + + if ($opt->{pre}) { + $opt->{pre}->($im, 1, 2); + $orig->header_set('References', '<1@example.com>'); + } + ok($im->add($orig), 'add message to be replaced'); + if ($opt->{post}) { + $opt->{post}->($im, 3, { 4 => 'replace@example.com' }); + } + $im->done; + my $thread_a = $ibx->over->get_thread('replace@example.com'); + + my %before = map {; delete($_->{blob}) => $_ } @{$ibx->recent}; + my $reject = PublicInbox::MIME->new($orig->as_string); + foreach my $mid (['', ''], + [], ['']) { + $reject->header_set('Message-ID', @$mid); + my $ok = eval { $im->replace($orig, $reject) }; + like($@, qr/Message-ID.*may not be changed/, + '->replace died on Message-ID change'); + ok(!$ok, 'no replacement happened'); + } + + # prepare the replacement + my $expect = "Move along, nothing to see here\n"; + my $repl = PublicInbox::MIME->new($orig->as_string); + $repl->header_set('From', ''); + $repl->header_set('Subject', 'redacted'); + $repl->header_set('Date', 'Sat, 02 Oct 2010 00:00:00 +0000'); + $repl->body_str_set($expect); + + my @warn; + local $SIG{__WARN__} = sub { push @warn, @_ }; + ok(my $cmts = $im->replace($orig, $repl), 'replaced message'); + my $changed_epochs = 0; + for my $tip (@$cmts) { + next if !defined $tip; + $changed_epochs++; + like($tip, qr/\A[a-f0-9]{40}\z/, + 'replace returned current commit'); + } + is($changed_epochs, 1, 'only one epoch changed'); + + $im->done; + my $m = PublicInbox::MIME->new($ibx->msg_by_mid('replace@example.com')); + is($m->body, $expect, 'replaced message'); + is_deeply(\@warn, [], 'no warnings on noop'); + + my @cat = qw(cat-file --buffer --batch --batch-all-objects); + my $git = $ibx->git; + my @all = $git->qx(@cat); + is_deeply([grep(/confidential/, @all)], [], 'nothing confidential'); + is_deeply([grep(/Streisand/, @all)], [], 'Streisand who?'); + is_deeply([grep(/\bOct 1993\b/, @all)], [], 'nothing from Oct 1993'); + my $t19931002 = qr/ 749520000 /; + is_deeply([grep(/$t19931002/, @all)], [], "nothing matches $t19931002"); + + for my $dir (glob("$ibx->{mainrepo}/git/*.git")) { + my ($bn) = ($dir =~ m!([^/]+)\z!); + is(system(qw(git --git-dir), $dir, qw(fsck --strict)), 0, + "git fsck is clean in epoch $bn"); + } + + my $thread_b = $ibx->over->get_thread('replace@example.com'); + is_deeply([sort map { $_->{mid} } @$thread_b], + [sort map { $_->{mid} } @$thread_a], 'threading preserved'); + + if (my $srch = $ibx->search) { + for my $q ('f:streisand', 's:confidential', 'malibu') { + my $msgs = $srch->query($q); + is_deeply($msgs, [], "no match for $q"); + } + my @ok = ('f:redactor', 's:redacted', 'nothing to see'); + if ($opt->{pre}) { + push @ok, 'm:1@example.com', 'm:2@example.com', + 's:message2', 's:message1'; + } + if ($opt->{post}) { + push @ok, 'm:3@example.com', 'm:4@example.com', + 's:message3', 's:message4'; + } + for my $q (@ok) { + my $msgs = $srch->query($q); + ok($msgs->[0], "got match for $q"); + } + } + + # check overview matches: + my %after = map {; delete($_->{blob}) => $_ } @{$ibx->recent}; + my @before_blobs = keys %before; + foreach my $blob (@before_blobs) { + delete $before{$blob} if delete $after{$blob}; + } + + is(scalar keys %before, 1, 'one unique blob from before left'); + is(scalar keys %after, 1, 'one unique blob from after left'); + foreach my $blob (keys %before) { + is($git->check($blob), undef, 'old blob not found'); + my $smsg = $before{$blob}; + is($smsg->{subject}, 'confidential', 'before subject'); + is($smsg->{mid}, 'replace@example.com', 'before MID'); + } + foreach my $blob (keys %after) { + ok($git->check($blob), 'new blob found'); + my $smsg = $after{$blob}; + is($smsg->{subject}, 'redacted', 'after subject'); + is($smsg->{mid}, 'replace@example.com', 'before MID'); + } + @warn = (); + is($im->replace($orig, $repl), undef, 'no-op replace returns undef'); + is($im->purge($orig), undef, 'no-op purge returns undef'); + is_deeply(\@warn, [], 'no warnings on noop'); +} + +sub pad_msgs { + my ($im, @range) = @_; + for my $i (@range) { + my $irt; + if (ref($i) eq 'HASH') { + ($i, $irt) = each %$i; + } + my $sec = sprintf('%0d', $i); + my $mime = PublicInbox::MIME->new(< +Date: Fri, 02, Jan 1970 00:00:$sec +0000 +Subject: message$i + +message number$i +EOF + + if (defined($irt)) { + $mime->header_set('References', "<$irt>"); + } + + $im->add($mime); + } +} + +my $opt = { pre => *pad_msgs }; +test_replace(2, 'basic', {}); +test_replace(2, 'basic', $opt); +test_replace(2, 'basic', $opt = { %$opt, post => *pad_msgs }); +test_replace(2, 'basic', $opt = { %$opt, rotate_bytes => 1 }); + +SKIP: if ('test xapian') { + require PublicInbox::Search; + PublicInbox::Search::load_xapian() or skip 'Search::Xapian missing', 8; + for my $l (qw(medium)) { + test_replace(2, $l, {}); + $opt = { pre => *pad_msgs }; + test_replace(2, $l, $opt); + test_replace(2, $l, $opt = { %$opt, post => *pad_msgs }); + test_replace(2, $l, $opt = { %$opt, rotate_bytes => 1 }); + } +}; + +done_testing(); -- EW