about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/Admin.pm75
-rw-r--r--lib/PublicInbox/AdminEdit.pm67
-rw-r--r--lib/PublicInbox/Git.pm43
-rw-r--r--lib/PublicInbox/Import.pm101
-rw-r--r--lib/PublicInbox/NNTP.pm27
-rw-r--r--lib/PublicInbox/SearchIdx.pm6
-rw-r--r--lib/PublicInbox/SearchMsg.pm6
-rw-r--r--lib/PublicInbox/V2Writable.pm203
8 files changed, 384 insertions, 144 deletions
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 4a862c6d..8a2f2043 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -9,6 +9,8 @@ use warnings;
 use Cwd 'abs_path';
 use base qw(Exporter);
 our @EXPORT_OK = qw(resolve_repo_dir);
+my $CFG; # all the admin stuff is a singleton
+require PublicInbox::Config;
 
 sub resolve_repo_dir {
         my ($cd, $ver) = @_;
@@ -66,36 +68,65 @@ $ibx->{mainrepo} has unexpected indexlevel in Xapian: $m
         $l;
 }
 
-sub resolve_inboxes {
-        my ($argv, $warn_on_unconfigured) = @_;
-        require PublicInbox::Config;
+sub unconfigured_ibx ($$) {
+        my ($dir, $i) = @_;
+        my $name = "unconfigured-$i";
+        PublicInbox::Inbox->new({
+                name => $name,
+                address => [ "$name\@example.com" ],
+                mainrepo => $dir,
+                # TODO: consumers may want to warn on this:
+                #-unconfigured => 1,
+        });
+}
+
+sub config () { $CFG //= eval { PublicInbox::Config->new } }
+
+sub resolve_inboxes ($;$) {
+        my ($argv, $opt) = @_;
         require PublicInbox::Inbox;
+        $opt ||= {};
 
-        my @ibxs = map { resolve_repo_dir($_) } @$argv;
-        push(@ibxs, resolve_repo_dir()) unless @ibxs;
+        my $cfg = config();
+        if ($opt->{all}) {
+                my $cfgfile = PublicInbox::Config::default_file();
+                $cfg or die "--all specified, but $cfgfile not readable\n";
+                @$argv and die "--all specified, but directories specified\n";
+        }
 
+        my $min_ver = $opt->{-min_inbox_version} || 0;
+        my (@old, @ibxs);
         my %dir2ibx;
-        if (my $config = eval { PublicInbox::Config->new }) {
-                $config->each_inbox(sub {
+        if ($cfg) {
+                $cfg->each_inbox(sub {
                         my ($ibx) = @_;
+                        $ibx->{version} ||= 1;
                         $dir2ibx{abs_path($ibx->{mainrepo})} = $ibx;
                 });
-        } elsif ($warn_on_unconfigured) {
-                # do we really care about this?  It's annoying...
-                warn $warn_on_unconfigured, "\n";
         }
-        for my $i (0..$#ibxs) {
-                my $dir = $ibxs[$i];
-                $ibxs[$i] = $dir2ibx{$dir} ||= do {
-                        my $name = "unconfigured-$i";
-                        PublicInbox::Inbox->new({
-                                name => $name,
-                                address => [ "$name\@example.com" ],
-                                mainrepo => $dir,
-                                # TODO: consumers may want to warn on this:
-                                #-unconfigured => 1,
-                        });
-                };
+        if ($opt->{all}) {
+                my @all = values %dir2ibx;
+                @all = grep { $_->{version} >= $min_ver } @all;
+                push @ibxs, @all;
+        } else { # directories specified on the command-line
+                my $i = 0;
+                my @dirs = @$argv;
+                push @dirs, '.' unless @dirs;
+                foreach (@dirs) {
+                        my $v;
+                        my $dir = resolve_repo_dir($_, \$v);
+                        if ($v < $min_ver) {
+                                push @old, $dir;
+                                next;
+                        }
+                        my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i);
+                        $i++;
+                        push @ibxs, $ibx;
+                }
+        }
+        if (@old) {
+                die "inboxes $min_ver inboxes not supported by $0\n\t",
+                    join("\n\t", @old), "\n";
         }
         @ibxs;
 }
diff --git a/lib/PublicInbox/AdminEdit.pm b/lib/PublicInbox/AdminEdit.pm
new file mode 100644
index 00000000..169feba0
--- /dev/null
+++ b/lib/PublicInbox/AdminEdit.pm
@@ -0,0 +1,67 @@
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# common stuff between -edit, -purge (and maybe -learn in the future)
+package PublicInbox::AdminEdit;
+use strict;
+use warnings;
+use PublicInbox::Admin;
+our @OPT = qw(all force|f verbose|v!);
+
+sub check_editable ($) {
+        my ($ibxs) = @_;
+
+        foreach my $ibx (@$ibxs) {
+                my $lvl = $ibx->{indexlevel};
+                if (defined $lvl) {
+                        PublicInbox::Admin::indexlevel_ok_or_die($lvl);
+                        next;
+                }
+
+                # Undefined indexlevel, so `full'...
+                # Search::Xapian exists and the DB can be read, at least, fine
+                $ibx->search and next;
+
+                # it's possible for a Xapian directory to exist,
+                # but Search::Xapian to go missing/broken.
+                # Make sure it's purged in that case:
+                $ibx->over or die "no over.sqlite3 in $ibx->{mainrepo}\n";
+
+                # $ibx->{search} is populated by $ibx->over call
+                my $xdir_ro = $ibx->{search}->xdir(1);
+                my $npart = 0;
+                foreach my $part (<$xdir_ro/*>) {
+                        if (-d $part && $part =~ m!/[0-9]+\z!) {
+                                my $bytes = 0;
+                                $bytes += -s $_ foreach glob("$part/*");
+                                $npart++ if $bytes;
+                        }
+                }
+                if ($npart) {
+                        PublicInbox::Admin::require_or_die('-search');
+                } else {
+                        # somebody could "rm -r" all the Xapian directories;
+                        # let them purge the overview, at least
+                        $ibx->{indexlevel} ||= 'basic';
+                }
+        }
+}
+
+# takes the output of V2Writable::purge and V2Writable::replace
+# $rewrites = [ array commits keyed by epoch ]
+sub show_rewrites ($$$) {
+        my ($fh, $ibx, $rewrites) = @_;
+        print $fh "$ibx->{mainrepo}:";
+        if (scalar @$rewrites) {
+                my $epoch = -1;
+                my @out = map {;
+                        ++$epoch;
+                        "$epoch.git: ".(defined($_) ? $_ : '(unchanged)')
+                } @$rewrites;
+                print $fh join("\n\t", '', @out), "\n";
+        } else {
+                print $fh " NONE\n";
+        }
+}
+
+1;
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index 82510b99..f5c7a95c 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -145,41 +145,24 @@ again:
                 fail($self, "Unexpected result from git cat-file: $head");
 
         my $size = $1;
-        my $ref_type = $ref ? ref($ref) : '';
-
         my $rv;
         my $left = $size;
-        $$ref = $size if ($ref_type eq 'SCALAR');
-        my $cb_err;
-
-        if ($ref_type eq 'CODE') {
-                $rv = eval { $ref->($in, \$left) };
-                $cb_err = $@;
-                # drain the rest
-                my $max = 8192;
-                while ($left > 0) {
-                        my $r = read($in, my $x, $left > $max ? $max : $left);
-                        defined($r) or fail($self, "read failed: $!");
-                        $r == 0 and fail($self, 'exited unexpectedly');
-                        $left -= $r;
-                }
-        } else {
-                my $offset = 0;
-                my $buf = '';
-                while ($left > 0) {
-                        my $r = read($in, $buf, $left, $offset);
-                        defined($r) or fail($self, "read failed: $!");
-                        $r == 0 and fail($self, 'exited unexpectedly');
-                        $left -= $r;
-                        $offset += $r;
-                }
-                $rv = \$buf;
+        $$ref = $size if $ref;
+
+        my $offset = 0;
+        my $buf = '';
+        while ($left > 0) {
+                my $r = read($in, $buf, $left, $offset);
+                defined($r) or fail($self, "read failed: $!");
+                $r == 0 and fail($self, 'exited unexpectedly');
+                $left -= $r;
+                $offset += $r;
         }
+        $rv = \$buf;
 
-        my $r = read($in, my $buf, 1);
+        my $r = read($in, my $lf, 1);
         defined($r) or fail($self, "read failed: $!");
-        fail($self, 'newline missing after blob') if ($r != 1 || $buf ne "\n");
-        die $cb_err if $cb_err;
+        fail($self, 'newline missing after blob') if ($r != 1 || $lf ne "\n");
 
         $rv;
 }
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 2c4bad92..137b2b78 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -277,7 +277,7 @@ sub git_timestamp {
         "$ts $zone";
 }
 
-sub extract_author_info ($) {
+sub extract_cmt_info ($) {
         my ($mime) = @_;
 
         my $sender = '';
@@ -314,7 +314,17 @@ sub extract_author_info ($) {
                 $name = '';
                 warn "no name in From: $from or Sender: $sender\n";
         }
-        ($name, $email);
+
+        my $hdr = $mime->header_obj;
+
+        my $subject = $hdr->header('Subject');
+        $subject = '(no subject)' unless defined $subject;
+        # Mime decoding can create nulls replace them with spaces to protect git
+        $subject =~ tr/\0/ /;
+        utf8::encode($subject);
+        my $at = git_timestamp(my @at = msg_datestamp($hdr));
+        my $ct = git_timestamp(my @ct = msg_timestamp($hdr));
+        ($name, $email, $at, $ct, $subject);
 }
 
 # kill potentially confusing/misleading headers
@@ -361,19 +371,7 @@ sub clean_tree_v2 ($$$) {
 sub add {
         my ($self, $mime, $check_cb) = @_; # mime = Email::MIME
 
-        my ($name, $email) = extract_author_info($mime);
-        my $hdr = $mime->header_obj;
-        my @at = msg_datestamp($hdr);
-        my @ct = msg_timestamp($hdr);
-        my $author_time_raw = git_timestamp(@at);
-        my $commit_time_raw = git_timestamp(@ct);
-
-        my $subject = $mime->header('Subject');
-        $subject = '(no subject)' unless defined $subject;
-        # Mime decoding can create nulls replace them with spaces to protect git
-        $subject =~ tr/\0/ /;
-        utf8::encode($subject);
-
+        my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime);
         my $path_type = $self->{path_type};
         my $path;
         if ($path_type eq '2/38') {
@@ -416,8 +414,8 @@ sub add {
         }
 
         print $w "commit $ref\nmark :$commit\n",
-                "author $name <$email> $author_time_raw\n",
-                "committer $self->{ident} $commit_time_raw\n" or wfail;
+                "author $name <$email> $at\n",
+                "committer $self->{ident} $ct\n" or wfail;
         print $w "data ", (length($subject) + 1), "\n",
                 $subject, "\n\n" or wfail;
         if ($tip ne '') {
@@ -486,33 +484,45 @@ sub digest2mid ($$) {
         "$dt.$b64" . '@z';
 }
 
-sub clean_purge_buffer {
-        my ($oids, $buf) = @_;
-        my $cmt_msg = 'purged '.join(' ',@$oids)."\n";
+sub rewrite_commit ($$$$) {
+        my ($self, $oids, $buf, $mime) = @_;
+        my ($name, $email, $at, $ct, $subject);
+        if ($mime) {
+                ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime);
+        } else {
+                $name = $email = '';
+                $subject = 'purged '.join(' ', @$oids);
+        }
         @$oids = ();
-
+        $subject .= "\n";
         foreach my $i (0..$#$buf) {
                 my $l = $buf->[$i];
                 if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) {
-                        $buf->[$i] = "author <> $1\n";
+                        $at //= $1;
+                        $buf->[$i] = "author $name <$email> $at\n";
+                } elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) {
+                        $ct //= $1;
+                        $buf->[$i] = "committer $self->{ident} $ct\n";
                 } elsif ($l =~ /^data ([0-9]+)/) {
-                        $buf->[$i++] = "data " . length($cmt_msg) . "\n";
-                        $buf->[$i] = $cmt_msg;
+                        $buf->[$i++] = "data " . length($subject) . "\n";
+                        $buf->[$i] = $subject;
                         last;
                 }
         }
 }
 
-sub purge_oids {
-        my ($self, $purge) = @_;
-        my $tmp = "refs/heads/purge-".((keys %$purge)[0]);
+# returns the new commit OID if a replacement was done
+# returns undef if nothing was done
+sub replace_oids {
+        my ($self, $mime, $replace_map) = @_; # oid => raw string
+        my $tmp = "refs/heads/replace-".((keys %$replace_map)[0]);
         my $old = $self->{'ref'};
         my $git = $self->{git};
         my @export = (qw(fast-export --no-data --use-done-feature), $old);
         my $rd = $git->popen(@export);
         my ($r, $w) = $self->gfi_start;
         my @buf;
-        my $npurge = 0;
+        my $nreplace = 0;
         my @oids;
         my ($done, $mark);
         my $tree = $self->{-tree};
@@ -535,10 +545,13 @@ sub purge_oids {
                 } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) {
                         my ($oid, $path) = ($1, $2);
                         $tree->{$path} = 1;
-                        if ($purge->{$oid}) {
+                        my $sref = $replace_map->{$oid};
+                        if (defined $sref) {
                                 push @oids, $oid;
-                                my $cmd = "M 100644 inline $path\ndata 0\n\n";
-                                push @buf, $cmd;
+                                my $n = length($$sref);
+                                push @buf, "M 100644 inline $path\ndata $n\n";
+                                push @buf, $$sref; # hope CoW works...
+                                push @buf, "\n";
                         } else {
                                 push @buf, $_;
                         }
@@ -547,11 +560,13 @@ sub purge_oids {
                         push @buf, $_ if $tree->{$path};
                 } elsif ($_ eq "\n") {
                         if (@oids) {
-                                my $out = join('', @buf);
-                                $out =~ s/^/# /sgm;
-                                warn "purge rewriting\n", $out, "\n";
-                                clean_purge_buffer(\@oids, \@buf);
-                                $npurge++;
+                                if (!$mime) {
+                                        my $out = join('', @buf);
+                                        $out =~ s/^/# /sgm;
+                                        warn "purge rewriting\n", $out, "\n";
+                                }
+                                rewrite_commit($self, \@oids, \@buf, $mime);
+                                $nreplace++;
                         }
                         $w->print(@buf, "\n") or wfail;
                         @buf = ();
@@ -569,28 +584,30 @@ sub purge_oids {
                 $w->print(@buf) or wfail;
         }
         die 'done\n not seen from fast-export' unless $done;
-        chomp(my $cmt = $self->get_mark(":$mark")) if $npurge;
+        chomp(my $cmt = $self->get_mark(":$mark")) if $nreplace;
         $self->{nchg} = 0; # prevent _update_git_info until update-ref:
         $self->done;
         my @git = ('git', "--git-dir=$git->{git_dir}");
 
-        run_die([@git, qw(update-ref), $old, $tmp]) if $npurge;
+        run_die([@git, qw(update-ref), $old, $tmp]) if $nreplace;
 
         run_die([@git, qw(update-ref -d), $tmp]);
 
-        return if $npurge == 0;
+        return if $nreplace == 0;
 
         run_die([@git, qw(-c gc.reflogExpire=now gc --prune=all)]);
+
+        # check that old OIDs are gone
         my $err = 0;
-        foreach my $oid (keys %$purge) {
+        foreach my $oid (keys %$replace_map) {
                 my @info = $git->check($oid);
                 if (@info) {
-                        warn "$oid not purged\n";
+                        warn "$oid not replaced\n";
                         $err++;
                 }
         }
         _update_git_info($self, 0);
-        die "Failed to purge $err object(s)\n" if $err;
+        die "Failed to replace $err object(s)\n" if $err;
         $cmt;
 }
 
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index be80560f..8a31b910 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -434,6 +434,26 @@ sub xref ($$$$) {
 sub set_nntp_headers ($$$$$) {
         my ($self, $hdr, $ng, $n, $mid) = @_;
 
+        # why? leafnode requires a Path: header for some inexplicable
+        # reason.  We'll fake the shortest one possible.
+        $hdr->header_set('Path', 'y');
+
+        # leafnode (and maybe other NNTP clients) have trouble dealing
+        # with v2 messages which have multiple Message-IDs (either due
+        # to our own content-based dedupe or buggy git-send-email versions).
+        my @mids = $hdr->header('Message-ID');
+        if (scalar(@mids) > 1) {
+                my $mid0 = "<$mid>";
+                $hdr->header_set('Message-ID', $mid0);
+                my @alt = $hdr->header('X-Alt-Message-ID');
+                my %seen = map { $_ => 1 } (@alt, $mid0);
+                foreach my $m (@mids) {
+                        next if $seen{$m}++;
+                        push @alt, $m;
+                }
+                $hdr->header_set('X-Alt-Message-ID', @alt);
+        }
+
         # clobber some
         my $xref = xref($self, $ng, $n, $mid);
         $hdr->header_set('Xref', $xref);
@@ -515,6 +535,13 @@ sub _header ($) {
         my $hdr = $_[0]->header_obj->as_string;
         utf8::encode($hdr);
         $hdr =~ s/(?<!\r)\n/\r\n/sg;
+
+        # for leafnode compatibility, we need to ensure Message-ID headers
+        # are only a single line.  We can't subclass Email::Simple::Header
+        # and override _default_fold_at in here, either; since that won't
+        # affect messages already in the archive.
+        $hdr =~ s/^(Message-ID:)[ \t]*\r\n[ \t]+([^\r]+)\r\n/$1 $2\r\n/igsm;
+
         $hdr
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 99856286..7cd67f12 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -117,7 +117,11 @@ sub _xdb_acquire {
                 }
         }
         return unless defined $flag;
-        $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag);
+        my $xdb = eval { Search::Xapian::WritableDatabase->new($dir, $flag) };
+        if ($@) {
+                die "Failed opening $dir: ", $@;
+        }
+        $self->{xdb} = $xdb;
 }
 
 sub add_val ($$$) {
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 5f3c8af8..96a26b15 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -25,12 +25,6 @@ sub wrap {
         bless { mid => $mid }, $class;
 }
 
-sub get {
-        my ($class, $head, $db, $mid) = @_;
-        my $doc_id = $head->get_docid;
-        load_expand(wrap($class, $mid), $db->get_document($doc_id));
-}
-
 sub get_val ($$) {
         my ($doc, $col) = @_;
         Search::Xapian::sortable_unserialise($doc->get_value($col));
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index a8c33ef4..09ed4e7b 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -11,7 +11,7 @@ use PublicInbox::SearchIdxPart;
 use PublicInbox::MIME;
 use PublicInbox::Git;
 use PublicInbox::Import;
-use PublicInbox::MID qw(mids);
+use PublicInbox::MID qw(mids references);
 use PublicInbox::ContentId qw(content_id content_digest);
 use PublicInbox::Inbox;
 use PublicInbox::OverIdx;
@@ -116,6 +116,18 @@ sub add {
         });
 }
 
+# indexes a message, returns true if checkpointing is needed
+sub do_idx ($$$$$$$) {
+        my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_;
+        $self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
+        my $npart = $self->{partitions};
+        my $part = $num % $npart;
+        my $idx = idx_part($self, $part);
+        $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+        my $n = $self->{transact_bytes} += $len;
+        $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $npart);
+}
+
 sub _add {
         my ($self, $mime, $check_cb) = @_;
 
@@ -141,13 +153,7 @@ sub _add {
         $self->{last_commit}->[$self->{epoch_max}] = $cmt;
 
         my ($oid, $len, $msgref) = @{$im->{last_object}};
-        $self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
-        my $nparts = $self->{partitions};
-        my $part = $num % $nparts;
-        my $idx = $self->idx_part($part);
-        $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
-        my $n = $self->{transact_bytes} += $len;
-        if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
+        if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) {
                 $self->checkpoint;
         }
 
@@ -291,26 +297,30 @@ sub idx_init {
         });
 }
 
-sub purge_oids ($$) {
-        my ($self, $purge) = @_; # $purge = { $object_id => 1, ... }
+# returns an array mapping [ epoch => latest_commit ]
+# latest_commit may be undef if nothing was done to that epoch
+# $replace_map = { $object_id => $strref, ... }
+sub _replace_oids ($$$) {
+        my ($self, $mime, $replace_map) = @_;
         $self->done;
         my $pfx = "$self->{-inbox}->{mainrepo}/git";
-        my $purges = [];
+        my $rewrites = []; # epoch => commit
         my $max = $self->{epoch_max};
 
         unless (defined($max)) {
                 defined(my $latest = git_dir_latest($self, \$max)) or return;
                 $self->{epoch_max} = $max;
         }
+
         foreach my $i (0..$max) {
                 my $git_dir = "$pfx/$i.git";
                 -d $git_dir or next;
                 my $git = PublicInbox::Git->new($git_dir);
                 my $im = $self->import_init($git, 0, 1);
-                $purges->[$i] = $im->purge_oids($purge);
+                $rewrites->[$i] = $im->replace_oids($mime, $replace_map);
                 $im->done;
         }
-        $purges;
+        $rewrites;
 }
 
 sub content_ids ($) {
@@ -333,25 +343,31 @@ sub content_matches ($$) {
         0
 }
 
-sub remove_internal ($$$$) {
-        my ($self, $mime, $cmt_msg, $purge) = @_;
+# used for removing or replacing (purging)
+sub rewrite_internal ($$;$$$) {
+        my ($self, $old_mime, $cmt_msg, $new_mime, $sref) = @_;
         $self->idx_init;
-        my $im = $self->importer unless $purge;
+        my ($im, $need_reindex, $replace_map);
+        if ($sref) {
+                $replace_map = {}; # oid => sref
+                $need_reindex = [] if $new_mime;
+        } else {
+                $im = $self->importer;
+        }
         my $over = $self->{over};
-        my $cids = content_ids($mime);
+        my $cids = content_ids($old_mime);
         my $parts = $self->{idx_parts};
-        my $mm = $self->{mm};
         my $removed;
-        my $mids = mids($mime->header_obj);
+        my $mids = mids($old_mime->header_obj);
 
         # We avoid introducing new blobs into git since the raw content
         # can be slightly different, so we do not need the user-supplied
         # message now that we have the mids and content_id
-        $mime = undef;
+        $old_mime = undef;
         my $mark;
 
         foreach my $mid (@$mids) {
-                my %gone;
+                my %gone; # num => [ smsg, raw ]
                 my ($id, $prev);
                 while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) {
                         my $msg = get_blob($self, $smsg);
@@ -374,17 +390,21 @@ sub remove_internal ($$$$) {
                 }
                 foreach my $num (keys %gone) {
                         my ($smsg, $orig) = @{$gone{$num}};
-                        $mm->num_delete($num);
                         # $removed should only be set once assuming
                         # no bugs in our deduplication code:
                         $removed = $smsg;
                         my $oid = $smsg->{blob};
-                        if ($purge) {
-                                $purge->{$oid} = 1;
+                        if ($replace_map) {
+                                $replace_map->{$oid} = $sref;
                         } else {
                                 ($mark, undef) = $im->remove($orig, $cmt_msg);
                         }
                         $orig = undef;
+                        if ($need_reindex) { # ->replace
+                                push @$need_reindex, $smsg;
+                        } else { # ->purge or ->remove
+                                $self->{mm}->num_delete($num);
+                        }
                         unindex_oid_remote($self, $oid, $mid);
                 }
         }
@@ -393,8 +413,9 @@ sub remove_internal ($$$$) {
                 my $cmt = $im->get_mark($mark);
                 $self->{last_commit}->[$self->{epoch_max}] = $cmt;
         }
-        if ($purge && scalar keys %$purge) {
-                return purge_oids($self, $purge);
+        if ($replace_map && scalar keys %$replace_map) {
+                my $rewrites = _replace_oids($self, $new_mime, $replace_map);
+                return { rewrites => $rewrites, need_reindex => $need_reindex };
         }
         $removed;
 }
@@ -403,22 +424,125 @@ sub remove_internal ($$$$) {
 sub remove {
         my ($self, $mime, $cmt_msg) = @_;
         $self->{-inbox}->with_umask(sub {
-                remove_internal($self, $mime, $cmt_msg, undef);
+                rewrite_internal($self, $mime, $cmt_msg);
         });
 }
 
+sub _replace ($$;$$) {
+        my ($self, $old_mime, $new_mime, $sref) = @_;
+        my $rewritten = $self->{-inbox}->with_umask(sub {
+                rewrite_internal($self, $old_mime, undef, $new_mime, $sref);
+        }) or return;
+
+        my $rewrites = $rewritten->{rewrites};
+        # ->done is called if there are rewrites since we gc+prune from git
+        $self->idx_init if @$rewrites;
+
+        for my $i (0..$#$rewrites) {
+                defined(my $cmt = $rewrites->[$i]) or next;
+                $self->{last_commit}->[$i] = $cmt;
+        }
+        $rewritten;
+}
+
 # public
 sub purge {
         my ($self, $mime) = @_;
-        my $purges = $self->{-inbox}->with_umask(sub {
-                remove_internal($self, $mime, undef, {});
-        }) or return;
-        $self->idx_init if @$purges; # ->done is called on purges
-        for my $i (0..$#$purges) {
-                defined(my $cmt = $purges->[$i]) or next;
-                $self->{last_commit}->[$i] = $cmt;
+        my $rewritten = _replace($self, $mime, undef, \'') or return;
+        $rewritten->{rewrites}
+}
+
+# returns the git object_id of $fh, does not write the object to FS
+sub git_hash_raw ($$) {
+        my ($self, $raw) = @_;
+        # grab the expected OID we have to reindex:
+        open my $tmp_fh, '+>', undef or die "failed to open tmp: $!";
+        $tmp_fh->autoflush(1);
+        print $tmp_fh $$raw or die "print \$tmp_fh: $!";
+        sysseek($tmp_fh, 0, 0) or die "seek failed: $!";
+
+        my ($r, $w);
+        pipe($r, $w) or die "failed to create pipe: $!";
+        my $rdr = { 0 => fileno($tmp_fh), 1 => fileno($w) };
+        my $git_dir = $self->{-inbox}->git->{git_dir};
+        my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)];
+        my $pid = spawn($cmd, undef, $rdr);
+        close $w;
+        local $/ = "\n";
+        chomp(my $oid = <$r>);
+        waitpid($pid, 0) == $pid or die "git hash-object did not finish";
+        die "git hash-object failed: $?" if $?;
+        $oid =~ /\A[a-f0-9]{40}\z/ or die "OID not expected: $oid";
+        $oid;
+}
+
+sub _check_mids_match ($$$) {
+        my ($old_list, $new_list, $hdrs) = @_;
+        my %old_mids = map { $_ => 1 } @$old_list;
+        my %new_mids = map { $_ => 1 } @$new_list;
+        my @old = keys %old_mids;
+        my @new = keys %new_mids;
+        my $err = "$hdrs may not be changed when replacing\n";
+        die $err if scalar(@old) != scalar(@new);
+        delete @new_mids{@old};
+        delete @old_mids{@new};
+        die $err if (scalar(keys %old_mids) || scalar(keys %new_mids));
+}
+
+# Changing Message-IDs or References with ->replace isn't supported.
+# The rules for dealing with messages with multiple or conflicting
+# Message-IDs are pretty complex and rethreading hasn't been fully
+# implemented, yet.
+sub check_mids_match ($$) {
+        my ($old_mime, $new_mime) = @_;
+        my $old = $old_mime->header_obj;
+        my $new = $new_mime->header_obj;
+        _check_mids_match(mids($old), mids($new), 'Message-ID(s)');
+        _check_mids_match(references($old), references($new),
+                        'References/In-Reply-To');
+}
+
+# public
+sub replace ($$$) {
+        my ($self, $old_mime, $new_mime) = @_;
+
+        check_mids_match($old_mime, $new_mime);
+
+        # mutt will always add Content-Length:, Status:, Lines: when editing
+        PublicInbox::Import::drop_unwanted_headers($new_mime);
+
+        my $raw = $new_mime->as_string;
+        my $expect_oid = git_hash_raw($self, \$raw);
+        my $rewritten = _replace($self, $old_mime, $new_mime, \$raw) or return;
+        my $need_reindex = $rewritten->{need_reindex};
+
+        # just in case we have bugs in deduplication code:
+        my $n = scalar(@$need_reindex);
+        if ($n > 1) {
+                my $list = join(', ', map {
+                                        "$_->{num}: <$_->{mid}>"
+                                } @$need_reindex);
+                warn <<"";
+W: rewritten $n messages matching content of original message (expected: 1).
+W: possible bug in public-inbox, NNTP article IDs and Message-IDs follow:
+W: $list
+
+        }
+
+        # make sure we really got the OID:
+        my ($oid, $type, $len) = $self->{-inbox}->git->check($expect_oid);
+        $oid eq $expect_oid or die "BUG: $expect_oid not found after replace";
+
+        # don't leak FDs to Xapian:
+        $self->{-inbox}->git->cleanup;
+
+        # reindex modified messages:
+        for my $smsg (@$need_reindex) {
+                my $num = $smsg->{num};
+                my $mid0 = $smsg->{mid};
+                do_idx($self, \$raw, $new_mime, $len, $num, $oid, $mid0);
         }
-        $purges;
+        $rewritten->{rewrites};
 }
 
 sub last_commit_part ($$;$) {
@@ -772,15 +896,8 @@ sub reindex_oid ($$$$) {
         }
         $sync->{mm_tmp}->mid_delete($mid0) or
                 die "failed to delete <$mid0> for article #$num\n";
-
-        $self->{over}->add_overview($mime, $len, $num, $oid, $mid0);
-        my $nparts = $self->{partitions};
-        my $part = $num % $nparts;
-        my $idx = $self->idx_part($part);
-        $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
-        my $n = $self->{transact_bytes} += $len;
         $sync->{nr}++;
-        if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
+        if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) {
                 $git->cleanup;
                 $sync->{mm_tmp}->atfork_prepare;
                 $self->done; # release lock