diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/PublicInbox/Admin.pm | 75 | ||||
-rw-r--r-- | lib/PublicInbox/AdminEdit.pm | 67 | ||||
-rw-r--r-- | lib/PublicInbox/Git.pm | 43 | ||||
-rw-r--r-- | lib/PublicInbox/Import.pm | 101 | ||||
-rw-r--r-- | lib/PublicInbox/NNTP.pm | 27 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/SearchMsg.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/V2Writable.pm | 203 |
8 files changed, 384 insertions, 144 deletions
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 4a862c6d..8a2f2043 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -9,6 +9,8 @@ use warnings; use Cwd 'abs_path'; use base qw(Exporter); our @EXPORT_OK = qw(resolve_repo_dir); +my $CFG; # all the admin stuff is a singleton +require PublicInbox::Config; sub resolve_repo_dir { my ($cd, $ver) = @_; @@ -66,36 +68,65 @@ $ibx->{mainrepo} has unexpected indexlevel in Xapian: $m $l; } -sub resolve_inboxes { - my ($argv, $warn_on_unconfigured) = @_; - require PublicInbox::Config; +sub unconfigured_ibx ($$) { + my ($dir, $i) = @_; + my $name = "unconfigured-$i"; + PublicInbox::Inbox->new({ + name => $name, + address => [ "$name\@example.com" ], + mainrepo => $dir, + # TODO: consumers may want to warn on this: + #-unconfigured => 1, + }); +} + +sub config () { $CFG //= eval { PublicInbox::Config->new } } + +sub resolve_inboxes ($;$) { + my ($argv, $opt) = @_; require PublicInbox::Inbox; + $opt ||= {}; - my @ibxs = map { resolve_repo_dir($_) } @$argv; - push(@ibxs, resolve_repo_dir()) unless @ibxs; + my $cfg = config(); + if ($opt->{all}) { + my $cfgfile = PublicInbox::Config::default_file(); + $cfg or die "--all specified, but $cfgfile not readable\n"; + @$argv and die "--all specified, but directories specified\n"; + } + my $min_ver = $opt->{-min_inbox_version} || 0; + my (@old, @ibxs); my %dir2ibx; - if (my $config = eval { PublicInbox::Config->new }) { - $config->each_inbox(sub { + if ($cfg) { + $cfg->each_inbox(sub { my ($ibx) = @_; + $ibx->{version} ||= 1; $dir2ibx{abs_path($ibx->{mainrepo})} = $ibx; }); - } elsif ($warn_on_unconfigured) { - # do we really care about this? It's annoying... - warn $warn_on_unconfigured, "\n"; } - for my $i (0..$#ibxs) { - my $dir = $ibxs[$i]; - $ibxs[$i] = $dir2ibx{$dir} ||= do { - my $name = "unconfigured-$i"; - PublicInbox::Inbox->new({ - name => $name, - address => [ "$name\@example.com" ], - mainrepo => $dir, - # TODO: consumers may want to warn on this: - #-unconfigured => 1, - }); - }; + if ($opt->{all}) { + my @all = values %dir2ibx; + @all = grep { $_->{version} >= $min_ver } @all; + push @ibxs, @all; + } else { # directories specified on the command-line + my $i = 0; + my @dirs = @$argv; + push @dirs, '.' unless @dirs; + foreach (@dirs) { + my $v; + my $dir = resolve_repo_dir($_, \$v); + if ($v < $min_ver) { + push @old, $dir; + next; + } + my $ibx = $dir2ibx{$dir} ||= unconfigured_ibx($dir, $i); + $i++; + push @ibxs, $ibx; + } + } + if (@old) { + die "inboxes $min_ver inboxes not supported by $0\n\t", + join("\n\t", @old), "\n"; } @ibxs; } diff --git a/lib/PublicInbox/AdminEdit.pm b/lib/PublicInbox/AdminEdit.pm new file mode 100644 index 00000000..169feba0 --- /dev/null +++ b/lib/PublicInbox/AdminEdit.pm @@ -0,0 +1,67 @@ +# Copyright (C) 2019 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# common stuff between -edit, -purge (and maybe -learn in the future) +package PublicInbox::AdminEdit; +use strict; +use warnings; +use PublicInbox::Admin; +our @OPT = qw(all force|f verbose|v!); + +sub check_editable ($) { + my ($ibxs) = @_; + + foreach my $ibx (@$ibxs) { + my $lvl = $ibx->{indexlevel}; + if (defined $lvl) { + PublicInbox::Admin::indexlevel_ok_or_die($lvl); + next; + } + + # Undefined indexlevel, so `full'... + # Search::Xapian exists and the DB can be read, at least, fine + $ibx->search and next; + + # it's possible for a Xapian directory to exist, + # but Search::Xapian to go missing/broken. + # Make sure it's purged in that case: + $ibx->over or die "no over.sqlite3 in $ibx->{mainrepo}\n"; + + # $ibx->{search} is populated by $ibx->over call + my $xdir_ro = $ibx->{search}->xdir(1); + my $npart = 0; + foreach my $part (<$xdir_ro/*>) { + if (-d $part && $part =~ m!/[0-9]+\z!) { + my $bytes = 0; + $bytes += -s $_ foreach glob("$part/*"); + $npart++ if $bytes; + } + } + if ($npart) { + PublicInbox::Admin::require_or_die('-search'); + } else { + # somebody could "rm -r" all the Xapian directories; + # let them purge the overview, at least + $ibx->{indexlevel} ||= 'basic'; + } + } +} + +# takes the output of V2Writable::purge and V2Writable::replace +# $rewrites = [ array commits keyed by epoch ] +sub show_rewrites ($$$) { + my ($fh, $ibx, $rewrites) = @_; + print $fh "$ibx->{mainrepo}:"; + if (scalar @$rewrites) { + my $epoch = -1; + my @out = map {; + ++$epoch; + "$epoch.git: ".(defined($_) ? $_ : '(unchanged)') + } @$rewrites; + print $fh join("\n\t", '', @out), "\n"; + } else { + print $fh " NONE\n"; + } +} + +1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 82510b99..f5c7a95c 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -145,41 +145,24 @@ again: fail($self, "Unexpected result from git cat-file: $head"); my $size = $1; - my $ref_type = $ref ? ref($ref) : ''; - my $rv; my $left = $size; - $$ref = $size if ($ref_type eq 'SCALAR'); - my $cb_err; - - if ($ref_type eq 'CODE') { - $rv = eval { $ref->($in, \$left) }; - $cb_err = $@; - # drain the rest - my $max = 8192; - while ($left > 0) { - my $r = read($in, my $x, $left > $max ? $max : $left); - defined($r) or fail($self, "read failed: $!"); - $r == 0 and fail($self, 'exited unexpectedly'); - $left -= $r; - } - } else { - my $offset = 0; - my $buf = ''; - while ($left > 0) { - my $r = read($in, $buf, $left, $offset); - defined($r) or fail($self, "read failed: $!"); - $r == 0 and fail($self, 'exited unexpectedly'); - $left -= $r; - $offset += $r; - } - $rv = \$buf; + $$ref = $size if $ref; + + my $offset = 0; + my $buf = ''; + while ($left > 0) { + my $r = read($in, $buf, $left, $offset); + defined($r) or fail($self, "read failed: $!"); + $r == 0 and fail($self, 'exited unexpectedly'); + $left -= $r; + $offset += $r; } + $rv = \$buf; - my $r = read($in, my $buf, 1); + my $r = read($in, my $lf, 1); defined($r) or fail($self, "read failed: $!"); - fail($self, 'newline missing after blob') if ($r != 1 || $buf ne "\n"); - die $cb_err if $cb_err; + fail($self, 'newline missing after blob') if ($r != 1 || $lf ne "\n"); $rv; } diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 2c4bad92..137b2b78 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -277,7 +277,7 @@ sub git_timestamp { "$ts $zone"; } -sub extract_author_info ($) { +sub extract_cmt_info ($) { my ($mime) = @_; my $sender = ''; @@ -314,7 +314,17 @@ sub extract_author_info ($) { $name = ''; warn "no name in From: $from or Sender: $sender\n"; } - ($name, $email); + + my $hdr = $mime->header_obj; + + my $subject = $hdr->header('Subject'); + $subject = '(no subject)' unless defined $subject; + # Mime decoding can create nulls replace them with spaces to protect git + $subject =~ tr/\0/ /; + utf8::encode($subject); + my $at = git_timestamp(my @at = msg_datestamp($hdr)); + my $ct = git_timestamp(my @ct = msg_timestamp($hdr)); + ($name, $email, $at, $ct, $subject); } # kill potentially confusing/misleading headers @@ -361,19 +371,7 @@ sub clean_tree_v2 ($$$) { sub add { my ($self, $mime, $check_cb) = @_; # mime = Email::MIME - my ($name, $email) = extract_author_info($mime); - my $hdr = $mime->header_obj; - my @at = msg_datestamp($hdr); - my @ct = msg_timestamp($hdr); - my $author_time_raw = git_timestamp(@at); - my $commit_time_raw = git_timestamp(@ct); - - my $subject = $mime->header('Subject'); - $subject = '(no subject)' unless defined $subject; - # Mime decoding can create nulls replace them with spaces to protect git - $subject =~ tr/\0/ /; - utf8::encode($subject); - + my ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime); my $path_type = $self->{path_type}; my $path; if ($path_type eq '2/38') { @@ -416,8 +414,8 @@ sub add { } print $w "commit $ref\nmark :$commit\n", - "author $name <$email> $author_time_raw\n", - "committer $self->{ident} $commit_time_raw\n" or wfail; + "author $name <$email> $at\n", + "committer $self->{ident} $ct\n" or wfail; print $w "data ", (length($subject) + 1), "\n", $subject, "\n\n" or wfail; if ($tip ne '') { @@ -486,33 +484,45 @@ sub digest2mid ($$) { "$dt.$b64" . '@z'; } -sub clean_purge_buffer { - my ($oids, $buf) = @_; - my $cmt_msg = 'purged '.join(' ',@$oids)."\n"; +sub rewrite_commit ($$$$) { + my ($self, $oids, $buf, $mime) = @_; + my ($name, $email, $at, $ct, $subject); + if ($mime) { + ($name, $email, $at, $ct, $subject) = extract_cmt_info($mime); + } else { + $name = $email = ''; + $subject = 'purged '.join(' ', @$oids); + } @$oids = (); - + $subject .= "\n"; foreach my $i (0..$#$buf) { my $l = $buf->[$i]; if ($l =~ /^author .* ([0-9]+ [\+-]?[0-9]+)$/) { - $buf->[$i] = "author <> $1\n"; + $at //= $1; + $buf->[$i] = "author $name <$email> $at\n"; + } elsif ($l =~ /^committer .* ([0-9]+ [\+-]?[0-9]+)$/) { + $ct //= $1; + $buf->[$i] = "committer $self->{ident} $ct\n"; } elsif ($l =~ /^data ([0-9]+)/) { - $buf->[$i++] = "data " . length($cmt_msg) . "\n"; - $buf->[$i] = $cmt_msg; + $buf->[$i++] = "data " . length($subject) . "\n"; + $buf->[$i] = $subject; last; } } } -sub purge_oids { - my ($self, $purge) = @_; - my $tmp = "refs/heads/purge-".((keys %$purge)[0]); +# returns the new commit OID if a replacement was done +# returns undef if nothing was done +sub replace_oids { + my ($self, $mime, $replace_map) = @_; # oid => raw string + my $tmp = "refs/heads/replace-".((keys %$replace_map)[0]); my $old = $self->{'ref'}; my $git = $self->{git}; my @export = (qw(fast-export --no-data --use-done-feature), $old); my $rd = $git->popen(@export); my ($r, $w) = $self->gfi_start; my @buf; - my $npurge = 0; + my $nreplace = 0; my @oids; my ($done, $mark); my $tree = $self->{-tree}; @@ -535,10 +545,13 @@ sub purge_oids { } elsif (/^M 100644 ([a-f0-9]+) (\w+)/) { my ($oid, $path) = ($1, $2); $tree->{$path} = 1; - if ($purge->{$oid}) { + my $sref = $replace_map->{$oid}; + if (defined $sref) { push @oids, $oid; - my $cmd = "M 100644 inline $path\ndata 0\n\n"; - push @buf, $cmd; + my $n = length($$sref); + push @buf, "M 100644 inline $path\ndata $n\n"; + push @buf, $$sref; # hope CoW works... + push @buf, "\n"; } else { push @buf, $_; } @@ -547,11 +560,13 @@ sub purge_oids { push @buf, $_ if $tree->{$path}; } elsif ($_ eq "\n") { if (@oids) { - my $out = join('', @buf); - $out =~ s/^/# /sgm; - warn "purge rewriting\n", $out, "\n"; - clean_purge_buffer(\@oids, \@buf); - $npurge++; + if (!$mime) { + my $out = join('', @buf); + $out =~ s/^/# /sgm; + warn "purge rewriting\n", $out, "\n"; + } + rewrite_commit($self, \@oids, \@buf, $mime); + $nreplace++; } $w->print(@buf, "\n") or wfail; @buf = (); @@ -569,28 +584,30 @@ sub purge_oids { $w->print(@buf) or wfail; } die 'done\n not seen from fast-export' unless $done; - chomp(my $cmt = $self->get_mark(":$mark")) if $npurge; + chomp(my $cmt = $self->get_mark(":$mark")) if $nreplace; $self->{nchg} = 0; # prevent _update_git_info until update-ref: $self->done; my @git = ('git', "--git-dir=$git->{git_dir}"); - run_die([@git, qw(update-ref), $old, $tmp]) if $npurge; + run_die([@git, qw(update-ref), $old, $tmp]) if $nreplace; run_die([@git, qw(update-ref -d), $tmp]); - return if $npurge == 0; + return if $nreplace == 0; run_die([@git, qw(-c gc.reflogExpire=now gc --prune=all)]); + + # check that old OIDs are gone my $err = 0; - foreach my $oid (keys %$purge) { + foreach my $oid (keys %$replace_map) { my @info = $git->check($oid); if (@info) { - warn "$oid not purged\n"; + warn "$oid not replaced\n"; $err++; } } _update_git_info($self, 0); - die "Failed to purge $err object(s)\n" if $err; + die "Failed to replace $err object(s)\n" if $err; $cmt; } diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index be80560f..8a31b910 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -434,6 +434,26 @@ sub xref ($$$$) { sub set_nntp_headers ($$$$$) { my ($self, $hdr, $ng, $n, $mid) = @_; + # why? leafnode requires a Path: header for some inexplicable + # reason. We'll fake the shortest one possible. + $hdr->header_set('Path', 'y'); + + # leafnode (and maybe other NNTP clients) have trouble dealing + # with v2 messages which have multiple Message-IDs (either due + # to our own content-based dedupe or buggy git-send-email versions). + my @mids = $hdr->header('Message-ID'); + if (scalar(@mids) > 1) { + my $mid0 = "<$mid>"; + $hdr->header_set('Message-ID', $mid0); + my @alt = $hdr->header('X-Alt-Message-ID'); + my %seen = map { $_ => 1 } (@alt, $mid0); + foreach my $m (@mids) { + next if $seen{$m}++; + push @alt, $m; + } + $hdr->header_set('X-Alt-Message-ID', @alt); + } + # clobber some my $xref = xref($self, $ng, $n, $mid); $hdr->header_set('Xref', $xref); @@ -515,6 +535,13 @@ sub _header ($) { my $hdr = $_[0]->header_obj->as_string; utf8::encode($hdr); $hdr =~ s/(?<!\r)\n/\r\n/sg; + + # for leafnode compatibility, we need to ensure Message-ID headers + # are only a single line. We can't subclass Email::Simple::Header + # and override _default_fold_at in here, either; since that won't + # affect messages already in the archive. + $hdr =~ s/^(Message-ID:)[ \t]*\r\n[ \t]+([^\r]+)\r\n/$1 $2\r\n/igsm; + $hdr } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 99856286..7cd67f12 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -117,7 +117,11 @@ sub _xdb_acquire { } } return unless defined $flag; - $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); + my $xdb = eval { Search::Xapian::WritableDatabase->new($dir, $flag) }; + if ($@) { + die "Failed opening $dir: ", $@; + } + $self->{xdb} = $xdb; } sub add_val ($$$) { diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 5f3c8af8..96a26b15 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -25,12 +25,6 @@ sub wrap { bless { mid => $mid }, $class; } -sub get { - my ($class, $head, $db, $mid) = @_; - my $doc_id = $head->get_docid; - load_expand(wrap($class, $mid), $db->get_document($doc_id)); -} - sub get_val ($$) { my ($doc, $col) = @_; Search::Xapian::sortable_unserialise($doc->get_value($col)); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index a8c33ef4..09ed4e7b 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -11,7 +11,7 @@ use PublicInbox::SearchIdxPart; use PublicInbox::MIME; use PublicInbox::Git; use PublicInbox::Import; -use PublicInbox::MID qw(mids); +use PublicInbox::MID qw(mids references); use PublicInbox::ContentId qw(content_id content_digest); use PublicInbox::Inbox; use PublicInbox::OverIdx; @@ -116,6 +116,18 @@ sub add { }); } +# indexes a message, returns true if checkpointing is needed +sub do_idx ($$$$$$$) { + my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_; + $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); + my $npart = $self->{partitions}; + my $part = $num % $npart; + my $idx = idx_part($self, $part); + $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); + my $n = $self->{transact_bytes} += $len; + $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $npart); +} + sub _add { my ($self, $mime, $check_cb) = @_; @@ -141,13 +153,7 @@ sub _add { $self->{last_commit}->[$self->{epoch_max}] = $cmt; my ($oid, $len, $msgref) = @{$im->{last_object}}; - $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); - my $nparts = $self->{partitions}; - my $part = $num % $nparts; - my $idx = $self->idx_part($part); - $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); - my $n = $self->{transact_bytes} += $len; - if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) { + if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) { $self->checkpoint; } @@ -291,26 +297,30 @@ sub idx_init { }); } -sub purge_oids ($$) { - my ($self, $purge) = @_; # $purge = { $object_id => 1, ... } +# returns an array mapping [ epoch => latest_commit ] +# latest_commit may be undef if nothing was done to that epoch +# $replace_map = { $object_id => $strref, ... } +sub _replace_oids ($$$) { + my ($self, $mime, $replace_map) = @_; $self->done; my $pfx = "$self->{-inbox}->{mainrepo}/git"; - my $purges = []; + my $rewrites = []; # epoch => commit my $max = $self->{epoch_max}; unless (defined($max)) { defined(my $latest = git_dir_latest($self, \$max)) or return; $self->{epoch_max} = $max; } + foreach my $i (0..$max) { my $git_dir = "$pfx/$i.git"; -d $git_dir or next; my $git = PublicInbox::Git->new($git_dir); my $im = $self->import_init($git, 0, 1); - $purges->[$i] = $im->purge_oids($purge); + $rewrites->[$i] = $im->replace_oids($mime, $replace_map); $im->done; } - $purges; + $rewrites; } sub content_ids ($) { @@ -333,25 +343,31 @@ sub content_matches ($$) { 0 } -sub remove_internal ($$$$) { - my ($self, $mime, $cmt_msg, $purge) = @_; +# used for removing or replacing (purging) +sub rewrite_internal ($$;$$$) { + my ($self, $old_mime, $cmt_msg, $new_mime, $sref) = @_; $self->idx_init; - my $im = $self->importer unless $purge; + my ($im, $need_reindex, $replace_map); + if ($sref) { + $replace_map = {}; # oid => sref + $need_reindex = [] if $new_mime; + } else { + $im = $self->importer; + } my $over = $self->{over}; - my $cids = content_ids($mime); + my $cids = content_ids($old_mime); my $parts = $self->{idx_parts}; - my $mm = $self->{mm}; my $removed; - my $mids = mids($mime->header_obj); + my $mids = mids($old_mime->header_obj); # We avoid introducing new blobs into git since the raw content # can be slightly different, so we do not need the user-supplied # message now that we have the mids and content_id - $mime = undef; + $old_mime = undef; my $mark; foreach my $mid (@$mids) { - my %gone; + my %gone; # num => [ smsg, raw ] my ($id, $prev); while (my $smsg = $over->next_by_mid($mid, \$id, \$prev)) { my $msg = get_blob($self, $smsg); @@ -374,17 +390,21 @@ sub remove_internal ($$$$) { } foreach my $num (keys %gone) { my ($smsg, $orig) = @{$gone{$num}}; - $mm->num_delete($num); # $removed should only be set once assuming # no bugs in our deduplication code: $removed = $smsg; my $oid = $smsg->{blob}; - if ($purge) { - $purge->{$oid} = 1; + if ($replace_map) { + $replace_map->{$oid} = $sref; } else { ($mark, undef) = $im->remove($orig, $cmt_msg); } $orig = undef; + if ($need_reindex) { # ->replace + push @$need_reindex, $smsg; + } else { # ->purge or ->remove + $self->{mm}->num_delete($num); + } unindex_oid_remote($self, $oid, $mid); } } @@ -393,8 +413,9 @@ sub remove_internal ($$$$) { my $cmt = $im->get_mark($mark); $self->{last_commit}->[$self->{epoch_max}] = $cmt; } - if ($purge && scalar keys %$purge) { - return purge_oids($self, $purge); + if ($replace_map && scalar keys %$replace_map) { + my $rewrites = _replace_oids($self, $new_mime, $replace_map); + return { rewrites => $rewrites, need_reindex => $need_reindex }; } $removed; } @@ -403,22 +424,125 @@ sub remove_internal ($$$$) { sub remove { my ($self, $mime, $cmt_msg) = @_; $self->{-inbox}->with_umask(sub { - remove_internal($self, $mime, $cmt_msg, undef); + rewrite_internal($self, $mime, $cmt_msg); }); } +sub _replace ($$;$$) { + my ($self, $old_mime, $new_mime, $sref) = @_; + my $rewritten = $self->{-inbox}->with_umask(sub { + rewrite_internal($self, $old_mime, undef, $new_mime, $sref); + }) or return; + + my $rewrites = $rewritten->{rewrites}; + # ->done is called if there are rewrites since we gc+prune from git + $self->idx_init if @$rewrites; + + for my $i (0..$#$rewrites) { + defined(my $cmt = $rewrites->[$i]) or next; + $self->{last_commit}->[$i] = $cmt; + } + $rewritten; +} + # public sub purge { my ($self, $mime) = @_; - my $purges = $self->{-inbox}->with_umask(sub { - remove_internal($self, $mime, undef, {}); - }) or return; - $self->idx_init if @$purges; # ->done is called on purges - for my $i (0..$#$purges) { - defined(my $cmt = $purges->[$i]) or next; - $self->{last_commit}->[$i] = $cmt; + my $rewritten = _replace($self, $mime, undef, \'') or return; + $rewritten->{rewrites} +} + +# returns the git object_id of $fh, does not write the object to FS +sub git_hash_raw ($$) { + my ($self, $raw) = @_; + # grab the expected OID we have to reindex: + open my $tmp_fh, '+>', undef or die "failed to open tmp: $!"; + $tmp_fh->autoflush(1); + print $tmp_fh $$raw or die "print \$tmp_fh: $!"; + sysseek($tmp_fh, 0, 0) or die "seek failed: $!"; + + my ($r, $w); + pipe($r, $w) or die "failed to create pipe: $!"; + my $rdr = { 0 => fileno($tmp_fh), 1 => fileno($w) }; + my $git_dir = $self->{-inbox}->git->{git_dir}; + my $cmd = ['git', "--git-dir=$git_dir", qw(hash-object --stdin)]; + my $pid = spawn($cmd, undef, $rdr); + close $w; + local $/ = "\n"; + chomp(my $oid = <$r>); + waitpid($pid, 0) == $pid or die "git hash-object did not finish"; + die "git hash-object failed: $?" if $?; + $oid =~ /\A[a-f0-9]{40}\z/ or die "OID not expected: $oid"; + $oid; +} + +sub _check_mids_match ($$$) { + my ($old_list, $new_list, $hdrs) = @_; + my %old_mids = map { $_ => 1 } @$old_list; + my %new_mids = map { $_ => 1 } @$new_list; + my @old = keys %old_mids; + my @new = keys %new_mids; + my $err = "$hdrs may not be changed when replacing\n"; + die $err if scalar(@old) != scalar(@new); + delete @new_mids{@old}; + delete @old_mids{@new}; + die $err if (scalar(keys %old_mids) || scalar(keys %new_mids)); +} + +# Changing Message-IDs or References with ->replace isn't supported. +# The rules for dealing with messages with multiple or conflicting +# Message-IDs are pretty complex and rethreading hasn't been fully +# implemented, yet. +sub check_mids_match ($$) { + my ($old_mime, $new_mime) = @_; + my $old = $old_mime->header_obj; + my $new = $new_mime->header_obj; + _check_mids_match(mids($old), mids($new), 'Message-ID(s)'); + _check_mids_match(references($old), references($new), + 'References/In-Reply-To'); +} + +# public +sub replace ($$$) { + my ($self, $old_mime, $new_mime) = @_; + + check_mids_match($old_mime, $new_mime); + + # mutt will always add Content-Length:, Status:, Lines: when editing + PublicInbox::Import::drop_unwanted_headers($new_mime); + + my $raw = $new_mime->as_string; + my $expect_oid = git_hash_raw($self, \$raw); + my $rewritten = _replace($self, $old_mime, $new_mime, \$raw) or return; + my $need_reindex = $rewritten->{need_reindex}; + + # just in case we have bugs in deduplication code: + my $n = scalar(@$need_reindex); + if ($n > 1) { + my $list = join(', ', map { + "$_->{num}: <$_->{mid}>" + } @$need_reindex); + warn <<""; +W: rewritten $n messages matching content of original message (expected: 1). +W: possible bug in public-inbox, NNTP article IDs and Message-IDs follow: +W: $list + + } + + # make sure we really got the OID: + my ($oid, $type, $len) = $self->{-inbox}->git->check($expect_oid); + $oid eq $expect_oid or die "BUG: $expect_oid not found after replace"; + + # don't leak FDs to Xapian: + $self->{-inbox}->git->cleanup; + + # reindex modified messages: + for my $smsg (@$need_reindex) { + my $num = $smsg->{num}; + my $mid0 = $smsg->{mid}; + do_idx($self, \$raw, $new_mime, $len, $num, $oid, $mid0); } - $purges; + $rewritten->{rewrites}; } sub last_commit_part ($$;$) { @@ -772,15 +896,8 @@ sub reindex_oid ($$$$) { } $sync->{mm_tmp}->mid_delete($mid0) or die "failed to delete <$mid0> for article #$num\n"; - - $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); - my $nparts = $self->{partitions}; - my $part = $num % $nparts; - my $idx = $self->idx_part($part); - $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); - my $n = $self->{transact_bytes} += $len; $sync->{nr}++; - if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) { + if (do_idx($self, $msgref, $mime, $len, $num, $oid, $mid0)) { $git->cleanup; $sync->{mm_tmp}->atfork_prepare; $self->done; # release lock |