about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-18 09:13:13 +0000
committerEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-18 09:14:17 +0000
commit017fed7bc4d33ac474a19356994be5bd0bfe68ba (patch)
treedd1fe9a007eed3506532f88875e4bf455a5e3404
parent0bf744b91df2d40580ff5776311d57dcede850b0 (diff)
downloadpublic-inbox-017fed7bc4d33ac474a19356994be5bd0bfe68ba.tar.gz
Some messages to git@vger went missing from Msgmap from old bugs
and became inaccessible via NNTP.  Forcing NNTP article numbers
when the overview DB came about made the problem more visible when
reindexing old (v1) repositories as all removed spam messages
took up AUTOINCREMENT numbers again before they were removed.

Having large gaps in NNTP article numbers is not good since it
throws off NNTP clients.  This does NOT prevent NNTP clients from
seeing some messages twice, but is better than having them
miss several messages entirely.

We also avoid depending on --reverse in git-log, as
git requires storing an entire commit list in memory for
--reverse, so it's cheaper to store only deleted blobs in the %D
hash since they do not live long.
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/Msgmap.pm8
-rw-r--r--lib/PublicInbox/SearchIdx.pm186
-rw-r--r--lib/PublicInbox/V2Writable.pm15
-rw-r--r--t/v1-add-remove-add.t45
5 files changed, 171 insertions, 84 deletions
diff --git a/MANIFEST b/MANIFEST
index 00a0970b..8038ad48 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -192,6 +192,7 @@ t/spawn.t
 t/thread-cycle.t
 t/time.t
 t/utf8.mbox
+t/v1-add-remove-add.t
 t/v2-add-remove-add.t
 t/v2mda.t
 t/v2mirror.t
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index feef8ba7..3237a5ed 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -108,10 +108,10 @@ sub created_at {
 sub mid_insert {
         my ($self, $mid) = @_;
         my $dbh = $self->{dbh};
-        my $sql = 'INSERT OR IGNORE INTO msgmap (mid) VALUES (?)';
-        my $sth = $self->{mid_insert} ||= $dbh->prepare($sql);
-        $sth->bind_param(1, $mid);
-        return if $sth->execute == 0;
+        my $sth = $dbh->prepare_cached(<<'');
+INSERT OR IGNORE INTO msgmap (mid) VALUES (?)
+
+        return if $sth->execute($mid) == 0;
         $dbh->last_insert_id(undef, undef, 'msgmap', 'num');
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 0b1dc219..2239c902 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -17,6 +17,7 @@ use PublicInbox::MsgIter;
 use Carp qw(croak);
 use POSIX qw(strftime);
 use PublicInbox::OverIdx;
+use PublicInbox::Spawn qw(spawn);
 require PublicInbox::Git;
 use Compress::Zlib qw(compress);
 
@@ -266,8 +267,8 @@ sub add_message {
         my $mids = mids($mime->header_obj);
         $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
         unless (defined $num) { # v1
-                my $mm = $self->_msgmap_init;
-                $num = $mm->mid_insert($mid0) || $mm->num_for($mid0);
+                $self->_msgmap_init;
+                $num = index_mm($self, $mime);
         }
         eval {
                 my $smsg = PublicInbox::SearchMsg->new($mime);
@@ -464,8 +465,28 @@ sub index_mm {
         my ($self, $mime) = @_;
         my $mid = mid_clean(mid_mime($mime));
         my $mm = $self->{mm};
-        my $num = $mm->mid_insert($mid);
-        return $num if defined $num;
+        my $num;
+
+        if (defined $self->{regen_down}) {
+                $num = $mm->num_for($mid) and return $num;
+
+                while (($num = $self->{regen_down}--) > 0) {
+                        if ($mm->mid_set($num, $mid) != 0) {
+                                return $num;
+                        }
+                }
+        } elsif (defined $self->{regen_up}) {
+                $num = $mm->num_for($mid) and return $num;
+
+                # this is to fixup old bugs due to add-remove-add
+                while (($num = ++$self->{regen_up})) {
+                        if ($mm->mid_set($num, $mid) != 0) {
+                                return $num;
+                        }
+                }
+        }
+
+        $num = $mm->mid_insert($mid) and return $num;
 
         # fallback to num_for since filters like RubyLang set the number
         $mm->num_for($mid);
@@ -476,18 +497,6 @@ sub unindex_mm {
         $self->{mm}->mid_delete(mid_clean(mid_mime($mime)));
 }
 
-sub index_mm2 {
-        my ($self, $mime, $bytes, $blob) = @_;
-        my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime)));
-        add_message($self, $mime, $bytes, $num, $blob);
-}
-
-sub unindex_mm2 {
-        my ($self, $mime) = @_;
-        $self->{mm}->mid_delete(mid_clean(mid_mime($mime)));
-        unindex_blob($self, $mime);
-}
-
 sub index_both {
         my ($self, $mime, $bytes, $blob) = @_;
         my $num = index_mm($self, $mime);
@@ -521,12 +530,12 @@ sub batch_adjust ($$$$) {
         $$max -= $bytes;
         if ($$max <= 0) {
                 $$max = BATCH_BYTES;
-                $batch_cb->($latest, 1);
+                $batch_cb->($latest);
         }
 }
 
 # only for v1
-sub rlog {
+sub read_log {
         my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
         my $hex = '[a-f0-9]';
         my $h40 = $hex .'{40}';
@@ -537,23 +546,39 @@ sub rlog {
         my $bytes;
         my $max = BATCH_BYTES;
         local $/ = "\n";
+        my %D;
         my $line;
+        my $newest;
+        my $mid = '20170114215743.5igbjup6qpsh3jfg@genre.crustytoothpaste.net';
         while (defined($line = <$log>)) {
                 if ($line =~ /$addmsg/o) {
                         my $blob = $1;
+                        delete $D{$blob} and next;
                         my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+                        my $mids = mids($mime->header_obj);
+                        foreach (@$mids) {
+                                warn "ADD $mid\n" if ($_ eq $mid);
+                        }
                         batch_adjust(\$max, $bytes, $batch_cb, $latest);
                         $add_cb->($self, $mime, $bytes, $blob);
                 } elsif ($line =~ /$delmsg/o) {
                         my $blob = $1;
-                        my $mime = do_cat_mail($git, $blob, \$bytes) or next;
-                        batch_adjust(\$max, $bytes, $batch_cb, $latest);
-                        $del_cb->($self, $mime);
+                        $D{$blob} = 1;
                 } elsif ($line =~ /^commit ($h40)/o) {
                         $latest = $1;
+                        $newest ||= $latest;
+                }
+        }
+        # get the leftovers
+        foreach my $blob (keys %D) {
+                my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+                my $mids = mids($mime->header_obj);
+                foreach (@$mids) {
+                        warn "DEL $mid\n" if ($_ eq $mid);
                 }
+                $del_cb->($self, $mime);
         }
-        $batch_cb->($latest, 0);
+        $batch_cb->($latest, $newest);
 }
 
 sub _msgmap_init {
@@ -567,18 +592,62 @@ sub _msgmap_init {
 
 sub _git_log {
         my ($self, $range) = @_;
-        $self->{git}->popen(qw/log --reverse --no-notes --no-color
+        my $git = $self->{git};
+
+        if (index($range, '..') < 0) {
+                my $regen_max = 0;
+                # can't use 'rev-list --count' if we use --diff-filter
+                my $fh = $git->popen(qw(log --pretty=tformat:%h
+                                --no-notes --no-color --no-renames
+                                --diff-filter=AM), $range);
+                ++$regen_max while <$fh>;
+                my (undef, $max) = $self->{mm}->minmax;
+
+                if ($max && $max == $regen_max) {
+                        # fix up old bugs in full indexes which caused messages to
+                        # not appear in Msgmap
+                        $self->{regen_up} = $max;
+                } else {
+                        # normal regen is for for fresh data
+                        $self->{regen_down} = $regen_max;
+                }
+        }
+
+        $git->popen(qw/log --no-notes --no-color --no-renames
                                 --raw -r --no-abbrev/, $range);
 }
 
+sub is_ancestor ($$$) {
+        my ($git, $cur, $tip) = @_;
+        return 0 unless $git->check($cur);
+        my $cmd = [ 'git', "--git-dir=$git->{git_dir}",
+                qw(merge-base --is-ancestor), $cur, $tip ];
+        my $pid = spawn($cmd);
+        defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!";
+        waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish';
+        $? == 0;
+}
+
+sub need_update ($$$) {
+        my ($self, $cur, $new) = @_;
+        my $git = $self->{git};
+        return 1 if $cur && !is_ancestor($git, $cur, $new);
+        my $range = $cur eq '' ? $new : "$cur..$new";
+        chomp(my $n = $git->qx(qw(rev-list --count), $range));
+        ($n eq '' || $n > 0);
+}
+
 # indexes all unindexed messages (v1 only)
 sub _index_sync {
         my ($self, $opts) = @_;
         my $tip = $opts->{ref} || 'HEAD';
         my $reindex = $opts->{reindex};
         my ($mkey, $last_commit, $lx, $xlog);
-        $self->{git}->batch_prepare;
+        my $git = $self->{git};
+        $git->batch_prepare;
+
         my $xdb = $self->begin_txn_lazy;
+        my $mm = _msgmap_init($self);
         do {
                 $xlog = undef;
                 $mkey = 'last_commit';
@@ -588,73 +657,54 @@ sub _index_sync {
                         $lx = '';
                         $mkey = undef if $last_commit ne '';
                 }
+
+                # use last_commit from msgmap if it is older or unset
+                my $lm = $mm->last_commit || '';
+                if (!$lm || ($lm && $lx && is_ancestor($git, $lm, $lx))) {
+                        $lx = $lm;
+                }
+
                 $self->{over}->rollback_lazy;
                 $self->{over}->disconnect;
                 delete $self->{txn};
                 $xdb->cancel_transaction;
                 $xdb = _xdb_release($self);
 
-                # ensure we leak no FDs to "git log"
+                # ensure we leak no FDs to "git log" with Xapian <= 1.2
                 my $range = $lx eq '' ? $tip : "$lx..$tip";
                 $xlog = _git_log($self, $range);
 
                 $xdb = $self->begin_txn_lazy;
         } while ($xdb->get_metadata('last_commit') ne $last_commit);
 
-        my $mm = _msgmap_init($self);
         my $dbh = $mm->{dbh} if $mm;
-        my $mm_only;
         my $cb = sub {
-                my ($commit, $more) = @_;
+                my ($commit, $newest) = @_;
                 if ($dbh) {
-                        $mm->last_commit($commit) if $commit;
+                        if ($newest) {
+                                my $cur = $mm->last_commit || '';
+                                if (need_update($self, $cur, $newest)) {
+                                        $mm->last_commit($newest);
+                                }
+                        }
                         $dbh->commit;
                 }
-                if (!$mm_only) {
-                        $xdb->set_metadata($mkey, $commit) if $mkey && $commit;
-                        $self->commit_txn_lazy;
+                if ($mkey && $newest) {
+                        my $cur = $xdb->get_metadata($mkey);
+                        if (need_update($self, $cur, $newest)) {
+                                $xdb->set_metadata($mkey, $newest);
+                        }
                 }
+                $self->commit_txn_lazy;
                 # let another process do some work... <
-                if ($more) {
-                        if (!$mm_only) {
-                                $xdb = $self->begin_txn_lazy;
-                        }
+                if (!$newest) {
+                        $xdb = $self->begin_txn_lazy;
                         $dbh->begin_work if $dbh;
                 }
         };
 
-        if ($mm) {
-                $dbh->begin_work;
-                my $lm = $mm->last_commit || '';
-                if ($lm eq $lx) {
-                        # Common case is the indexes are synced,
-                        # we only need to run git-log once:
-                        rlog($self, $xlog, *index_both, *unindex_both, $cb);
-                } else {
-                        # Uncommon case, msgmap and xapian are out-of-sync
-                        # do not care for performance (but git is fast :>)
-                        # This happens if we have to reindex Xapian since
-                        # msgmap is a frozen format and our Xapian format
-                        # is evolving.
-                        my $r = $lm eq '' ? $tip : "$lm..$tip";
-
-                        # first, ensure msgmap is up-to-date:
-                        my $mkey_prev = $mkey;
-                        $mkey = undef; # ignore xapian, for now
-                        my $mlog = _git_log($self, $r);
-                        $mm_only = 1;
-                        rlog($self, $mlog, *index_mm, *unindex_mm, $cb);
-                        $mm_only = $mlog = undef;
-
-                        # now deal with Xapian
-                        $mkey = $mkey_prev;
-                        $dbh = undef;
-                        rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb);
-                }
-        } else {
-                # user didn't install DBD::SQLite and DBI
-                rlog($self, $xlog, *add_message, *unindex_blob, $cb);
-        }
+        $dbh->begin_work;
+        read_log($self, $xlog, *index_both, *unindex_both, $cb);
 }
 
 sub DESTROY {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index e9fd502e..2cc87305 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -15,7 +15,8 @@ use PublicInbox::ContentId qw(content_id content_digest);
 use PublicInbox::Inbox;
 use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
-use PublicInbox::Spawn;
+use PublicInbox::Spawn qw(spawn);
+use PublicInbox::SearchIdx;
 use IO::Handle;
 
 # an estimate of the post-packed size to the raw uncompressed size
@@ -561,7 +562,6 @@ sub import_init {
 sub diff ($$$) {
         my ($mid, $cur, $new) = @_;
         use File::Temp qw(tempfile);
-        use PublicInbox::Spawn qw(spawn);
 
         my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1);
         print $ah $cur->as_string or die "print: $!";
@@ -738,16 +738,7 @@ sub last_commits {
         $heads;
 }
 
-sub is_ancestor ($$$) {
-        my ($git, $cur, $tip) = @_;
-        return 0 unless $git->check($cur);
-        my $cmd = [ 'git', "--git-dir=$git->{git_dir}",
-                qw(merge-base --is-ancestor), $cur, $tip ];
-        my $pid = spawn($cmd);
-        defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!";
-        waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish';
-        $? == 0;
-}
+*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
 
 sub index_prepare {
         my ($self, $opts, $epoch_max, $ranges) = @_;
diff --git a/t/v1-add-remove-add.t b/t/v1-add-remove-add.t
new file mode 100644
index 00000000..cd6e2811
--- /dev/null
+++ b/t/v1-add-remove-add.t
@@ -0,0 +1,45 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::MIME;
+use PublicInbox::Import;
+use PublicInbox::SearchIdx;
+use File::Temp qw/tempdir/;
+
+foreach my $mod (qw(DBD::SQLite Search::Xapian)) {
+        eval "require $mod";
+        plan skip_all => "$mod missing for v1-add-remove-add.t" if $@;
+}
+my $mainrepo = tempdir('pi-add-remove-add-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+is(system(qw(git init --bare), $mainrepo), 0);
+my $ibx = {
+        mainrepo => $mainrepo,
+        name => 'test-add-remove-add',
+        -primary_address => 'test@example.com',
+};
+$ibx = PublicInbox::Inbox->new($ibx);
+my $mime = PublicInbox::MIME->create(
+        header => [
+                From => 'a@example.com',
+                To => 'test@example.com',
+                Subject => 'this is a subject',
+                Date => 'Fri, 02 Oct 1993 00:00:00 +0000',
+                'Message-ID' => '<a-mid@b>',
+        ],
+        body => "hello world\n",
+);
+my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx);
+ok($im->add($mime), 'message added');
+ok($im->remove($mime), 'message added');
+ok($im->add($mime), 'message added again');
+$im->done;
+my $rw = PublicInbox::SearchIdx->new($ibx, 1);
+$rw->index_sync;
+my $msgs = $ibx->recent({limit => 10});
+is($msgs->[0]->{mid}, 'a-mid@b', 'message exists in history');
+is(scalar @$msgs, 1, 'only one message in history');
+is($ibx->mm->num_for('a-mid@b'), 2, 'exists with second article number');
+
+done_testing();