about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@yhbt.net>2020-03-20 08:18:21 +0000
committerEric Wong <e@yhbt.net>2020-03-22 09:00:23 +0000
commitc7acdfe78bda5bf36660a699e882e0e3c431a351 (patch)
treeb014171ac2b5bd4a37864d54edeed045bd79de58
parentb5bc3576af3d0ef0fa884ed32a674c7a703a19b2 (diff)
downloadpublic-inbox-c7acdfe78bda5bf36660a699e882e0e3c431a351.tar.gz
This lets us store author and committer times for deferred
indexing messages with ambiguous Message-IDs.  This allows
us to reproducibly reindex messages with the git commit
and author times when a rare message lacks Received and/or
Date headers while having ambiguous Message-IDs.
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/MultiMidQueue.pm57
-rw-r--r--lib/PublicInbox/V2Writable.pm23
-rw-r--r--t/multi-mid.t27
4 files changed, 95 insertions, 13 deletions
diff --git a/MANIFEST b/MANIFEST
index ec80c90f..f077d722 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -130,6 +130,7 @@ lib/PublicInbox/MboxGz.pm
 lib/PublicInbox/MsgIter.pm
 lib/PublicInbox/MsgTime.pm
 lib/PublicInbox/Msgmap.pm
+lib/PublicInbox/MultiMidQueue.pm
 lib/PublicInbox/NNTP.pm
 lib/PublicInbox/NNTPD.pm
 lib/PublicInbox/NNTPdeflate.pm
diff --git a/lib/PublicInbox/MultiMidQueue.pm b/lib/PublicInbox/MultiMidQueue.pm
new file mode 100644
index 00000000..3c28ebbc
--- /dev/null
+++ b/lib/PublicInbox/MultiMidQueue.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# temporary queue for public-inbox-index to support multi-Message-ID
+# messages on mirrors of v2 inboxes
+package PublicInbox::MultiMidQueue;
+use strict;
+use SDBM_File; # part of Perl standard library
+use Fcntl qw(O_RDWR O_CREAT);
+use File::Temp 0.19 (); # 0.19 for ->newdir
+my %e = (freebsd => 0x100000, linux => 0x80000, openbsd => 0x10000);
+my $O_CLOEXEC = $e{$^O} // 0;
+
+sub new {
+        my ($class) = @_;
+        my $tmpdir = File::Temp->newdir('multi-mid-q-XXXXXX', TMPDIR => 1);
+        my $base = $tmpdir->dirname . '/q';
+        my %sdbm;
+        my $flags = O_RDWR|O_CREAT;
+        if (!tie(%sdbm, 'SDBM_File', $base, $flags|$O_CLOEXEC, 0600)) {
+                if (!tie(%sdbm, 'SDBM_File', $base, $flags, 0600)) {
+                        die "could not tie ($base): $!";
+                }
+                $O_CLOEXEC = 0;
+        }
+
+        bless {
+                cur => 1,
+                min => 1,
+                max => 0,
+                sdbm => \%sdbm,
+                tmpdir => $tmpdir,
+        }, $class;
+}
+
+sub set_oid {
+        my ($self, $i, $oid, $v2w) = @_;
+        $self->{max} = $i if $i > $self->{max};
+        $self->{min} = $i if $i < $self->{min};
+        $self->{sdbm}->{$i} = "$oid\t$v2w->{autime}\t$v2w->{cotime}";
+}
+
+sub get_oid {
+        my ($self, $i, $v2w) = @_;
+        my $rec = $self->{sdbm}->{$i} or return;
+        my ($oid, $autime, $cotime) = split(/\t/, $rec);
+        $v2w->{autime} = $autime;
+        $v2w->{cotime} = $cotime;
+        $oid
+}
+
+sub push_oid {
+        my ($self, $oid, $v2w) = @_;
+        set_oid($self, $self->{cur}++, $oid, $v2w);
+}
+
+1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b45d2722..1c78ef24 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -20,6 +20,7 @@ use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::SearchIdx;
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::MultiMidQueue;
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
 
@@ -991,15 +992,15 @@ sub multi_mid_q_new () {
         $multi_mid
 }
 
-sub multi_mid_q_push ($$) {
-        my ($sync, $oid) = @_;
-        my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new();
+sub multi_mid_q_push ($$$) {
+        my ($self, $sync, $oid) = @_;
+        my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new;
         if ($sync->{reindex}) { # no regen on reindex
-                $multi_mid->mid_insert($oid);
+                $multi_mid->push_oid($oid, $self);
         } else {
                 my $num = $sync->{regen}--;
                 die "BUG: ran out of article numbers" if $num <= 0;
-                $multi_mid->mid_set($num, $oid);
+                $multi_mid->set_oid($num, $oid, $self);
         }
 }
 
@@ -1051,7 +1052,7 @@ sub reindex_oid ($$$$) {
                         # do not delete from {mm_tmp}, since another
                         # single-MID message may use it.
                 } else { # handle them at the end:
-                        multi_mid_q_push($sync, $oid);
+                        multi_mid_q_push($self, $sync, $oid);
                 }
                 return;
         }
@@ -1352,19 +1353,21 @@ sub index_sync {
         }
         if (my $multi_mid = delete $sync->{multi_mid}) {
                 $git //= $self->{-inbox}->git;
-                my ($min, $max) = $multi_mid->minmax;
+                my $min = $multi_mid->{min};
+                my $max = $multi_mid->{max};
                 if ($sync->{reindex}) {
                         # we may need to create new Message-IDs if mirrors
                         # were initially indexed with old versions
                         for (my $i = $max; $i >= $min; $i--) {
-                                my $oid = $multi_mid->mid_for($i);
+                                my $oid;
+                                $oid = $multi_mid->get_oid($i, $self) or next;
                                 next unless defined $oid;
                                 reindex_oid_m($self, $sync, $git, $oid);
                         }
                 } else { # regen on initial index
                         for my $num ($min..$max) {
-                                my $oid = $multi_mid->mid_for($num);
-                                next unless defined $oid;
+                                my $oid;
+                                $oid = $multi_mid->get_oid($num, $self) or next;
                                 reindex_oid_m($self, $sync, $git, $oid, $num);
                         }
                 }
diff --git a/t/multi-mid.t b/t/multi-mid.t
index df865efb..87240c2c 100644
--- a/t/multi-mid.t
+++ b/t/multi-mid.t
@@ -1,5 +1,6 @@
 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
 use Test::More;
 use PublicInbox::MIME;
 use PublicInbox::TestCommon;
@@ -7,6 +8,7 @@ use PublicInbox::InboxWritable;
 require_git(2.6);
 require_mods(qw(DBD::SQLite));
 require PublicInbox::SearchIdx;
+my $delay = $ENV{TEST_DELAY_CONVERT};
 
 my $addr = 'test@example.com';
 my $bad = PublicInbox::MIME->new(<<EOF);
@@ -14,14 +16,12 @@ Message-ID: <a\@example.com>
 Message-ID: <b\@example.com>
 From: a\@example.com
 To: $addr
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 Subject: bad
 
 EOF
 
 my $good = PublicInbox::MIME->new(<<EOF);
 Message-ID: <b\@example.com>
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 From: b\@example.com
 To: $addr
 Subject: good
@@ -37,13 +37,18 @@ for my $order ([$bad, $good], [$good, $bad]) {
                 indexlevel => 'basic',
                 -primary_address => $addr,
         }, my $creat_opt = {});
+        my @old;
         if ('setup v1 inbox') {
                 my $im = $ibx->importer(0);
-                ok($im->add($_), 'added '.$_->header('Subject')) for @$order;
+                for (@$order) {
+                        ok($im->add($_), 'added '.$_->header('Subject'));
+                        sleep($delay) if $delay;
+                }
                 $im->done;
                 my $s = PublicInbox::SearchIdx->new($ibx, 1);
                 $s->index_sync;
                 $before = [ $ibx->mm->minmax ];
+                @old = ($ibx->over->get_art(1), $ibx->over->get_art(2));
                 $ibx->cleanup;
         }
         my $rdr = { 1 => \(my $out = ''), 2 => \(my $err = '') };
@@ -56,6 +61,22 @@ for my $order ([$bad, $good], [$good, $bad]) {
         $ibx->{inboxdir} = "$tmpdir/v2";
         is_deeply([$ibx->mm->minmax], $before,
                 'min, max article numbers unchanged');
+
+        my @v2 = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+        is_deeply(\@v2, \@old, 'v2 conversion times match');
+
+        system(qw(git clone -sq --mirror), "$tmpdir/v2/git/0.git",
+                "$tmpdir/v2-clone/git/0.git") == 0 or die "clone: $?";
+        $cmd = [ '-init', '-V2', 'v2c', "$tmpdir/v2-clone",
+                'http://example.com/v2c', 'v2c@example.com' ];
+        ok(run_script($cmd, $env), 'init clone');
+        $cmd = [ '-index', "$tmpdir/v2-clone" ];
+        sleep($delay) if $delay;
+        ok(run_script($cmd, $env), 'index the clone');
+        $ibx->cleanup;
+        $ibx->{inboxdir} = "$tmpdir/v2-clone";
+        my @v2c = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+        is_deeply(\@v2c, \@old, 'v2 clone times match');
 }
 
 done_testing();