From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 52C451F624 for ; Fri, 20 Mar 2020 08:18:23 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 9/9] v2: SDBM-based multi Message-ID queue Date: Fri, 20 Mar 2020 08:18:21 +0000 Message-Id: <20200320081821.21715-10-e@yhbt.net> In-Reply-To: <20200320081821.21715-1-e@yhbt.net> References: <20200320081821.21715-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This lets us store author and committer times for deferred indexing messages with ambiguous Message-IDs. This allows us to reproducibly reindex messages with the git commit and author times when a rare message lacks Received and/or Date headers while having ambiguous Message-IDs. --- MANIFEST | 1 + lib/PublicInbox/MultiMidQueue.pm | 57 ++++++++++++++++++++++++++++++++ lib/PublicInbox/V2Writable.pm | 23 +++++++------ t/multi-mid.t | 27 +++++++++++++-- 4 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 lib/PublicInbox/MultiMidQueue.pm diff --git a/MANIFEST b/MANIFEST index ec80c90f..f077d722 100644 --- a/MANIFEST +++ b/MANIFEST @@ -130,6 +130,7 @@ lib/PublicInbox/MboxGz.pm lib/PublicInbox/MsgIter.pm lib/PublicInbox/MsgTime.pm lib/PublicInbox/Msgmap.pm +lib/PublicInbox/MultiMidQueue.pm lib/PublicInbox/NNTP.pm lib/PublicInbox/NNTPD.pm lib/PublicInbox/NNTPdeflate.pm diff --git a/lib/PublicInbox/MultiMidQueue.pm b/lib/PublicInbox/MultiMidQueue.pm new file mode 100644 index 00000000..3c28ebbc --- /dev/null +++ b/lib/PublicInbox/MultiMidQueue.pm @@ -0,0 +1,57 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# temporary queue for public-inbox-index to support multi-Message-ID +# messages on mirrors of v2 inboxes +package PublicInbox::MultiMidQueue; +use strict; +use SDBM_File; # part of Perl standard library +use Fcntl qw(O_RDWR O_CREAT); +use File::Temp 0.19 (); # 0.19 for ->newdir +my %e = (freebsd => 0x100000, linux => 0x80000, openbsd => 0x10000); +my $O_CLOEXEC = $e{$^O} // 0; + +sub new { + my ($class) = @_; + my $tmpdir = File::Temp->newdir('multi-mid-q-XXXXXX', TMPDIR => 1); + my $base = $tmpdir->dirname . '/q'; + my %sdbm; + my $flags = O_RDWR|O_CREAT; + if (!tie(%sdbm, 'SDBM_File', $base, $flags|$O_CLOEXEC, 0600)) { + if (!tie(%sdbm, 'SDBM_File', $base, $flags, 0600)) { + die "could not tie ($base): $!"; + } + $O_CLOEXEC = 0; + } + + bless { + cur => 1, + min => 1, + max => 0, + sdbm => \%sdbm, + tmpdir => $tmpdir, + }, $class; +} + +sub set_oid { + my ($self, $i, $oid, $v2w) = @_; + $self->{max} = $i if $i > $self->{max}; + $self->{min} = $i if $i < $self->{min}; + $self->{sdbm}->{$i} = "$oid\t$v2w->{autime}\t$v2w->{cotime}"; +} + +sub get_oid { + my ($self, $i, $v2w) = @_; + my $rec = $self->{sdbm}->{$i} or return; + my ($oid, $autime, $cotime) = split(/\t/, $rec); + $v2w->{autime} = $autime; + $v2w->{cotime} = $cotime; + $oid +} + +sub push_oid { + my ($self, $oid, $v2w) = @_; + set_oid($self, $self->{cur}++, $oid, $v2w); +} + +1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b45d2722..1c78ef24 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -20,6 +20,7 @@ use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd); use PublicInbox::SearchIdx; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +use PublicInbox::MultiMidQueue; use IO::Handle; # ->autoflush use File::Temp qw(tempfile); @@ -991,15 +992,15 @@ sub multi_mid_q_new () { $multi_mid } -sub multi_mid_q_push ($$) { - my ($sync, $oid) = @_; - my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new(); +sub multi_mid_q_push ($$$) { + my ($self, $sync, $oid) = @_; + my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new; if ($sync->{reindex}) { # no regen on reindex - $multi_mid->mid_insert($oid); + $multi_mid->push_oid($oid, $self); } else { my $num = $sync->{regen}--; die "BUG: ran out of article numbers" if $num <= 0; - $multi_mid->mid_set($num, $oid); + $multi_mid->set_oid($num, $oid, $self); } } @@ -1051,7 +1052,7 @@ sub reindex_oid ($$$$) { # do not delete from {mm_tmp}, since another # single-MID message may use it. } else { # handle them at the end: - multi_mid_q_push($sync, $oid); + multi_mid_q_push($self, $sync, $oid); } return; } @@ -1352,19 +1353,21 @@ sub index_sync { } if (my $multi_mid = delete $sync->{multi_mid}) { $git //= $self->{-inbox}->git; - my ($min, $max) = $multi_mid->minmax; + my $min = $multi_mid->{min}; + my $max = $multi_mid->{max}; if ($sync->{reindex}) { # we may need to create new Message-IDs if mirrors # were initially indexed with old versions for (my $i = $max; $i >= $min; $i--) { - my $oid = $multi_mid->mid_for($i); + my $oid; + $oid = $multi_mid->get_oid($i, $self) or next; next unless defined $oid; reindex_oid_m($self, $sync, $git, $oid); } } else { # regen on initial index for my $num ($min..$max) { - my $oid = $multi_mid->mid_for($num); - next unless defined $oid; + my $oid; + $oid = $multi_mid->get_oid($num, $self) or next; reindex_oid_m($self, $sync, $git, $oid, $num); } } diff --git a/t/multi-mid.t b/t/multi-mid.t index df865efb..87240c2c 100644 --- a/t/multi-mid.t +++ b/t/multi-mid.t @@ -1,5 +1,6 @@ # Copyright (C) 2020 all contributors # License: AGPL-3.0+ +use strict; use Test::More; use PublicInbox::MIME; use PublicInbox::TestCommon; @@ -7,6 +8,7 @@ use PublicInbox::InboxWritable; require_git(2.6); require_mods(qw(DBD::SQLite)); require PublicInbox::SearchIdx; +my $delay = $ENV{TEST_DELAY_CONVERT}; my $addr = 'test@example.com'; my $bad = PublicInbox::MIME->new(< Message-ID: From: a\@example.com To: $addr -Date: Fri, 02 Oct 1993 00:00:00 +0000 Subject: bad EOF my $good = PublicInbox::MIME->new(< -Date: Fri, 02 Oct 1993 00:00:00 +0000 From: b\@example.com To: $addr Subject: good @@ -37,13 +37,18 @@ for my $order ([$bad, $good], [$good, $bad]) { indexlevel => 'basic', -primary_address => $addr, }, my $creat_opt = {}); + my @old; if ('setup v1 inbox') { my $im = $ibx->importer(0); - ok($im->add($_), 'added '.$_->header('Subject')) for @$order; + for (@$order) { + ok($im->add($_), 'added '.$_->header('Subject')); + sleep($delay) if $delay; + } $im->done; my $s = PublicInbox::SearchIdx->new($ibx, 1); $s->index_sync; $before = [ $ibx->mm->minmax ]; + @old = ($ibx->over->get_art(1), $ibx->over->get_art(2)); $ibx->cleanup; } my $rdr = { 1 => \(my $out = ''), 2 => \(my $err = '') }; @@ -56,6 +61,22 @@ for my $order ([$bad, $good], [$good, $bad]) { $ibx->{inboxdir} = "$tmpdir/v2"; is_deeply([$ibx->mm->minmax], $before, 'min, max article numbers unchanged'); + + my @v2 = ($ibx->over->get_art(1), $ibx->over->get_art(2)); + is_deeply(\@v2, \@old, 'v2 conversion times match'); + + system(qw(git clone -sq --mirror), "$tmpdir/v2/git/0.git", + "$tmpdir/v2-clone/git/0.git") == 0 or die "clone: $?"; + $cmd = [ '-init', '-V2', 'v2c', "$tmpdir/v2-clone", + 'http://example.com/v2c', 'v2c@example.com' ]; + ok(run_script($cmd, $env), 'init clone'); + $cmd = [ '-index', "$tmpdir/v2-clone" ]; + sleep($delay) if $delay; + ok(run_script($cmd, $env), 'index the clone'); + $ibx->cleanup; + $ibx->{inboxdir} = "$tmpdir/v2-clone"; + my @v2c = ($ibx->over->get_art(1), $ibx->over->get_art(2)); + is_deeply(\@v2c, \@old, 'v2 clone times match'); } done_testing();