From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 9/9] v2: SDBM-based multi Message-ID queue
Date: Fri, 20 Mar 2020 08:18:21 +0000 [thread overview]
Message-ID: <20200320081821.21715-10-e@yhbt.net> (raw)
In-Reply-To: <20200320081821.21715-1-e@yhbt.net>
This lets us store author and committer times for deferred
indexing messages with ambiguous Message-IDs. This allows
us to reproducibly reindex messages with the git commit
and author times when a rare message lacks Received and/or
Date headers while having ambiguous Message-IDs.
---
MANIFEST | 1 +
lib/PublicInbox/MultiMidQueue.pm | 57 ++++++++++++++++++++++++++++++++
lib/PublicInbox/V2Writable.pm | 23 +++++++------
t/multi-mid.t | 27 +++++++++++++--
4 files changed, 95 insertions(+), 13 deletions(-)
create mode 100644 lib/PublicInbox/MultiMidQueue.pm
diff --git a/MANIFEST b/MANIFEST
index ec80c90f..f077d722 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -130,6 +130,7 @@ lib/PublicInbox/MboxGz.pm
lib/PublicInbox/MsgIter.pm
lib/PublicInbox/MsgTime.pm
lib/PublicInbox/Msgmap.pm
+lib/PublicInbox/MultiMidQueue.pm
lib/PublicInbox/NNTP.pm
lib/PublicInbox/NNTPD.pm
lib/PublicInbox/NNTPdeflate.pm
diff --git a/lib/PublicInbox/MultiMidQueue.pm b/lib/PublicInbox/MultiMidQueue.pm
new file mode 100644
index 00000000..3c28ebbc
--- /dev/null
+++ b/lib/PublicInbox/MultiMidQueue.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# temporary queue for public-inbox-index to support multi-Message-ID
+# messages on mirrors of v2 inboxes
+package PublicInbox::MultiMidQueue;
+use strict;
+use SDBM_File; # part of Perl standard library
+use Fcntl qw(O_RDWR O_CREAT);
+use File::Temp 0.19 (); # 0.19 for ->newdir
+my %e = (freebsd => 0x100000, linux => 0x80000, openbsd => 0x10000);
+my $O_CLOEXEC = $e{$^O} // 0;
+
+sub new {
+ my ($class) = @_;
+ my $tmpdir = File::Temp->newdir('multi-mid-q-XXXXXX', TMPDIR => 1);
+ my $base = $tmpdir->dirname . '/q';
+ my %sdbm;
+ my $flags = O_RDWR|O_CREAT;
+ if (!tie(%sdbm, 'SDBM_File', $base, $flags|$O_CLOEXEC, 0600)) {
+ if (!tie(%sdbm, 'SDBM_File', $base, $flags, 0600)) {
+ die "could not tie ($base): $!";
+ }
+ $O_CLOEXEC = 0;
+ }
+
+ bless {
+ cur => 1,
+ min => 1,
+ max => 0,
+ sdbm => \%sdbm,
+ tmpdir => $tmpdir,
+ }, $class;
+}
+
+sub set_oid {
+ my ($self, $i, $oid, $v2w) = @_;
+ $self->{max} = $i if $i > $self->{max};
+ $self->{min} = $i if $i < $self->{min};
+ $self->{sdbm}->{$i} = "$oid\t$v2w->{autime}\t$v2w->{cotime}";
+}
+
+sub get_oid {
+ my ($self, $i, $v2w) = @_;
+ my $rec = $self->{sdbm}->{$i} or return;
+ my ($oid, $autime, $cotime) = split(/\t/, $rec);
+ $v2w->{autime} = $autime;
+ $v2w->{cotime} = $cotime;
+ $oid
+}
+
+sub push_oid {
+ my ($self, $oid, $v2w) = @_;
+ set_oid($self, $self->{cur}++, $oid, $v2w);
+}
+
+1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b45d2722..1c78ef24 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -20,6 +20,7 @@ use PublicInbox::Msgmap;
use PublicInbox::Spawn qw(spawn popen_rd);
use PublicInbox::SearchIdx;
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::MultiMidQueue;
use IO::Handle; # ->autoflush
use File::Temp qw(tempfile);
@@ -991,15 +992,15 @@ sub multi_mid_q_new () {
$multi_mid
}
-sub multi_mid_q_push ($$) {
- my ($sync, $oid) = @_;
- my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new();
+sub multi_mid_q_push ($$$) {
+ my ($self, $sync, $oid) = @_;
+ my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new;
if ($sync->{reindex}) { # no regen on reindex
- $multi_mid->mid_insert($oid);
+ $multi_mid->push_oid($oid, $self);
} else {
my $num = $sync->{regen}--;
die "BUG: ran out of article numbers" if $num <= 0;
- $multi_mid->mid_set($num, $oid);
+ $multi_mid->set_oid($num, $oid, $self);
}
}
@@ -1051,7 +1052,7 @@ sub reindex_oid ($$$$) {
# do not delete from {mm_tmp}, since another
# single-MID message may use it.
} else { # handle them at the end:
- multi_mid_q_push($sync, $oid);
+ multi_mid_q_push($self, $sync, $oid);
}
return;
}
@@ -1352,19 +1353,21 @@ sub index_sync {
}
if (my $multi_mid = delete $sync->{multi_mid}) {
$git //= $self->{-inbox}->git;
- my ($min, $max) = $multi_mid->minmax;
+ my $min = $multi_mid->{min};
+ my $max = $multi_mid->{max};
if ($sync->{reindex}) {
# we may need to create new Message-IDs if mirrors
# were initially indexed with old versions
for (my $i = $max; $i >= $min; $i--) {
- my $oid = $multi_mid->mid_for($i);
+ my $oid;
+ $oid = $multi_mid->get_oid($i, $self) or next;
next unless defined $oid;
reindex_oid_m($self, $sync, $git, $oid);
}
} else { # regen on initial index
for my $num ($min..$max) {
- my $oid = $multi_mid->mid_for($num);
- next unless defined $oid;
+ my $oid;
+ $oid = $multi_mid->get_oid($num, $self) or next;
reindex_oid_m($self, $sync, $git, $oid, $num);
}
}
diff --git a/t/multi-mid.t b/t/multi-mid.t
index df865efb..87240c2c 100644
--- a/t/multi-mid.t
+++ b/t/multi-mid.t
@@ -1,5 +1,6 @@
# Copyright (C) 2020 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
use Test::More;
use PublicInbox::MIME;
use PublicInbox::TestCommon;
@@ -7,6 +8,7 @@ use PublicInbox::InboxWritable;
require_git(2.6);
require_mods(qw(DBD::SQLite));
require PublicInbox::SearchIdx;
+my $delay = $ENV{TEST_DELAY_CONVERT};
my $addr = 'test@example.com';
my $bad = PublicInbox::MIME->new(<<EOF);
@@ -14,14 +16,12 @@ Message-ID: <a\@example.com>
Message-ID: <b\@example.com>
From: a\@example.com
To: $addr
-Date: Fri, 02 Oct 1993 00:00:00 +0000
Subject: bad
EOF
my $good = PublicInbox::MIME->new(<<EOF);
Message-ID: <b\@example.com>
-Date: Fri, 02 Oct 1993 00:00:00 +0000
From: b\@example.com
To: $addr
Subject: good
@@ -37,13 +37,18 @@ for my $order ([$bad, $good], [$good, $bad]) {
indexlevel => 'basic',
-primary_address => $addr,
}, my $creat_opt = {});
+ my @old;
if ('setup v1 inbox') {
my $im = $ibx->importer(0);
- ok($im->add($_), 'added '.$_->header('Subject')) for @$order;
+ for (@$order) {
+ ok($im->add($_), 'added '.$_->header('Subject'));
+ sleep($delay) if $delay;
+ }
$im->done;
my $s = PublicInbox::SearchIdx->new($ibx, 1);
$s->index_sync;
$before = [ $ibx->mm->minmax ];
+ @old = ($ibx->over->get_art(1), $ibx->over->get_art(2));
$ibx->cleanup;
}
my $rdr = { 1 => \(my $out = ''), 2 => \(my $err = '') };
@@ -56,6 +61,22 @@ for my $order ([$bad, $good], [$good, $bad]) {
$ibx->{inboxdir} = "$tmpdir/v2";
is_deeply([$ibx->mm->minmax], $before,
'min, max article numbers unchanged');
+
+ my @v2 = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+ is_deeply(\@v2, \@old, 'v2 conversion times match');
+
+ system(qw(git clone -sq --mirror), "$tmpdir/v2/git/0.git",
+ "$tmpdir/v2-clone/git/0.git") == 0 or die "clone: $?";
+ $cmd = [ '-init', '-V2', 'v2c', "$tmpdir/v2-clone",
+ 'http://example.com/v2c', 'v2c@example.com' ];
+ ok(run_script($cmd, $env), 'init clone');
+ $cmd = [ '-index', "$tmpdir/v2-clone" ];
+ sleep($delay) if $delay;
+ ok(run_script($cmd, $env), 'index the clone');
+ $ibx->cleanup;
+ $ibx->{inboxdir} = "$tmpdir/v2-clone";
+ my @v2c = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+ is_deeply(\@v2c, \@old, 'v2 clone times match');
}
done_testing();
prev parent reply other threads:[~2020-03-20 8:18 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-03-05 3:23 [PATCH] index: use git commit times on missing Date/Received Eric Wong
2020-03-05 5:13 ` Eric Wong
2020-03-20 8:18 ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
2020-03-20 8:18 ` [PATCH 1/9] index: use git commit times on missing Date/Received Eric Wong
2020-03-20 8:18 ` [PATCH 2/9] v2writable: preserve timestamps from import Eric Wong
2020-03-20 8:18 ` [PATCH 3/9] rename PublicInbox::SearchMsg => PublicInbox::Smsg Eric Wong
2020-03-20 8:18 ` [PATCH 4/9] smsg: to_doc_data: use existing fields Eric Wong
2020-03-20 8:18 ` [PATCH 5/9] overidx: parse_references: less error-prone args Eric Wong
2020-03-20 8:18 ` [PATCH 6/9] *idx: pass $smsg in more places instead of many args Eric Wong
2020-03-20 8:18 ` [PATCH 7/9] v2: pass smsg in more places Eric Wong
2020-03-20 8:18 ` [PATCH 8/9] *idx: pass smsg in even " Eric Wong
2020-03-20 8:18 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200320081821.21715-10-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).