user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 9/9] v2: SDBM-based multi Message-ID queue
Date: Fri, 20 Mar 2020 08:18:21 +0000	[thread overview]
Message-ID: <20200320081821.21715-10-e@yhbt.net> (raw)
In-Reply-To: <20200320081821.21715-1-e@yhbt.net>

This lets us store author and committer times for deferred
indexing messages with ambiguous Message-IDs.  This allows
us to reproducibly reindex messages with the git commit
and author times when a rare message lacks Received and/or
Date headers while having ambiguous Message-IDs.
---
 MANIFEST                         |  1 +
 lib/PublicInbox/MultiMidQueue.pm | 57 ++++++++++++++++++++++++++++++++
 lib/PublicInbox/V2Writable.pm    | 23 +++++++------
 t/multi-mid.t                    | 27 +++++++++++++--
 4 files changed, 95 insertions(+), 13 deletions(-)
 create mode 100644 lib/PublicInbox/MultiMidQueue.pm

diff --git a/MANIFEST b/MANIFEST
index ec80c90f..f077d722 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -130,6 +130,7 @@ lib/PublicInbox/MboxGz.pm
 lib/PublicInbox/MsgIter.pm
 lib/PublicInbox/MsgTime.pm
 lib/PublicInbox/Msgmap.pm
+lib/PublicInbox/MultiMidQueue.pm
 lib/PublicInbox/NNTP.pm
 lib/PublicInbox/NNTPD.pm
 lib/PublicInbox/NNTPdeflate.pm
diff --git a/lib/PublicInbox/MultiMidQueue.pm b/lib/PublicInbox/MultiMidQueue.pm
new file mode 100644
index 00000000..3c28ebbc
--- /dev/null
+++ b/lib/PublicInbox/MultiMidQueue.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# temporary queue for public-inbox-index to support multi-Message-ID
+# messages on mirrors of v2 inboxes
+package PublicInbox::MultiMidQueue;
+use strict;
+use SDBM_File; # part of Perl standard library
+use Fcntl qw(O_RDWR O_CREAT);
+use File::Temp 0.19 (); # 0.19 for ->newdir
+my %e = (freebsd => 0x100000, linux => 0x80000, openbsd => 0x10000);
+my $O_CLOEXEC = $e{$^O} // 0;
+
+sub new {
+	my ($class) = @_;
+	my $tmpdir = File::Temp->newdir('multi-mid-q-XXXXXX', TMPDIR => 1);
+	my $base = $tmpdir->dirname . '/q';
+	my %sdbm;
+	my $flags = O_RDWR|O_CREAT;
+	if (!tie(%sdbm, 'SDBM_File', $base, $flags|$O_CLOEXEC, 0600)) {
+		if (!tie(%sdbm, 'SDBM_File', $base, $flags, 0600)) {
+			die "could not tie ($base): $!";
+		}
+		$O_CLOEXEC = 0;
+	}
+
+	bless {
+		cur => 1,
+		min => 1,
+		max => 0,
+		sdbm => \%sdbm,
+		tmpdir => $tmpdir,
+	}, $class;
+}
+
+sub set_oid {
+	my ($self, $i, $oid, $v2w) = @_;
+	$self->{max} = $i if $i > $self->{max};
+	$self->{min} = $i if $i < $self->{min};
+	$self->{sdbm}->{$i} = "$oid\t$v2w->{autime}\t$v2w->{cotime}";
+}
+
+sub get_oid {
+	my ($self, $i, $v2w) = @_;
+	my $rec = $self->{sdbm}->{$i} or return;
+	my ($oid, $autime, $cotime) = split(/\t/, $rec);
+	$v2w->{autime} = $autime;
+	$v2w->{cotime} = $cotime;
+	$oid
+}
+
+sub push_oid {
+	my ($self, $oid, $v2w) = @_;
+	set_oid($self, $self->{cur}++, $oid, $v2w);
+}
+
+1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index b45d2722..1c78ef24 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -20,6 +20,7 @@ use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::SearchIdx;
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
+use PublicInbox::MultiMidQueue;
 use IO::Handle; # ->autoflush
 use File::Temp qw(tempfile);
 
@@ -991,15 +992,15 @@ sub multi_mid_q_new () {
 	$multi_mid
 }
 
-sub multi_mid_q_push ($$) {
-	my ($sync, $oid) = @_;
-	my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new();
+sub multi_mid_q_push ($$$) {
+	my ($self, $sync, $oid) = @_;
+	my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new;
 	if ($sync->{reindex}) { # no regen on reindex
-		$multi_mid->mid_insert($oid);
+		$multi_mid->push_oid($oid, $self);
 	} else {
 		my $num = $sync->{regen}--;
 		die "BUG: ran out of article numbers" if $num <= 0;
-		$multi_mid->mid_set($num, $oid);
+		$multi_mid->set_oid($num, $oid, $self);
 	}
 }
 
@@ -1051,7 +1052,7 @@ sub reindex_oid ($$$$) {
 			# do not delete from {mm_tmp}, since another
 			# single-MID message may use it.
 		} else { # handle them at the end:
-			multi_mid_q_push($sync, $oid);
+			multi_mid_q_push($self, $sync, $oid);
 		}
 		return;
 	}
@@ -1352,19 +1353,21 @@ sub index_sync {
 	}
 	if (my $multi_mid = delete $sync->{multi_mid}) {
 		$git //= $self->{-inbox}->git;
-		my ($min, $max) = $multi_mid->minmax;
+		my $min = $multi_mid->{min};
+		my $max = $multi_mid->{max};
 		if ($sync->{reindex}) {
 			# we may need to create new Message-IDs if mirrors
 			# were initially indexed with old versions
 			for (my $i = $max; $i >= $min; $i--) {
-				my $oid = $multi_mid->mid_for($i);
+				my $oid;
+				$oid = $multi_mid->get_oid($i, $self) or next;
 				next unless defined $oid;
 				reindex_oid_m($self, $sync, $git, $oid);
 			}
 		} else { # regen on initial index
 			for my $num ($min..$max) {
-				my $oid = $multi_mid->mid_for($num);
-				next unless defined $oid;
+				my $oid;
+				$oid = $multi_mid->get_oid($num, $self) or next;
 				reindex_oid_m($self, $sync, $git, $oid, $num);
 			}
 		}
diff --git a/t/multi-mid.t b/t/multi-mid.t
index df865efb..87240c2c 100644
--- a/t/multi-mid.t
+++ b/t/multi-mid.t
@@ -1,5 +1,6 @@
 # Copyright (C) 2020 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
 use Test::More;
 use PublicInbox::MIME;
 use PublicInbox::TestCommon;
@@ -7,6 +8,7 @@ use PublicInbox::InboxWritable;
 require_git(2.6);
 require_mods(qw(DBD::SQLite));
 require PublicInbox::SearchIdx;
+my $delay = $ENV{TEST_DELAY_CONVERT};
 
 my $addr = 'test@example.com';
 my $bad = PublicInbox::MIME->new(<<EOF);
@@ -14,14 +16,12 @@ Message-ID: <a\@example.com>
 Message-ID: <b\@example.com>
 From: a\@example.com
 To: $addr
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 Subject: bad
 
 EOF
 
 my $good = PublicInbox::MIME->new(<<EOF);
 Message-ID: <b\@example.com>
-Date: Fri, 02 Oct 1993 00:00:00 +0000
 From: b\@example.com
 To: $addr
 Subject: good
@@ -37,13 +37,18 @@ for my $order ([$bad, $good], [$good, $bad]) {
 		indexlevel => 'basic',
 		-primary_address => $addr,
 	}, my $creat_opt = {});
+	my @old;
 	if ('setup v1 inbox') {
 		my $im = $ibx->importer(0);
-		ok($im->add($_), 'added '.$_->header('Subject')) for @$order;
+		for (@$order) {
+			ok($im->add($_), 'added '.$_->header('Subject'));
+			sleep($delay) if $delay;
+		}
 		$im->done;
 		my $s = PublicInbox::SearchIdx->new($ibx, 1);
 		$s->index_sync;
 		$before = [ $ibx->mm->minmax ];
+		@old = ($ibx->over->get_art(1), $ibx->over->get_art(2));
 		$ibx->cleanup;
 	}
 	my $rdr = { 1 => \(my $out = ''), 2 => \(my $err = '') };
@@ -56,6 +61,22 @@ for my $order ([$bad, $good], [$good, $bad]) {
 	$ibx->{inboxdir} = "$tmpdir/v2";
 	is_deeply([$ibx->mm->minmax], $before,
 		'min, max article numbers unchanged');
+
+	my @v2 = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+	is_deeply(\@v2, \@old, 'v2 conversion times match');
+
+	system(qw(git clone -sq --mirror), "$tmpdir/v2/git/0.git",
+		"$tmpdir/v2-clone/git/0.git") == 0 or die "clone: $?";
+	$cmd = [ '-init', '-V2', 'v2c', "$tmpdir/v2-clone",
+		'http://example.com/v2c', 'v2c@example.com' ];
+	ok(run_script($cmd, $env), 'init clone');
+	$cmd = [ '-index', "$tmpdir/v2-clone" ];
+	sleep($delay) if $delay;
+	ok(run_script($cmd, $env), 'index the clone');
+	$ibx->cleanup;
+	$ibx->{inboxdir} = "$tmpdir/v2-clone";
+	my @v2c = ($ibx->over->get_art(1), $ibx->over->get_art(2));
+	is_deeply(\@v2c, \@old, 'v2 clone times match');
 }
 
 done_testing();

      parent reply	other threads:[~2020-03-20  8:18 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-05  3:23 [PATCH] index: use git commit times on missing Date/Received Eric Wong
2020-03-05  5:13 ` Eric Wong
2020-03-20  8:18   ` [PATCH 0/9] preserve time and date of initial commit Eric Wong
2020-03-20  8:18     ` [PATCH 1/9] index: use git commit times on missing Date/Received Eric Wong
2020-03-20  8:18     ` [PATCH 2/9] v2writable: preserve timestamps from import Eric Wong
2020-03-20  8:18     ` [PATCH 3/9] rename PublicInbox::SearchMsg => PublicInbox::Smsg Eric Wong
2020-03-20  8:18     ` [PATCH 4/9] smsg: to_doc_data: use existing fields Eric Wong
2020-03-20  8:18     ` [PATCH 5/9] overidx: parse_references: less error-prone args Eric Wong
2020-03-20  8:18     ` [PATCH 6/9] *idx: pass $smsg in more places instead of many args Eric Wong
2020-03-20  8:18     ` [PATCH 7/9] v2: pass smsg in more places Eric Wong
2020-03-20  8:18     ` [PATCH 8/9] *idx: pass smsg in even " Eric Wong
2020-03-20  8:18     ` Eric Wong [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200320081821.21715-10-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).