From c7acdfe78bda5bf36660a699e882e0e3c431a351 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 20 Mar 2020 08:18:21 +0000 Subject: v2: SDBM-based multi Message-ID queue This lets us store author and committer times for deferred indexing messages with ambiguous Message-IDs. This allows us to reproducibly reindex messages with the git commit and author times when a rare message lacks Received and/or Date headers while having ambiguous Message-IDs. --- lib/PublicInbox/MultiMidQueue.pm | 57 ++++++++++++++++++++++++++++++++++++++++ lib/PublicInbox/V2Writable.pm | 23 +++++++++------- 2 files changed, 70 insertions(+), 10 deletions(-) create mode 100644 lib/PublicInbox/MultiMidQueue.pm (limited to 'lib') diff --git a/lib/PublicInbox/MultiMidQueue.pm b/lib/PublicInbox/MultiMidQueue.pm new file mode 100644 index 00000000..3c28ebbc --- /dev/null +++ b/lib/PublicInbox/MultiMidQueue.pm @@ -0,0 +1,57 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# temporary queue for public-inbox-index to support multi-Message-ID +# messages on mirrors of v2 inboxes +package PublicInbox::MultiMidQueue; +use strict; +use SDBM_File; # part of Perl standard library +use Fcntl qw(O_RDWR O_CREAT); +use File::Temp 0.19 (); # 0.19 for ->newdir +my %e = (freebsd => 0x100000, linux => 0x80000, openbsd => 0x10000); +my $O_CLOEXEC = $e{$^O} // 0; + +sub new { + my ($class) = @_; + my $tmpdir = File::Temp->newdir('multi-mid-q-XXXXXX', TMPDIR => 1); + my $base = $tmpdir->dirname . '/q'; + my %sdbm; + my $flags = O_RDWR|O_CREAT; + if (!tie(%sdbm, 'SDBM_File', $base, $flags|$O_CLOEXEC, 0600)) { + if (!tie(%sdbm, 'SDBM_File', $base, $flags, 0600)) { + die "could not tie ($base): $!"; + } + $O_CLOEXEC = 0; + } + + bless { + cur => 1, + min => 1, + max => 0, + sdbm => \%sdbm, + tmpdir => $tmpdir, + }, $class; +} + +sub set_oid { + my ($self, $i, $oid, $v2w) = @_; + $self->{max} = $i if $i > $self->{max}; + $self->{min} = $i if $i < $self->{min}; + $self->{sdbm}->{$i} = "$oid\t$v2w->{autime}\t$v2w->{cotime}"; +} + +sub get_oid { + my ($self, $i, $v2w) = @_; + my $rec = $self->{sdbm}->{$i} or return; + my ($oid, $autime, $cotime) = split(/\t/, $rec); + $v2w->{autime} = $autime; + $v2w->{cotime} = $cotime; + $oid +} + +sub push_oid { + my ($self, $oid, $v2w) = @_; + set_oid($self, $self->{cur}++, $oid, $v2w); +} + +1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index b45d2722..1c78ef24 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -20,6 +20,7 @@ use PublicInbox::Msgmap; use PublicInbox::Spawn qw(spawn popen_rd); use PublicInbox::SearchIdx; use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); +use PublicInbox::MultiMidQueue; use IO::Handle; # ->autoflush use File::Temp qw(tempfile); @@ -991,15 +992,15 @@ sub multi_mid_q_new () { $multi_mid } -sub multi_mid_q_push ($$) { - my ($sync, $oid) = @_; - my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new(); +sub multi_mid_q_push ($$$) { + my ($self, $sync, $oid) = @_; + my $multi_mid = $sync->{multi_mid} //= PublicInbox::MultiMidQueue->new; if ($sync->{reindex}) { # no regen on reindex - $multi_mid->mid_insert($oid); + $multi_mid->push_oid($oid, $self); } else { my $num = $sync->{regen}--; die "BUG: ran out of article numbers" if $num <= 0; - $multi_mid->mid_set($num, $oid); + $multi_mid->set_oid($num, $oid, $self); } } @@ -1051,7 +1052,7 @@ sub reindex_oid ($$$$) { # do not delete from {mm_tmp}, since another # single-MID message may use it. } else { # handle them at the end: - multi_mid_q_push($sync, $oid); + multi_mid_q_push($self, $sync, $oid); } return; } @@ -1352,19 +1353,21 @@ sub index_sync { } if (my $multi_mid = delete $sync->{multi_mid}) { $git //= $self->{-inbox}->git; - my ($min, $max) = $multi_mid->minmax; + my $min = $multi_mid->{min}; + my $max = $multi_mid->{max}; if ($sync->{reindex}) { # we may need to create new Message-IDs if mirrors # were initially indexed with old versions for (my $i = $max; $i >= $min; $i--) { - my $oid = $multi_mid->mid_for($i); + my $oid; + $oid = $multi_mid->get_oid($i, $self) or next; next unless defined $oid; reindex_oid_m($self, $sync, $git, $oid); } } else { # regen on initial index for my $num ($min..$max) { - my $oid = $multi_mid->mid_for($num); - next unless defined $oid; + my $oid; + $oid = $multi_mid->get_oid($num, $self) or next; reindex_oid_m($self, $sync, $git, $oid, $num); } } -- cgit v1.2.3-24-ge0c7