user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 5/3] v2writable: use msgmap as multi_mid queue
Date: Tue, 22 Oct 2019 01:29:27 +0000	[thread overview]
Message-ID: <20191022012927.GB7175@dcvr> (raw)
In-Reply-To: <20191022012841.GA7175@dcvr>

Instead of storing Message-IDs in the Msgmap object, we can
store the blob OID.

For initial indexing of mirrors, this lets us preserve
$sync->{regen} by storing the intended article number in
the queue.

On --reindex, the article number we store in Msgmap is ignored
but only used for ordering purposes.

This also allows us to avoid ENOMEM errors if somebody abuses
our system by reusing Message-IDs; but we now risk ENOSPC
instead (but systems tend to have more FS storage than RAM).
---
 lib/PublicInbox/V2Writable.pm | 120 ++++++++++++++++++++++------------
 1 file changed, 79 insertions(+), 41 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 33c0038d..ad2e8e62 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -19,6 +19,7 @@ use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::SearchIdx;
 use IO::Handle;
+use File::Temp qw(tempfile);
 
 # an estimate of the post-packed size to the raw uncompressed size
 my $PACKING_FACTOR = 0.4;
@@ -763,7 +764,6 @@ sub import_init {
 # XXX experimental
 sub diff ($$$) {
 	my ($mid, $cur, $new) = @_;
-	use File::Temp qw(tempfile);
 
 	my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1);
 	print $ah $cur->as_string or die "print: $!";
@@ -859,8 +859,9 @@ sub reindex_checkpoint ($$$) {
 }
 
 # only for a few odd messages with multiple Message-IDs
-sub reindex_oid_m ($$$$) {
-	my ($self, $sync, $git, $oid) = @_;
+sub reindex_oid_m ($$$$;$) {
+	my ($self, $sync, $git, $oid, $regen_num) = @_;
+	$self->{current_info} = "multi_mid $oid";
 	my ($num, $mid0, $len);
 	my $msgref = $git->cat_file($oid, \$len);
 	my $mime = PublicInbox::MIME->new($$msgref);
@@ -872,49 +873,51 @@ sub reindex_oid_m ($$$$) {
 		delete($sync->{D}->{"$mid\0$cid"}) and
 			die "BUG: reindex_oid should handle <$mid> delete";
 	}
+	my $over = $self->{over};
 	for my $mid (reverse @$mids) {
-		($num, $mid0) = $self->{over}->num_mid0_for_oid($oid, $mid);
-		last if defined $num;
+		($num, $mid0) = $over->num_mid0_for_oid($oid, $mid);
+		next unless defined $num;
+		if (defined($regen_num) && $regen_num != $num) {
+			die "BUG: regen(#$regen_num) != over(#$num)";
+		}
 	}
 	unless (defined($num)) {
 		for my $mid (reverse @$mids) {
 			# is this a number we got before?
-			$num = $sync->{mm_tmp}->num_for($mid);
-			next unless defined $num;
-			$mid0 = $mid;
+			my $n = $sync->{mm_tmp}->num_for($mid);
+			next unless defined $n;
+			next if defined($regen_num) && $regen_num != $n;
+			($num, $mid0) = ($n, $mid);
 			last;
 		}
 	}
 	if (defined($num)) {
 		$sync->{mm_tmp}->num_delete($num);
-	} else {
-		$num = $sync->{regen}--;
-		if ($num <= 0) {
-			# fixup bugs in old mirrors on reindex
-			for my $mid (reverse @$mids) {
-				$num = $self->{mm}->mid_insert($mid);
-				next unless defined $num;
-				$mid0 = $mid;
-				last;
-			}
-			if (defined $mid0) {
-				if ($sync->{reindex}) {
-					warn "reindex added #$num <$mid0>\n";
-				}
-			} else {
-				warn "E: cannot find article #\n";
-				return;
-			}
-		} else { # $num > 0, use the new article number
-			for my $mid (reverse @$mids) {
-				$self->{mm}->mid_set($num, $mid) == 1 or next;
-				$mid0 = $mid;
-				last;
-			}
-			unless (defined $mid0) {
-				warn "E: cannot regen #$num\n";
-				return;
+	} elsif (defined $regen_num) {
+		$num = $regen_num;
+		for my $mid (reverse @$mids) {
+			$self->{mm}->mid_set($num, $mid) == 1 or next;
+			$mid0 = $mid;
+			last;
+		}
+		unless (defined $mid0) {
+			warn "E: cannot regen #$num\n";
+			return;
+		}
+	} else { # fixup bugs in old mirrors on reindex
+		for my $mid (reverse @$mids) {
+			$num = $self->{mm}->mid_insert($mid);
+			next unless defined $num;
+			$mid0 = $mid;
+			last;
+		}
+		if (defined $mid0) {
+			if ($sync->{reindex}) {
+				warn "reindex added #$num <$mid0>\n";
 			}
+		} else {
+			warn "E: cannot find article #\n";
+			return;
 		}
 	}
 	$sync->{nr}++;
@@ -935,6 +938,30 @@ sub check_unindexed ($$$) {
 	}
 }
 
+# reuse Msgmap to store num => oid mapping (rather than num => mid)
+sub multi_mid_q_new () {
+	my ($fh, $fn) = tempfile('multi_mid-XXXXXXX', EXLOCK => 0, TMPDIR => 1);
+	my $multi_mid = PublicInbox::Msgmap->new_file($fn, 1);
+	$multi_mid->{dbh}->do('PRAGMA synchronous = OFF');
+	# for Msgmap->DESTROY:
+	$multi_mid->{tmp_name} = $fn;
+	$multi_mid->{pid} = $$;
+	close $fh or die "failed to close $fn: $!";
+	$multi_mid
+}
+
+sub multi_mid_q_push ($$) {
+	my ($sync, $oid) = @_;
+	my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new();
+	if ($sync->{reindex}) { # no regen on reindex
+		$multi_mid->mid_insert($oid);
+	} else {
+		my $num = $sync->{regen}--;
+		die "BUG: ran out of article numbers" if $num <= 0;
+		$multi_mid->mid_set($num, $oid);
+	}
+}
+
 sub reindex_oid ($$$$) {
 	my ($self, $sync, $git, $oid) = @_;
 	my ($num, $mid0, $len);
@@ -966,7 +993,7 @@ sub reindex_oid ($$$$) {
 			check_unindexed($self, $num, $mid0);
 		} else {
 			$num = $sync->{regen}--;
-			die "BUG: ran out of article numbers\n" if $num <= 0;
+			die "BUG: ran out of article numbers" if $num <= 0;
 			if ($self->{mm}->mid_set($num, $mid) != 1) {
 				warn "E: unable to assign $num => <$mid>\n";
 				return;
@@ -983,7 +1010,7 @@ sub reindex_oid ($$$$) {
 			# do not delete from {mm_tmp}, since another
 			# single-MID message may use it.
 		} else { # handle them at the end:
-			push @{$sync->{multi_mid} //= []}, $oid;
+			multi_mid_q_push($sync, $oid);
 		}
 		return;
 	}
@@ -1275,10 +1302,21 @@ sub index_sync {
 	}
 	if (my $multi_mid = delete $sync->{multi_mid}) {
 		$git //= $self->{-inbox}->git;
-
-		while (defined(my $oid = pop(@$multi_mid))) {
-			$self->{current_info} = "multi_mid $oid";
-			reindex_oid_m($self, $sync, $git, $oid);
+		my ($min, $max) = $multi_mid->minmax;
+		if ($sync->{reindex}) {
+			# we may need to create new Message-IDs if mirrors
+			# were initially indexed with old versions
+			for (my $i = $max; $i >= $min; $i--) {
+				my $oid = $multi_mid->mid_for($i);
+				next unless defined $oid;
+				reindex_oid_m($self, $sync, $git, $oid);
+			}
+		} else { # regen on initial index
+			for my $num ($min..$max) {
+				my $oid = $multi_mid->mid_for($num);
+				next unless defined $oid;
+				reindex_oid_m($self, $sync, $git, $oid, $num);
+			}
 		}
 	}
 	$git->cleanup if $git;

  reply	other threads:[~2019-10-22  1:29 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-16 21:14 BUG: --reindex broken on multiple Message-IDs reuse Eric Wong
2019-10-17 11:22 ` [RFC] v2writable: reindex handles 3-headed monsters [was: BUG: --reindex broken on multiple Message-IDs reuse] Eric Wong
2019-10-22  8:09   ` [RFC/HELP] search: multiple From/To/Cc/Subject (what about Date?) Eric Wong
2019-10-21 11:02 ` [PATCH 0/3] fix reindex on multiple + overlapping Message-IDs Eric Wong
2019-10-21 11:02   ` [PATCH 1/3] v2writable: set unindexed article number Eric Wong
2019-10-21 11:02   ` [PATCH 2/3] v2writable: improve "num_for" API and disambiguate Eric Wong
2019-10-21 11:02   ` [PATCH 3/3] v2writable: reindex handles 3-headered monsters Eric Wong
2019-10-21 11:34     ` Eric Wong
2019-10-22  1:28     ` [PATCH 4/3] v2writable: move git->cleanup to the correct place Eric Wong
2019-10-22  1:29       ` Eric Wong [this message]
2019-10-23 18:19   ` [PUSHED] fix reindex on multiple + overlapping Message-IDs Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191022012927.GB7175@dcvr \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).