user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PUSHED] fix reindex on multiple + overlapping Message-IDs
    @ 2019-10-23 18:19  7%   ` Eric Wong
  1 sibling, 0 replies; 2+ results
From: Eric Wong @ 2019-10-23 18:19 UTC (permalink / raw)
  To: meta

Eric Wong <e@80x24.org> wrote:
> I'm still testing this with various repos, but the
> linux-renesas-soc mirror from lore seems good when doing initial
> indexing with an old version of this and using --reindex with
> this patch, as all previously-missed messages are now indexed.

Seems fine and pushed all 5 patches.  However, display issues
still need to be fixed in the HTML...

> Eric Wong (3):
>   v2writable: set unindexed article number
>   v2writable: improve "num_for" API and disambiguate
>   v2writable: reindex handles 3-headered monsters

      v2writable: move git->cleanup to the correct place
      v2writable: use msgmap as multi_mid queue

^ permalink raw reply	[relevance 7%]

* [PATCH 5/3] v2writable: use msgmap as multi_mid queue
  @ 2019-10-22  1:29  6%       ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2019-10-22  1:29 UTC (permalink / raw)
  To: meta

Instead of storing Message-IDs in the Msgmap object, we can
store the blob OID.

For initial indexing of mirrors, this lets us preserve
$sync->{regen} by storing the intended article number in
the queue.

On --reindex, the article number we store in Msgmap is ignored
but only used for ordering purposes.

This also allows us to avoid ENOMEM errors if somebody abuses
our system by reusing Message-IDs; but we now risk ENOSPC
instead (but systems tend to have more FS storage than RAM).
---
 lib/PublicInbox/V2Writable.pm | 120 ++++++++++++++++++++++------------
 1 file changed, 79 insertions(+), 41 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 33c0038d..ad2e8e62 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -19,6 +19,7 @@ use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::SearchIdx;
 use IO::Handle;
+use File::Temp qw(tempfile);
 
 # an estimate of the post-packed size to the raw uncompressed size
 my $PACKING_FACTOR = 0.4;
@@ -763,7 +764,6 @@ sub import_init {
 # XXX experimental
 sub diff ($$$) {
 	my ($mid, $cur, $new) = @_;
-	use File::Temp qw(tempfile);
 
 	my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1);
 	print $ah $cur->as_string or die "print: $!";
@@ -859,8 +859,9 @@ sub reindex_checkpoint ($$$) {
 }
 
 # only for a few odd messages with multiple Message-IDs
-sub reindex_oid_m ($$$$) {
-	my ($self, $sync, $git, $oid) = @_;
+sub reindex_oid_m ($$$$;$) {
+	my ($self, $sync, $git, $oid, $regen_num) = @_;
+	$self->{current_info} = "multi_mid $oid";
 	my ($num, $mid0, $len);
 	my $msgref = $git->cat_file($oid, \$len);
 	my $mime = PublicInbox::MIME->new($$msgref);
@@ -872,49 +873,51 @@ sub reindex_oid_m ($$$$) {
 		delete($sync->{D}->{"$mid\0$cid"}) and
 			die "BUG: reindex_oid should handle <$mid> delete";
 	}
+	my $over = $self->{over};
 	for my $mid (reverse @$mids) {
-		($num, $mid0) = $self->{over}->num_mid0_for_oid($oid, $mid);
-		last if defined $num;
+		($num, $mid0) = $over->num_mid0_for_oid($oid, $mid);
+		next unless defined $num;
+		if (defined($regen_num) && $regen_num != $num) {
+			die "BUG: regen(#$regen_num) != over(#$num)";
+		}
 	}
 	unless (defined($num)) {
 		for my $mid (reverse @$mids) {
 			# is this a number we got before?
-			$num = $sync->{mm_tmp}->num_for($mid);
-			next unless defined $num;
-			$mid0 = $mid;
+			my $n = $sync->{mm_tmp}->num_for($mid);
+			next unless defined $n;
+			next if defined($regen_num) && $regen_num != $n;
+			($num, $mid0) = ($n, $mid);
 			last;
 		}
 	}
 	if (defined($num)) {
 		$sync->{mm_tmp}->num_delete($num);
-	} else {
-		$num = $sync->{regen}--;
-		if ($num <= 0) {
-			# fixup bugs in old mirrors on reindex
-			for my $mid (reverse @$mids) {
-				$num = $self->{mm}->mid_insert($mid);
-				next unless defined $num;
-				$mid0 = $mid;
-				last;
-			}
-			if (defined $mid0) {
-				if ($sync->{reindex}) {
-					warn "reindex added #$num <$mid0>\n";
-				}
-			} else {
-				warn "E: cannot find article #\n";
-				return;
-			}
-		} else { # $num > 0, use the new article number
-			for my $mid (reverse @$mids) {
-				$self->{mm}->mid_set($num, $mid) == 1 or next;
-				$mid0 = $mid;
-				last;
-			}
-			unless (defined $mid0) {
-				warn "E: cannot regen #$num\n";
-				return;
+	} elsif (defined $regen_num) {
+		$num = $regen_num;
+		for my $mid (reverse @$mids) {
+			$self->{mm}->mid_set($num, $mid) == 1 or next;
+			$mid0 = $mid;
+			last;
+		}
+		unless (defined $mid0) {
+			warn "E: cannot regen #$num\n";
+			return;
+		}
+	} else { # fixup bugs in old mirrors on reindex
+		for my $mid (reverse @$mids) {
+			$num = $self->{mm}->mid_insert($mid);
+			next unless defined $num;
+			$mid0 = $mid;
+			last;
+		}
+		if (defined $mid0) {
+			if ($sync->{reindex}) {
+				warn "reindex added #$num <$mid0>\n";
 			}
+		} else {
+			warn "E: cannot find article #\n";
+			return;
 		}
 	}
 	$sync->{nr}++;
@@ -935,6 +938,30 @@ sub check_unindexed ($$$) {
 	}
 }
 
+# reuse Msgmap to store num => oid mapping (rather than num => mid)
+sub multi_mid_q_new () {
+	my ($fh, $fn) = tempfile('multi_mid-XXXXXXX', EXLOCK => 0, TMPDIR => 1);
+	my $multi_mid = PublicInbox::Msgmap->new_file($fn, 1);
+	$multi_mid->{dbh}->do('PRAGMA synchronous = OFF');
+	# for Msgmap->DESTROY:
+	$multi_mid->{tmp_name} = $fn;
+	$multi_mid->{pid} = $$;
+	close $fh or die "failed to close $fn: $!";
+	$multi_mid
+}
+
+sub multi_mid_q_push ($$) {
+	my ($sync, $oid) = @_;
+	my $multi_mid = $sync->{multi_mid} //= multi_mid_q_new();
+	if ($sync->{reindex}) { # no regen on reindex
+		$multi_mid->mid_insert($oid);
+	} else {
+		my $num = $sync->{regen}--;
+		die "BUG: ran out of article numbers" if $num <= 0;
+		$multi_mid->mid_set($num, $oid);
+	}
+}
+
 sub reindex_oid ($$$$) {
 	my ($self, $sync, $git, $oid) = @_;
 	my ($num, $mid0, $len);
@@ -966,7 +993,7 @@ sub reindex_oid ($$$$) {
 			check_unindexed($self, $num, $mid0);
 		} else {
 			$num = $sync->{regen}--;
-			die "BUG: ran out of article numbers\n" if $num <= 0;
+			die "BUG: ran out of article numbers" if $num <= 0;
 			if ($self->{mm}->mid_set($num, $mid) != 1) {
 				warn "E: unable to assign $num => <$mid>\n";
 				return;
@@ -983,7 +1010,7 @@ sub reindex_oid ($$$$) {
 			# do not delete from {mm_tmp}, since another
 			# single-MID message may use it.
 		} else { # handle them at the end:
-			push @{$sync->{multi_mid} //= []}, $oid;
+			multi_mid_q_push($sync, $oid);
 		}
 		return;
 	}
@@ -1275,10 +1302,21 @@ sub index_sync {
 	}
 	if (my $multi_mid = delete $sync->{multi_mid}) {
 		$git //= $self->{-inbox}->git;
-
-		while (defined(my $oid = pop(@$multi_mid))) {
-			$self->{current_info} = "multi_mid $oid";
-			reindex_oid_m($self, $sync, $git, $oid);
+		my ($min, $max) = $multi_mid->minmax;
+		if ($sync->{reindex}) {
+			# we may need to create new Message-IDs if mirrors
+			# were initially indexed with old versions
+			for (my $i = $max; $i >= $min; $i--) {
+				my $oid = $multi_mid->mid_for($i);
+				next unless defined $oid;
+				reindex_oid_m($self, $sync, $git, $oid);
+			}
+		} else { # regen on initial index
+			for my $num ($min..$max) {
+				my $oid = $multi_mid->mid_for($num);
+				next unless defined $oid;
+				reindex_oid_m($self, $sync, $git, $oid, $num);
+			}
 		}
 	}
 	$git->cleanup if $git;

^ permalink raw reply related	[relevance 6%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2019-10-16 21:14     BUG: --reindex broken on multiple Message-IDs reuse Eric Wong
2019-10-21 11:02     ` [PATCH 0/3] fix reindex on multiple + overlapping Message-IDs Eric Wong
2019-10-21 11:02       ` [PATCH 3/3] v2writable: reindex handles 3-headered monsters Eric Wong
2019-10-22  1:28         ` [PATCH 4/3] v2writable: move git->cleanup to the correct place Eric Wong
2019-10-22  1:29  6%       ` [PATCH 5/3] v2writable: use msgmap as multi_mid queue Eric Wong
2019-10-23 18:19  7%   ` [PUSHED] fix reindex on multiple + overlapping Message-IDs Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).