From 9294716ecc685c7b21ec21e2dbbbeb2c8f62c477 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:32 -0500 Subject: Import.pm: Don't assume {in} and {out} always exist While working on one of the tests I did: my $im = PublicInbox::V2Writable->new($ibx, 1); my $im0 = $im->importer(); $im->add($mime); Which resulted in a warning of the use of an undefined value from atfork_child, and the test failing nastily. Inspection of the code reveals this can happen anytime gfi_start has not been called. So just fix atfork_child to skip closing file descriptors that have not yet been setup. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/Import.pm | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 4e3b4c55..bfa7a805 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -451,6 +451,7 @@ sub done { sub atfork_child { my ($self) = @_; foreach my $f (qw(in out)) { + next unless defined($self->{$f}); close $self->{$f} or die "failed to close import[$f]: $!\n"; } } -- cgit v1.2.3-24-ge0c7 From ad2080deb8102a75d2f26b448267c209bea4b4e2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:38 -0500 Subject: SearchIdx.pm: Always assign numbers backwards during incremental indexing When walking messages newest to oldest, assigning the larger numbers before smaller numbers ensures older messages get smaller numbers. This leads to the possibility of a msgmap that can be regenerated when needed. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/SearchIdx.pm | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1d259a86..ac821ac0 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -619,23 +619,28 @@ sub _git_log { my ($self, $range) = @_; my $git = $self->{git}; + # Count the new files so they can be added newest to oldest + # and still have numbers increasing from oldest to newest + my $fcount = 0; + # can't use 'rev-list --count' if we use --diff-filter + my $fh = $git->popen(qw(log --pretty=tformat:%h + --no-notes --no-color --no-renames + --diff-filter=AM), $range); + ++$fcount while <$fh>; + my (undef, $max) = $self->{mm}->minmax; + if (index($range, '..') < 0) { - my $regen_max = 0; - # can't use 'rev-list --count' if we use --diff-filter - my $fh = $git->popen(qw(log --pretty=tformat:%h - --no-notes --no-color --no-renames - --diff-filter=AM), $range); - ++$regen_max while <$fh>; - my (undef, $max) = $self->{mm}->minmax; - - if ($max && $max == $regen_max) { + if ($max && $max == $fcount) { # fix up old bugs in full indexes which caused messages to # not appear in Msgmap $self->{regen_up} = $max; } else { # normal regen is for for fresh data - $self->{regen_down} = $regen_max; + $self->{regen_down} = $fcount; } + } else { + # Give oldest messages the smallest numbers + $self->{regen_down} = $max + $fcount; } $git->popen(qw/log --no-notes --no-color --no-renames -- cgit v1.2.3-24-ge0c7 From 5e6d44673ba5e9aaeb6d8e63f27adf542c2760a0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:39 -0500 Subject: Msgmap.pm: Track the largest value of num ever assigned Today the only thing that prevents public-inbox not reusing the message numbers of deleted messages is the sqlite autoincrement magic and that only works part of the time. The new incremental indexing test has revealed areas where today public-inbox does try to reuse numbers of deleted messages. Reusing the message numbers of existing messages is a problem because if a client ever sees messages that are subsequently deleted the client will not see the new messages with their old numbers. In practice this is difficult to trigger because it requires the most recently added message to be removed and have the removal show up in a separate pull request. Still it can happen and it should be handled. Instead of infering the highset number ever used by finding the maximum number in the message map, track the largest number ever assigned directly. Update Msgmap to track this value and update the indexers to use this value. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/Msgmap.pm | 23 +++++++++++++++++++++-- lib/PublicInbox/SearchIdx.pm | 8 ++++---- lib/PublicInbox/V2Writable.pm | 4 ++-- 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index fdc71e46..d474bade 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -50,6 +50,10 @@ sub new_file { create_tables($dbh); $dbh->begin_work; $self->created_at(time) unless $self->created_at; + + my (undef, $max) = $self->minmax(); + $max ||= 0; + $self->num_highwater($max); $dbh->commit; } $self; @@ -107,6 +111,17 @@ sub created_at { $self->meta_accessor('created_at', $second); } +sub num_highwater { + my ($self, $num) = @_; + my $high = $self->{num_highwater} ||= + $self->meta_accessor('num_highwater'); + if (defined($num) && (!defined($high) || ($num > $high))) { + $self->{num_highwater} = $num; + $self->meta_accessor('num_highwater', $num); + } + $self->{num_highwater}; +} + sub mid_insert { my ($self, $mid) = @_; my $dbh = $self->{dbh}; @@ -114,7 +129,9 @@ sub mid_insert { INSERT OR IGNORE INTO msgmap (mid) VALUES (?) return if $sth->execute($mid) == 0; - $dbh->last_insert_id(undef, undef, 'msgmap', 'num'); + my $num = $dbh->last_insert_id(undef, undef, 'msgmap', 'num'); + $self->num_highwater($num) unless !defined($num); + $num; } sub mid_for { @@ -213,7 +230,9 @@ sub mid_set { $self->{dbh}->prepare( 'INSERT OR IGNORE INTO msgmap (num,mid) VALUES (?,?)'); }; - $sth->execute($num, $mid); + my $result = $sth->execute($num, $mid); + $self->num_highwater($num) if (defined($result) && $result == 1); + $result; } sub DESTROY { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ac821ac0..2532c8df 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -627,20 +627,20 @@ sub _git_log { --no-notes --no-color --no-renames --diff-filter=AM), $range); ++$fcount while <$fh>; - my (undef, $max) = $self->{mm}->minmax; + my $high = $self->{mm}->num_highwater; if (index($range, '..') < 0) { - if ($max && $max == $fcount) { + if ($high && $high == $fcount) { # fix up old bugs in full indexes which caused messages to # not appear in Msgmap - $self->{regen_up} = $max; + $self->{regen_up} = $high; } else { # normal regen is for for fresh data $self->{regen_down} = $fcount; } } else { # Give oldest messages the smallest numbers - $self->{regen_down} = $max + $fcount; + $self->{regen_down} = $high + $fcount; } $git->popen(qw/log --no-notes --no-color --no-renames diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 934640eb..c450980c 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -879,9 +879,9 @@ sub index_sync { my $mm_tmp = $self->{mm}->tmp_clone; my $ranges = $opts->{reindex} ? [] : $self->last_commits($epoch_max); - my ($min, $max) = $mm_tmp->minmax; + my $high = $self->{mm}->num_highwater(); my $regen = $self->index_prepare($opts, $epoch_max, $ranges); - $$regen += $max if $max; + $$regen += $high if $high; my $D = {}; # "$mid\0$cid" => $oid my @cmd = qw(log --raw -r --pretty=tformat:%H --no-notes --no-color --no-abbrev --no-renames); -- cgit v1.2.3-24-ge0c7 From 3f189032fb2d7c8a9dda732e0263e697ce1cb0ac Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:42 -0500 Subject: SearchIdx,V2Writeable: Update num_highwater on optimized deletes When performing an incremental index update with index_sync if a message is seen to be both added and deleted update the num_highwater mark even though the message is not otherwise indexed. This ensures index_sync generates the same msgmap no matter which commit it stops at during incremental syncs. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/SearchIdx.pm | 3 ++- lib/PublicInbox/V2Writable.pm | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 2532c8df..54f82aa8 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -583,7 +583,8 @@ sub read_log { my $blob = $1; if (delete $D{$blob}) { if (defined $self->{regen_down}) { - $self->{regen_down}--; + my $num = $self->{regen_down}--; + $self->{mm}->num_highwater($num); } next; } diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index c450980c..92d2672c 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -658,7 +658,7 @@ sub mark_deleted { } sub reindex_oid { - my ($self, $mm_tmp, $D, $git, $oid, $regen) = @_; + my ($self, $mm_tmp, $D, $git, $oid, $regen, $reindex) = @_; my $len; my $msgref = $git->cat_file($oid, \$len); my $mime = PublicInbox::MIME->new($$msgref); @@ -700,7 +700,8 @@ sub reindex_oid { if (!defined($mid0) || $del) { if (!defined($mid0) && $del) { # expected for deletes - $$regen--; + $num = $$regen--; + $self->{mm}->num_highwater($num) unless $reindex; return } @@ -877,7 +878,8 @@ sub index_sync { return unless defined $latest; $self->idx_init; # acquire lock my $mm_tmp = $self->{mm}->tmp_clone; - my $ranges = $opts->{reindex} ? [] : $self->last_commits($epoch_max); + my $reindex = $opts->{reindex}; + my $ranges = $reindex ? [] : $self->last_commits($epoch_max); my $high = $self->{mm}->num_highwater(); my $regen = $self->index_prepare($opts, $epoch_max, $ranges); @@ -903,7 +905,7 @@ sub index_sync { chomp($cmt = $_); } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { $self->reindex_oid($mm_tmp, $D, $git, $1, - $regen); + $regen, $reindex); } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { $self->mark_deleted($D, $git, $1); } -- cgit v1.2.3-24-ge0c7 From 65820d88976bc2fa83e9acd3b460ac304578d40a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:43 -0500 Subject: V2Writeable.pm: Ensure that a found message number is in the msgmap The lookup to see if a num has already been assigned to a message happens in a temporary copy of message map. It is possible that the number has been removed from the current message map. The unindex/reindex after a history rewrite triggered by a purge should be one such case. Therefore add the number to the msgmap in case it is not currently present. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/V2Writable.pm | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 92d2672c..4dd14331 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -676,6 +676,7 @@ sub reindex_oid { if (defined $n && $n > $num) { $mid0 = $mid; $num = $n; + $self->{mm}->mid_set($num, $mid0); } } if (!defined($mid0) && $regen && !$del) { -- cgit v1.2.3-24-ge0c7 From 72fa722146912781230c54d7282bf7c1147e0455 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:44 -0500 Subject: V2Writeable.pm: In unindex_oid delete the message from msgmap Now that we track the num highwater mark it is safe to remove messages from msgmap that have been previously allocated. Removing even the highest numbered article will no longer cause new message numbers to move backwards. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/V2Writable.pm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 4dd14331..0396d9f5 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -842,7 +842,10 @@ sub unindex_oid { warn "BUG: multiple articles linked to $oid\n", join(',',sort keys %gone), "\n"; } - $self->{unindexed}->{$_}++ foreach keys %gone; + foreach my $num (keys %gone) { + $self->{unindexed}->{$_}++; + $self->{mm}->num_delete($num); + } $self->unindex_oid_remote($oid, $mid); } } -- cgit v1.2.3-24-ge0c7