From ad2080deb8102a75d2f26b448267c209bea4b4e2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:38 -0500 Subject: SearchIdx.pm: Always assign numbers backwards during incremental indexing When walking messages newest to oldest, assigning the larger numbers before smaller numbers ensures older messages get smaller numbers. This leads to the possibility of a msgmap that can be regenerated when needed. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/SearchIdx.pm | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1d259a86..ac821ac0 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -619,23 +619,28 @@ sub _git_log { my ($self, $range) = @_; my $git = $self->{git}; + # Count the new files so they can be added newest to oldest + # and still have numbers increasing from oldest to newest + my $fcount = 0; + # can't use 'rev-list --count' if we use --diff-filter + my $fh = $git->popen(qw(log --pretty=tformat:%h + --no-notes --no-color --no-renames + --diff-filter=AM), $range); + ++$fcount while <$fh>; + my (undef, $max) = $self->{mm}->minmax; + if (index($range, '..') < 0) { - my $regen_max = 0; - # can't use 'rev-list --count' if we use --diff-filter - my $fh = $git->popen(qw(log --pretty=tformat:%h - --no-notes --no-color --no-renames - --diff-filter=AM), $range); - ++$regen_max while <$fh>; - my (undef, $max) = $self->{mm}->minmax; - - if ($max && $max == $regen_max) { + if ($max && $max == $fcount) { # fix up old bugs in full indexes which caused messages to # not appear in Msgmap $self->{regen_up} = $max; } else { # normal regen is for for fresh data - $self->{regen_down} = $regen_max; + $self->{regen_down} = $fcount; } + } else { + # Give oldest messages the smallest numbers + $self->{regen_down} = $max + $fcount; } $git->popen(qw/log --no-notes --no-color --no-renames -- cgit v1.2.3-24-ge0c7 From 5e6d44673ba5e9aaeb6d8e63f27adf542c2760a0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:39 -0500 Subject: Msgmap.pm: Track the largest value of num ever assigned Today the only thing that prevents public-inbox not reusing the message numbers of deleted messages is the sqlite autoincrement magic and that only works part of the time. The new incremental indexing test has revealed areas where today public-inbox does try to reuse numbers of deleted messages. Reusing the message numbers of existing messages is a problem because if a client ever sees messages that are subsequently deleted the client will not see the new messages with their old numbers. In practice this is difficult to trigger because it requires the most recently added message to be removed and have the removal show up in a separate pull request. Still it can happen and it should be handled. Instead of infering the highset number ever used by finding the maximum number in the message map, track the largest number ever assigned directly. Update Msgmap to track this value and update the indexers to use this value. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/SearchIdx.pm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ac821ac0..2532c8df 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -627,20 +627,20 @@ sub _git_log { --no-notes --no-color --no-renames --diff-filter=AM), $range); ++$fcount while <$fh>; - my (undef, $max) = $self->{mm}->minmax; + my $high = $self->{mm}->num_highwater; if (index($range, '..') < 0) { - if ($max && $max == $fcount) { + if ($high && $high == $fcount) { # fix up old bugs in full indexes which caused messages to # not appear in Msgmap - $self->{regen_up} = $max; + $self->{regen_up} = $high; } else { # normal regen is for for fresh data $self->{regen_down} = $fcount; } } else { # Give oldest messages the smallest numbers - $self->{regen_down} = $max + $fcount; + $self->{regen_down} = $high + $fcount; } $git->popen(qw/log --no-notes --no-color --no-renames -- cgit v1.2.3-24-ge0c7 From 3f189032fb2d7c8a9dda732e0263e697ce1cb0ac Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2018 11:43:42 -0500 Subject: SearchIdx,V2Writeable: Update num_highwater on optimized deletes When performing an incremental index update with index_sync if a message is seen to be both added and deleted update the num_highwater mark even though the message is not otherwise indexed. This ensures index_sync generates the same msgmap no matter which commit it stops at during incremental syncs. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/SearchIdx.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 2532c8df..54f82aa8 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -583,7 +583,8 @@ sub read_log { my $blob = $1; if (delete $D{$blob}) { if (defined $self->{regen_down}) { - $self->{regen_down}--; + my $num = $self->{regen_down}--; + $self->{mm}->num_highwater($num); } next; } -- cgit v1.2.3-24-ge0c7