about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2018-08-03 20:05:24 +0000
committerEric Wong <e@80x24.org>2018-08-03 20:05:24 +0000
commit861bec7bec5908871e5b0ede244cb1e990a47403 (patch)
tree8d116f0c9ad6a3af4d1b4d4041c2be5bbdf42065 /lib
parent7808b18c63f9d754a56ad7b2bd2385545d3521fb (diff)
parent72fa722146912781230c54d7282bf7c1147e0455 (diff)
downloadpublic-inbox-861bec7bec5908871e5b0ede244cb1e990a47403.tar.gz
Incremental indexing fixes from Eric W. Biederman.

These prevents the highest message number in msgmap from
being reassigned after deletes in rare cases and ensures
messages are deleted from msgmap in v2.

* eb/index-incremental:
  V2Writeable.pm: In unindex_oid delete the message from msgmap
  V2Writeable.pm: Ensure that a found message number is in the msgmap
  SearchIdx,V2Writeable: Update num_highwater on optimized deletes
  t/v[12]reindex.t: Verify the num highwater is as expected
  t/v[12]reindex.t Verify num_highwater
  Msgmap.pm: Track the largest value of num ever assigned
  SearchIdx.pm: Always assign numbers backwards during incremental indexing
  t/v[12]reindex.t: Test incremental indexing works
  t/v[12]reindex.t: Test that the resulting msgmap is as expected
  t/v[12]reindex.t: Place expected second in Xapian tests
  t/v2reindex.t: Isolate the test cases more
  t/v1reindex.t: Isolate the test cases
  Import.pm: Don't assume {in} and {out} always exist
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/Import.pm1
-rw-r--r--lib/PublicInbox/Msgmap.pm23
-rw-r--r--lib/PublicInbox/SearchIdx.pm30
-rw-r--r--lib/PublicInbox/V2Writable.pm20
4 files changed, 53 insertions, 21 deletions
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 4e3b4c55..bfa7a805 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -451,6 +451,7 @@ sub done {
 sub atfork_child {
         my ($self) = @_;
         foreach my $f (qw(in out)) {
+                next unless defined($self->{$f});
                 close $self->{$f} or die "failed to close import[$f]: $!\n";
         }
 }
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index fdc71e46..d474bade 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -50,6 +50,10 @@ sub new_file {
                 create_tables($dbh);
                 $dbh->begin_work;
                 $self->created_at(time) unless $self->created_at;
+
+                my (undef, $max) = $self->minmax();
+                $max ||= 0;
+                $self->num_highwater($max);
                 $dbh->commit;
         }
         $self;
@@ -107,6 +111,17 @@ sub created_at {
         $self->meta_accessor('created_at', $second);
 }
 
+sub num_highwater {
+        my ($self, $num) = @_;
+        my $high = $self->{num_highwater} ||=
+            $self->meta_accessor('num_highwater');
+        if (defined($num) && (!defined($high) || ($num > $high))) {
+                $self->{num_highwater} = $num;
+                $self->meta_accessor('num_highwater', $num);
+        }
+        $self->{num_highwater};
+}
+
 sub mid_insert {
         my ($self, $mid) = @_;
         my $dbh = $self->{dbh};
@@ -114,7 +129,9 @@ sub mid_insert {
 INSERT OR IGNORE INTO msgmap (mid) VALUES (?)
 
         return if $sth->execute($mid) == 0;
-        $dbh->last_insert_id(undef, undef, 'msgmap', 'num');
+        my $num = $dbh->last_insert_id(undef, undef, 'msgmap', 'num');
+        $self->num_highwater($num) unless !defined($num);
+        $num;
 }
 
 sub mid_for {
@@ -213,7 +230,9 @@ sub mid_set {
                 $self->{dbh}->prepare(
                         'INSERT OR IGNORE INTO msgmap (num,mid) VALUES (?,?)');
         };
-        $sth->execute($num, $mid);
+        my $result = $sth->execute($num, $mid);
+        $self->num_highwater($num) if (defined($result) && $result == 1);
+        $result;
 }
 
 sub DESTROY {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 29868d99..ca832ad3 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -582,7 +582,8 @@ sub read_log {
                         my $blob = $1;
                         if (delete $D{$blob}) {
                                 if (defined $self->{regen_down}) {
-                                        $self->{regen_down}--;
+                                        my $num = $self->{regen_down}--;
+                                        $self->{mm}->num_highwater($num);
                                 }
                                 next;
                         }
@@ -618,23 +619,28 @@ sub _git_log {
         my ($self, $range) = @_;
         my $git = $self->{git};
 
+        # Count the new files so they can be added newest to oldest
+        # and still have numbers increasing from oldest to newest
+        my $fcount = 0;
+        # can't use 'rev-list --count' if we use --diff-filter
+        my $fh = $git->popen(qw(log --pretty=tformat:%h
+                             --no-notes --no-color --no-renames
+                             --diff-filter=AM), $range);
+        ++$fcount while <$fh>;
+        my $high = $self->{mm}->num_highwater;
+
         if (index($range, '..') < 0) {
-                my $regen_max = 0;
-                # can't use 'rev-list --count' if we use --diff-filter
-                my $fh = $git->popen(qw(log --pretty=tformat:%h
-                                --no-notes --no-color --no-renames
-                                --diff-filter=AM), $range);
-                ++$regen_max while <$fh>;
-                my (undef, $max) = $self->{mm}->minmax;
-
-                if ($max && $max == $regen_max) {
+                if ($high && $high == $fcount) {
                         # fix up old bugs in full indexes which caused messages to
                         # not appear in Msgmap
-                        $self->{regen_up} = $max;
+                        $self->{regen_up} = $high;
                 } else {
                         # normal regen is for for fresh data
-                        $self->{regen_down} = $regen_max;
+                        $self->{regen_down} = $fcount;
                 }
+        } else {
+                # Give oldest messages the smallest numbers
+                $self->{regen_down} = $high + $fcount;
         }
 
         $git->popen(qw/log --no-notes --no-color --no-renames
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 934640eb..0396d9f5 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -658,7 +658,7 @@ sub mark_deleted {
 }
 
 sub reindex_oid {
-        my ($self, $mm_tmp, $D, $git, $oid, $regen) = @_;
+        my ($self, $mm_tmp, $D, $git, $oid, $regen, $reindex) = @_;
         my $len;
         my $msgref = $git->cat_file($oid, \$len);
         my $mime = PublicInbox::MIME->new($$msgref);
@@ -676,6 +676,7 @@ sub reindex_oid {
                 if (defined $n && $n > $num) {
                         $mid0 = $mid;
                         $num = $n;
+                        $self->{mm}->mid_set($num, $mid0);
                 }
         }
         if (!defined($mid0) && $regen && !$del) {
@@ -700,7 +701,8 @@ sub reindex_oid {
 
         if (!defined($mid0) || $del) {
                 if (!defined($mid0) && $del) { # expected for deletes
-                        $$regen--;
+                        $num = $$regen--;
+                        $self->{mm}->num_highwater($num) unless $reindex;
                         return
                 }
 
@@ -840,7 +842,10 @@ sub unindex_oid {
                         warn "BUG: multiple articles linked to $oid\n",
                                 join(',',sort keys %gone), "\n";
                 }
-                $self->{unindexed}->{$_}++ foreach keys %gone;
+                foreach my $num (keys %gone) {
+                        $self->{unindexed}->{$_}++;
+                        $self->{mm}->num_delete($num);
+                }
                 $self->unindex_oid_remote($oid, $mid);
         }
 }
@@ -877,11 +882,12 @@ sub index_sync {
         return unless defined $latest;
         $self->idx_init; # acquire lock
         my $mm_tmp = $self->{mm}->tmp_clone;
-        my $ranges = $opts->{reindex} ? [] : $self->last_commits($epoch_max);
+        my $reindex = $opts->{reindex};
+        my $ranges = $reindex ? [] : $self->last_commits($epoch_max);
 
-        my ($min, $max) = $mm_tmp->minmax;
+        my $high = $self->{mm}->num_highwater();
         my $regen = $self->index_prepare($opts, $epoch_max, $ranges);
-        $$regen += $max if $max;
+        $$regen += $high if $high;
         my $D = {}; # "$mid\0$cid" => $oid
         my @cmd = qw(log --raw -r --pretty=tformat:%H
                         --no-notes --no-color --no-abbrev --no-renames);
@@ -903,7 +909,7 @@ sub index_sync {
                                 chomp($cmt = $_);
                         } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
                                 $self->reindex_oid($mm_tmp, $D, $git, $1,
-                                                $regen);
+                                                $regen, $reindex);
                         } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
                                 $self->mark_deleted($D, $git, $1);
                         }