about summary refs log tree commit homepage
path: root/lib/PublicInbox/ExtSearchIdx.pm
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2023-11-16 11:00:20 +0000
committerEric Wong <e@80x24.org>2023-11-16 19:46:54 +0000
commit9005cb3dced86b78715fef0472a83813003e8e0d (patch)
tree45d5c85bd367bb11bce80cc78a2f9a4cb27c2711 /lib/PublicInbox/ExtSearchIdx.pm
parent87981c405d27597a19f34b01a0763aed2be6dcf3 (diff)
downloadpublic-inbox-9005cb3dced86b78715fef0472a83813003e8e0d.tar.gz
Stale entries from newsgroup name changes (including adding
a `publicinbox.<name>.newsgroup' entry when none existed
before) can wreak havoc during a --reindex.  So give the
hint to users about running -extindex with --gc to clean
up stale entries.
Diffstat (limited to 'lib/PublicInbox/ExtSearchIdx.pm')
-rw-r--r--lib/PublicInbox/ExtSearchIdx.pm29
1 files changed, 25 insertions, 4 deletions
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 6856ae66..7b7436ea 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -113,11 +113,30 @@ sub check_batch_limit ($) {
         ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes};
 }
 
+sub bad_ibx_id ($$;$) {
+        my ($self, $ibx_id, $cb) = @_;
+        my $msg = "E: bad/stale ibx_id=#$ibx_id encountered";
+        my $ekey = $self->{oidx}->dbh->selectrow_array(<<EOM, undef, $ibx_id);
+SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
+EOM
+        $msg .= " (formerly `$ekey')" if defined $ekey;
+        $cb //= \&carp;
+        $cb->($msg, "\nE: running $0 --gc may be required");
+}
+
+sub check_xr3 ($$$) {
+        my ($self, $id2pos, $xr3) = @_;
+        @$xr3 = grep {
+                defined($id2pos->{$_->[0]}) ? 1 : bad_ibx_id($self, $_->[0])
+        } @$xr3;
+}
+
 sub apply_boost ($$) {
         my ($req, $smsg) = @_;
         my $id2pos = $req->{id2pos}; # index in ibx_sorted
         my $xr3 = $req->{self}->{oidx}->get_xref3($smsg->{num}, 1);
-        @$xr3 = sort {
+        check_xr3($req->{self}, $id2pos, $xr3);
+        @$xr3 = sort { # sort ascending
                 $id2pos->{$a->[0]} <=> $id2pos->{$b->[0]}
                                 ||
                 $a->[1] <=> $b->[1] # break ties with {xnum}
@@ -513,8 +532,9 @@ sub eidx_gc {
 
 sub _ibx_for ($$$) {
         my ($self, $sync, $smsg) = @_;
-        my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset';
-        my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos";
+        my $ibx_id = delete($smsg->{ibx_id}) // die 'BUG: {ibx_id} unset';
+        my $pos = $sync->{id2pos}->{$ibx_id} //
+                bad_ibx_id($self, $ibx_id, \&croak);
         $self->{-ibx_ary_known}->[$pos] //
                 die "BUG: ibx for $smsg->{blob} not mapped"
 }
@@ -657,7 +677,8 @@ BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex
         # hit the common case in _reindex_finalize without rereading
         # from git (or holding multiple messages in memory).
         my $id2pos = $sync->{id2pos}; # index in ibx_sorted
-        @$xr3 = sort {
+        check_xr3($self, $id2pos, $xr3);
+        @$xr3 = sort { # sort descending
                 $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]}
                                 ||
                 $b->[1] <=> $a->[1] # break ties with {xnum}