From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E2C921F406 for ; Thu, 16 Nov 2023 11:00:20 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1700132421; bh=VRl/ebgFUQ44O3KWH9UB/WNfftvQLMdKkU8XXaoDHLk=; h=From:To:Subject:Date:From; b=uUvZqamI8LDWjBiW/Tb8whVuYPLtK6rOjBHg47j6J70g9FBgEg7gVJ4Ek/urbb3rg 9Rk8ecq3BVg+pInlJ1wn8HOREeWdrOdjmGYGYBdSAL8vNF4xthZBe/xPoMgeNrLEZF l1d57jOEiJf20kXpQhmZf1FFq3Z1Z8979Z2/jrzU= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] extindex: warn and hint about --gc on bad ibx_id Date: Thu, 16 Nov 2023 11:00:20 +0000 Message-Id: <20231116110020.1224857-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Stale entries from newsgroup name changes (including adding a `publicinbox..newsgroup' entry when none existed before) can wreak havoc during a --reindex. So give the hint to users about running -extindex with --gc to clean up stale entries. --- Documentation/public-inbox-extindex.pod | 5 +++-- lib/PublicInbox/ExtSearchIdx.pm | 29 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/Documentation/public-inbox-extindex.pod b/Documentation/public-inbox-extindex.pod index fbb12fe9..be4ea4de 100644 --- a/Documentation/public-inbox-extindex.pod +++ b/Documentation/public-inbox-extindex.pod @@ -50,8 +50,9 @@ significant space savings on Xapian indices. =item --gc Perform garbage collection instead of indexing. Use this if -inboxes are removed from the extindex, or if messages are -purged or removed from some inboxes. +inboxes are removed from the extindex, a newsgroup name is +set or changed, or if messages are purged or removed from +some inboxes. =item --reindex diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 6856ae66..7b7436ea 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -113,11 +113,30 @@ sub check_batch_limit ($) { ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes}; } +sub bad_ibx_id ($$;$) { + my ($self, $ibx_id, $cb) = @_; + my $msg = "E: bad/stale ibx_id=#$ibx_id encountered"; + my $ekey = $self->{oidx}->dbh->selectrow_array(<($msg, "\nE: running $0 --gc may be required"); +} + +sub check_xr3 ($$$) { + my ($self, $id2pos, $xr3) = @_; + @$xr3 = grep { + defined($id2pos->{$_->[0]}) ? 1 : bad_ibx_id($self, $_->[0]) + } @$xr3; +} + sub apply_boost ($$) { my ($req, $smsg) = @_; my $id2pos = $req->{id2pos}; # index in ibx_sorted my $xr3 = $req->{self}->{oidx}->get_xref3($smsg->{num}, 1); - @$xr3 = sort { + check_xr3($req->{self}, $id2pos, $xr3); + @$xr3 = sort { # sort ascending $id2pos->{$a->[0]} <=> $id2pos->{$b->[0]} || $a->[1] <=> $b->[1] # break ties with {xnum} @@ -513,8 +532,9 @@ sub eidx_gc { sub _ibx_for ($$$) { my ($self, $sync, $smsg) = @_; - my $ibx_id = delete($smsg->{ibx_id}) // die '{ibx_id} unset'; - my $pos = $sync->{id2pos}->{$ibx_id} // die "$ibx_id no pos"; + my $ibx_id = delete($smsg->{ibx_id}) // die 'BUG: {ibx_id} unset'; + my $pos = $sync->{id2pos}->{$ibx_id} // + bad_ibx_id($self, $ibx_id, \&croak); $self->{-ibx_ary_known}->[$pos] // die "BUG: ibx for $smsg->{blob} not mapped" } @@ -657,7 +677,8 @@ BUG? #$docid $smsg->{blob} is not referenced by inboxes during reindex # hit the common case in _reindex_finalize without rereading # from git (or holding multiple messages in memory). my $id2pos = $sync->{id2pos}; # index in ibx_sorted - @$xr3 = sort { + check_xr3($self, $id2pos, $xr3); + @$xr3 = sort { # sort descending $id2pos->{$b->[0]} <=> $id2pos->{$a->[0]} || $b->[1] <=> $a->[1] # break ties with {xnum}