From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 246561FA12 for ; Thu, 17 Jun 2021 22:00:48 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/3] lei/store: cull redundant docids based on blob OID Date: Thu, 17 Jun 2021 22:00:47 +0000 Message-Id: <20210617220047.11225-4-e@80x24.org> In-Reply-To: <20210617220047.11225-1-e@80x24.org> References: <20210617220047.11225-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: I'm not sure how this happened (only once for me in March), but it should not happen... In any case, we'll operate on the lowest numbered docid and cull redundant index entries when lei/store is open for read-write. This also fixes the normal lei/store removal path to clean up the xref3 table (since it's not done automatically for public-facing -eidx due to the multi-list nature of it). --- lib/PublicInbox/LeiStore.pm | 54 +++++++++++++++++++++++------------- lib/PublicInbox/SearchIdx.pm | 2 +- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index f978288a..4ba1e647 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -226,6 +226,18 @@ sub _remove_if_local { # git->cat_async arg $self->{im}->remove($bref) if $bref; } +sub remove_docids ($;@) { + my ($self, @docids) = @_; + my $eidx = eidx_init($self); + for my $docid (@docids) { + $eidx->idx_shard($docid)->ipc_do('xdb_remove', $docid); + $self->{oidx}->delete_by_num($docid); + $self->{oidx}->{dbh}->do(<cat_async($oidhex, \&_remove_if_local, $self); } - $eidx->idx_shard($docid)->ipc_do('xdb_remove', $docid); - $oidx->delete_by_num($docid); } $git->cat_async_wait; + remove_docids($self, @docids); \@docids; } +sub oid2docid ($$) { + my ($self, $oid) = @_; + my $eidx = eidx_init($self); + my ($docid, @cull) = $eidx->{oidx}->blob_exists($oid); + if (@cull) { # fixup old bugs... + warn <{-fake_im} // $self->importer; # may create new epoch @@ -268,7 +292,7 @@ sub add_eml { if (scalar keys %$xoids) { my %docids = map { $_ => 1 } @$vivify_xvmd; for my $oid (keys %$xoids) { - my @id = $oidx->blob_exists($oid); + my @id = oid2docid($self, $oid); @docids{@id} = @id; } @$vivify_xvmd = sort { $a <=> $b } keys(%docids); @@ -356,15 +380,11 @@ sub update_xvmd { my $oidx = $eidx->{oidx}; my %seen; for my $oid (keys %$xoids) { - my @docids = $oidx->blob_exists($oid) or next; - scalar(@docids) > 1 and - warn "W: $oid indexed as multiple docids: @docids\n"; - for my $docid (@docids) { - next if $seen{$docid}++; - my $idx = $eidx->idx_shard($docid); - $idx->ipc_do('update_vmd', $docid, $vmd_mod); - } + my $docid = oid2docid($self, $oid) // next; delete $xoids->{$oid}; + next if $seen{$docid}++; + my $idx = $eidx->idx_shard($docid); + $idx->ipc_do('update_vmd', $docid, $vmd_mod); } return unless scalar(keys(%$xoids)); @@ -395,15 +415,11 @@ sub set_xvmd { # see if we can just update existing docs for my $oid (keys %$xoids) { - my @docids = $oidx->blob_exists($oid) or next; - scalar(@docids) > 1 and - warn "W: $oid indexed as multiple docids: @docids\n"; - for my $docid (@docids) { - next if $seen{$docid}++; - my $idx = $eidx->idx_shard($docid); - $idx->ipc_do('set_vmd', $docid, $vmd); - } + my $docid = oid2docid($self, $oid) // next; delete $xoids->{$oid}; # all done with this oid + next if $seen{$docid}++; + my $idx = $eidx->idx_shard($docid); + $idx->ipc_do('set_vmd', $docid, $vmd); } return unless scalar(keys(%$xoids)); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index f066cc92..f553eda6 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -572,7 +572,7 @@ sub apply_vmd_mod ($$) { my $updated = 0; my @x = @VMD_MAP; while (my ($field, $pfx) = splice(@x, 0, 2)) { - # field: "label" or "kw" + # field: "L" or "kw" for my $val (@{$vmd_mod->{"-$field"} // []}) { eval { $doc->remove_term($pfx . $val);