diff options
author | Eric Wong <e@80x24.org> | 2020-11-29 05:09:35 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2020-11-29 06:42:00 +0000 |
commit | ea885bf0f76dc1795dc771667be721ec0fed5482 (patch) | |
tree | 5323da5926b6aa4c20619e2a05805f581722a389 /lib/PublicInbox/ExtSearchIdx.pm | |
parent | 33f80a367325f3dac983633cb0a59946205776c2 (diff) | |
download | public-inbox-ea885bf0f76dc1795dc771667be721ec0fed5482.tar.gz |
Inboxes may be removed or newsgroups renamed over time. Introduce a switch to do garbage collection and eliminate stale search and xref3 results based on inboxes which remain in the config file. This may also fixup stale results leftover from any bugs which may leave stale data around. This is also useful in case a clumsy BOFH (me :P) is swapping between several PI_CONFIGs and accidentally indexed a bunch of inboxes they didn't intend to.
Diffstat (limited to 'lib/PublicInbox/ExtSearchIdx.pm')
-rw-r--r-- | lib/PublicInbox/ExtSearchIdx.pm | 90 |
1 files changed, 89 insertions, 1 deletions
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index d780776f..4de47b58 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -87,6 +87,7 @@ sub _ibx_attach { # each_inbox callback sub attach_config { my ($self, $cfg) = @_; + $self->{cfg} = $cfg; $cfg->each_inbox(\&_ibx_attach, $self); } @@ -141,7 +142,8 @@ sub do_xpost ($$) { if ($nr == 0) { $idx->shard_remove($oid, $docid); } elsif ($rm_eidx_info) { - $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml); + $idx->shard_remove_eidx_info($docid, $oid, $eidx_key, + $eml); } } } @@ -324,6 +326,90 @@ sub _sync_inbox ($$$) { $ibx->git->cleanup; # done with this inbox, now } +sub unref_doc ($$$$) { + my ($self, $ibx_id, $eidx_key, $docid) = @_; + my $dbh = $self->{oidx}->dbh; + + # for debug/info purposes, oids may no longer be accessible + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref}; + + $dbh->prepare_cached(<<'')->execute($docid, $ibx_id); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? + + my $remain = $self->{oidx}->get_xref3($docid); + my $idx = $self->idx_shard($docid); + if (@$remain) { + for my $oid (@oid) { + warn "I: unref #$docid $eidx_key $oid\n"; + $idx->shard_remove_eidx_info($docid, $oid, $eidx_key); + } + } else { + for my $oid (@oid) { + warn "I: remove #$docid $eidx_key $oid\n"; + $idx->shard_remove($oid, $docid); + } + } +} + +sub eidx_gc { + my ($self, $opt) = @_; + $self->{cfg} or die "E: GC requires ->attach_config\n"; + $opt->{-idx_gc} = 1; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + + my $dbh = $self->{oidx}->dbh; + my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?'); + my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes'); + my $lc_i = $dbh->prepare('SELECT key FROM eidx_meta WHERE key LIKE ?'); + + $ibx_ck->execute; + while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) { + next if $self->{ibx_map}->{$eidx_key}; + $self->{midx}->remove_eidx_key($eidx_key); + warn "I: deleting messages for $eidx_key...\n"; + $x3_doc->execute($ibx_id); + while (defined(my $docid = $x3_doc->fetchrow_array)) { + unref_doc($self, $ibx_id, $eidx_key, $docid); + } + $dbh->prepare_cached(<<'')->execute($ibx_id); +DELETE FROM inboxes WHERE ibx_id = ? + + # drop last_commit info + my $pat = $eidx_key; + $pat =~ s/([_%])/\\$1/g; + $lc_i->execute("lc-%:$pat//%"); + while (my ($key) = $lc_i->fetchrow_array) { + next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!; + warn "I: removing $key\n"; + $dbh->prepare_cached(<<'')->execute($key); +DELETE FROM eidx_meta WHERE key = ? + + } + + warn "I: $eidx_key removed\n"; + } + + # it's not real unless it's in `over', we use parallelism here, + # shards will be reading directly from over, so commit + $self->{oidx}->commit_lazy; + $self->{oidx}->begin_lazy; + + for my $idx (@{$self->{idx_shards}}) { + warn "I: cleaning up shard #$idx->{shard}\n"; + $idx->shard_over_check($self->{oidx}); + } + my $nr = $dbh->do(<<''); +DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) + + warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; + + done($self); +} + sub eidx_sync { # main entry point my ($self, $opt) = @_; $self->idx_init($opt); # acquire lock via V2Writable::_idx_init @@ -413,6 +499,7 @@ sub idx_init { # similar to V2Writable next if $seen{"$st[0]\0$st[1]"}++; } else { warn "W: stat($d) failed (from $alt): $!\n"; + next if $opt->{-idx_gc}; } push @old, $line; } @@ -424,6 +511,7 @@ sub idx_init { # similar to V2Writable next if $seen{"$st[0]\0$st[1]"}++; } else { warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n"; + next if $opt->{-idx_gc}; } push @new, $line; } |