From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 8825D1FA11 for ; Sun, 10 Oct 2021 14:25:19 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 4/8] extindex: speed up Xapian cleanup in --gc Date: Sun, 10 Oct 2021 14:25:14 +0000 Message-Id: <20211010142518.7012-5-e@80x24.org> In-Reply-To: <20211010142518.7012-1-e@80x24.org> References: <20211010142518.7012-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Avoiding repeated SQL statements brings --gc down to 2-3 minutes from around 10. We'll also add some checkpoints around over and xref3 cleanups. --- lib/PublicInbox/ExtSearchIdx.pm | 37 ++++++++++++++++++++------------- lib/PublicInbox/SearchIdx.pm | 3 +++ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 20c4cf78..04948b8b 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -421,34 +421,43 @@ sub eidx_gc_scan_shards ($$) { # TODO: use for lei/store DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; + reindex_checkpoint($self, $sync) if checkpoint_due($sync); # fixup from old bugs: $nr = $self->{oidx}->dbh->do(<<''); DELETE FROM over WHERE num NOT IN (SELECT docid FROM xref3) warn "I: eliminated $nr stale over entries\n" if $nr != 0; + reindex_checkpoint($self, $sync) if checkpoint_due($sync); my ($cur) = $self->{oidx}->dbh->selectrow_array(<{oidx}->dbh->selectrow_array(<{oidx}->dbh->prepare(<execute($cur); - next if $exists->fetchrow_array != 0; - $self->idx_shard($cur)->ipc_do('xdb_remove_quiet', $cur); + $cur // return; # empty + my ($r, $n, %active); + $nr = 0; + while (1) { + $r = $self->{oidx}->dbh->selectcol_arrayref(<<"", undef, $cur); +SELECT num FROM over WHERE num >= ? ORDER BY num ASC LIMIT 10000 + + last unless scalar(@$r); + while (defined($n = shift @$r)) { + for my $i ($cur..($n - 1)) { + my $idx = idx_shard($self, $i); + $idx->ipc_do('xdb_remove_quiet', $i); + $active{$idx} = $idx; + } + $cur = $n + 1; + } if (checkpoint_due($sync)) { - $exists = undef; + for my $idx (values %active) { + $nr += $idx->ipc_do('nr_quiet_rm') + } + %active = (); reindex_checkpoint($self, $sync); - goto restart; } } + warn "I: eliminated $nr stale Xapian documents\n" if $nr != 0; } sub eidx_gc { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 78db329d..bebe904b 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -650,8 +650,11 @@ sub xdb_remove_quiet { begin_txn_lazy($self); my $xdb = $self->{xdb} // die 'BUG: missing {xdb}'; eval { $xdb->delete_document($docid) }; + ++$self->{-quiet_rm} unless $@; } +sub nr_quiet_rm { delete($_[0]->{-quiet_rm}) // 0 } + sub index_git_blob_id { my ($doc, $pfx, $objid) = @_;