diff options
-rw-r--r-- | lib/PublicInbox/ExtSearchIdx.pm | 90 | ||||
-rw-r--r-- | lib/PublicInbox/MiscIdx.pm | 16 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 23 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdxShard.pm | 21 | ||||
-rw-r--r-- | script/public-inbox-extindex | 19 | ||||
-rw-r--r-- | t/extsearch.t | 19 |
6 files changed, 178 insertions, 10 deletions
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index d780776f..4de47b58 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -87,6 +87,7 @@ sub _ibx_attach { # each_inbox callback sub attach_config { my ($self, $cfg) = @_; + $self->{cfg} = $cfg; $cfg->each_inbox(\&_ibx_attach, $self); } @@ -141,7 +142,8 @@ sub do_xpost ($$) { if ($nr == 0) { $idx->shard_remove($oid, $docid); } elsif ($rm_eidx_info) { - $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml); + $idx->shard_remove_eidx_info($docid, $oid, $eidx_key, + $eml); } } } @@ -324,6 +326,90 @@ sub _sync_inbox ($$$) { $ibx->git->cleanup; # done with this inbox, now } +sub unref_doc ($$$$) { + my ($self, $ibx_id, $eidx_key, $docid) = @_; + my $dbh = $self->{oidx}->dbh; + + # for debug/info purposes, oids may no longer be accessible + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ? + + $sth->execute($docid, $ibx_id); + my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref}; + + $dbh->prepare_cached(<<'')->execute($docid, $ibx_id); +DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? + + my $remain = $self->{oidx}->get_xref3($docid); + my $idx = $self->idx_shard($docid); + if (@$remain) { + for my $oid (@oid) { + warn "I: unref #$docid $eidx_key $oid\n"; + $idx->shard_remove_eidx_info($docid, $oid, $eidx_key); + } + } else { + for my $oid (@oid) { + warn "I: remove #$docid $eidx_key $oid\n"; + $idx->shard_remove($oid, $docid); + } + } +} + +sub eidx_gc { + my ($self, $opt) = @_; + $self->{cfg} or die "E: GC requires ->attach_config\n"; + $opt->{-idx_gc} = 1; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + + my $dbh = $self->{oidx}->dbh; + my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?'); + my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes'); + my $lc_i = $dbh->prepare('SELECT key FROM eidx_meta WHERE key LIKE ?'); + + $ibx_ck->execute; + while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) { + next if $self->{ibx_map}->{$eidx_key}; + $self->{midx}->remove_eidx_key($eidx_key); + warn "I: deleting messages for $eidx_key...\n"; + $x3_doc->execute($ibx_id); + while (defined(my $docid = $x3_doc->fetchrow_array)) { + unref_doc($self, $ibx_id, $eidx_key, $docid); + } + $dbh->prepare_cached(<<'')->execute($ibx_id); +DELETE FROM inboxes WHERE ibx_id = ? + + # drop last_commit info + my $pat = $eidx_key; + $pat =~ s/([_%])/\\$1/g; + $lc_i->execute("lc-%:$pat//%"); + while (my ($key) = $lc_i->fetchrow_array) { + next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!; + warn "I: removing $key\n"; + $dbh->prepare_cached(<<'')->execute($key); +DELETE FROM eidx_meta WHERE key = ? + + } + + warn "I: $eidx_key removed\n"; + } + + # it's not real unless it's in `over', we use parallelism here, + # shards will be reading directly from over, so commit + $self->{oidx}->commit_lazy; + $self->{oidx}->begin_lazy; + + for my $idx (@{$self->{idx_shards}}) { + warn "I: cleaning up shard #$idx->{shard}\n"; + $idx->shard_over_check($self->{oidx}); + } + my $nr = $dbh->do(<<''); +DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) + + warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; + + done($self); +} + sub eidx_sync { # main entry point my ($self, $opt) = @_; $self->idx_init($opt); # acquire lock via V2Writable::_idx_init @@ -413,6 +499,7 @@ sub idx_init { # similar to V2Writable next if $seen{"$st[0]\0$st[1]"}++; } else { warn "W: stat($d) failed (from $alt): $!\n"; + next if $opt->{-idx_gc}; } push @old, $line; } @@ -424,6 +511,7 @@ sub idx_init { # similar to V2Writable next if $seen{"$st[0]\0$st[1]"}++; } else { warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n"; + next if $opt->{-idx_gc}; } push @new, $line; } diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index 642d920b..64591d05 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -53,6 +53,22 @@ sub commit_txn { delete($self->{xdb})->commit_transaction; } +sub remove_eidx_key { + my ($self, $eidx_key) = @_; + my $xdb = $self->{xdb}; + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + my @docids; # only one, unless we had bugs + for (; $head != $tail; $head++) { + push @docids, $head->get_docid; + } + for my $docid (@docids) { + $xdb->delete_document($docid); + warn "I: remove inbox docid #$docid ($eidx_key)\n"; + } +} + +# adds or updates according to $eidx_key sub index_ibx { my ($self, $ibx) = @_; my $eidx_key = $ibx->eidx_key; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index d06c159b..c18c7c36 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -471,7 +471,7 @@ sub remove_eidx_info { my $doc = _get_doc($self, $docid, $oid) or return; eval { $doc->remove_term('O'.$eidx_key) }; warn "W: ->remove_term O$eidx_key: $@\n" if $@; - for my $l ($eml->header_raw('List-Id')) { + for my $l ($eml ? $eml->header_raw('List-Id') : ()) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; eval { $doc->remove_term('G' . $lid) }; @@ -970,4 +970,25 @@ sub eidx_shard_new { $self; } +# ensure there's no stale Xapian docs by treating $over as canonical +sub over_check { + my ($self, $over) = @_; + begin_txn_lazy($self); + my $sth = $over->dbh->prepare(<<''); +SELECT COUNT(*) FROM over WHERE num = ? + + my $xdb = $self->{xdb}; + my $cur = $xdb->postlist_begin(''); + my $end = $xdb->postlist_end(''); + my $xdir = $self->xdir; + for (; $cur != $end; $cur++) { + my $docid = $cur->get_docid; + $sth->execute($docid); + my $x = $sth->fetchrow_array; + next if $x > 0; + warn "I: removing $xdir #$docid, not in `over'\n"; + $xdb->delete_document($docid); + } +} + 1; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index dcfeb0be..53fac9b6 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -57,6 +57,7 @@ sub spawn_worker { sub eml ($$) { my ($r, $len) = @_; + return if $len == 0; my $n = read($r, my $bref, $len) or die "read: $!\n"; $n == $len or die "short read: $n != $len\n"; PublicInbox::Eml->new(\$bref); @@ -92,6 +93,10 @@ sub shard_worker_loop ($$$$$) { chomp $eidx_key; $self->remove_eidx_info($docid, $oid, $eidx_key, eml($r, $len)); + } elsif ($line =~ s/\AO ([^\n]+)\n//) { + my $over_fn = $1; + $over_fn =~ tr/\0/\n/; + $self->over_check(PublicInbox::Over->new($over_fn)); } else { chomp $line; my $eidx_key; @@ -155,10 +160,9 @@ sub shard_add_eidx_info { } sub shard_remove_eidx_info { - my ($self, $docid, $oid, $xibx, $eml) = @_; - my $eidx_key = $xibx->eidx_key; + my ($self, $docid, $oid, $eidx_key, $eml) = @_; if (my $w = $self->{w}) { - my $hdr = $eml->header_obj->as_string; + my $hdr = $eml ? $eml->header_obj->as_string : ''; my $len = length($hdr); print $w "-X $len $docid $oid $eidx_key\n", $hdr or die "failed to write shard: $!"; @@ -212,4 +216,15 @@ sub shard_remove { } } +sub shard_over_check { + my ($self, $over) = @_; + if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child + my ($over_fn) = $over->{dbh}->sqlite_db_filename; + $over_fn =~ tr/\n/\0/; + print $w "O $over_fn\n" or die "failed to write over $!"; + } else { + $self->over_check($over); + } +} + 1; diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index 20a0737c..17ad59fa 100644 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -16,6 +16,7 @@ usage: public-inbox-extindex [options] EXTINDEX_DIR [INBOX_DIR] --jobs=NUM set or disable parallelization (NUM=0) --batch-size=BYTES flush changes to OS after a given number of bytes --max-size=BYTES do not index messages larger than the given size + --gc perform garbage collection instead of indexing --verbose | -v increase verbosity (may be repeated) BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes) @@ -26,6 +27,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i fsync|sync! indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s + gc all help|h)) or die $help; if ($opt->{help}) { print $help; exit 0 }; @@ -36,7 +38,13 @@ my $eidx_dir = shift(@ARGV) // die "E: $help"; local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync require PublicInbox::Admin; my $cfg = PublicInbox::Config->new; -my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); +my @ibxs; +if ($opt->{gc}) { + die "E: inbox paths must not be specified with --gc\n" if @ARGV; + die "E: --all not compatible --gc\n" if $opt->{all}; +} else { + @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg); +} PublicInbox::Admin::require_or_die(qw(-search)); PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n"; PublicInbox::Admin::progress_prepare($opt); @@ -44,5 +52,10 @@ my $env = PublicInbox::Admin::index_prepare($opt, $cfg); local %ENV = (%ENV, %$env) if $env; require PublicInbox::ExtSearchIdx; my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt); -$eidx->attach_inbox($_) for @ibxs; -$eidx->eidx_sync($opt); +if ($opt->{gc}) { + $eidx->attach_config($cfg); + $eidx->eidx_gc($opt); +} else { + $eidx->attach_inbox($_) for @ibxs; + $eidx->eidx_sync($opt); +} diff --git a/t/extsearch.t b/t/extsearch.t index 4f1dbadf..10b14baf 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -16,7 +16,8 @@ my $host_port = $sock->sockhost . ':' . $sock->sockport; my ($home, $for_destroy) = tmpdir(); local $ENV{HOME} = $home; mkdir "$home/.public-inbox" or BAIL_OUT $!; -open my $fh, '>', "$home/.public-inbox/config" or BAIL_OUT $!; +my $cfg_path = "$home/.public-inbox/config"; +open my $fh, '>', $cfg_path or BAIL_OUT $!; print $fh <<EOF or BAIL_OUT $!; [publicinboxMda] spamcheck = none @@ -55,7 +56,7 @@ ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init'); { # TODO: -extindex should write this to config - open $fh, '>>', "$home/.public-inbox/config" or BAIL_OUT $!; + open $fh, '>>', $cfg_path or BAIL_OUT $!; print $fh <<EOF or BAIL_OUT $!; ; for ->ALL [extindex "all"] @@ -148,4 +149,18 @@ is(scalar(@it), 2, 'two inboxes'); like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2'); like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1'); +if ('remove v1test and test gc') { + xsys([qw(git config --unset publicinbox.v1test.inboxdir)], + { GIT_CONFIG => $cfg_path }); + my $opt = { 2 => \(my $err = '') }; + ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $opt), + 'extindex --gc'); + like($err, qr/^I: remove #1 v1\.example /ms, 'removed v1 message'); + is(scalar(grep(!/^I:/, split(/^/m, $err))), 0, + 'no non-informational messages'); + $misc->{xdb}->reopen; + @it = $misc->mset('')->items; + is(scalar(@it), 1, 'only one inbox left'); +} + done_testing; |