about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--lib/PublicInbox/ExtSearchIdx.pm90
-rw-r--r--lib/PublicInbox/MiscIdx.pm16
-rw-r--r--lib/PublicInbox/SearchIdx.pm23
-rw-r--r--lib/PublicInbox/SearchIdxShard.pm21
-rw-r--r--script/public-inbox-extindex19
-rw-r--r--t/extsearch.t19
6 files changed, 178 insertions, 10 deletions
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index d780776f..4de47b58 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -87,6 +87,7 @@ sub _ibx_attach { # each_inbox callback
 
 sub attach_config {
         my ($self, $cfg) = @_;
+        $self->{cfg} = $cfg;
         $cfg->each_inbox(\&_ibx_attach, $self);
 }
 
@@ -141,7 +142,8 @@ sub do_xpost ($$) {
                 if ($nr == 0) {
                         $idx->shard_remove($oid, $docid);
                 } elsif ($rm_eidx_info) {
-                        $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml);
+                        $idx->shard_remove_eidx_info($docid, $oid, $eidx_key,
+                                                        $eml);
                 }
         }
 }
@@ -324,6 +326,90 @@ sub _sync_inbox ($$$) {
         $ibx->git->cleanup; # done with this inbox, now
 }
 
+sub unref_doc ($$$$) {
+        my ($self, $ibx_id, $eidx_key, $docid) = @_;
+        my $dbh = $self->{oidx}->dbh;
+
+        # for debug/info purposes, oids may no longer be accessible
+        my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+        $sth->execute($docid, $ibx_id);
+        my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref};
+
+        $dbh->prepare_cached(<<'')->execute($docid, $ibx_id);
+DELETE FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+        my $remain = $self->{oidx}->get_xref3($docid);
+        my $idx = $self->idx_shard($docid);
+        if (@$remain) {
+                for my $oid (@oid) {
+                        warn "I: unref #$docid $eidx_key $oid\n";
+                        $idx->shard_remove_eidx_info($docid, $oid, $eidx_key);
+                }
+        } else {
+                for my $oid (@oid) {
+                        warn "I: remove #$docid $eidx_key $oid\n";
+                        $idx->shard_remove($oid, $docid);
+                }
+        }
+}
+
+sub eidx_gc {
+        my ($self, $opt) = @_;
+        $self->{cfg} or die "E: GC requires ->attach_config\n";
+        $opt->{-idx_gc} = 1;
+        $self->idx_init($opt); # acquire lock via V2Writable::_idx_init
+
+        my $dbh = $self->{oidx}->dbh;
+        my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?');
+        my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes');
+        my $lc_i = $dbh->prepare('SELECT key FROM eidx_meta WHERE key LIKE ?');
+
+        $ibx_ck->execute;
+        while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) {
+                next if $self->{ibx_map}->{$eidx_key};
+                $self->{midx}->remove_eidx_key($eidx_key);
+                warn "I: deleting messages for $eidx_key...\n";
+                $x3_doc->execute($ibx_id);
+                while (defined(my $docid = $x3_doc->fetchrow_array)) {
+                        unref_doc($self, $ibx_id, $eidx_key, $docid);
+                }
+                $dbh->prepare_cached(<<'')->execute($ibx_id);
+DELETE FROM inboxes WHERE ibx_id = ?
+
+                # drop last_commit info
+                my $pat = $eidx_key;
+                $pat =~ s/([_%])/\\$1/g;
+                $lc_i->execute("lc-%:$pat//%");
+                while (my ($key) = $lc_i->fetchrow_array) {
+                        next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!;
+                        warn "I: removing $key\n";
+                        $dbh->prepare_cached(<<'')->execute($key);
+DELETE FROM eidx_meta WHERE key = ?
+
+                }
+
+                warn "I: $eidx_key removed\n";
+        }
+
+        # it's not real unless it's in `over', we use parallelism here,
+        # shards will be reading directly from over, so commit
+        $self->{oidx}->commit_lazy;
+        $self->{oidx}->begin_lazy;
+
+        for my $idx (@{$self->{idx_shards}}) {
+                warn "I: cleaning up shard #$idx->{shard}\n";
+                $idx->shard_over_check($self->{oidx});
+        }
+        my $nr = $dbh->do(<<'');
+DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over)
+
+        warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0;
+
+        done($self);
+}
+
 sub eidx_sync { # main entry point
         my ($self, $opt) = @_;
         $self->idx_init($opt); # acquire lock via V2Writable::_idx_init
@@ -413,6 +499,7 @@ sub idx_init { # similar to V2Writable
                                 next if $seen{"$st[0]\0$st[1]"}++;
                         } else {
                                 warn "W: stat($d) failed (from $alt): $!\n";
+                                next if $opt->{-idx_gc};
                         }
                         push @old, $line;
                 }
@@ -424,6 +511,7 @@ sub idx_init { # similar to V2Writable
                         next if $seen{"$st[0]\0$st[1]"}++;
                 } else {
                         warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n";
+                        next if $opt->{-idx_gc};
                 }
                 push @new, $line;
         }
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
index 642d920b..64591d05 100644
--- a/lib/PublicInbox/MiscIdx.pm
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -53,6 +53,22 @@ sub commit_txn {
         delete($self->{xdb})->commit_transaction;
 }
 
+sub remove_eidx_key {
+        my ($self, $eidx_key) = @_;
+        my $xdb = $self->{xdb};
+        my $head = $xdb->postlist_begin('Q'.$eidx_key);
+        my $tail = $xdb->postlist_end('Q'.$eidx_key);
+        my @docids; # only one, unless we had bugs
+        for (; $head != $tail; $head++) {
+                push @docids, $head->get_docid;
+        }
+        for my $docid (@docids) {
+                $xdb->delete_document($docid);
+                warn "I: remove inbox docid #$docid ($eidx_key)\n";
+        }
+}
+
+# adds or updates according to $eidx_key
 sub index_ibx {
         my ($self, $ibx) = @_;
         my $eidx_key = $ibx->eidx_key;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index d06c159b..c18c7c36 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -471,7 +471,7 @@ sub remove_eidx_info {
         my $doc = _get_doc($self, $docid, $oid) or return;
         eval { $doc->remove_term('O'.$eidx_key) };
         warn "W: ->remove_term O$eidx_key: $@\n" if $@;
-        for my $l ($eml->header_raw('List-Id')) {
+        for my $l ($eml ? $eml->header_raw('List-Id') : ()) {
                 $l =~ /<([^>]+)>/ or next;
                 my $lid = lc $1;
                 eval { $doc->remove_term('G' . $lid) };
@@ -970,4 +970,25 @@ sub eidx_shard_new {
         $self;
 }
 
+# ensure there's no stale Xapian docs by treating $over as canonical
+sub over_check {
+        my ($self, $over) = @_;
+        begin_txn_lazy($self);
+        my $sth = $over->dbh->prepare(<<'');
+SELECT COUNT(*) FROM over WHERE num = ?
+
+        my $xdb = $self->{xdb};
+        my $cur = $xdb->postlist_begin('');
+        my $end = $xdb->postlist_end('');
+        my $xdir = $self->xdir;
+        for (; $cur != $end; $cur++) {
+                my $docid = $cur->get_docid;
+                $sth->execute($docid);
+                my $x = $sth->fetchrow_array;
+                next if $x > 0;
+                warn "I: removing $xdir #$docid, not in `over'\n";
+                $xdb->delete_document($docid);
+        }
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index dcfeb0be..53fac9b6 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -57,6 +57,7 @@ sub spawn_worker {
 
 sub eml ($$) {
         my ($r, $len) = @_;
+        return if $len == 0;
         my $n = read($r, my $bref, $len) or die "read: $!\n";
         $n == $len or die "short read: $n != $len\n";
         PublicInbox::Eml->new(\$bref);
@@ -92,6 +93,10 @@ sub shard_worker_loop ($$$$$) {
                         chomp $eidx_key;
                         $self->remove_eidx_info($docid, $oid, $eidx_key,
                                                         eml($r, $len));
+                } elsif ($line =~ s/\AO ([^\n]+)\n//) {
+                        my $over_fn = $1;
+                        $over_fn =~ tr/\0/\n/;
+                        $self->over_check(PublicInbox::Over->new($over_fn));
                 } else {
                         chomp $line;
                         my $eidx_key;
@@ -155,10 +160,9 @@ sub shard_add_eidx_info {
 }
 
 sub shard_remove_eidx_info {
-        my ($self, $docid, $oid, $xibx, $eml) = @_;
-        my $eidx_key = $xibx->eidx_key;
+        my ($self, $docid, $oid, $eidx_key, $eml) = @_;
         if (my $w = $self->{w}) {
-                my $hdr = $eml->header_obj->as_string;
+                my $hdr = $eml ? $eml->header_obj->as_string : '';
                 my $len = length($hdr);
                 print $w "-X $len $docid $oid $eidx_key\n", $hdr or
                         die "failed to write shard: $!";
@@ -212,4 +216,15 @@ sub shard_remove {
         }
 }
 
+sub shard_over_check {
+        my ($self, $over) = @_;
+        if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child
+                my ($over_fn) = $over->{dbh}->sqlite_db_filename;
+                $over_fn =~ tr/\n/\0/;
+                print $w "O $over_fn\n" or die "failed to write over $!";
+        } else {
+                $self->over_check($over);
+        }
+}
+
 1;
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index 20a0737c..17ad59fa 100644
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -16,6 +16,7 @@ usage: public-inbox-extindex [options] EXTINDEX_DIR [INBOX_DIR]
   --jobs=NUM          set or disable parallelization (NUM=0)
   --batch-size=BYTES  flush changes to OS after a given number of bytes
   --max-size=BYTES    do not index messages larger than the given size
+  --gc                perform garbage collection instead of indexing
   --verbose | -v      increase verbosity (may be repeated)
 
 BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
@@ -26,6 +27,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
                 fsync|sync!
                 indexlevel|index-level|L=s max_size|max-size=s
                 batch_size|batch-size=s
+                gc
                 all help|h))
         or die $help;
 if ($opt->{help}) { print $help; exit 0 };
@@ -36,7 +38,13 @@ my $eidx_dir = shift(@ARGV) // die "E: $help";
 local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync
 require PublicInbox::Admin;
 my $cfg = PublicInbox::Config->new;
-my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
+my @ibxs;
+if ($opt->{gc}) {
+        die "E: inbox paths must not be specified with --gc\n" if @ARGV;
+        die "E: --all not compatible --gc\n" if $opt->{all};
+} else {
+        @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
+}
 PublicInbox::Admin::require_or_die(qw(-search));
 PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
 PublicInbox::Admin::progress_prepare($opt);
@@ -44,5 +52,10 @@ my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
 local %ENV = (%ENV, %$env) if $env;
 require PublicInbox::ExtSearchIdx;
 my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
-$eidx->attach_inbox($_) for @ibxs;
-$eidx->eidx_sync($opt);
+if ($opt->{gc}) {
+        $eidx->attach_config($cfg);
+        $eidx->eidx_gc($opt);
+} else {
+        $eidx->attach_inbox($_) for @ibxs;
+        $eidx->eidx_sync($opt);
+}
diff --git a/t/extsearch.t b/t/extsearch.t
index 4f1dbadf..10b14baf 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -16,7 +16,8 @@ my $host_port = $sock->sockhost . ':' . $sock->sockport;
 my ($home, $for_destroy) = tmpdir();
 local $ENV{HOME} = $home;
 mkdir "$home/.public-inbox" or BAIL_OUT $!;
-open my $fh, '>', "$home/.public-inbox/config" or BAIL_OUT $!;
+my $cfg_path = "$home/.public-inbox/config";
+open my $fh, '>', $cfg_path or BAIL_OUT $!;
 print $fh <<EOF or BAIL_OUT $!;
 [publicinboxMda]
         spamcheck = none
@@ -55,7 +56,7 @@ ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
 
 
 { # TODO: -extindex should write this to config
-        open $fh, '>>', "$home/.public-inbox/config" or BAIL_OUT $!;
+        open $fh, '>>', $cfg_path or BAIL_OUT $!;
         print $fh <<EOF or BAIL_OUT $!;
 ; for ->ALL
 [extindex "all"]
@@ -148,4 +149,18 @@ is(scalar(@it), 2, 'two inboxes');
 like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
 like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
 
+if ('remove v1test and test gc') {
+        xsys([qw(git config --unset publicinbox.v1test.inboxdir)],
+                { GIT_CONFIG => $cfg_path });
+        my $opt = { 2 => \(my $err = '') };
+        ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $opt),
+                'extindex --gc');
+        like($err, qr/^I: remove #1 v1\.example /ms, 'removed v1 message');
+        is(scalar(grep(!/^I:/, split(/^/m, $err))), 0,
+                'no non-informational messages');
+        $misc->{xdb}->reopen;
+        @it = $misc->mset('')->items;
+        is(scalar(@it), 1, 'only one inbox left');
+}
+
 done_testing;