diff options
-rw-r--r-- | lib/PublicInbox/ExtSearchIdx.pm | 25 | ||||
-rwxr-xr-x | script/public-inbox-extindex | 4 |
2 files changed, 20 insertions, 9 deletions
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 357312b8..2311161e 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -887,23 +887,32 @@ sub dd_smsg { # git->cat_async callback } } -sub eidx_dedupe ($$) { - my ($self, $sync) = @_; +sub eidx_dedupe ($$$) { + my ($self, $sync, $msgids) = @_; $sync->{dedupe_cull} = 0; my $candidates = 0; my $nr_mid = 0; return unless eidxq_lock_acquire($self); - my $iter; + my ($iter, $cur_mid); my $min_id = 0; + my $idx = 0; local $sync->{-regen_fmt} = "dedupe %u/".$self->{oidx}->max."\n"; # note: we could write this query more intelligently, # but that causes lock contention with read-only processes dedupe_restart: - $iter = $self->{oidx}->dbh->prepare(<<EOS); + $cur_mid = $msgids->[$idx]; + if ($cur_mid eq '') { # all Message-IDs + $iter = $self->{oidx}->dbh->prepare(<<EOS); SELECT mid,id FROM msgid WHERE id > ? ORDER BY id ASC EOS - $iter->execute($min_id); + $iter->execute($min_id); + } else { + $iter = $self->{oidx}->dbh->prepare(<<EOS); +SELECT mid,id FROM msgid WHERE mid = ? AND id > ? ORDER BY id ASC +EOS + $iter->execute($cur_mid, $min_id); + } while (my ($mid, $id) = $iter->fetchrow_array) { last if $sync->{quit}; $self->{current_info} = "dedupe $mid"; @@ -937,6 +946,8 @@ EOS goto dedupe_restart; } } + goto dedupe_restart if defined($msgids->[++$idx]); + my $n = delete $sync->{dedupe_cull}; if (my $pr = $sync->{-opt}->{-progress}) { $pr->("culled $n/$candidates candidates ($nr_mid msgids)\n"); @@ -974,9 +985,9 @@ sub eidx_sync { # main entry point for my $ibx (@{ibx_sorted($self)}) { $ibx->{-ibx_id} //= $self->{oidx}->ibx_id($ibx->eidx_key); } - if (delete($opt->{dedupe})) { + if (my $msgids = delete($opt->{dedupe})) { local $sync->{checkpoint_unlocks} = 1; - eidx_dedupe($self, $sync); + eidx_dedupe($self, $sync, $msgids); } if (delete($opt->{reindex})) { local $sync->{checkpoint_unlocks} = 1; diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index dcb12e5a..addd5ac6 100755 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -17,7 +17,7 @@ usage: public-inbox-extindex [options] [EXTINDEX_DIR] [INBOX_DIR...] --batch-size=BYTES flush changes to OS after a given number of bytes --max-size=BYTES do not index messages larger than the given size --gc perform garbage collection instead of indexing - --dedupe fix prior deduplication errors + --dedupe[=MSGID] fix prior deduplication errors (may be repeated) --verbose | -v increase verbosity (may be repeated) --dry-run | -n dry-run on --dedupe @@ -29,7 +29,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i fsync|sync! indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s - dedupe gc commit-interval=i watch scan! dry-run|n + dedupe:s@ gc commit-interval=i watch scan! dry-run|n all help|h)) or die $help; if ($opt->{help}) { print $help; exit 0 }; |