about summary refs log tree commit homepage
path: root/script
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-07-06 12:42:02 +0000
committerEric Wong <e@80x24.org>2021-07-06 13:36:54 +0000
commit8ef622d408d2e4d98ad3aada8466f539c9ac61ba (patch)
tree5a66df2970f98c9cb591b615d0f94e28a7f9b32c /script
parentf1f2464064af3840f2f1a697b638e5b769f111af (diff)
downloadpublic-inbox-8ef622d408d2e4d98ad3aada8466f539c9ac61ba.tar.gz
This is intended to fix older indices that had deduplication
bugs for matching content.  It'll also make dealing with
future changes to ContentHash easier since that's never
guaranteed stable.

It also supports --dry-run to print changes only without
making them.
Diffstat (limited to 'script')
-rwxr-xr-xscript/public-inbox-extindex13
1 files changed, 10 insertions, 3 deletions
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index 771486c4..dcb12e5a 100755
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -17,7 +17,9 @@ usage: public-inbox-extindex [options] [EXTINDEX_DIR] [INBOX_DIR...]
   --batch-size=BYTES  flush changes to OS after a given number of bytes
   --max-size=BYTES    do not index messages larger than the given size
   --gc                perform garbage collection instead of indexing
+  --dedupe            fix prior deduplication errors
   --verbose | -v      increase verbosity (may be repeated)
+  --dry-run | -n      dry-run on --dedupe
 
 BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
 See public-inbox-extindex(1) man page for full documentation.
@@ -27,7 +29,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
                 fsync|sync!
                 indexlevel|index-level|L=s max_size|max-size=s
                 batch_size|batch-size=s
-                gc commit-interval=i watch scan!
+                dedupe gc commit-interval=i watch scan! dry-run|n
                 all help|h))
         or die $help;
 if ($opt->{help}) { print $help; exit 0 };
@@ -50,11 +52,16 @@ unless (defined $eidx_dir) {
 my @ibxs;
 if ($opt->{gc}) {
         die "E: inbox paths must not be specified with --gc\n" if @ARGV;
-        die "E: --all not compatible with --gc\n" if $opt->{all};
-        die "E: --watch is not compatible with --gc\n" if $opt->{watch};
+        for my $sw (qw(all watch dry-run dedupe)) {
+                die "E: --$sw is not compatible with --gc\n" if $opt->{$sw};
+        }
 } else {
         @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
 }
+if ($opt->{'dry-run'} && !$opt->{dedupe}) {
+        die "E: --dry-run only affects --dedupe\n";
+}
+
 PublicInbox::Admin::require_or_die(qw(-search));
 PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
 PublicInbox::Admin::progress_prepare($opt);