about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--Documentation/public-inbox-index.pod10
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/Admin.pm4
-rw-r--r--lib/PublicInbox/SearchIdx.pm33
-rw-r--r--lib/PublicInbox/V2Writable.pm8
-rwxr-xr-xscript/public-inbox-index3
-rw-r--r--t/reindex-time-range.t58
7 files changed, 106 insertions, 11 deletions
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index 57fedb69..c92b6de4 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -185,6 +185,16 @@ external indices are configured.
 Do not update the C<all> external index by default.  This negates
 all uses of C<-E> / C<--update-extindex=> on the command-line.
 
+=item --since=DATESTRING
+
+=item --after=DATESTRING
+
+=item --until=DATESTRING
+
+=item --before=DATESTRING
+
+Passed directly to L<git-log(1)> to limit changes for C<--reindex>
+
 =back
 
 =head1 FILES
diff --git a/MANIFEST b/MANIFEST
index 22b7df9b..122ceda0 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -519,6 +519,7 @@ t/psgi_v2-old.eml
 t/psgi_v2.t
 t/purge.t
 t/qspawn.t
+t/reindex-time-range.t
 t/replace.t
 t/reply.t
 t/run.perl
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index dcf17cf5..a17a632c 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -368,6 +368,10 @@ sub index_prepare ($$) {
                                         or die "`$git_key=$s' not boolean\n";
                 $opt->{$k} = $v;
         }
+        for my $k (qw(since until)) {
+                my $v = $opt->{$k} // next;
+                $opt->{reindex} or die "--$k=$v requires --reindex\n";
+        }
         $env;
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 5b0e4458..e5c872d5 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -750,7 +750,8 @@ sub index_sync {
         my ($self, $opt) = @_;
         delete $self->{lock_path} if $opt->{-skip_lock};
         $self->with_umask(\&_index_sync, $self, $opt);
-        if ($opt->{reindex} && !$opt->{quit}) {
+        if ($opt->{reindex} && !$opt->{quit} &&
+                        !grep(defined, @$opt{qw(since until)})) {
                 my %again = %$opt;
                 delete @again{qw(rethread reindex)};
                 index_sync($self, \%again);
@@ -775,8 +776,8 @@ sub v1_checkpoint ($$;$) {
         # $newest may be undef
         my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}};
         if (defined($newest)) {
-                my $cur = $self->{mm}->last_commit || '';
-                if (need_update($self, $cur, $newest)) {
+                my $cur = $self->{mm}->last_commit;
+                if (need_update($self, $sync, $cur, $newest)) {
                         $self->{mm}->last_commit($newest);
                 }
         }
@@ -786,7 +787,7 @@ sub v1_checkpoint ($$;$) {
         my $xdb = $self->{xdb};
         if ($newest && $xdb) {
                 my $cur = $xdb->get_metadata('last_commit');
-                if (need_update($self, $cur, $newest)) {
+                if (need_update($self, $sync, $cur, $newest)) {
                         $xdb->set_metadata('last_commit', $newest);
                 }
         }
@@ -870,9 +871,14 @@ sub log2stack ($$$) {
 
         # Count the new files so they can be added newest to oldest
         # and still have numbers increasing from oldest to newest
-        my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H
-                                --no-notes --no-color --no-renames --no-abbrev),
-                                $range);
+        my @cmd = qw(log --raw -r --pretty=tformat:%at-%ct-%H
+                        --no-notes --no-color --no-renames --no-abbrev);
+        for my $k (qw(since until)) {
+                my $v = $sync->{-opt}->{$k} // next;
+                next if !$sync->{-opt}->{reindex};
+                push @cmd, "--$k=$v";
+        }
+        my $fh = $git->popen(@cmd, $range);
         my ($at, $ct, $stk, $cmt);
         while (<$fh>) {
                 return if $sync->{quit};
@@ -928,10 +934,17 @@ sub is_ancestor ($$$) {
         $? == 0;
 }
 
-sub need_update ($$$) {
-        my ($self, $cur, $new) = @_;
+sub need_update ($$$$) {
+        my ($self, $sync, $cur, $new) = @_;
         my $git = $self->{ibx}->git;
-        return 1 if $cur && !is_ancestor($git, $cur, $new);
+        $cur //= ''; # XS Search::Xapian ->get_metadata doesn't give undef
+
+        # don't rewind if --{since,until,before,after} are in use
+        return if $cur ne '' &&
+                grep(defined, @{$sync->{-opt}}{qw(since until)}) &&
+                is_ancestor($git, $new, $cur);
+
+        return 1 if $cur ne '' && !is_ancestor($git, $cur, $new);
         my $range = $cur eq '' ? $new : "$cur..$new";
         chomp(my $n = $git->qx(qw(rev-list --count), $range));
         ($n eq '' || $n > 0);
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 971b007b..36b84f57 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -876,6 +876,11 @@ sub update_last_commit {
                 chomp(my $n = $unit->{git}->qx(@cmd));
                 return if $n ne '' && $n == 0;
         }
+        # don't rewind if --{since,until,before,after} are in use
+        return if (defined($last) &&
+                        grep(defined, @{$sync->{-opt}}{qw(since until)}) &&
+                        is_ancestor($self->git, $latest_cmt, $last));
+
         last_epoch_commit($self, $unit->{epoch}, $latest_cmt);
 }
 
@@ -1337,7 +1342,8 @@ sub index_sync {
         }
 
         # reindex does not pick up new changes, so we rerun w/o it:
-        if ($opt->{reindex} && !$sync->{quit}) {
+        if ($opt->{reindex} && !$sync->{quit} &&
+                        !grep(defined, @$opt{qw(since until)})) {
                 my %again = %$opt;
                 $sync = undef;
                 delete @again{qw(rethread reindex -skip_lock)};
diff --git a/script/public-inbox-index b/script/public-inbox-index
index ca190a2e..053d8b94 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -25,6 +25,8 @@ options:
   --batch-size=BYTES  flush changes to OS after a given number of bytes
   --max-size=BYTES    do not index messages larger than the given size
   --reindex           index previously indexed data (if upgrading)
+  --since=DATE        limit --reindex to changes after DATE
+  --until=DATE        limit --reindex to changes before DATE
   --rethread          regenerate thread IDs (if upgrading, use sparingly)
   --prune             prune git storage on discontiguous history
   --verbose | -v      increase verbosity (may be repeated)
@@ -40,6 +42,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
                 fsync|sync! xapian_only|xapian-only
                 indexlevel|index-level|L=s max_size|max-size=s
                 batch_size|batch-size=s
+                since|after=s until|before=s
                 sequential-shard|seq-shard
                 no-update-extindex update-extindex|E=s@
                 fast-noop|F skip-docdata all C=s@ help|h))
diff --git a/t/reindex-time-range.t b/t/reindex-time-range.t
new file mode 100644
index 00000000..59f5c2aa
--- /dev/null
+++ b/t/reindex-time-range.t
@@ -0,0 +1,58 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict; use v5.10.1; use PublicInbox::TestCommon;
+require_mods qw(DBD::SQLite);
+my $tmp = tmpdir();
+my $eml;
+my $cb = sub {
+        my ($im, $ibx) = @_;
+        $eml //= eml_load 't/utf8.eml';
+        for my $i (1..3) {
+                $eml->header_set('Message-ID', "<$i\@example.com>");
+                my $d = "Thu, 01 Jan 1970 0$i:30:00 +0000";
+                $eml->header_set('Date', $d);
+                $im->add($eml);
+        }
+};
+my %ibx = map {;
+        "v$_" => create_inbox("v$_", version => $_,
+                        indexlevel => 'basic', tmpdir => "$tmp/v$_", $cb);
+} (1, 2);
+
+my $env = { TZ => 'UTC' };
+my ($out, $err);
+for my $v (sort keys %ibx) {
+        my $opt = { -C => $ibx{$v}->{inboxdir}, 1 => \$out, 2 => \$err };
+
+        ($out, $err) = ('', '');
+        run_script([ qw(-index -vv) ], $env, $opt);
+        is($?, 0, 'no error on initial index');
+
+        for my $x (qw(until before)) {
+                ($out, $err) = ('', '');
+                run_script([ qw(-index --reindex -vv),
+                                "--$x=1970-01-01T02:00:00Z" ], $env, $opt);
+                is($?, 0, "no error with --$x");
+                like($err, qr! 1/1\b!, "$x only indexed one message");
+        }
+        for my $x (qw(after since)) {
+                ($out, $err) = ('', '');
+                run_script([ qw(-index --reindex -vv),
+                                "--$x=1970-01-01T02:00:00Z" ], $env, $opt);
+                is($?, 0, "no error with --$x");
+                like($err, qr! 2/2\b!, "$x only indexed one message");
+        }
+
+        ($out, $err) = ('', '');
+        run_script([ qw(-index --reindex -vv) ], $env, $opt);
+        is($?, 0, 'no error on initial index');
+
+        for my $x (qw(since before after until)) {
+                ($out, $err) = ('', '');
+                run_script([ qw(-index -v), "--$x=1970-01-01T02:00:00Z" ],
+                        $env, $opt);
+                isnt($?, 0, "--$x fails on --reindex");
+        }
+}
+
+done_testing;