From ef68ada3b207fdb511ebe6d33b072a84277e6cd6 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 5 Oct 2021 09:40:17 +0000 Subject: index: --reindex w/ --{since,until,before,after} This lets administrators reindex specific time ranges according to git "approxidate" formats. These arguments are passed directly to underlying git-log(1) invocations and may still reach into old epochs. Since these options rely on git committer dates (which we infer from the most recent Received: header), they are not guaranteed to be strictly tied to git history and it's possible to over/under-reindex some messages. It's probably not a major problem in practice, though; reindexing a few extra messages is generally harmless aside from some extra device wear. Since this currently relies on git-log, these options do not affect -extindex, yet. --- Documentation/public-inbox-index.pod | 10 +++++++ MANIFEST | 1 + lib/PublicInbox/Admin.pm | 4 +++ lib/PublicInbox/SearchIdx.pm | 33 +++++++++++++------- lib/PublicInbox/V2Writable.pm | 8 ++++- script/public-inbox-index | 3 ++ t/reindex-time-range.t | 58 ++++++++++++++++++++++++++++++++++++ 7 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 t/reindex-time-range.t diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 57fedb69..c92b6de4 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -185,6 +185,16 @@ external indices are configured. Do not update the C external index by default. This negates all uses of C<-E> / C<--update-extindex=> on the command-line. +=item --since=DATESTRING + +=item --after=DATESTRING + +=item --until=DATESTRING + +=item --before=DATESTRING + +Passed directly to L to limit changes for C<--reindex> + =back =head1 FILES diff --git a/MANIFEST b/MANIFEST index 22b7df9b..122ceda0 100644 --- a/MANIFEST +++ b/MANIFEST @@ -519,6 +519,7 @@ t/psgi_v2-old.eml t/psgi_v2.t t/purge.t t/qspawn.t +t/reindex-time-range.t t/replace.t t/reply.t t/run.perl diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index dcf17cf5..a17a632c 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -368,6 +368,10 @@ sub index_prepare ($$) { or die "`$git_key=$s' not boolean\n"; $opt->{$k} = $v; } + for my $k (qw(since until)) { + my $v = $opt->{$k} // next; + $opt->{reindex} or die "--$k=$v requires --reindex\n"; + } $env; } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5b0e4458..e5c872d5 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -750,7 +750,8 @@ sub index_sync { my ($self, $opt) = @_; delete $self->{lock_path} if $opt->{-skip_lock}; $self->with_umask(\&_index_sync, $self, $opt); - if ($opt->{reindex} && !$opt->{quit}) { + if ($opt->{reindex} && !$opt->{quit} && + !grep(defined, @$opt{qw(since until)})) { my %again = %$opt; delete @again{qw(rethread reindex)}; index_sync($self, \%again); @@ -775,8 +776,8 @@ sub v1_checkpoint ($$;$) { # $newest may be undef my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}}; if (defined($newest)) { - my $cur = $self->{mm}->last_commit || ''; - if (need_update($self, $cur, $newest)) { + my $cur = $self->{mm}->last_commit; + if (need_update($self, $sync, $cur, $newest)) { $self->{mm}->last_commit($newest); } } @@ -786,7 +787,7 @@ sub v1_checkpoint ($$;$) { my $xdb = $self->{xdb}; if ($newest && $xdb) { my $cur = $xdb->get_metadata('last_commit'); - if (need_update($self, $cur, $newest)) { + if (need_update($self, $sync, $cur, $newest)) { $xdb->set_metadata('last_commit', $newest); } } @@ -870,9 +871,14 @@ sub log2stack ($$$) { # Count the new files so they can be added newest to oldest # and still have numbers increasing from oldest to newest - my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H - --no-notes --no-color --no-renames --no-abbrev), - $range); + my @cmd = qw(log --raw -r --pretty=tformat:%at-%ct-%H + --no-notes --no-color --no-renames --no-abbrev); + for my $k (qw(since until)) { + my $v = $sync->{-opt}->{$k} // next; + next if !$sync->{-opt}->{reindex}; + push @cmd, "--$k=$v"; + } + my $fh = $git->popen(@cmd, $range); my ($at, $ct, $stk, $cmt); while (<$fh>) { return if $sync->{quit}; @@ -928,10 +934,17 @@ sub is_ancestor ($$$) { $? == 0; } -sub need_update ($$$) { - my ($self, $cur, $new) = @_; +sub need_update ($$$$) { + my ($self, $sync, $cur, $new) = @_; my $git = $self->{ibx}->git; - return 1 if $cur && !is_ancestor($git, $cur, $new); + $cur //= ''; # XS Search::Xapian ->get_metadata doesn't give undef + + # don't rewind if --{since,until,before,after} are in use + return if $cur ne '' && + grep(defined, @{$sync->{-opt}}{qw(since until)}) && + is_ancestor($git, $new, $cur); + + return 1 if $cur ne '' && !is_ancestor($git, $cur, $new); my $range = $cur eq '' ? $new : "$cur..$new"; chomp(my $n = $git->qx(qw(rev-list --count), $range)); ($n eq '' || $n > 0); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 971b007b..36b84f57 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -876,6 +876,11 @@ sub update_last_commit { chomp(my $n = $unit->{git}->qx(@cmd)); return if $n ne '' && $n == 0; } + # don't rewind if --{since,until,before,after} are in use + return if (defined($last) && + grep(defined, @{$sync->{-opt}}{qw(since until)}) && + is_ancestor($self->git, $latest_cmt, $last)); + last_epoch_commit($self, $unit->{epoch}, $latest_cmt); } @@ -1337,7 +1342,8 @@ sub index_sync { } # reindex does not pick up new changes, so we rerun w/o it: - if ($opt->{reindex} && !$sync->{quit}) { + if ($opt->{reindex} && !$sync->{quit} && + !grep(defined, @$opt{qw(since until)})) { my %again = %$opt; $sync = undef; delete @again{qw(rethread reindex -skip_lock)}; diff --git a/script/public-inbox-index b/script/public-inbox-index index ca190a2e..053d8b94 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -25,6 +25,8 @@ options: --batch-size=BYTES flush changes to OS after a given number of bytes --max-size=BYTES do not index messages larger than the given size --reindex index previously indexed data (if upgrading) + --since=DATE limit --reindex to changes after DATE + --until=DATE limit --reindex to changes before DATE --rethread regenerate thread IDs (if upgrading, use sparingly) --prune prune git storage on discontiguous history --verbose | -v increase verbosity (may be repeated) @@ -40,6 +42,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune fsync|sync! xapian_only|xapian-only indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s + since|after=s until|before=s sequential-shard|seq-shard no-update-extindex update-extindex|E=s@ fast-noop|F skip-docdata all C=s@ help|h)) diff --git a/t/reindex-time-range.t b/t/reindex-time-range.t new file mode 100644 index 00000000..59f5c2aa --- /dev/null +++ b/t/reindex-time-range.t @@ -0,0 +1,58 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ +use strict; use v5.10.1; use PublicInbox::TestCommon; +require_mods qw(DBD::SQLite); +my $tmp = tmpdir(); +my $eml; +my $cb = sub { + my ($im, $ibx) = @_; + $eml //= eml_load 't/utf8.eml'; + for my $i (1..3) { + $eml->header_set('Message-ID', "<$i\@example.com>"); + my $d = "Thu, 01 Jan 1970 0$i:30:00 +0000"; + $eml->header_set('Date', $d); + $im->add($eml); + } +}; +my %ibx = map {; + "v$_" => create_inbox("v$_", version => $_, + indexlevel => 'basic', tmpdir => "$tmp/v$_", $cb); +} (1, 2); + +my $env = { TZ => 'UTC' }; +my ($out, $err); +for my $v (sort keys %ibx) { + my $opt = { -C => $ibx{$v}->{inboxdir}, 1 => \$out, 2 => \$err }; + + ($out, $err) = ('', ''); + run_script([ qw(-index -vv) ], $env, $opt); + is($?, 0, 'no error on initial index'); + + for my $x (qw(until before)) { + ($out, $err) = ('', ''); + run_script([ qw(-index --reindex -vv), + "--$x=1970-01-01T02:00:00Z" ], $env, $opt); + is($?, 0, "no error with --$x"); + like($err, qr! 1/1\b!, "$x only indexed one message"); + } + for my $x (qw(after since)) { + ($out, $err) = ('', ''); + run_script([ qw(-index --reindex -vv), + "--$x=1970-01-01T02:00:00Z" ], $env, $opt); + is($?, 0, "no error with --$x"); + like($err, qr! 2/2\b!, "$x only indexed one message"); + } + + ($out, $err) = ('', ''); + run_script([ qw(-index --reindex -vv) ], $env, $opt); + is($?, 0, 'no error on initial index'); + + for my $x (qw(since before after until)) { + ($out, $err) = ('', ''); + run_script([ qw(-index -v), "--$x=1970-01-01T02:00:00Z" ], + $env, $opt); + isnt($?, 0, "--$x fails on --reindex"); + } +} + +done_testing; -- cgit v1.2.3-24-ge0c7