* [PATCH 3/3] index: --reindex w/ --{since,until,before,after}
2021-10-05 9:40 7% [PATCH 0/3] some indexing stuff Eric Wong
@ 2021-10-05 9:40 5% ` Eric Wong
0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2021-10-05 9:40 UTC (permalink / raw)
To: meta
This lets administrators reindex specific time ranges
according to git "approxidate" formats. These arguments
are passed directly to underlying git-log(1) invocations
and may still reach into old epochs.
Since these options rely on git committer dates (which we infer
from the most recent Received: header), they are not guaranteed
to be strictly tied to git history and it's possible to
over/under-reindex some messages. It's probably not a major
problem in practice, though; reindexing a few extra messages
is generally harmless aside from some extra device wear.
Since this currently relies on git-log, these options do not
affect -extindex, yet.
---
Documentation/public-inbox-index.pod | 10 +++++
MANIFEST | 1 +
lib/PublicInbox/Admin.pm | 4 ++
lib/PublicInbox/SearchIdx.pm | 33 +++++++++++-----
lib/PublicInbox/V2Writable.pm | 8 +++-
script/public-inbox-index | 3 ++
t/reindex-time-range.t | 58 ++++++++++++++++++++++++++++
7 files changed, 106 insertions(+), 11 deletions(-)
create mode 100644 t/reindex-time-range.t
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index 57fedb69656d..c92b6de43b35 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -185,6 +185,16 @@ external indices are configured.
Do not update the C<all> external index by default. This negates
all uses of C<-E> / C<--update-extindex=> on the command-line.
+=item --since=DATESTRING
+
+=item --after=DATESTRING
+
+=item --until=DATESTRING
+
+=item --before=DATESTRING
+
+Passed directly to L<git-log(1)> to limit changes for C<--reindex>
+
=back
=head1 FILES
diff --git a/MANIFEST b/MANIFEST
index 22b7df9bb89d..122ceda0a761 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -519,6 +519,7 @@ t/psgi_v2-old.eml
t/psgi_v2.t
t/purge.t
t/qspawn.t
+t/reindex-time-range.t
t/replace.t
t/reply.t
t/run.perl
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index dcf17cf51d7a..a17a632ceb86 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -368,6 +368,10 @@ sub index_prepare ($$) {
or die "`$git_key=$s' not boolean\n";
$opt->{$k} = $v;
}
+ for my $k (qw(since until)) {
+ my $v = $opt->{$k} // next;
+ $opt->{reindex} or die "--$k=$v requires --reindex\n";
+ }
$env;
}
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 5b0e44581d27..e5c872d52fd3 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -750,7 +750,8 @@ sub index_sync {
my ($self, $opt) = @_;
delete $self->{lock_path} if $opt->{-skip_lock};
$self->with_umask(\&_index_sync, $self, $opt);
- if ($opt->{reindex} && !$opt->{quit}) {
+ if ($opt->{reindex} && !$opt->{quit} &&
+ !grep(defined, @$opt{qw(since until)})) {
my %again = %$opt;
delete @again{qw(rethread reindex)};
index_sync($self, \%again);
@@ -775,8 +776,8 @@ sub v1_checkpoint ($$;$) {
# $newest may be undef
my $newest = $stk ? $stk->{latest_cmt} : ${$sync->{latest_cmt}};
if (defined($newest)) {
- my $cur = $self->{mm}->last_commit || '';
- if (need_update($self, $cur, $newest)) {
+ my $cur = $self->{mm}->last_commit;
+ if (need_update($self, $sync, $cur, $newest)) {
$self->{mm}->last_commit($newest);
}
}
@@ -786,7 +787,7 @@ sub v1_checkpoint ($$;$) {
my $xdb = $self->{xdb};
if ($newest && $xdb) {
my $cur = $xdb->get_metadata('last_commit');
- if (need_update($self, $cur, $newest)) {
+ if (need_update($self, $sync, $cur, $newest)) {
$xdb->set_metadata('last_commit', $newest);
}
}
@@ -870,9 +871,14 @@ sub log2stack ($$$) {
# Count the new files so they can be added newest to oldest
# and still have numbers increasing from oldest to newest
- my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H
- --no-notes --no-color --no-renames --no-abbrev),
- $range);
+ my @cmd = qw(log --raw -r --pretty=tformat:%at-%ct-%H
+ --no-notes --no-color --no-renames --no-abbrev);
+ for my $k (qw(since until)) {
+ my $v = $sync->{-opt}->{$k} // next;
+ next if !$sync->{-opt}->{reindex};
+ push @cmd, "--$k=$v";
+ }
+ my $fh = $git->popen(@cmd, $range);
my ($at, $ct, $stk, $cmt);
while (<$fh>) {
return if $sync->{quit};
@@ -928,10 +934,17 @@ sub is_ancestor ($$$) {
$? == 0;
}
-sub need_update ($$$) {
- my ($self, $cur, $new) = @_;
+sub need_update ($$$$) {
+ my ($self, $sync, $cur, $new) = @_;
my $git = $self->{ibx}->git;
- return 1 if $cur && !is_ancestor($git, $cur, $new);
+ $cur //= ''; # XS Search::Xapian ->get_metadata doesn't give undef
+
+ # don't rewind if --{since,until,before,after} are in use
+ return if $cur ne '' &&
+ grep(defined, @{$sync->{-opt}}{qw(since until)}) &&
+ is_ancestor($git, $new, $cur);
+
+ return 1 if $cur ne '' && !is_ancestor($git, $cur, $new);
my $range = $cur eq '' ? $new : "$cur..$new";
chomp(my $n = $git->qx(qw(rev-list --count), $range));
($n eq '' || $n > 0);
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 971b007b02d6..36b84f5708ae 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -876,6 +876,11 @@ sub update_last_commit {
chomp(my $n = $unit->{git}->qx(@cmd));
return if $n ne '' && $n == 0;
}
+ # don't rewind if --{since,until,before,after} are in use
+ return if (defined($last) &&
+ grep(defined, @{$sync->{-opt}}{qw(since until)}) &&
+ is_ancestor($self->git, $latest_cmt, $last));
+
last_epoch_commit($self, $unit->{epoch}, $latest_cmt);
}
@@ -1337,7 +1342,8 @@ sub index_sync {
}
# reindex does not pick up new changes, so we rerun w/o it:
- if ($opt->{reindex} && !$sync->{quit}) {
+ if ($opt->{reindex} && !$sync->{quit} &&
+ !grep(defined, @$opt{qw(since until)})) {
my %again = %$opt;
$sync = undef;
delete @again{qw(rethread reindex -skip_lock)};
diff --git a/script/public-inbox-index b/script/public-inbox-index
index ca190a2e32d0..053d8b9438cb 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -25,6 +25,8 @@ options:
--batch-size=BYTES flush changes to OS after a given number of bytes
--max-size=BYTES do not index messages larger than the given size
--reindex index previously indexed data (if upgrading)
+ --since=DATE limit --reindex to changes after DATE
+ --until=DATE limit --reindex to changes before DATE
--rethread regenerate thread IDs (if upgrading, use sparingly)
--prune prune git storage on discontiguous history
--verbose | -v increase verbosity (may be repeated)
@@ -40,6 +42,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune
fsync|sync! xapian_only|xapian-only
indexlevel|index-level|L=s max_size|max-size=s
batch_size|batch-size=s
+ since|after=s until|before=s
sequential-shard|seq-shard
no-update-extindex update-extindex|E=s@
fast-noop|F skip-docdata all C=s@ help|h))
diff --git a/t/reindex-time-range.t b/t/reindex-time-range.t
new file mode 100644
index 000000000000..59f5c2aa2db4
--- /dev/null
+++ b/t/reindex-time-range.t
@@ -0,0 +1,58 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict; use v5.10.1; use PublicInbox::TestCommon;
+require_mods qw(DBD::SQLite);
+my $tmp = tmpdir();
+my $eml;
+my $cb = sub {
+ my ($im, $ibx) = @_;
+ $eml //= eml_load 't/utf8.eml';
+ for my $i (1..3) {
+ $eml->header_set('Message-ID', "<$i\@example.com>");
+ my $d = "Thu, 01 Jan 1970 0$i:30:00 +0000";
+ $eml->header_set('Date', $d);
+ $im->add($eml);
+ }
+};
+my %ibx = map {;
+ "v$_" => create_inbox("v$_", version => $_,
+ indexlevel => 'basic', tmpdir => "$tmp/v$_", $cb);
+} (1, 2);
+
+my $env = { TZ => 'UTC' };
+my ($out, $err);
+for my $v (sort keys %ibx) {
+ my $opt = { -C => $ibx{$v}->{inboxdir}, 1 => \$out, 2 => \$err };
+
+ ($out, $err) = ('', '');
+ run_script([ qw(-index -vv) ], $env, $opt);
+ is($?, 0, 'no error on initial index');
+
+ for my $x (qw(until before)) {
+ ($out, $err) = ('', '');
+ run_script([ qw(-index --reindex -vv),
+ "--$x=1970-01-01T02:00:00Z" ], $env, $opt);
+ is($?, 0, "no error with --$x");
+ like($err, qr! 1/1\b!, "$x only indexed one message");
+ }
+ for my $x (qw(after since)) {
+ ($out, $err) = ('', '');
+ run_script([ qw(-index --reindex -vv),
+ "--$x=1970-01-01T02:00:00Z" ], $env, $opt);
+ is($?, 0, "no error with --$x");
+ like($err, qr! 2/2\b!, "$x only indexed one message");
+ }
+
+ ($out, $err) = ('', '');
+ run_script([ qw(-index --reindex -vv) ], $env, $opt);
+ is($?, 0, 'no error on initial index');
+
+ for my $x (qw(since before after until)) {
+ ($out, $err) = ('', '');
+ run_script([ qw(-index -v), "--$x=1970-01-01T02:00:00Z" ],
+ $env, $opt);
+ isnt($?, 0, "--$x fails on --reindex");
+ }
+}
+
+done_testing;
^ permalink raw reply related [relevance 5%]
* [PATCH 0/3] some indexing stuff
@ 2021-10-05 9:40 7% Eric Wong
2021-10-05 9:40 5% ` [PATCH 3/3] index: --reindex w/ --{since,until,before,after} Eric Wong
0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2021-10-05 9:40 UTC (permalink / raw)
To: meta
-extindex still needs to support --{since,until,...},
but I got -index working with them, at least.
I initially failed to guard the last*commit SQLite
updates with since/until, and didn't notice it until
I started writing tests.
Eric Wong (3):
extsearchidx: favor 20-byte OID comparison
overidx: update comment for new sub name
index: --reindex w/ --{since,until,before,after}
Documentation/public-inbox-index.pod | 10 +++++
MANIFEST | 1 +
lib/PublicInbox/Admin.pm | 4 ++
lib/PublicInbox/ExtSearchIdx.pm | 3 +-
lib/PublicInbox/OverIdx.pm | 2 +-
lib/PublicInbox/SearchIdx.pm | 33 +++++++++++-----
lib/PublicInbox/V2Writable.pm | 8 +++-
script/public-inbox-index | 3 ++
t/reindex-time-range.t | 58 ++++++++++++++++++++++++++++
9 files changed, 108 insertions(+), 14 deletions(-)
create mode 100644 t/reindex-time-range.t
^ permalink raw reply [relevance 7%]
Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2021-10-05 9:40 7% [PATCH 0/3] some indexing stuff Eric Wong
2021-10-05 9:40 5% ` [PATCH 3/3] index: --reindex w/ --{since,until,before,after} Eric Wong
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).