From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Subject: [PATCH 3/3] searchidx: fix incremental index with indexlevel=basic on v1
Date: Tue, 14 May 2019 02:04:43 +0000 [thread overview]
Message-ID: <20190514020443.7200-4-e@80x24.org> (raw)
In-Reply-To: <20190514020443.7200-1-e@80x24.org>
We were reindexing the full history every invocation of -index
when Xapian was not used because we were incorrectly relying on
'last_commit' metadata stored in Xapian.
Rewrite the indexing logic to be less confusing while we're
at it, since we rely on `git merge-base --is-ancestor' nowadays.
Furthermore, we need to handle message removals from the
overview index correctly when Xapian is not in use.
Co-authored-by: Eric W. Biederman <ebiederm@xmission.com>
---
MANIFEST | 1 +
lib/PublicInbox/OverIdx.pm | 9 ++-
lib/PublicInbox/SearchIdx.pm | 68 +++++++++++--------
t/indexlevels-mirror.t | 125 +++++++++++++++++++++++++++++++++++
4 files changed, 176 insertions(+), 27 deletions(-)
create mode 100644 t/indexlevels-mirror.t
diff --git a/MANIFEST b/MANIFEST
index 28300e0..1da40a9 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -198,6 +198,7 @@ t/httpd.t
t/hval.t
t/import.t
t/inbox.t
+t/indexlevels-mirror.t
t/init.t
t/linkify.t
t/main-bin/spamc
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index cc9bd7d..bb3068d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -317,14 +317,21 @@ sub delete_articles {
$self->delete_by_num($_) foreach @$nums;
}
+# returns number of removed messages
+# $oid may be undef to match only on $mid
sub remove_oid {
my ($self, $oid, $mid) = @_;
+ my $nr = 0;
$self->begin_lazy;
each_by_mid($self, $mid, ['ddd'], sub {
my ($smsg) = @_;
- $self->delete_by_num($smsg->{num}) if $smsg->{blob} eq $oid;
+ if (!defined($oid) || $smsg->{blob} eq $oid) {
+ $self->delete_by_num($smsg->{num});
+ $nr++;
+ }
1;
});
+ $nr;
}
sub create_tables {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index db0495b..1b86f72 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -365,6 +365,7 @@ sub find_doc_ids {
($db->postlist_begin($termval), $db->postlist_end($termval));
}
+# v1 only
sub batch_do {
my ($self, $termval, $cb) = @_;
my $batch_size = 1000; # don't let @ids grow too large to avoid OOM
@@ -379,25 +380,33 @@ sub batch_do {
}
}
+# v1 only, where $mid is unique
sub remove_message {
my ($self, $mid) = @_;
my $db = $self->{xdb};
- my $called;
$mid = mid_clean($mid);
- my $over = $self->{over};
+ if (my $over = $self->{over}) {
+ my $nr = eval { $over->remove_oid(undef, $mid) };
+ if ($@) {
+ warn "failed to remove <$mid> from overview: $@\n";
+ } elsif ($nr == 0) {
+ warn "<$mid> missing for removal from overview\n";
+ }
+ }
+ return if $self->{indexlevel} !~ $xapianlevels;
+ my $nr = 0;
eval {
batch_do($self, 'Q' . $mid, sub {
my ($ids) = @_;
$db->delete_document($_) for @$ids;
- $over->delete_articles($ids) if $over;
- $called = 1;
+ $nr = scalar @$ids;
});
};
if ($@) {
- warn "failed to remove message <$mid>: $@\n";
- } elsif (!$called) {
- warn "cannot remove non-existent <$mid>\n";
+ warn "failed to remove <$mid> from Xapian: $@\n";
+ } elsif ($nr == 0) {
+ warn "<$mid> missing for removal from Xapian\n";
}
}
@@ -648,12 +657,30 @@ sub need_update ($$$) {
($n eq '' || $n > 0);
}
+# The last git commit we indexed with Xapian or SQLite (msgmap)
+# This needs to account for cases where Xapian or SQLite is
+# out-of-date with respect to the other.
+sub _last_x_commit {
+ my ($self, $mm) = @_;
+ my $lm = $mm->last_commit || '';
+ my $lx = '';
+ if ($self->{indexlevel} =~ $xapianlevels) {
+ $lx = $self->{xdb}->get_metadata('last_commit') || '';
+ } else {
+ $lx = $lm;
+ }
+ # Use last_commit from msgmap if it is older or unset
+ if (!$lm || ($lx && $lx && is_ancestor($self->{git}, $lm, $lx))) {
+ $lx = $lm;
+ }
+ $lx;
+}
+
# indexes all unindexed messages (v1 only)
sub _index_sync {
my ($self, $opts) = @_;
my $tip = $opts->{ref} || 'HEAD';
- my $reindex = $opts->{reindex};
- my ($mkey, $last_commit, $lx, $xlog);
+ my ($last_commit, $lx, $xlog);
my $git = $self->{git};
$git->batch_prepare;
@@ -661,19 +688,8 @@ sub _index_sync {
my $mm = _msgmap_init($self);
do {
$xlog = undef;
- $mkey = 'last_commit';
- $last_commit = $xdb->get_metadata('last_commit');
- $lx = $last_commit;
- if ($reindex) {
- $lx = '';
- $mkey = undef if $last_commit ne '';
- }
-
- # use last_commit from msgmap if it is older or unset
- my $lm = $mm->last_commit || '';
- if (!$lm || ($lm && $lx && is_ancestor($git, $lm, $lx))) {
- $lx = $lm;
- }
+ $last_commit = _last_x_commit($self, $mm);
+ $lx = $opts->{reindex} ? '' : $last_commit;
$self->{over}->rollback_lazy;
$self->{over}->disconnect;
@@ -687,7 +703,7 @@ sub _index_sync {
$xlog = _git_log($self, $range);
$xdb = $self->begin_txn_lazy;
- } while ($xdb->get_metadata('last_commit') ne $last_commit);
+ } while (_last_x_commit($self, $mm) ne $last_commit);
my $dbh = $mm->{dbh} if $mm;
my $cb = sub {
@@ -701,10 +717,10 @@ sub _index_sync {
}
$dbh->commit;
}
- if ($mkey && $newest && $self->{indexlevel} =~ $xapianlevels) {
- my $cur = $xdb->get_metadata($mkey);
+ if ($newest && $self->{indexlevel} =~ $xapianlevels) {
+ my $cur = $xdb->get_metadata('last_commit');
if (need_update($self, $cur, $newest)) {
- $xdb->set_metadata($mkey, $newest);
+ $xdb->set_metadata('last_commit', $newest);
}
}
$self->commit_txn_lazy;
diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t
new file mode 100644
index 0000000..e25b827
--- /dev/null
+++ b/t/indexlevels-mirror.t
@@ -0,0 +1,125 @@
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::MIME;
+use PublicInbox::Inbox;
+use File::Temp qw/tempdir/;
+require './t/common.perl';
+require_git(2.6);
+my $this = (split('/', __FILE__))[-1];
+
+# TODO: remove Search::Xapian as a requirement for basic
+foreach my $mod (qw(DBD::SQLite Search::Xapian)) {
+ eval "require $mod";
+ plan skip_all => "$mod missing for $this" if $@;
+}
+
+my $path = 'blib/script';
+my $index = "$path/public-inbox-index";
+
+my $mime = PublicInbox::MIME->create(
+ header => [
+ From => 'a@example.com',
+ To => 'test@example.com',
+ Subject => 'this is a subject',
+ Date => 'Fri, 02 Oct 1993 00:00:00 +0000',
+ ],
+ body => "hello world\n",
+);
+
+sub import_index_incremental {
+ my ($v, $level) = @_;
+ my $tmpdir = tempdir("pi-$this-tmp-XXXXXX", TMPDIR => 1, CLEANUP => 1);
+ my $ibx = PublicInbox::Inbox->new({
+ mainrepo => "$tmpdir/testbox",
+ name => "$this-$v",
+ version => $v,
+ -primary_address => 'test@example.com',
+ indexlevel => $level,
+ });
+ my $cls = "PublicInbox::V${v}Writable";
+ use_ok $cls;
+ my $im = $cls->new($ibx, {nproc=>1});
+ $mime->header_set('Message-ID', '<m@1>');
+ ok($im->add($mime), 'first message added');
+ $im->done;
+
+ # index master (required for v1)
+ is(system($index, $ibx->{mainrepo}), 0, 'index master OK');
+ my $ro_master = PublicInbox::Inbox->new({mainrepo => $ibx->{mainrepo}});
+ my ($nr, $msgs) = $ro_master->recent;
+ is($nr, 1, 'only one message in master, so far');
+ is($msgs->[0]->{mid}, 'm@1', 'first message in master indexed');
+
+ # clone
+ my @cmd = (qw(git clone --mirror -q));
+ my $mirror = "$tmpdir/mirror-$v";
+ if ($v == 1) {
+ push @cmd, $ibx->{mainrepo}, $mirror;
+ } else {
+ push @cmd, "$ibx->{mainrepo}/git/0.git", "$mirror/git/0.git";
+ }
+ my $fetch_dir = $cmd[-1];
+ is(system(@cmd), 0, "v$v clone OK");
+
+ # inbox init
+ local $ENV{PI_CONFIG} = "$tmpdir/.picfg";
+ @cmd = ("$path/public-inbox-init", '-L', $level,
+ 'mirror', $mirror, '//example.com/test', 'test@example.com');
+ push @cmd, '-V2' if $v == 2;
+ is(system(@cmd), 0, "v$v init OK");
+
+ # index mirror
+ is(system($index, $mirror), 0, "v$v index mirror OK");
+
+ # read-only access
+ my $ro_mirror = PublicInbox::Inbox->new({mainrepo => $mirror});
+ ($nr, $msgs) = $ro_mirror->recent;
+ is($nr, 1, 'only one message, so far');
+ is($msgs->[0]->{mid}, 'm@1', 'read first message');
+
+ # update master
+ $mime->header_set('Message-ID', '<m@2>');
+ ok($im->add($mime), '2nd message added');
+ $im->done;
+
+ # mirror updates
+ is(system('git', "--git-dir=$fetch_dir", qw(fetch -q)), 0, 'fetch OK');
+ is(system($index, $mirror), 0, "v$v index mirror again OK");
+ ($nr, $msgs) = $ro_mirror->recent;
+ is($nr, 2, '2nd message seen in mirror');
+ is_deeply([sort { $a cmp $b } map { $_->{mid} } @$msgs],
+ ['m@1','m@2'], 'got both messages in mirror');
+
+ # incremental index master (required for v1)
+ is(system($index, $ibx->{mainrepo}), 0, 'index master OK');
+ ($nr, $msgs) = $ro_master->recent;
+ is($nr, 2, '2nd message seen in master');
+ is_deeply([sort { $a cmp $b } map { $_->{mid} } @$msgs],
+ ['m@1','m@2'], 'got both messages in master');
+
+ # remove message from master
+ ok($im->remove($mime), '2nd message removed');
+ $im->done;
+
+ # sync the mirror
+ is(system('git', "--git-dir=$fetch_dir", qw(fetch -q)), 0, 'fetch OK');
+ is(system($index, $mirror), 0, "v$v index mirror again OK");
+ ($nr, $msgs) = $ro_mirror->recent;
+ is($nr, 1, '2nd message gone from mirror');
+ is_deeply([map { $_->{mid} } @$msgs], ['m@1'],
+ 'message unavailable in mirror');
+}
+
+# we can probably cull some other tests and put full/medium tests, here
+for my $level (qw(basic)) {
+ for my $v (1..2) {
+ subtest("v$v indexlevel=$level" => sub {
+ import_index_incremental($v, $level);
+ })
+ }
+}
+
+done_testing();
--
EW
next prev parent reply other threads:[~2019-05-14 2:04 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-08-01 16:41 [PATCH 00/13]: Incremental index fixes Eric W. Biederman
2018-08-01 16:43 ` [PATCH 01/13] Import.pm: Don't assume {in} and {out} always exist Eric W. Biederman
2018-08-01 16:43 ` [PATCH 02/13] t/v1reindex.t: Isolate the test cases Eric W. Biederman
2018-08-01 16:43 ` [PATCH 03/13] t/v2reindex.t: Isolate the test cases more Eric W. Biederman
2018-08-01 16:43 ` [PATCH 04/13] t/v[12]reindex.t: Place expected second in Xapian tests Eric W. Biederman
2018-08-01 16:43 ` [PATCH 05/13] t/v[12]reindex.t: Test that the resulting msgmap is as expected Eric W. Biederman
2018-08-01 16:43 ` [PATCH 06/13] t/v[12]reindex.t: Test incremental indexing works Eric W. Biederman
2018-08-01 16:43 ` [PATCH 07/13] SearchIdx.pm: Always assign numbers backwards during incremental indexing Eric W. Biederman
2018-08-01 16:43 ` [PATCH 08/13] Msgmap.pm: Track the largest value of num ever assigned Eric W. Biederman
2018-08-02 3:00 ` Eric Wong
2018-08-02 3:44 ` [WIP] searchidx: support incremental indexing on indexlevel=basic Eric Wong
2018-08-02 12:25 ` Eric W. Biederman
2018-08-02 17:12 ` Eric W. Biederman
2018-08-02 18:15 ` Eric W. Biederman
2019-05-14 2:04 ` [PATCH 0/3] incremental index fixes for indexlevel=basic Eric Wong
2019-05-14 2:04 ` [PATCH 1/3] v1writable: new wrapper which is closer to v2writable Eric Wong
2019-05-14 2:04 ` [PATCH 2/3] v2writable: allow setting nproc via creat options Eric Wong
2019-05-14 2:04 ` Eric Wong [this message]
2018-08-02 12:08 ` [PATCH 08/13] Msgmap.pm: Track the largest value of num ever assigned Eric W. Biederman
2018-08-01 16:43 ` [PATCH 09/13] t/v[12]reindex.t Verify num_highwater Eric W. Biederman
2018-08-01 16:43 ` [PATCH 10/13] t/v[12]reindex.t: Verify the num highwater is as expected Eric W. Biederman
2018-08-01 16:43 ` [PATCH 11/13] SearchIdx,V2Writeable: Update num_highwater on optimized deletes Eric W. Biederman
2018-08-01 16:43 ` [PATCH 12/13] V2Writeable.pm: Ensure that a found message number is in the msgmap Eric W. Biederman
2018-08-01 16:43 ` [PATCH 13/13] V2Writeable.pm: In unindex_oid delete the message from msgmap Eric W. Biederman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190514020443.7200-4-e@80x24.org \
--to=e@80x24.org \
--cc=ebiederm@xmission.com \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).