From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 6971F1F4B9; Tue, 14 May 2019 02:04:44 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Cc: "Eric W. Biederman" Subject: [PATCH 3/3] searchidx: fix incremental index with indexlevel=basic on v1 Date: Tue, 14 May 2019 02:04:43 +0000 Message-Id: <20190514020443.7200-4-e@80x24.org> In-Reply-To: <20190514020443.7200-1-e@80x24.org> References: <20190514020443.7200-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We were reindexing the full history every invocation of -index when Xapian was not used because we were incorrectly relying on 'last_commit' metadata stored in Xapian. Rewrite the indexing logic to be less confusing while we're at it, since we rely on `git merge-base --is-ancestor' nowadays. Furthermore, we need to handle message removals from the overview index correctly when Xapian is not in use. Co-authored-by: Eric W. Biederman --- MANIFEST | 1 + lib/PublicInbox/OverIdx.pm | 9 ++- lib/PublicInbox/SearchIdx.pm | 68 +++++++++++-------- t/indexlevels-mirror.t | 125 +++++++++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+), 27 deletions(-) create mode 100644 t/indexlevels-mirror.t diff --git a/MANIFEST b/MANIFEST index 28300e0..1da40a9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -198,6 +198,7 @@ t/httpd.t t/hval.t t/import.t t/inbox.t +t/indexlevels-mirror.t t/init.t t/linkify.t t/main-bin/spamc diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index cc9bd7d..bb3068d 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -317,14 +317,21 @@ sub delete_articles { $self->delete_by_num($_) foreach @$nums; } +# returns number of removed messages +# $oid may be undef to match only on $mid sub remove_oid { my ($self, $oid, $mid) = @_; + my $nr = 0; $self->begin_lazy; each_by_mid($self, $mid, ['ddd'], sub { my ($smsg) = @_; - $self->delete_by_num($smsg->{num}) if $smsg->{blob} eq $oid; + if (!defined($oid) || $smsg->{blob} eq $oid) { + $self->delete_by_num($smsg->{num}); + $nr++; + } 1; }); + $nr; } sub create_tables { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index db0495b..1b86f72 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -365,6 +365,7 @@ sub find_doc_ids { ($db->postlist_begin($termval), $db->postlist_end($termval)); } +# v1 only sub batch_do { my ($self, $termval, $cb) = @_; my $batch_size = 1000; # don't let @ids grow too large to avoid OOM @@ -379,25 +380,33 @@ sub batch_do { } } +# v1 only, where $mid is unique sub remove_message { my ($self, $mid) = @_; my $db = $self->{xdb}; - my $called; $mid = mid_clean($mid); - my $over = $self->{over}; + if (my $over = $self->{over}) { + my $nr = eval { $over->remove_oid(undef, $mid) }; + if ($@) { + warn "failed to remove <$mid> from overview: $@\n"; + } elsif ($nr == 0) { + warn "<$mid> missing for removal from overview\n"; + } + } + return if $self->{indexlevel} !~ $xapianlevels; + my $nr = 0; eval { batch_do($self, 'Q' . $mid, sub { my ($ids) = @_; $db->delete_document($_) for @$ids; - $over->delete_articles($ids) if $over; - $called = 1; + $nr = scalar @$ids; }); }; if ($@) { - warn "failed to remove message <$mid>: $@\n"; - } elsif (!$called) { - warn "cannot remove non-existent <$mid>\n"; + warn "failed to remove <$mid> from Xapian: $@\n"; + } elsif ($nr == 0) { + warn "<$mid> missing for removal from Xapian\n"; } } @@ -648,12 +657,30 @@ sub need_update ($$$) { ($n eq '' || $n > 0); } +# The last git commit we indexed with Xapian or SQLite (msgmap) +# This needs to account for cases where Xapian or SQLite is +# out-of-date with respect to the other. +sub _last_x_commit { + my ($self, $mm) = @_; + my $lm = $mm->last_commit || ''; + my $lx = ''; + if ($self->{indexlevel} =~ $xapianlevels) { + $lx = $self->{xdb}->get_metadata('last_commit') || ''; + } else { + $lx = $lm; + } + # Use last_commit from msgmap if it is older or unset + if (!$lm || ($lx && $lx && is_ancestor($self->{git}, $lm, $lx))) { + $lx = $lm; + } + $lx; +} + # indexes all unindexed messages (v1 only) sub _index_sync { my ($self, $opts) = @_; my $tip = $opts->{ref} || 'HEAD'; - my $reindex = $opts->{reindex}; - my ($mkey, $last_commit, $lx, $xlog); + my ($last_commit, $lx, $xlog); my $git = $self->{git}; $git->batch_prepare; @@ -661,19 +688,8 @@ sub _index_sync { my $mm = _msgmap_init($self); do { $xlog = undef; - $mkey = 'last_commit'; - $last_commit = $xdb->get_metadata('last_commit'); - $lx = $last_commit; - if ($reindex) { - $lx = ''; - $mkey = undef if $last_commit ne ''; - } - - # use last_commit from msgmap if it is older or unset - my $lm = $mm->last_commit || ''; - if (!$lm || ($lm && $lx && is_ancestor($git, $lm, $lx))) { - $lx = $lm; - } + $last_commit = _last_x_commit($self, $mm); + $lx = $opts->{reindex} ? '' : $last_commit; $self->{over}->rollback_lazy; $self->{over}->disconnect; @@ -687,7 +703,7 @@ sub _index_sync { $xlog = _git_log($self, $range); $xdb = $self->begin_txn_lazy; - } while ($xdb->get_metadata('last_commit') ne $last_commit); + } while (_last_x_commit($self, $mm) ne $last_commit); my $dbh = $mm->{dbh} if $mm; my $cb = sub { @@ -701,10 +717,10 @@ sub _index_sync { } $dbh->commit; } - if ($mkey && $newest && $self->{indexlevel} =~ $xapianlevels) { - my $cur = $xdb->get_metadata($mkey); + if ($newest && $self->{indexlevel} =~ $xapianlevels) { + my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { - $xdb->set_metadata($mkey, $newest); + $xdb->set_metadata('last_commit', $newest); } } $self->commit_txn_lazy; diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t new file mode 100644 index 0000000..e25b827 --- /dev/null +++ b/t/indexlevels-mirror.t @@ -0,0 +1,125 @@ +# Copyright (C) 2019 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use PublicInbox::MIME; +use PublicInbox::Inbox; +use File::Temp qw/tempdir/; +require './t/common.perl'; +require_git(2.6); +my $this = (split('/', __FILE__))[-1]; + +# TODO: remove Search::Xapian as a requirement for basic +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for $this" if $@; +} + +my $path = 'blib/script'; +my $index = "$path/public-inbox-index"; + +my $mime = PublicInbox::MIME->create( + header => [ + From => 'a@example.com', + To => 'test@example.com', + Subject => 'this is a subject', + Date => 'Fri, 02 Oct 1993 00:00:00 +0000', + ], + body => "hello world\n", +); + +sub import_index_incremental { + my ($v, $level) = @_; + my $tmpdir = tempdir("pi-$this-tmp-XXXXXX", TMPDIR => 1, CLEANUP => 1); + my $ibx = PublicInbox::Inbox->new({ + mainrepo => "$tmpdir/testbox", + name => "$this-$v", + version => $v, + -primary_address => 'test@example.com', + indexlevel => $level, + }); + my $cls = "PublicInbox::V${v}Writable"; + use_ok $cls; + my $im = $cls->new($ibx, {nproc=>1}); + $mime->header_set('Message-ID', ''); + ok($im->add($mime), 'first message added'); + $im->done; + + # index master (required for v1) + is(system($index, $ibx->{mainrepo}), 0, 'index master OK'); + my $ro_master = PublicInbox::Inbox->new({mainrepo => $ibx->{mainrepo}}); + my ($nr, $msgs) = $ro_master->recent; + is($nr, 1, 'only one message in master, so far'); + is($msgs->[0]->{mid}, 'm@1', 'first message in master indexed'); + + # clone + my @cmd = (qw(git clone --mirror -q)); + my $mirror = "$tmpdir/mirror-$v"; + if ($v == 1) { + push @cmd, $ibx->{mainrepo}, $mirror; + } else { + push @cmd, "$ibx->{mainrepo}/git/0.git", "$mirror/git/0.git"; + } + my $fetch_dir = $cmd[-1]; + is(system(@cmd), 0, "v$v clone OK"); + + # inbox init + local $ENV{PI_CONFIG} = "$tmpdir/.picfg"; + @cmd = ("$path/public-inbox-init", '-L', $level, + 'mirror', $mirror, '//example.com/test', 'test@example.com'); + push @cmd, '-V2' if $v == 2; + is(system(@cmd), 0, "v$v init OK"); + + # index mirror + is(system($index, $mirror), 0, "v$v index mirror OK"); + + # read-only access + my $ro_mirror = PublicInbox::Inbox->new({mainrepo => $mirror}); + ($nr, $msgs) = $ro_mirror->recent; + is($nr, 1, 'only one message, so far'); + is($msgs->[0]->{mid}, 'm@1', 'read first message'); + + # update master + $mime->header_set('Message-ID', ''); + ok($im->add($mime), '2nd message added'); + $im->done; + + # mirror updates + is(system('git', "--git-dir=$fetch_dir", qw(fetch -q)), 0, 'fetch OK'); + is(system($index, $mirror), 0, "v$v index mirror again OK"); + ($nr, $msgs) = $ro_mirror->recent; + is($nr, 2, '2nd message seen in mirror'); + is_deeply([sort { $a cmp $b } map { $_->{mid} } @$msgs], + ['m@1','m@2'], 'got both messages in mirror'); + + # incremental index master (required for v1) + is(system($index, $ibx->{mainrepo}), 0, 'index master OK'); + ($nr, $msgs) = $ro_master->recent; + is($nr, 2, '2nd message seen in master'); + is_deeply([sort { $a cmp $b } map { $_->{mid} } @$msgs], + ['m@1','m@2'], 'got both messages in master'); + + # remove message from master + ok($im->remove($mime), '2nd message removed'); + $im->done; + + # sync the mirror + is(system('git', "--git-dir=$fetch_dir", qw(fetch -q)), 0, 'fetch OK'); + is(system($index, $mirror), 0, "v$v index mirror again OK"); + ($nr, $msgs) = $ro_mirror->recent; + is($nr, 1, '2nd message gone from mirror'); + is_deeply([map { $_->{mid} } @$msgs], ['m@1'], + 'message unavailable in mirror'); +} + +# we can probably cull some other tests and put full/medium tests, here +for my $level (qw(basic)) { + for my $v (1..2) { + subtest("v$v indexlevel=$level" => sub { + import_index_incremental($v, $level); + }) + } +} + +done_testing(); -- EW