user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 1/7] overidx: remove unused delete_articles sub
  @ 2019-10-21 11:22  6% ` Eric Wong
  0 siblings, 0 replies; 3+ results
From: Eric Wong @ 2019-10-21 11:22 UTC (permalink / raw)
  To: meta

This hasn't been used since commit 1b7e935ab1690e28
("searchidx: fix incremental index with indexlevel=basic on v1")
---
 lib/PublicInbox/OverIdx.pm | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 7fd1905d..e8df01c4 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -320,12 +320,6 @@ INSERT INTO id2num (id, num) VALUES (?,?)
 	}
 }
 
-sub delete_articles {
-	my ($self, $nums) = @_;
-	my $dbh = $self->connect;
-	$self->delete_by_num($_) foreach @$nums;
-}
-
 # returns number of removed messages
 # $oid may be undef to match only on $mid
 sub remove_oid {

^ permalink raw reply	[relevance 6%]

* [PATCH 3/3] searchidx: fix incremental index with indexlevel=basic on v1
  2019-05-14  2:04  7% ` [PATCH 0/3] incremental index fixes for indexlevel=basic Eric Wong
@ 2019-05-14  2:04  4%   ` Eric Wong
  0 siblings, 0 replies; 3+ results
From: Eric Wong @ 2019-05-14  2:04 UTC (permalink / raw)
  To: meta; +Cc: Eric W. Biederman

We were reindexing the full history every invocation of -index
when Xapian was not used because we were incorrectly relying on
'last_commit' metadata stored in Xapian.

Rewrite the indexing logic to be less confusing while we're
at it, since we rely on `git merge-base --is-ancestor' nowadays.

Furthermore, we need to handle message removals from the
overview index correctly when Xapian is not in use.

Co-authored-by: Eric W. Biederman <ebiederm@xmission.com>
---
 MANIFEST                     |   1 +
 lib/PublicInbox/OverIdx.pm   |   9 ++-
 lib/PublicInbox/SearchIdx.pm |  68 +++++++++++--------
 t/indexlevels-mirror.t       | 125 +++++++++++++++++++++++++++++++++++
 4 files changed, 176 insertions(+), 27 deletions(-)
 create mode 100644 t/indexlevels-mirror.t

diff --git a/MANIFEST b/MANIFEST
index 28300e0..1da40a9 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -198,6 +198,7 @@ t/httpd.t
 t/hval.t
 t/import.t
 t/inbox.t
+t/indexlevels-mirror.t
 t/init.t
 t/linkify.t
 t/main-bin/spamc
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index cc9bd7d..bb3068d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -317,14 +317,21 @@ sub delete_articles {
 	$self->delete_by_num($_) foreach @$nums;
 }
 
+# returns number of removed messages
+# $oid may be undef to match only on $mid
 sub remove_oid {
 	my ($self, $oid, $mid) = @_;
+	my $nr = 0;
 	$self->begin_lazy;
 	each_by_mid($self, $mid, ['ddd'], sub {
 		my ($smsg) = @_;
-		$self->delete_by_num($smsg->{num}) if $smsg->{blob} eq $oid;
+		if (!defined($oid) || $smsg->{blob} eq $oid) {
+			$self->delete_by_num($smsg->{num});
+			$nr++;
+		}
 		1;
 	});
+	$nr;
 }
 
 sub create_tables {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index db0495b..1b86f72 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -365,6 +365,7 @@ sub find_doc_ids {
 	($db->postlist_begin($termval), $db->postlist_end($termval));
 }
 
+# v1 only
 sub batch_do {
 	my ($self, $termval, $cb) = @_;
 	my $batch_size = 1000; # don't let @ids grow too large to avoid OOM
@@ -379,25 +380,33 @@ sub batch_do {
 	}
 }
 
+# v1 only, where $mid is unique
 sub remove_message {
 	my ($self, $mid) = @_;
 	my $db = $self->{xdb};
-	my $called;
 	$mid = mid_clean($mid);
-	my $over = $self->{over};
 
+	if (my $over = $self->{over}) {
+		my $nr = eval { $over->remove_oid(undef, $mid) };
+		if ($@) {
+			warn "failed to remove <$mid> from overview: $@\n";
+		} elsif ($nr == 0) {
+			warn "<$mid> missing for removal from overview\n";
+		}
+	}
+	return if $self->{indexlevel} !~ $xapianlevels;
+	my $nr = 0;
 	eval {
 		batch_do($self, 'Q' . $mid, sub {
 			my ($ids) = @_;
 			$db->delete_document($_) for @$ids;
-			$over->delete_articles($ids) if $over;
-			$called = 1;
+			$nr = scalar @$ids;
 		});
 	};
 	if ($@) {
-		warn "failed to remove message <$mid>: $@\n";
-	} elsif (!$called) {
-		warn "cannot remove non-existent <$mid>\n";
+		warn "failed to remove <$mid> from Xapian: $@\n";
+	} elsif ($nr == 0) {
+		warn "<$mid> missing for removal from Xapian\n";
 	}
 }
 
@@ -648,12 +657,30 @@ sub need_update ($$$) {
 	($n eq '' || $n > 0);
 }
 
+# The last git commit we indexed with Xapian or SQLite (msgmap)
+# This needs to account for cases where Xapian or SQLite is
+# out-of-date with respect to the other.
+sub _last_x_commit {
+	my ($self, $mm) = @_;
+	my $lm = $mm->last_commit || '';
+	my $lx = '';
+	if ($self->{indexlevel} =~ $xapianlevels) {
+		$lx = $self->{xdb}->get_metadata('last_commit') || '';
+	} else {
+		$lx = $lm;
+	}
+	# Use last_commit from msgmap if it is older or unset
+	if (!$lm || ($lx && $lx && is_ancestor($self->{git}, $lm, $lx))) {
+		$lx = $lm;
+	}
+	$lx;
+}
+
 # indexes all unindexed messages (v1 only)
 sub _index_sync {
 	my ($self, $opts) = @_;
 	my $tip = $opts->{ref} || 'HEAD';
-	my $reindex = $opts->{reindex};
-	my ($mkey, $last_commit, $lx, $xlog);
+	my ($last_commit, $lx, $xlog);
 	my $git = $self->{git};
 	$git->batch_prepare;
 
@@ -661,19 +688,8 @@ sub _index_sync {
 	my $mm = _msgmap_init($self);
 	do {
 		$xlog = undef;
-		$mkey = 'last_commit';
-		$last_commit = $xdb->get_metadata('last_commit');
-		$lx = $last_commit;
-		if ($reindex) {
-			$lx = '';
-			$mkey = undef if $last_commit ne '';
-		}
-
-		# use last_commit from msgmap if it is older or unset
-		my $lm = $mm->last_commit || '';
-		if (!$lm || ($lm && $lx && is_ancestor($git, $lm, $lx))) {
-			$lx = $lm;
-		}
+		$last_commit = _last_x_commit($self, $mm);
+		$lx = $opts->{reindex} ? '' : $last_commit;
 
 		$self->{over}->rollback_lazy;
 		$self->{over}->disconnect;
@@ -687,7 +703,7 @@ sub _index_sync {
 		$xlog = _git_log($self, $range);
 
 		$xdb = $self->begin_txn_lazy;
-	} while ($xdb->get_metadata('last_commit') ne $last_commit);
+	} while (_last_x_commit($self, $mm) ne $last_commit);
 
 	my $dbh = $mm->{dbh} if $mm;
 	my $cb = sub {
@@ -701,10 +717,10 @@ sub _index_sync {
 			}
 			$dbh->commit;
 		}
-		if ($mkey && $newest && $self->{indexlevel} =~ $xapianlevels) {
-			my $cur = $xdb->get_metadata($mkey);
+		if ($newest && $self->{indexlevel} =~ $xapianlevels) {
+			my $cur = $xdb->get_metadata('last_commit');
 			if (need_update($self, $cur, $newest)) {
-				$xdb->set_metadata($mkey, $newest);
+				$xdb->set_metadata('last_commit', $newest);
 			}
 		}
 		$self->commit_txn_lazy;
diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t
new file mode 100644
index 0000000..e25b827
--- /dev/null
+++ b/t/indexlevels-mirror.t
@@ -0,0 +1,125 @@
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::MIME;
+use PublicInbox::Inbox;
+use File::Temp qw/tempdir/;
+require './t/common.perl';
+require_git(2.6);
+my $this = (split('/', __FILE__))[-1];
+
+# TODO: remove Search::Xapian as a requirement for basic
+foreach my $mod (qw(DBD::SQLite Search::Xapian)) {
+	eval "require $mod";
+	plan skip_all => "$mod missing for $this" if $@;
+}
+
+my $path = 'blib/script';
+my $index = "$path/public-inbox-index";
+
+my $mime = PublicInbox::MIME->create(
+	header => [
+		From => 'a@example.com',
+		To => 'test@example.com',
+		Subject => 'this is a subject',
+		Date => 'Fri, 02 Oct 1993 00:00:00 +0000',
+	],
+	body => "hello world\n",
+);
+
+sub import_index_incremental {
+	my ($v, $level) = @_;
+	my $tmpdir = tempdir("pi-$this-tmp-XXXXXX", TMPDIR => 1, CLEANUP => 1);
+	my $ibx = PublicInbox::Inbox->new({
+		mainrepo => "$tmpdir/testbox",
+		name => "$this-$v",
+		version => $v,
+		-primary_address => 'test@example.com',
+		indexlevel => $level,
+	});
+	my $cls = "PublicInbox::V${v}Writable";
+	use_ok $cls;
+	my $im = $cls->new($ibx, {nproc=>1});
+	$mime->header_set('Message-ID', '<m@1>');
+	ok($im->add($mime), 'first message added');
+	$im->done;
+
+	# index master (required for v1)
+	is(system($index, $ibx->{mainrepo}), 0, 'index master OK');
+	my $ro_master = PublicInbox::Inbox->new({mainrepo => $ibx->{mainrepo}});
+	my ($nr, $msgs) = $ro_master->recent;
+	is($nr, 1, 'only one message in master, so far');
+	is($msgs->[0]->{mid}, 'm@1', 'first message in master indexed');
+
+	# clone
+	my @cmd = (qw(git clone --mirror -q));
+	my $mirror = "$tmpdir/mirror-$v";
+	if ($v == 1) {
+		push @cmd, $ibx->{mainrepo}, $mirror;
+	} else {
+		push @cmd, "$ibx->{mainrepo}/git/0.git", "$mirror/git/0.git";
+	}
+	my $fetch_dir = $cmd[-1];
+	is(system(@cmd), 0, "v$v clone OK");
+
+	# inbox init
+	local $ENV{PI_CONFIG} = "$tmpdir/.picfg";
+	@cmd = ("$path/public-inbox-init", '-L', $level,
+		'mirror', $mirror, '//example.com/test', 'test@example.com');
+	push @cmd, '-V2' if $v == 2;
+	is(system(@cmd), 0, "v$v init OK");
+
+	# index mirror
+	is(system($index, $mirror), 0, "v$v index mirror OK");
+
+	# read-only access
+	my $ro_mirror = PublicInbox::Inbox->new({mainrepo => $mirror});
+	($nr, $msgs) = $ro_mirror->recent;
+	is($nr, 1, 'only one message, so far');
+	is($msgs->[0]->{mid}, 'm@1', 'read first message');
+
+	# update master
+	$mime->header_set('Message-ID', '<m@2>');
+	ok($im->add($mime), '2nd message added');
+	$im->done;
+
+	# mirror updates
+	is(system('git', "--git-dir=$fetch_dir", qw(fetch -q)), 0, 'fetch OK');
+	is(system($index, $mirror), 0, "v$v index mirror again OK");
+	($nr, $msgs) = $ro_mirror->recent;
+	is($nr, 2, '2nd message seen in mirror');
+	is_deeply([sort { $a cmp $b } map { $_->{mid} } @$msgs],
+		['m@1','m@2'], 'got both messages in mirror');
+
+	# incremental index master (required for v1)
+	is(system($index, $ibx->{mainrepo}), 0, 'index master OK');
+	($nr, $msgs) = $ro_master->recent;
+	is($nr, 2, '2nd message seen in master');
+	is_deeply([sort { $a cmp $b } map { $_->{mid} } @$msgs],
+		['m@1','m@2'], 'got both messages in master');
+
+	# remove message from master
+	ok($im->remove($mime), '2nd message removed');
+	$im->done;
+
+	# sync the mirror
+	is(system('git', "--git-dir=$fetch_dir", qw(fetch -q)), 0, 'fetch OK');
+	is(system($index, $mirror), 0, "v$v index mirror again OK");
+	($nr, $msgs) = $ro_mirror->recent;
+	is($nr, 1, '2nd message gone from mirror');
+	is_deeply([map { $_->{mid} } @$msgs], ['m@1'],
+		'message unavailable in mirror');
+}
+
+# we can probably cull some other tests and put full/medium tests, here
+for my $level (qw(basic)) {
+	for my $v (1..2) {
+		subtest("v$v indexlevel=$level" => sub {
+			import_index_incremental($v, $level);
+		})
+	}
+}
+
+done_testing();
-- 
EW


^ permalink raw reply	[relevance 4%]

* [PATCH 0/3] incremental index fixes for indexlevel=basic
  @ 2019-05-14  2:04  7% ` Eric Wong
  2019-05-14  2:04  4%   ` [PATCH 3/3] searchidx: fix incremental index with indexlevel=basic on v1 Eric Wong
  0 siblings, 1 reply; 3+ results
From: Eric Wong @ 2019-05-14  2:04 UTC (permalink / raw)
  To: meta; +Cc: Eric W. Biederman

Resurrecting an old issue from last year I completely forgot
about :x  Thanks to Eric Biederman for the cleanup to 3/3
to simplify the v1 indexing logic.

The first two are prepatory patches which make tests
easier-to-write; and I have plans to cleanup and cull
redundancies in tests because they take too long.

Eric Wong (3):
  v1writable: new wrapper which is closer to v2writable
  v2writable: allow setting nproc via creat options
  searchidx: fix incremental index with indexlevel=basic on v1

 MANIFEST                      |   2 +
 lib/PublicInbox/Import.pm     |  15 +++-
 lib/PublicInbox/OverIdx.pm    |   9 ++-
 lib/PublicInbox/SearchIdx.pm  |  68 +++++++++++-------
 lib/PublicInbox/V1Writable.pm |  34 +++++++++
 lib/PublicInbox/V2Writable.pm |  19 ++++--
 script/public-inbox-index     |   3 +-
 script/public-inbox-init      |  23 +++----
 t/indexlevels-mirror.t        | 125 ++++++++++++++++++++++++++++++++++
 t/purge.t                     |   3 +-
 t/v2reindex.t                 |   3 +-
 t/v2writable.t                |   8 +--
 12 files changed, 252 insertions(+), 60 deletions(-)
 create mode 100644 lib/PublicInbox/V1Writable.pm
 create mode 100644 t/indexlevels-mirror.t

-- 
EW

^ permalink raw reply	[relevance 7%]

Results 1-3 of 3 | reverse | sort options + mbox downloads above
-- links below jump to the message on this page --
2018-08-02  3:44     [WIP] searchidx: support incremental indexing on indexlevel=basic Eric Wong
2019-05-14  2:04  7% ` [PATCH 0/3] incremental index fixes for indexlevel=basic Eric Wong
2019-05-14  2:04  4%   ` [PATCH 3/3] searchidx: fix incremental index with indexlevel=basic on v1 Eric Wong
2019-10-21 11:22     [PATCH 0/7] dead code elimination Eric Wong
2019-10-21 11:22  6% ` [PATCH 1/7] overidx: remove unused delete_articles sub Eric Wong

Code repositories for project(s) associated with this inbox:

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).