user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Cc: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
Subject: [PATCH 09/12] searchidx: regenerate and avoid article number gaps on full index
Date: Wed, 18 Apr 2018 09:13:13 +0000	[thread overview]
Message-ID: <20180418091316.29114-10-e@80x24.org> (raw)
In-Reply-To: <20180418091316.29114-1-e@80x24.org>

Some messages to git@vger went missing from Msgmap from old bugs
and became inaccessible via NNTP.  Forcing NNTP article numbers
when the overview DB came about made the problem more visible when
reindexing old (v1) repositories as all removed spam messages
took up AUTOINCREMENT numbers again before they were removed.

Having large gaps in NNTP article numbers is not good since it
throws off NNTP clients.  This does NOT prevent NNTP clients from
seeing some messages twice, but is better than having them
miss several messages entirely.

We also avoid depending on --reverse in git-log, as
git requires storing an entire commit list in memory for
--reverse, so it's cheaper to store only deleted blobs in the %D
hash since they do not live long.
---
 MANIFEST                      |   1 +
 lib/PublicInbox/Msgmap.pm     |   8 +-
 lib/PublicInbox/SearchIdx.pm  | 186 +++++++++++++++++++++-------------
 lib/PublicInbox/V2Writable.pm |  15 +--
 t/v1-add-remove-add.t         |  45 ++++++++
 5 files changed, 171 insertions(+), 84 deletions(-)
 create mode 100644 t/v1-add-remove-add.t

diff --git a/MANIFEST b/MANIFEST
index 00a0970..8038ad4 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -192,6 +192,7 @@ t/spawn.t
 t/thread-cycle.t
 t/time.t
 t/utf8.mbox
+t/v1-add-remove-add.t
 t/v2-add-remove-add.t
 t/v2mda.t
 t/v2mirror.t
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index feef8ba..3237a5e 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -108,10 +108,10 @@ sub created_at {
 sub mid_insert {
 	my ($self, $mid) = @_;
 	my $dbh = $self->{dbh};
-	my $sql = 'INSERT OR IGNORE INTO msgmap (mid) VALUES (?)';
-	my $sth = $self->{mid_insert} ||= $dbh->prepare($sql);
-	$sth->bind_param(1, $mid);
-	return if $sth->execute == 0;
+	my $sth = $dbh->prepare_cached(<<'');
+INSERT OR IGNORE INTO msgmap (mid) VALUES (?)
+
+	return if $sth->execute($mid) == 0;
 	$dbh->last_insert_id(undef, undef, 'msgmap', 'num');
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 0b1dc21..2239c90 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -17,6 +17,7 @@ use PublicInbox::MsgIter;
 use Carp qw(croak);
 use POSIX qw(strftime);
 use PublicInbox::OverIdx;
+use PublicInbox::Spawn qw(spawn);
 require PublicInbox::Git;
 use Compress::Zlib qw(compress);
 
@@ -266,8 +267,8 @@ sub add_message {
 	my $mids = mids($mime->header_obj);
 	$mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
 	unless (defined $num) { # v1
-		my $mm = $self->_msgmap_init;
-		$num = $mm->mid_insert($mid0) || $mm->num_for($mid0);
+		$self->_msgmap_init;
+		$num = index_mm($self, $mime);
 	}
 	eval {
 		my $smsg = PublicInbox::SearchMsg->new($mime);
@@ -464,8 +465,28 @@ sub index_mm {
 	my ($self, $mime) = @_;
 	my $mid = mid_clean(mid_mime($mime));
 	my $mm = $self->{mm};
-	my $num = $mm->mid_insert($mid);
-	return $num if defined $num;
+	my $num;
+
+	if (defined $self->{regen_down}) {
+		$num = $mm->num_for($mid) and return $num;
+
+		while (($num = $self->{regen_down}--) > 0) {
+			if ($mm->mid_set($num, $mid) != 0) {
+				return $num;
+			}
+		}
+	} elsif (defined $self->{regen_up}) {
+		$num = $mm->num_for($mid) and return $num;
+
+		# this is to fixup old bugs due to add-remove-add
+		while (($num = ++$self->{regen_up})) {
+			if ($mm->mid_set($num, $mid) != 0) {
+				return $num;
+			}
+		}
+	}
+
+	$num = $mm->mid_insert($mid) and return $num;
 
 	# fallback to num_for since filters like RubyLang set the number
 	$mm->num_for($mid);
@@ -476,18 +497,6 @@ sub unindex_mm {
 	$self->{mm}->mid_delete(mid_clean(mid_mime($mime)));
 }
 
-sub index_mm2 {
-	my ($self, $mime, $bytes, $blob) = @_;
-	my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime)));
-	add_message($self, $mime, $bytes, $num, $blob);
-}
-
-sub unindex_mm2 {
-	my ($self, $mime) = @_;
-	$self->{mm}->mid_delete(mid_clean(mid_mime($mime)));
-	unindex_blob($self, $mime);
-}
-
 sub index_both {
 	my ($self, $mime, $bytes, $blob) = @_;
 	my $num = index_mm($self, $mime);
@@ -521,12 +530,12 @@ sub batch_adjust ($$$$) {
 	$$max -= $bytes;
 	if ($$max <= 0) {
 		$$max = BATCH_BYTES;
-		$batch_cb->($latest, 1);
+		$batch_cb->($latest);
 	}
 }
 
 # only for v1
-sub rlog {
+sub read_log {
 	my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
 	my $hex = '[a-f0-9]';
 	my $h40 = $hex .'{40}';
@@ -537,23 +546,39 @@ sub rlog {
 	my $bytes;
 	my $max = BATCH_BYTES;
 	local $/ = "\n";
+	my %D;
 	my $line;
+	my $newest;
+	my $mid = '20170114215743.5igbjup6qpsh3jfg@genre.crustytoothpaste.net';
 	while (defined($line = <$log>)) {
 		if ($line =~ /$addmsg/o) {
 			my $blob = $1;
+			delete $D{$blob} and next;
 			my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+			my $mids = mids($mime->header_obj);
+			foreach (@$mids) {
+				warn "ADD $mid\n" if ($_ eq $mid);
+			}
 			batch_adjust(\$max, $bytes, $batch_cb, $latest);
 			$add_cb->($self, $mime, $bytes, $blob);
 		} elsif ($line =~ /$delmsg/o) {
 			my $blob = $1;
-			my $mime = do_cat_mail($git, $blob, \$bytes) or next;
-			batch_adjust(\$max, $bytes, $batch_cb, $latest);
-			$del_cb->($self, $mime);
+			$D{$blob} = 1;
 		} elsif ($line =~ /^commit ($h40)/o) {
 			$latest = $1;
+			$newest ||= $latest;
+		}
+	}
+	# get the leftovers
+	foreach my $blob (keys %D) {
+		my $mime = do_cat_mail($git, $blob, \$bytes) or next;
+		my $mids = mids($mime->header_obj);
+		foreach (@$mids) {
+			warn "DEL $mid\n" if ($_ eq $mid);
 		}
+		$del_cb->($self, $mime);
 	}
-	$batch_cb->($latest, 0);
+	$batch_cb->($latest, $newest);
 }
 
 sub _msgmap_init {
@@ -567,18 +592,62 @@ sub _msgmap_init {
 
 sub _git_log {
 	my ($self, $range) = @_;
-	$self->{git}->popen(qw/log --reverse --no-notes --no-color
+	my $git = $self->{git};
+
+	if (index($range, '..') < 0) {
+		my $regen_max = 0;
+		# can't use 'rev-list --count' if we use --diff-filter
+		my $fh = $git->popen(qw(log --pretty=tformat:%h
+				--no-notes --no-color --no-renames
+				--diff-filter=AM), $range);
+		++$regen_max while <$fh>;
+		my (undef, $max) = $self->{mm}->minmax;
+
+		if ($max && $max == $regen_max) {
+			# fix up old bugs in full indexes which caused messages to
+			# not appear in Msgmap
+			$self->{regen_up} = $max;
+		} else {
+			# normal regen is for for fresh data
+			$self->{regen_down} = $regen_max;
+		}
+	}
+
+	$git->popen(qw/log --no-notes --no-color --no-renames
 				--raw -r --no-abbrev/, $range);
 }
 
+sub is_ancestor ($$$) {
+	my ($git, $cur, $tip) = @_;
+	return 0 unless $git->check($cur);
+	my $cmd = [ 'git', "--git-dir=$git->{git_dir}",
+		qw(merge-base --is-ancestor), $cur, $tip ];
+	my $pid = spawn($cmd);
+	defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!";
+	waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish';
+	$? == 0;
+}
+
+sub need_update ($$$) {
+	my ($self, $cur, $new) = @_;
+	my $git = $self->{git};
+	return 1 if $cur && !is_ancestor($git, $cur, $new);
+	my $range = $cur eq '' ? $new : "$cur..$new";
+	chomp(my $n = $git->qx(qw(rev-list --count), $range));
+	($n eq '' || $n > 0);
+}
+
 # indexes all unindexed messages (v1 only)
 sub _index_sync {
 	my ($self, $opts) = @_;
 	my $tip = $opts->{ref} || 'HEAD';
 	my $reindex = $opts->{reindex};
 	my ($mkey, $last_commit, $lx, $xlog);
-	$self->{git}->batch_prepare;
+	my $git = $self->{git};
+	$git->batch_prepare;
+
 	my $xdb = $self->begin_txn_lazy;
+	my $mm = _msgmap_init($self);
 	do {
 		$xlog = undef;
 		$mkey = 'last_commit';
@@ -588,73 +657,54 @@ sub _index_sync {
 			$lx = '';
 			$mkey = undef if $last_commit ne '';
 		}
+
+		# use last_commit from msgmap if it is older or unset
+		my $lm = $mm->last_commit || '';
+		if (!$lm || ($lm && $lx && is_ancestor($git, $lm, $lx))) {
+			$lx = $lm;
+		}
+
 		$self->{over}->rollback_lazy;
 		$self->{over}->disconnect;
 		delete $self->{txn};
 		$xdb->cancel_transaction;
 		$xdb = _xdb_release($self);
 
-		# ensure we leak no FDs to "git log"
+		# ensure we leak no FDs to "git log" with Xapian <= 1.2
 		my $range = $lx eq '' ? $tip : "$lx..$tip";
 		$xlog = _git_log($self, $range);
 
 		$xdb = $self->begin_txn_lazy;
 	} while ($xdb->get_metadata('last_commit') ne $last_commit);
 
-	my $mm = _msgmap_init($self);
 	my $dbh = $mm->{dbh} if $mm;
-	my $mm_only;
 	my $cb = sub {
-		my ($commit, $more) = @_;
+		my ($commit, $newest) = @_;
 		if ($dbh) {
-			$mm->last_commit($commit) if $commit;
+			if ($newest) {
+				my $cur = $mm->last_commit || '';
+				if (need_update($self, $cur, $newest)) {
+					$mm->last_commit($newest);
+				}
+			}
 			$dbh->commit;
 		}
-		if (!$mm_only) {
-			$xdb->set_metadata($mkey, $commit) if $mkey && $commit;
-			$self->commit_txn_lazy;
+		if ($mkey && $newest) {
+			my $cur = $xdb->get_metadata($mkey);
+			if (need_update($self, $cur, $newest)) {
+				$xdb->set_metadata($mkey, $newest);
+			}
 		}
+		$self->commit_txn_lazy;
 		# let another process do some work... <
-		if ($more) {
-			if (!$mm_only) {
-				$xdb = $self->begin_txn_lazy;
-			}
+		if (!$newest) {
+			$xdb = $self->begin_txn_lazy;
 			$dbh->begin_work if $dbh;
 		}
 	};
 
-	if ($mm) {
-		$dbh->begin_work;
-		my $lm = $mm->last_commit || '';
-		if ($lm eq $lx) {
-			# Common case is the indexes are synced,
-			# we only need to run git-log once:
-			rlog($self, $xlog, *index_both, *unindex_both, $cb);
-		} else {
-			# Uncommon case, msgmap and xapian are out-of-sync
-			# do not care for performance (but git is fast :>)
-			# This happens if we have to reindex Xapian since
-			# msgmap is a frozen format and our Xapian format
-			# is evolving.
-			my $r = $lm eq '' ? $tip : "$lm..$tip";
-
-			# first, ensure msgmap is up-to-date:
-			my $mkey_prev = $mkey;
-			$mkey = undef; # ignore xapian, for now
-			my $mlog = _git_log($self, $r);
-			$mm_only = 1;
-			rlog($self, $mlog, *index_mm, *unindex_mm, $cb);
-			$mm_only = $mlog = undef;
-
-			# now deal with Xapian
-			$mkey = $mkey_prev;
-			$dbh = undef;
-			rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb);
-		}
-	} else {
-		# user didn't install DBD::SQLite and DBI
-		rlog($self, $xlog, *add_message, *unindex_blob, $cb);
-	}
+	$dbh->begin_work;
+	read_log($self, $xlog, *index_both, *unindex_both, $cb);
 }
 
 sub DESTROY {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index e9fd502..2cc8730 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -15,7 +15,8 @@ use PublicInbox::ContentId qw(content_id content_digest);
 use PublicInbox::Inbox;
 use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
-use PublicInbox::Spawn;
+use PublicInbox::Spawn qw(spawn);
+use PublicInbox::SearchIdx;
 use IO::Handle;
 
 # an estimate of the post-packed size to the raw uncompressed size
@@ -561,7 +562,6 @@ sub import_init {
 sub diff ($$$) {
 	my ($mid, $cur, $new) = @_;
 	use File::Temp qw(tempfile);
-	use PublicInbox::Spawn qw(spawn);
 
 	my ($ah, $an) = tempfile('email-cur-XXXXXXXX', TMPDIR => 1);
 	print $ah $cur->as_string or die "print: $!";
@@ -738,16 +738,7 @@ sub last_commits {
 	$heads;
 }
 
-sub is_ancestor ($$$) {
-	my ($git, $cur, $tip) = @_;
-	return 0 unless $git->check($cur);
-	my $cmd = [ 'git', "--git-dir=$git->{git_dir}",
-		qw(merge-base --is-ancestor), $cur, $tip ];
-	my $pid = spawn($cmd);
-	defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!";
-	waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish';
-	$? == 0;
-}
+*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
 
 sub index_prepare {
 	my ($self, $opts, $epoch_max, $ranges) = @_;
diff --git a/t/v1-add-remove-add.t b/t/v1-add-remove-add.t
new file mode 100644
index 0000000..cd6e281
--- /dev/null
+++ b/t/v1-add-remove-add.t
@@ -0,0 +1,45 @@
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::MIME;
+use PublicInbox::Import;
+use PublicInbox::SearchIdx;
+use File::Temp qw/tempdir/;
+
+foreach my $mod (qw(DBD::SQLite Search::Xapian)) {
+	eval "require $mod";
+	plan skip_all => "$mod missing for v1-add-remove-add.t" if $@;
+}
+my $mainrepo = tempdir('pi-add-remove-add-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+is(system(qw(git init --bare), $mainrepo), 0);
+my $ibx = {
+	mainrepo => $mainrepo,
+	name => 'test-add-remove-add',
+	-primary_address => 'test@example.com',
+};
+$ibx = PublicInbox::Inbox->new($ibx);
+my $mime = PublicInbox::MIME->create(
+	header => [
+		From => 'a@example.com',
+		To => 'test@example.com',
+		Subject => 'this is a subject',
+		Date => 'Fri, 02 Oct 1993 00:00:00 +0000',
+		'Message-ID' => '<a-mid@b>',
+	],
+	body => "hello world\n",
+);
+my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx);
+ok($im->add($mime), 'message added');
+ok($im->remove($mime), 'message added');
+ok($im->add($mime), 'message added again');
+$im->done;
+my $rw = PublicInbox::SearchIdx->new($ibx, 1);
+$rw->index_sync;
+my $msgs = $ibx->recent({limit => 10});
+is($msgs->[0]->{mid}, 'a-mid@b', 'message exists in history');
+is(scalar @$msgs, 1, 'only one message in history');
+is($ibx->mm->num_for('a-mid@b'), 2, 'exists with second article number');
+
+done_testing();
-- 
EW


  parent reply	other threads:[~2018-04-18  9:13 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-18  9:13 [PATCH 00/12] better dedupe, contiguous article numbers Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 01/12] feed: respect feedmax, again Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 02/12] v1: remove articles from overview DB Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 03/12] compact: do not merge v2 repos by default Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 04/12] v2writable: reduce partititions by one Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 05/12] search: preserve References in Xapian smsg for x=t view Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 06/12] v2: generate better Message-IDs for duplicates Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 07/12] v2: improve deduplication checks Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 08/12] import: cat_blob drops leading 'From ' lines like Inbox Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-04-18  9:13 ` [PATCH 10/12] extmsg: remove expensive git path checks Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 11/12] use %H consistently to disable abbreviations Eric Wong (Contractor, The Linux Foundation)
2018-04-18  9:13 ` [PATCH 12/12] searchidx: increase term positions for all text terms Eric Wong (Contractor, The Linux Foundation)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180418091316.29114-10-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).