about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2020-09-01 05:55:45 +0000
committerEric Wong <e@80x24.org>2020-09-02 08:53:49 +0000
commitc654417ce342a999ead50ba4cf7745071e8e8b0f (patch)
tree609481f7bfbee651c52957487c1ae39e205184a4
parent60a13fdee967a003e17a03a6e4978a57965ab419 (diff)
downloadpublic-inbox-c654417ce342a999ead50ba4cf7745071e8e8b0f.tar.gz
While it's not a known problem, our deduplicating logic may
change in the future; or a BOFH could be manually injecting
duplicate messages directly into the git epoch repositories.

Ensure indexing in mirrors doesn't break when there's
duplicates.  This is in preparation for detached indices
for multi-inbox search.
-rw-r--r--MANIFEST1
-rw-r--r--t/v2dupindex.t61
2 files changed, 62 insertions, 0 deletions
diff --git a/MANIFEST b/MANIFEST
index b65e96b0..44670c7e 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -357,6 +357,7 @@ t/utf8.eml
 t/v1-add-remove-add.t
 t/v1reindex.t
 t/v2-add-remove-add.t
+t/v2dupindex.t
 t/v2mda.t
 t/v2mirror.t
 t/v2reindex.t
diff --git a/t/v2dupindex.t b/t/v2dupindex.t
new file mode 100644
index 00000000..b1abccd9
--- /dev/null
+++ b/t/v2dupindex.t
@@ -0,0 +1,61 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# we can index a message from a mirror which bypasses dedupe.
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+require_git(2.6);
+require_mods(qw(DBD::SQLite));
+my ($tmpdir, $for_destroy) = tmpdir();
+use_ok 'PublicInbox::Import';
+use_ok 'PublicInbox::Git';
+use_ok 'PublicInbox::InboxWritable';
+my $ibx = PublicInbox::InboxWritable->new({
+        inboxdir => $tmpdir,
+        name => 'test-v2dupindex',
+        version => 2,
+        indexlevel => 'basic',
+        -primary_address => 'test@example.com',
+}, { nproc => 1 });
+$ibx->init_inbox(1);
+my $v2w = $ibx->importer;
+$v2w->add(eml_load('t/plack-qp.eml'));
+$v2w->add(eml_load('t/mda-mime.eml'));
+$v2w->done;
+
+my $git0 = PublicInbox::Git->new("$tmpdir/git/0.git");
+my $im = PublicInbox::Import->new($git0, undef, undef, $ibx);
+$im->{path_type} = 'v2';
+$im->{lock_path} = undef;
+
+# bypass duplicate filters (->header_set is optional)
+my $eml = eml_load('t/plack-qp.eml');
+$eml->header_set('X-This-Is-Not-Checked-By-ContentHash', 'blah');
+ok($im->add($eml), 'add seen message directly');
+ok($im->add(eml_load('t/mda-mime.eml')), 'add another seen message directly');
+
+ok($im->add(eml_load('t/iso-2202-jp.eml')), 'add another new message');
+$im->done;
+
+# mimic a fresh clone by dropping indices
+my @sqlite = (glob("$tmpdir/*sqlite3*"), glob("$tmpdir/xap*/*sqlite3*"));
+is(unlink(@sqlite), scalar(@sqlite), 'unlinked SQLite indices');
+my @shards = glob("$tmpdir/xap*/?");
+is(scalar(@shards), 0, 'no Xapian shards to drop');
+
+my $rdr = { 2 => \(my $err = '') };
+ok(run_script([qw(-index -Lbasic), $tmpdir], undef, $rdr), '-indexed');
+my @n = $ibx->over->dbh->selectrow_array('SELECT COUNT(*) FROM over');
+is_deeply(\@n, [ 3 ], 'identical message not re-indexed');
+my $mm = $ibx->mm->{dbh}->selectall_arrayref(<<'');
+SELECT num,mid FROM msgmap ORDER BY num ASC
+
+is_deeply($mm, [
+        [ 1, 'qp@example.com' ],
+        [ 2, 'multipart-html-sucks@11' ],
+        [ 3, '199707281508.AAA24167@hoyogw.example' ]
+], 'msgmap omits redundant message');
+
+done_testing;