about summary refs log tree commit homepage
path: root/t/extsearch.t
diff options
context:
space:
mode:
Diffstat (limited to 't/extsearch.t')
-rw-r--r--t/extsearch.t313
1 files changed, 270 insertions, 43 deletions
diff --git a/t/extsearch.t b/t/extsearch.t
index 2c3f7547..797aa8f5 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -1,30 +1,25 @@
 #!perl -w
-# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-use strict;
-use Test::More;
+use v5.12;
 use PublicInbox::TestCommon;
 use PublicInbox::Config;
-use PublicInbox::Search;
 use PublicInbox::InboxWritable;
-use Fcntl qw(:seek);
 require_git(2.6);
-require_mods(qw(json DBD::SQLite Search::Xapian));
+require_mods(qw(json DBD::SQLite Xapian));
+use autodie qw(open rename truncate unlink);
+require PublicInbox::Search;
 use_ok 'PublicInbox::ExtSearch';
 use_ok 'PublicInbox::ExtSearchIdx';
 use_ok 'PublicInbox::OverIdx';
-my $sock = tcp_server();
-my $host_port = $sock->sockhost . ':' . $sock->sockport;
 my ($home, $for_destroy) = tmpdir();
 local $ENV{HOME} = $home;
 mkdir "$home/.public-inbox" or BAIL_OUT $!;
 my $cfg_path = "$home/.public-inbox/config";
-open my $fh, '>', $cfg_path or BAIL_OUT $!;
-print $fh <<EOF or BAIL_OUT $!;
+PublicInbox::IO::write_file '>', $cfg_path, <<EOF;
 [publicinboxMda]
         spamcheck = none
 EOF
-close $fh or BAIL_OUT $!;
 my $v2addr = 'v2test@example.com';
 my $v1addr = 'v1test@example.com';
 ok(run_script([qw(-init -Lbasic -V2 v2test --newsgroup v2.example),
@@ -33,42 +28,78 @@ my $env = { ORIGINAL_RECIPIENT => $v2addr };
 my $eml = eml_load('t/utf8.eml');
 
 $eml->header_set('List-Id', '<v2.example.com>');
-open($fh, '+>', undef) or BAIL_OUT $!;
-$fh->autoflush(1);
-print $fh $eml->as_string or BAIL_OUT $!;
-seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
 
-run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+my $in = \($eml->as_string);
+run_script(['-mda', '--no-precheck'], $env, { 0 => $in }) or BAIL_OUT '-mda';
 
 ok(run_script([qw(-init -V1 v1test --newsgroup v1.example), "$home/v1test",
         'http://example.com/v1test', $v1addr ]), 'v1test init');
 
 $eml->header_set('List-Id', '<v1.example.com>');
-seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
-truncate($fh, 0) or BAIL_OUT $!;
-print $fh $eml->as_string or BAIL_OUT $!;
-seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+$in = \$eml->as_string;
 
 $env = { ORIGINAL_RECIPIENT => $v1addr };
-run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+run_script(['-mda', '--no-precheck'], $env, { 0 => $in }) or BAIL_OUT '-mda';
 
 run_script([qw(-index -Lbasic), "$home/v1test"]) or BAIL_OUT "index $?";
 
-ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
+ok(run_script([qw(-extindex --dangerous --all), "$home/extindex"]),
+        'extindex init');
 {
         my $es = PublicInbox::ExtSearch->new("$home/extindex");
         ok($es->has_threadid, '->has_threadid');
 }
 
+if ('with boost') {
+        xsys([qw(git config publicinbox.v1test.boost), 10],
+                { GIT_CONFIG => $cfg_path });
+        ok(run_script([qw(-extindex --all), "$home/extindex-b"]),
+                'extindex init with boost');
+        my $es = PublicInbox::ExtSearch->new("$home/extindex-b");
+        my $smsg = $es->over->get_art(1);
+        ok($smsg, 'got first article');
+        my $xref3 = $es->over->get_xref3($smsg->{num});
+        my @v1 = grep(/\Av1/, @$xref3);
+        my @v2 = grep(/\Av2/, @$xref3);
+        like($v1[0], qr/\Av1\.example.*?\b\Q$smsg->{blob}\E\b/,
+                'smsg->{blob} respected boost');
+        is(scalar(@$xref3), 2, 'only to entries');
+        undef $es;
+
+        xsys([qw(git config publicinbox.v2test.boost), 20],
+                { GIT_CONFIG => $cfg_path });
+        ok(run_script([qw(-extindex --all --reindex), "$home/extindex-b"]),
+                'extindex --reindex with altered boost');
+
+        $es = PublicInbox::ExtSearch->new("$home/extindex-b");
+        $smsg = $es->over->get_art(1);
+        like($v2[0], qr/\Av2\.example.*?\b\Q$smsg->{blob}\E\b/,
+                        'smsg->{blob} respects boost after reindex');
+
+        # high boost added later
+        my $b2 = "$home/extindex-bb";
+        ok(run_script([qw(-extindex), $b2, "$home/v1test"]),
+                'extindex with low boost inbox only');
+        ok(run_script([qw(-extindex), $b2, "$home/v2test"]),
+                'extindex with high boost inbox only');
+        $es = PublicInbox::ExtSearch->new($b2);
+        $smsg = $es->over->get_art(1);
+        $xref3 = $es->over->get_xref3($smsg->{num});
+        like($v2[0], qr/\Av2\.example.*?\b\Q$smsg->{blob}\E\b/,
+                'smsg->{blob} respected boost across 2 index runs');
+
+        xsys([qw(git config --unset publicinbox.v1test.boost)],
+                { GIT_CONFIG => $cfg_path });
+        xsys([qw(git config --unset publicinbox.v2test.boost)],
+                { GIT_CONFIG => $cfg_path });
+}
+
 { # TODO: -extindex should write this to config
-        open $fh, '>>', $cfg_path or BAIL_OUT $!;
-        print $fh <<EOF or BAIL_OUT $!;
+        PublicInbox::IO::write_file '>>', $cfg_path, <<EOF;
 ; for ->ALL
 [extindex "all"]
         topdir = $home/extindex
 EOF
-        close $fh or BAIL_OUT $!;
-
         my $pi_cfg = PublicInbox::Config->new;
         $pi_cfg->fill_all;
         ok($pi_cfg->ALL, '->ALL');
@@ -80,6 +111,8 @@ EOF
 
 SKIP: {
         require_mods(qw(Net::NNTP), 1);
+        my $sock = tcp_server();
+        my $host_port = tcp_host_port($sock);
         my ($out, $err) = ("$home/nntpd.out.log", "$home/nntpd.err.log");
         my $cmd = [ '-nntpd', '-W0', "--stdout=$out", "--stderr=$err" ];
         my $td = start_script($cmd, undef, { 3 => $sock });
@@ -107,7 +140,7 @@ if ('inbox edited') {
         my ($in, $out, $err);
         $in = $out = $err = '';
         my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
-        my $env = { MAIL_EDITOR => "$^X -i -p -e 's/test message/BEST MSG/'" };
+        my $env = { MAIL_EDITOR => "$^X -w -i -p -e 's/test message/BEST MSG/'" };
         my $cmd = [ qw(-edit -Ft/utf8.eml), "$home/v2test" ];
         ok(run_script($cmd, $env, $opt), '-edit');
         ok(run_script([qw(-extindex --all), "$home/extindex"], undef, $opt),
@@ -132,7 +165,7 @@ if ('inbox edited') {
         is($mset->size, 1, 'new message found');
         $mset = $es->mset('b:"test message"');
         is($mset->size, 1, 'old message found');
-        delete @$es{qw(git over xdb)}; # fork preparation
+        delete @$es{qw(git over xdb qp)}; # fork preparation
 
         my $pi_cfg = PublicInbox::Config->new;
         $pi_cfg->fill_all;
@@ -158,11 +191,7 @@ if ('inbox edited') {
         is_deeply($res, $exp, 'isearch limited results');
         $pi_cfg = $res = $exp = undef;
 
-        open my $rmfh, '+>', undef or BAIL_OUT $!;
-        $rmfh->autoflush(1);
-        print $rmfh $eml2->as_string or BAIL_OUT $!;
-        seek($rmfh, 0, SEEK_SET) or BAIL_OUT $!;
-        $opt->{0} = $rmfh;
+        $opt->{0} = \($eml2->as_string);
         ok(run_script([qw(-learn rm --all)], undef, $opt), '-learn rm');
 
         ok(run_script([qw(-extindex --all), "$home/extindex"], undef, undef),
@@ -201,13 +230,11 @@ if ('inject w/o indexing') {
         isnt($tip, $cmt, '0.git v2 updated');
 
         # inject a message w/o updating index
-        rename("$home/v1test/public-inbox", "$home/v1test/skip-index") or
-                BAIL_OUT $!;
-        open(my $eh, '<', 't/iso-2202-jp.eml') or BAIL_OUT $!;
+        rename("$home/v1test/public-inbox", "$home/v1test/skip-index");
+        open(my $eh, '<', 't/iso-2202-jp.eml');
         run_script(['-mda', '--no-precheck'], $env, { 0 => $eh}) or
                 BAIL_OUT '-mda';
-        rename("$home/v1test/skip-index", "$home/v1test/public-inbox") or
-                BAIL_OUT $!;
+        rename("$home/v1test/skip-index", "$home/v1test/public-inbox");
 
         my ($in, $out, $err);
         $in = $out = $err = '';
@@ -236,6 +263,7 @@ if ('inject w/o indexing') {
 
 if ('reindex catches missed messages') {
         my $v2ibx = $cfg->lookup_name('v2test');
+        $v2ibx->{-no_fsync} = 1;
         my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
         my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
         my $eml = eml_load('t/data/0001.patch');
@@ -263,12 +291,17 @@ if ('reindex catches missed messages') {
         is($oidx->eidx_meta($lc_key), $cmt_b, 'lc-v2 stays unchanged');
         my @err = split(/^/, $err);
         is(scalar(@err), 1, 'only one warning') or diag "err=$err";
-        like($err[0], qr/I: reindex_unseen/, 'got reindex_unseen message');
+        like($err[0], qr/# reindex_unseen/, 'got reindex_unseen message');
         my $new = $oidx->get_art($max + 1);
         is($new->{subject}, $eml->header('Subject'), 'new message added');
 
         $es->{xdb}->reopen;
-        my $mset = $es->mset("mid:$new->{mid}");
+        # git patch-id --stable <t/data/0001.patch | awk '{print $1}'
+        my $patchid = '91ee6b761fc7f47cad9f2b09b10489f313eb5b71';
+        my $mset = $es->search->mset("patchid:$patchid");
+        is($mset->size, 1, 'patchid search works');
+
+        $mset = $es->mset("mid:$new->{mid}");
         is($mset->size, 1, 'previously unseen, now indexed in Xapian');
 
         ok($im->remove($eml), 'remove new message from v2 inbox');
@@ -291,11 +324,17 @@ if ('reindex catches missed messages') {
         $es->{xdb}->reopen;
         $mset = $es->mset("mid:$new->{mid}");
         is($mset->size, 0, 'stale mid gone Xapian');
+
+        ok(run_script([qw(-extindex --reindex --all --fast), "$home/extindex"],
+                        undef, $opt), '--reindex w/ --fast');
+        ok(!run_script([qw(-extindex --all --fast), "$home/extindex"],
+                        undef, $opt), '--fast alone makes no sense');
 }
 
 if ('reindex catches content bifurcation') {
         use PublicInbox::MID qw(mids);
         my $v2ibx = $cfg->lookup_name('v2test');
+        $v2ibx->{-no_fsync} = 1;
         my $im = PublicInbox::InboxWritable->new($v2ibx)->importer(0);
         my $eml = eml_load('t/data/message_embed.eml');
         my $cmt_a = $v2ibx->mm->last_commit_xap($schema_version, 0);
@@ -324,7 +363,7 @@ if ('reindex catches content bifurcation') {
         is($oidx->max, $oldmax, 'oidx->max unchanged');
         $oidx->dbh_close;
         ok(run_script([qw(-extindex --reindex --all), "$home/extindex"],
-                undef, $opt), 'extindex --reindex');
+                undef, $opt), 'extindex --reindex') or diag explain($opt);
         $oidx->dbh;
         ok($oidx->max > $oldmax, 'oidx->max bumped');
         like($err, qr/split into 2 due to deduplication change/,
@@ -358,12 +397,200 @@ if ('remove v1test and test gc') {
         my $opt = { 2 => \(my $err = '') };
         ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $opt),
                 'extindex --gc');
-        like($err, qr/^I: remove #1 v1\.example /ms, 'removed v1 message');
-        is(scalar(grep(!/^I:/, split(/^/m, $err))), 0,
+        like($err, qr/^# remove #1 v1\.example /ms, 'removed v1 message');
+        is(scalar(grep(!/^#/, split(/^/m, $err))), 0,
                 'no non-informational messages');
         $misc->{xdb}->reopen;
         @it = $misc->mset('')->items;
         is(scalar(@it), 1, 'only one inbox left');
 }
 
+if ('dedupe + dry-run') {
+        my @cmd = ('-extindex', "$home/extindex");
+        my $opt = { 2 => \(my $err = '') };
+        ok(run_script([@cmd, '--dedupe'], undef, $opt), '--dedupe');
+        ok(run_script([@cmd, qw(--dedupe --dry-run)], undef, $opt),
+                '--dry-run --dedupe');
+        is $err, '', 'no errors';
+        ok(!run_script([@cmd, qw(--dry-run)], undef, $opt),
+                '--dry-run alone fails');
+}
+
+# chmod 0755, $home or xbail "chmod: $!";
+for my $j (1, 3, 6) {
+        my $o = { 2 => \(my $err = '') };
+        my $d = "$home/extindex-j$j";
+        ok(run_script(['-extindex', "-j$j", '--all', $d], undef, $o),
+                "init with -j$j");
+        my $max = $j - 2;
+        $max = 0 if $max < 0;
+        my @dirs = glob("$d/ei*/?");
+        like($dirs[-1], qr!/ei[0-9]+/$max\z!, '-j works');
+}
+
+SKIP: {
+        my $d = "$home/extindex-j1";
+        my $es = PublicInbox::ExtSearch->new($d);
+        ok(my $nresult0 = $es->mset('z:0..')->size, 'got results');
+        ok(ref($es->{xdb}), '{xdb} created');
+        my $nshards1 = $es->{nshard};
+        is($nshards1, 1, 'correct shard count');
+
+        my @ei_dir = glob("$d/ei*/");
+        chmod 0755, $ei_dir[0] or xbail "chmod: $!";
+        my $mode = sprintf('%04o', 07777 & (stat($ei_dir[0]))[2]);
+        is($mode, '0755', 'mode set on ei*/ dir');
+        my $o = { 2 => \(my $err = '') };
+        ok(run_script([qw(-xcpdb -R4), $d]), 'xcpdb R4');
+        my @dirs = glob("$d/ei*/?");
+        for my $i (0..3) {
+                is(grep(m!/ei[0-9]+/$i\z!, @dirs), 1, "shard [$i] created");
+                my $m = sprintf('%04o', 07777 & (stat($dirs[$i]))[2]);
+                is($m, $mode, "shard [$i] mode");
+        }
+        delete @$es{qw(xdb qp)};
+        is($es->mset('z:0..')->size, $nresult0, 'new shards, same results');
+
+        for my $i (4..5) {
+                is(grep(m!/ei[0-9]+/$i\z!, @dirs), 0, "no shard [$i]");
+        }
+
+        ok(run_script([qw(-xcpdb -R2), $d]), 'xcpdb -R2');
+        @dirs = glob("$d/ei*/?");
+        for my $i (0..1) {
+                is(grep(m!/ei[0-9]+/$i\z!, @dirs), 1, "shard [$i] kept");
+        }
+        for my $i (2..3) {
+                is(grep(m!/ei[0-9]+/$i\z!, @dirs), 0, "no shard [$i]");
+        }
+        have_xapian_compact 1;
+        ok(run_script([qw(-compact), $d], undef, $o), 'compact');
+        # n.b. stderr contains xapian-compact output
+
+        my @d2 = glob("$d/ei*/?");
+        is_deeply(\@d2, \@dirs, 'dirs consistent after compact');
+        ok(run_script([qw(-extindex --dedupe --all), $d]),
+                '--dedupe works after compact');
+        ok(run_script([qw(-extindex --gc), $d], undef, $o),
+                '--gc works after compact');
+}
+
+{ # ensure --gc removes non-xposted messages
+        my $old_size = -s $cfg_path // xbail "stat $cfg_path $!";
+        my $tmp_addr = 'v2tmp@example.com';
+        run_script([qw(-init v2tmp --indexlevel basic
+                --newsgroup v2tmp.example),
+                "$home/v2tmp", 'http://example.com/v2tmp', $tmp_addr ])
+                or xbail '-init';
+        $env = { ORIGINAL_RECIPIENT => $tmp_addr };
+        my $mid = 'tmpmsg@example.com';
+        my $in = \<<EOM;
+From: b\@z
+To: b\@r
+Message-Id: <$mid>
+Subject: tmpmsg
+Date: Tue, 19 Jan 2038 03:14:07 +0000
+
+EOM
+        run_script([qw(-mda --no-precheck)], $env, {0 => $in}) or xbail '-mda';
+        ok(run_script([qw(-extindex --all), "$home/extindex"]), 'update');
+        my $nr;
+        {
+                my $es = PublicInbox::ExtSearch->new("$home/extindex");
+                my ($id, $prv);
+                my $smsg = $es->over->next_by_mid($mid, \$id, \$prv);
+                ok($smsg, 'tmpmsg indexed');
+                my $mset = $es->search->mset("mid:$mid");
+                is($mset->size, 1, 'new message found');
+                $mset = $es->search->mset('z:0..');
+                $nr = $mset->size;
+        }
+        truncate($cfg_path, $old_size);
+        my $rdr = { 2 => \(my $err) };
+        ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $rdr),
+                'gc to get rid of removed inbox');
+        is_deeply([ grep(!/^(?:I:|#)/, split(/^/m, $err)) ], [],
+                'no non-informational errors in stderr');
+
+        my $es = PublicInbox::ExtSearch->new("$home/extindex");
+        my $mset = $es->search->mset("mid:$mid");
+        is($mset->size, 0, 'tmpmsg gone from search');
+        my ($id, $prv);
+        is($es->over->next_by_mid($mid, \$id, \$prv), undef,
+                'tmpmsg gone from over');
+        $id = $prv = undef;
+        is($es->over->next_by_mid('testmessage@example.com', \$id, \$prv),
+                undef, 'remaining message not indavderover');
+        $mset = $es->search->mset('z:0..');
+        is($mset->size, $nr - 1, 'existing messages not clobbered from search');
+        my $o = $es->over->{dbh}->selectall_arrayref(<<EOM);
+SELECT num FROM over ORDER BY num
+EOM
+        is(scalar(@$o), $mset->size, 'over row count matches Xapian');
+        my $x = $es->over->{dbh}->selectall_arrayref(<<EOM);
+SELECT DISTINCT(docid) FROM xref3 ORDER BY docid
+EOM
+        is_deeply($x, $o, 'xref3 and over docids match');
+}
+
+{
+        my $d = "$home/eidx-med";
+        ok(run_script([qw(-extindex --dangerous --all -L medium -j3), $d]),
+                'extindex medium init');
+        my $es = PublicInbox::ExtSearch->new($d);
+        is($es->xdb->get_metadata('indexlevel'), 'medium',
+                'es indexlevel before');
+        my @xdb = $es->xdb_shards_flat;
+        is($xdb[0]->get_metadata('indexlevel'), 'medium',
+                '0 indexlevel before');
+        shift @xdb;
+        for (@xdb) {
+                ok(!$_->get_metadata('indexlevel'), 'no indexlevel in >0 shard')
+        }
+        is($es->xdb->get_metadata('indexlevel'), 'medium', 'indexlevel before');
+        ok(run_script([qw(-xcpdb -R5), $d]), 'xcpdb R5');
+        $es = PublicInbox::ExtSearch->new($d);
+        is($es->xdb->get_metadata('indexlevel'), 'medium',
+                '0 indexlevel after');
+        @xdb = $es->xdb_shards_flat;
+        is(scalar(@xdb), 5, 'got 5 shards');
+        is($xdb[0]->get_metadata('indexlevel'), 'medium', '0 indexlevel after');
+        shift @xdb;
+        for (@xdb) {
+                ok(!$_->get_metadata('indexlevel'), 'no indexlevel in >0 shard')
+        }
+        my $mpi = "$d/ALL.git/objects/pack/multi-pack-index";
+        SKIP: {
+                skip 'git too old for for multi-pack-index', 2 if !-f $mpi;
+                unlink glob("$d/ALL.git/objects/pack/*");
+                ok run_script([qw(-extindex --all -L medium -j3
+                                --no-multi-pack-index), $d]),
+                                'test --no-multi-pack-index';
+                ok !-f $mpi, '--no-multi-pack-index respected';
+        }
+}
+
+test_lei(sub {
+        my $d = "$home/extindex";
+        lei_ok('convert', '-o', "$home/md1", $d);
+        lei_ok('convert', '-o', "$home/md2", "extindex:$d");
+        my $dst = [];
+        my $cb = sub { push @$dst, $_[2]->as_string };
+        require PublicInbox::MdirReader;
+        PublicInbox::MdirReader->new->maildir_each_eml("$home/md1", $cb);
+        my @md1 = sort { $a cmp $b } @$dst;
+        ok(scalar(@md1), 'dumped messages to md1');
+        $dst = [];
+        PublicInbox::MdirReader->new->maildir_each_eml("$home/md2", $cb);
+        @$dst = sort { $a cmp $b } @$dst;
+        is_deeply($dst, \@md1,
+                "convert from extindex w/ or w/o `extindex' prefix");
+
+        my @o = glob "$home/extindex/ei*/over.sqlite*";
+        unlink(@o);
+        ok(!lei('convert', '-o', "$home/fail", "extindex:$d"));
+        like($lei_err, qr/unindexed .*?not supported/,
+                'noted unindexed extindex is unsupported');
+});
+
 done_testing;