diff options
author | Eric Wong (Contractor, The Linux Foundation) <e@80x24.org> | 2018-04-22 08:01:48 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2018-04-22 08:02:13 +0000 |
commit | a46893a2b5dabfdbcf7b593ac19967daecfb1772 (patch) | |
tree | 4b49778a165ec769a6412b07f965413567954c95 /t | |
parent | 866837def71b9d70198f51e634e6141f75f0df3e (diff) | |
download | public-inbox-a46893a2b5dabfdbcf7b593ac19967daecfb1772.tar.gz |
"LIKE" in SQLite (and other SQL implementations I've seen) is expensive with nearly 3 million messages in the archives. This caused some partial Message-ID lookups to take over 600ms on my workstation (~300ms on a faster Xeon). Cut that to below under 30ms on average on my workstation by relying exclusively on Xapian for partial Message-ID lookups as we have in the past. Unlike in the past when we tried using Xapian to match partial Message-IDs; we now optimize our indexing of Message-IDs to break apart "words" in Message-IDs for searching, yielding (hopefully) "good enough" accuracy for folks who get long URLs broken across lines when copy+pasting. We'll also drop the (in retrospect) pointless stripping of "/[tTf]" suffixes for the partial match, since anybody who hits that codepath would be hitting an invalid message ID. Finally, limit wildcard expansion to prevent easy DoS vectors on short terms. And blame Pine and alpine for generating Message-IDs with low-entropy prefixes :P
Diffstat (limited to 't')
-rw-r--r-- | t/cgi.t | 6 | ||||
-rw-r--r-- | t/msgmap.t | 3 | ||||
-rw-r--r-- | t/psgi_search.t | 23 | ||||
-rw-r--r-- | t/search.t | 30 |
4 files changed, 47 insertions, 15 deletions
@@ -155,22 +155,16 @@ EOF $res = cgi_run("/test/blahblah\@example.com/raw"); like($res->{body}, qr/Message-Id: <blahblah\@example\.com>/, "mid raw hit"); - $res = cgi_run("/test/blahblah\@example.con/raw"); - like($res->{head}, qr/Status: 300 Multiple Choices/, "mid raw miss"); $res = cgi_run("/test/blahblah\@example.com/"); like($res->{body}, qr/\A<html>/, "mid html hit"); like($res->{head}, qr/Status: 200 OK/, "200 response"); - $res = cgi_run("/test/blahblah\@example.con/"); - like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss"); $res = cgi_run("/test/blahblah\@example.com/f/"); like($res->{head}, qr/Status: 301 Moved/, "301 response"); like($res->{head}, qr!^Location: http://[^/]+/test/blahblah\@example\.com/\r\n!ms, '301 redirect location'); - $res = cgi_run("/test/blahblah\@example.con/"); - like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss"); $res = cgi_run("/test/new.html"); like($res->{body}, qr/slashy%2Fasdf\@example\.com/, @@ -38,9 +38,6 @@ foreach my $mid (@mids) { is($d->num_for($mid), $mid2num{$mid}, "mid:$mid maps correctly"); } -is_deeply($d->mid_prefixes('a'), [qw(aa@cc aa@bb a@b)], "mid_prefixes match"); -is_deeply($d->mid_prefixes('A'), [], "mid_prefixes is case sensitive"); - is(undef, $d->last_commit, "last commit not set"); my $lc = 'deadbeef' x 5; is(undef, $d->last_commit($lc), 'previous last commit (undef) returned'); diff --git a/t/psgi_search.t b/t/psgi_search.t index 2f033016..a057a994 100644 --- a/t/psgi_search.t +++ b/t/psgi_search.t @@ -20,11 +20,14 @@ my $git_dir = "$tmpdir/a.git"; is(0, system(qw(git init -q --bare), $git_dir), "git init (main)"); my $rw = PublicInbox::SearchIdx->new($git_dir, 1); ok($rw, "search indexer created"); -my $data = <<'EOF'; +my $digits = '10010260936330'; +my $ua = 'Pine.LNX.4.10'; +my $mid = "$ua.$digits.2460-100000\@penguin.transmeta.com"; +my $data = <<"EOF"; Subject: test -Message-Id: <utf8@example> -From: Ævar Arnfjörð Bjarmason <avarab@example> -To: git@vger.kernel.org +Message-ID: <$mid> +From: Ævar Arnfjörð Bjarmason <avarab\@example> +To: git\@vger.kernel.org EOF @@ -37,8 +40,7 @@ foreach (reverse split(/\n\n/, $data)) { my $mime = Email::MIME->new(\$_); my $bytes = bytes::length($mime->as_string); my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored'); - my $mid = $mime->header('Message-Id'); - ok($doc_id, 'message added: '. $mid); + ok($doc_id, 'message added'); } $rw->commit_txn_lazy; @@ -72,6 +74,15 @@ test_psgi(sub { $www->call(@_) }, sub { $res = $cb->(POST('/test/?q=s:bogus&x=m')); is($res->code, 404, 'failed search result gives 404'); is_deeply([], $warn, 'no warnings'); + + my $mid_re = qr/\Q$mid\E/o; + while (length($digits) > 8) { + $res = $cb->(GET("/test/$ua.$digits/")); + is($res->code, 300, 'partial match found while truncated'); + like($res->content, qr/\b1 partial match found\b/); + like($res->content, $mid_re, 'found mid in response'); + chop($digits); + } }); done_testing(); @@ -430,6 +430,36 @@ foreach my $f ("$git_dir/public-inbox/msgmap.sqlite3", "sharedRepository respected for $bn"); } +$ibx->with_umask(sub { + $rw_commit->(); + my $digits = '10010260936330'; + my $ua = 'Pine.LNX.4.10'; + my $mid = "$ua.$digits.2460-100000\@penguin.transmeta.com"; + is($ro->reopen->query("m:$digits", { mset => 1})->size, 0, + 'no results yet'); + my $pine = Email::MIME->create( + header_str => [ + Subject => 'blah', + 'Message-ID' => "<$mid>", + From => 'torvalds@transmeta', + To => 'list@example.com', + ], + body => "" + ); + my $x = $rw->add_message($pine); + $rw->commit_txn_lazy; + is($ro->reopen->query("m:$digits", { mset => 1})->size, 1, + 'searching only digit yielded result'); + + my $wild = $digits; + for my $i (1..6) { + chop($wild); + is($ro->query("m:$wild*", { mset => 1})->size, 1, + "searching chopped($i) digit yielded result $wild "); + } + is($ro->query("m:Pine m:LNX m:10010260936330", {mset=>1})->size, 1); +}); + done_testing(); 1; |