From a46893a2b5dabfdbcf7b593ac19967daecfb1772 Mon Sep 17 00:00:00 2001 From: "Eric Wong (Contractor, The Linux Foundation)" Date: Sun, 22 Apr 2018 08:01:48 +0000 Subject: extmsg: use Xapian only for partial matches "LIKE" in SQLite (and other SQL implementations I've seen) is expensive with nearly 3 million messages in the archives. This caused some partial Message-ID lookups to take over 600ms on my workstation (~300ms on a faster Xeon). Cut that to below under 30ms on average on my workstation by relying exclusively on Xapian for partial Message-ID lookups as we have in the past. Unlike in the past when we tried using Xapian to match partial Message-IDs; we now optimize our indexing of Message-IDs to break apart "words" in Message-IDs for searching, yielding (hopefully) "good enough" accuracy for folks who get long URLs broken across lines when copy+pasting. We'll also drop the (in retrospect) pointless stripping of "/[tTf]" suffixes for the partial match, since anybody who hits that codepath would be hitting an invalid message ID. Finally, limit wildcard expansion to prevent easy DoS vectors on short terms. And blame Pine and alpine for generating Message-IDs with low-entropy prefixes :P --- lib/PublicInbox/SearchIdx.pm | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'lib/PublicInbox/SearchIdx.pm') diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 4dc81352..aeb363e0 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -331,6 +331,13 @@ sub add_message { foreach my $mid (@$mids) { $tg->index_text($mid, 1, 'XM'); + + # because too many Message-IDs are prefixed with + # "Pine.LNX."... + if ($mid =~ /\w{12,}/) { + my @long = ($mid =~ /(\w{3,}+)/g); + $tg->index_text(join(' ', @long), 1, 'XM'); + } $tg->increase_termpos; } $smsg->{to} = $smsg->{cc} = ''; -- cgit v1.2.3-24-ge0c7