about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong (Contractor, The Linux Foundation) <e@80x24.org>2018-04-22 08:01:48 +0000
committerEric Wong <e@80x24.org>2018-04-22 08:02:13 +0000
commita46893a2b5dabfdbcf7b593ac19967daecfb1772 (patch)
tree4b49778a165ec769a6412b07f965413567954c95 /lib
parent866837def71b9d70198f51e634e6141f75f0df3e (diff)
downloadpublic-inbox-a46893a2b5dabfdbcf7b593ac19967daecfb1772.tar.gz
"LIKE" in SQLite (and other SQL implementations I've seen) is
expensive with nearly 3 million messages in the archives.

This caused some partial Message-ID lookups to take over 600ms
on my workstation (~300ms on a faster Xeon).  Cut that to below
under 30ms on average on my workstation by relying exclusively
on Xapian for partial Message-ID lookups as we have in the past.

Unlike in the past when we tried using Xapian to match partial
Message-IDs; we now optimize our indexing of Message-IDs to
break apart "words" in Message-IDs for searching, yielding
(hopefully) "good enough" accuracy for folks who get long URLs
broken across lines when copy+pasting.

We'll also drop the (in retrospect) pointless stripping of
"/[tTf]" suffixes for the partial match, since anybody who
hits that codepath would be hitting an invalid message ID.

Finally, limit wildcard expansion to prevent easy DoS vectors
on short terms.

And blame Pine and alpine for generating Message-IDs with
low-entropy prefixes :P
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/ExtMsg.pm87
-rw-r--r--lib/PublicInbox/Msgmap.pm18
-rw-r--r--lib/PublicInbox/Search.pm1
-rw-r--r--lib/PublicInbox/SearchIdx.pm7
4 files changed, 66 insertions, 47 deletions
diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 04cb4062..51e7799d 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -26,6 +26,52 @@ our @EXT_URL = (
                 'doc-url=/lurker&format=en.html&query=id:%s'
 );
 
+sub PARTIAL_MAX () { 100 }
+
+sub search_partial ($$) {
+        my ($srch, $mid) = @_;
+        my $opt = { limit => PARTIAL_MAX, mset => 2 };
+        my @try = ("m:$mid*");
+        my $chop = $mid;
+        if ($chop =~ s/(\W+)(\w*)\z//) {
+                my ($delim, $word) = ($1, $2);
+                if (length($word)) {
+                        push @try, "m:$chop$delim";
+                        push @try, "m:$chop$delim*";
+                }
+                push @try, "m:$chop";
+                push @try, "m:$chop*";
+        }
+
+        # break out long words individually to search for, because
+        # too many messages begin with "Pine.LNX." (or "alpine" or "nycvar")
+        if ($mid =~ /\w{9,}/) {
+                my @long = ($mid =~ m!(\w{3,})!g);
+                push(@try, join(' ', map { "m:$_" } @long));
+
+                # is the last element long enough to not trigger excessive
+                # wildcard matches?
+                if (length($long[-1]) > 8) {
+                        $long[-1] .= '*';
+                        push(@try, join(' ', map { "m:$_" } @long));
+                }
+        }
+
+        foreach my $m (@try) {
+                my $mset = eval { $srch->query($m, $opt) };
+                if (ref($@) eq 'Search::Xapian::QueryParserError') {
+                        # If Xapian can't handle the wildcard since it
+                        # has too many results.
+                        next;
+                }
+                my @mids = map {
+                        my $doc = $_->get_document;
+                        PublicInbox::SearchMsg->load_doc($doc)->mid;
+                } $mset->items;
+                return \@mids if scalar(@mids);
+        }
+}
+
 sub ext_msg {
         my ($ctx) = @_;
         my $cur = $ctx->{-inbox};
@@ -56,41 +102,23 @@ sub ext_msg {
         return exact($ctx, \@found, $mid) if @found;
 
         # fall back to partial MID matching
-        my $n_partial = 0;
         my @partial;
-
-        if (my $mm = $cur->mm) {
-                my $tmp_mid = $mid;
-                my $res = $mm->mid_prefixes($tmp_mid, 100);
-                if ($res && scalar(@$res)) {
-                        $n_partial += scalar(@$res);
-                        push @partial, [ $cur, $res ];
-                # fixup common errors:
-                } elsif ($tmp_mid =~ s,/[tTf],,) {
-                        $res = $mm->mid_prefixes($tmp_mid, 100);
-                        if ($res && scalar(@$res)) {
-                                $n_partial += scalar(@$res);
-                                push @partial, [ $cur, $res ];
-                        }
-                }
+        my $n_partial = 0;
+        my $srch = $cur->search;
+        my $mids = search_partial($srch, $mid) if $srch;
+        if ($mids) {
+                $n_partial = scalar(@$mids);
+                push @partial, [ $cur, $mids ];
         }
 
         # can't find a partial match in current inbox, try the others:
         if (!$n_partial && length($mid) >= 16) {
-                my $tmp_mid = $mid;
-again:
                 foreach my $ibx (@ibx) {
-                        my $mm = $ibx->mm or next;
-                        my $res = $mm->mid_prefixes($tmp_mid, 100);
-                        if ($res && scalar(@$res)) {
-                                $n_partial += scalar(@$res);
-                                push @partial, [ $ibx, $res ];
-                                last if $n_partial >= 100;
-                        }
-                }
-                # fixup common errors:
-                if (!$n_partial && $tmp_mid =~ s,/[tTf],,) {
-                        goto again;
+                        $srch = $ibx->search or next;
+                        $mids = search_partial($srch, $mid) or next;
+                        $n_partial += scalar(@$mids);
+                        push @partial, [ $ibx, $mids];
+                        last if $n_partial >= PARTIAL_MAX;
                 }
         }
 
@@ -103,6 +131,7 @@ again:
         if ($n_partial) {
                 $code = 300;
                 my $es = $n_partial == 1 ? '' : 'es';
+                $n_partial .= '+' if ($n_partial == PARTIAL_MAX);
                 $s .= "\n$n_partial partial match$es found:\n\n";
                 my $cur_name = $cur->{name};
                 foreach my $pair (@partial) {
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 6e758c1a..192e311a 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -36,7 +36,6 @@ sub dbh_new {
                 ReadOnly => !$writable,
                 sqlite_use_immediate_transaction => 1,
         });
-        $dbh->do('PRAGMA case_sensitive_like = ON');
         $dbh;
 }
 
@@ -151,23 +150,6 @@ sub minmax {
         ($min, $sth->fetchrow_array);
 }
 
-sub mid_prefixes {
-        my ($self, $pfx, $limit) = @_;
-
-        die "No prefix given" unless (defined $pfx && $pfx ne '');
-        $pfx =~ s/([%_])/\\$1/g;
-        $pfx .= '%';
-
-        $limit ||= 100;
-        $limit += 0; # force to integer
-        $limit ||= 100;
-
-        $self->{dbh}->selectcol_arrayref('SELECT mid FROM msgmap ' .
-                                         'WHERE mid LIKE ? ESCAPE ? ' .
-                                         "ORDER BY num DESC LIMIT $limit",
-                                         undef, $pfx, '\\');
-}
-
 sub mid_delete {
         my ($self, $mid) = @_;
         my $dbh = $self->{dbh};
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7175ddc5..5aabda02 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -260,6 +260,7 @@ sub qp {
         $qp->set_database($self->{xdb});
         $qp->set_stemmer($self->stemmer);
         $qp->set_stemming_strategy(STEM_SOME);
+        $qp->set_max_wildcard_expansion(100);
         $qp->add_valuerangeprocessor(
                 Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:'));
         $qp->add_valuerangeprocessor(
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 4dc81352..aeb363e0 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -331,6 +331,13 @@ sub add_message {
 
                 foreach my $mid (@$mids) {
                         $tg->index_text($mid, 1, 'XM');
+
+                        # because too many Message-IDs are prefixed with
+                        # "Pine.LNX."...
+                        if ($mid =~ /\w{12,}/) {
+                                my @long = ($mid =~ /(\w{3,}+)/g);
+                                $tg->index_text(join(' ', @long), 1, 'XM');
+                        }
                         $tg->increase_termpos;
                 }
                 $smsg->{to} = $smsg->{cc} = '';