about summary refs log tree commit homepage
path: root/lib/PublicInbox/Over.pm
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2018-08-05 06:04:40 +0000
committerEric Wong <e@80x24.org>2018-08-05 06:12:33 +0000
commit0c22c3e3a99527ebf04a647ca287ea2fc9bdeddd (patch)
tree381fa5ea5c80452950c774fb278dc9f95cc0dfdc /lib/PublicInbox/Over.pm
parent861bec7bec5908871e5b0ede244cb1e990a47403 (diff)
downloadpublic-inbox-0c22c3e3a99527ebf04a647ca287ea2fc9bdeddd.tar.gz
The "loose" (Subject:-based) thread matching yields too many
hits for some common subjects (e.g. "[GIT] Networking" on LKML)
and causes thread skeletons to not show the current messages.
Favor strict matches in the query and only add loose matches
if there's space.

While working on this, I noticed the backwards --reindex walk
breaks `tid' on v1 repositories, at least.  That bug was hidden
by the Subject: match logic and not discovered until now.  It
will be fixed separately.

Reported-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Diffstat (limited to 'lib/PublicInbox/Over.pm')
-rw-r--r--lib/PublicInbox/Over.pm43
1 files changed, 35 insertions, 8 deletions
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index b2f68835..dda1e6d0 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -11,6 +11,7 @@ use DBI;
 use DBD::SQLite;
 use PublicInbox::SearchMsg;
 use Compress::Zlib qw(uncompress);
+use constant DEFAULT_LIMIT => 1000;
 
 sub dbh_new {
         my ($self) = @_;
@@ -53,7 +54,7 @@ sub load_from_row {
 sub do_get {
         my ($self, $sql, $opts, @args) = @_;
         my $dbh = $self->connect;
-        my $lim = (($opts->{limit} || 0) + 0) || 1000;
+        my $lim = (($opts->{limit} || 0) + 0) || DEFAULT_LIMIT;
         $sql .= "LIMIT $lim";
         my $msgs = $dbh->selectall_arrayref($sql, { Slice => {} }, @args);
         load_from_row($_) for @$msgs;
@@ -97,21 +98,47 @@ ORDER BY num ASC LIMIT 1
 SELECT tid,sid FROM over WHERE num = ? LIMIT 1
 
         defined $tid or return nothing; # $sid may be undef
+
+        my $cond_all = '(tid = ? OR sid = ?) AND num > ?';
         my $sort_col = 'ds';
         $num = 0;
-        if ($prev) {
+        if ($prev) { # mboxrd stream, only
                 $num = $prev->{num} || 0;
                 $sort_col = 'num';
         }
-        my $cond = '(tid = ? OR sid = ?) AND num > ?';
-        my $msgs = do_get($self, <<"", {}, $tid, $sid, $num);
-SELECT num,ts,ds,ddd FROM over WHERE $cond ORDER BY $sort_col ASC
 
-        return $msgs unless wantarray;
+        my $cols = 'num,ts,ds,ddd';
+        unless (wantarray) {
+                return do_get($self, <<"", {}, $tid, $sid, $num);
+SELECT $cols FROM over WHERE $cond_all
+ORDER BY $sort_col ASC
 
-        my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num);
-SELECT COUNT(num) FROM over WHERE $cond
+        }
 
+        # HTML view always wants an array and never uses $prev,
+        # but the mbox stream never wants an array and always has $prev
+        die '$prev not supported with wantarray' if $prev;
+        my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num);
+SELECT COUNT(num) FROM over WHERE $cond_all
+
+        # giant thread, prioritize strict (tid) matches and throw
+        # in the loose (sid) matches at the end
+        my $msgs = do_get($self, <<"", {}, $tid, $num);
+SELECT $cols FROM over WHERE tid = ? AND num > ?
+ORDER BY $sort_col ASC
+
+        # do we have room for loose matches? get the most recent ones, first:
+        my $lim = DEFAULT_LIMIT - scalar(@$msgs);
+        if ($lim > 0) {
+                my $opts = { limit => $lim };
+                my $loose = do_get($self, <<"", $opts, $tid, $sid, $num);
+SELECT $cols FROM over WHERE tid != ? AND sid = ? AND num > ?
+ORDER BY $sort_col DESC
+
+                # TODO separate strict and loose matches here once --reindex
+                # is fixed to preserve `tid' properly
+                push @$msgs, @$loose;
+        }
         ($nr, $msgs);
 }