From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.1 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B4F3C1F597; Sun, 5 Aug 2018 06:04:40 +0000 (UTC) Date: Sun, 5 Aug 2018 06:04:40 +0000 From: Eric Wong To: Konstantin Ryabitsev Cc: meta@public-inbox.org Subject: [PATCH] view: distinguish strict and loose thread matches Message-ID: <20180805060440.fhl7zvyis246e3ym@dcvr> References: <20180803182647.GA28438@chatter> <20180803192056.5swqcf67bsdxbpg6@dcvr> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: List-Id: The "loose" (Subject:-based) thread matching yields too many hits for some common subjects (e.g. "[GIT] Networking" on LKML) and causes thread skeletons to not show the current messages. Favor strict matches in the query and only add loose matches if there's space. While working on this, I noticed the backwards --reindex walk breaks `tid' on v1 repositories, at least. That bug was hidden by the Subject: match logic and not discovered until now. It will be fixed separately. Reported-by: Konstantin Ryabitsev --- lib/PublicInbox/Over.pm | 43 ++++++++++++++++++++++------ lib/PublicInbox/View.pm | 63 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 93 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index b2f6883..dda1e6d 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -11,6 +11,7 @@ package PublicInbox::Over; use DBD::SQLite; use PublicInbox::SearchMsg; use Compress::Zlib qw(uncompress); +use constant DEFAULT_LIMIT => 1000; sub dbh_new { my ($self) = @_; @@ -53,7 +54,7 @@ sub load_from_row { sub do_get { my ($self, $sql, $opts, @args) = @_; my $dbh = $self->connect; - my $lim = (($opts->{limit} || 0) + 0) || 1000; + my $lim = (($opts->{limit} || 0) + 0) || DEFAULT_LIMIT; $sql .= "LIMIT $lim"; my $msgs = $dbh->selectall_arrayref($sql, { Slice => {} }, @args); load_from_row($_) for @$msgs; @@ -97,21 +98,47 @@ sub get_thread { SELECT tid,sid FROM over WHERE num = ? LIMIT 1 defined $tid or return nothing; # $sid may be undef + + my $cond_all = '(tid = ? OR sid = ?) AND num > ?'; my $sort_col = 'ds'; $num = 0; - if ($prev) { + if ($prev) { # mboxrd stream, only $num = $prev->{num} || 0; $sort_col = 'num'; } - my $cond = '(tid = ? OR sid = ?) AND num > ?'; - my $msgs = do_get($self, <<"", {}, $tid, $sid, $num); -SELECT num,ts,ds,ddd FROM over WHERE $cond ORDER BY $sort_col ASC - return $msgs unless wantarray; + my $cols = 'num,ts,ds,ddd'; + unless (wantarray) { + return do_get($self, <<"", {}, $tid, $sid, $num); +SELECT $cols FROM over WHERE $cond_all +ORDER BY $sort_col ASC - my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num); -SELECT COUNT(num) FROM over WHERE $cond + } + # HTML view always wants an array and never uses $prev, + # but the mbox stream never wants an array and always has $prev + die '$prev not supported with wantarray' if $prev; + my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num); +SELECT COUNT(num) FROM over WHERE $cond_all + + # giant thread, prioritize strict (tid) matches and throw + # in the loose (sid) matches at the end + my $msgs = do_get($self, <<"", {}, $tid, $num); +SELECT $cols FROM over WHERE tid = ? AND num > ? +ORDER BY $sort_col ASC + + # do we have room for loose matches? get the most recent ones, first: + my $lim = DEFAULT_LIMIT - scalar(@$msgs); + if ($lim > 0) { + my $opts = { limit => $lim }; + my $loose = do_get($self, <<"", $opts, $tid, $sid, $num); +SELECT $cols FROM over WHERE tid != ? AND sid = ? AND num > ? +ORDER BY $sort_col DESC + + # TODO separate strict and loose matches here once --reindex + # is fixed to preserve `tid' properly + push @$msgs, @$loose; + } ($nr, $msgs); } diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 58851ed..eb002ae 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -365,7 +365,7 @@ sub walk_thread { while (@q) { my ($level, $node, $i) = splice(@q, 0, 3); defined $node or next; - $cb->($ctx, $level, $node, $i); + $cb->($ctx, $level, $node, $i) or return; ++$level; $i = 0; unshift @q, map { ($level, $_, $i++) } @{$node->{children}}; @@ -818,10 +818,56 @@ sub indent_for { $level ? INDENT x ($level - 1) : ''; } +sub find_mid_root { + my ($ctx, $level, $node, $idx) = @_; + ++$ctx->{root_idx} if $level == 0; + if ($node->{id} eq $ctx->{mid}) { + $ctx->{found_mid_at} = $ctx->{root_idx}; + return 0; + } + 1; +} + +sub strict_loose_note ($) { + my ($nr) = @_; + my $msg = +" -- strict thread matches above, loose matches on Subject: below --\n"; + + if ($nr > PublicInbox::Over::DEFAULT_LIMIT()) { + $msg .= +" -- use mbox.gz link to download all $nr messages --\n"; + } + $msg; +} + sub thread_results { my ($ctx, $msgs) = @_; require PublicInbox::SearchThread; - PublicInbox::SearchThread::thread($msgs, *sort_ds, $ctx->{-inbox}); + my $ibx = $ctx->{-inbox}; + my $rootset = PublicInbox::SearchThread::thread($msgs, *sort_ds, $ibx); + + # FIXME: `tid' is broken on --reindex, so that needs to be fixed + # and preserved in the future. This bug is hidden by `sid' matches + # in get_thread, so we never noticed it until now. And even when + # reindexing is fixed, we'll keep this code until a SCHEMA_VERSION + # bump since reindexing is expensive and users may not do it + + # loose threading could've returned too many results, + # put the root the message we care about at the top: + my $mid = $ctx->{mid}; + if (defined($mid) && scalar(@$rootset) > 1) { + $ctx->{root_idx} = -1; + my $nr = scalar @$msgs; + walk_thread($rootset, $ctx, *find_mid_root); + my $idx = $ctx->{found_mid_at}; + if (defined($idx) && $idx != 0) { + my $tip = splice(@$rootset, $idx, 1); + @$rootset = reverse @$rootset; + unshift @$rootset, $tip; + $ctx->{sl_note} = strict_loose_note($nr); + } + } + $rootset } sub missing_thread { @@ -864,6 +910,10 @@ sub skel_dump { my $cur = $ctx->{cur}; my $mid = $smsg->{mid}; + if ($level == 0 && $ctx->{skel_dump_roots}++) { + $$dst .= delete $ctx->{sl_note} || ''; + } + my $f = ascii_html($smsg->from_name); my $obfs_ibx = $ctx->{-obfs_ibx}; obfuscate_addrs($obfs_ibx, $f) if $obfs_ibx; @@ -882,7 +932,7 @@ sub skel_dump { delete $ctx->{cur}; $$dst .= "$d". "$attr [this message]\n"; - return; + return 1; } else { $ctx->{prev_msg} = $mid; } @@ -922,6 +972,7 @@ sub skel_dump { $m = $ctx->{-upfx}.mid_escape($mid).'/'; } $$dst .= $d . "" . $end; + 1; } sub _skel_ghost { @@ -947,6 +998,7 @@ sub _skel_ghost { } my $dst = $ctx->{dst}; $$dst .= $d; + 1; } sub sort_ds { @@ -973,7 +1025,7 @@ sub acc_topic { $topic = [ $ds, 1, { $subj => $mid }, $subj ]; $ctx->{-cur_topic} = $topic; push @{$ctx->{order}}, $topic; - return; + return 1; } $topic = $ctx->{-cur_topic}; # should never be undef @@ -987,11 +1039,12 @@ sub acc_topic { } $seen->{$subj} = $mid; # latest for subject } else { # ghost message - return if $level != 0; # ignore child ghosts + return 1 if $level != 0; # ignore child ghosts $topic = [ -666, 0, {} ]; $ctx->{-cur_topic} = $topic; push @{$ctx->{order}}, $topic; } + 1; } sub dump_topics { -- EW