From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 547E81FA10 for ; Sat, 22 Aug 2020 06:06:28 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/5] searchidx: index THREADID in Xapian Date: Sat, 22 Aug 2020 06:06:25 +0000 Message-Id: <20200822060627.15595-4-e@yhbt.net> In-Reply-To: <20200822060627.15595-1-e@yhbt.net> References: <20200822060627.15595-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This is the `tid' column from over.sqlite3; and will be used for IMAP and JMAP search (among other things). --- Documentation/standards.perl | 4 ++++ lib/PublicInbox/Over.pm | 2 +- lib/PublicInbox/OverIdx.pm | 18 +++++++++--------- lib/PublicInbox/Search.pm | 4 ++-- lib/PublicInbox/SearchIdx.pm | 1 + lib/PublicInbox/SearchIdxShard.pm | 7 ++++--- lib/PublicInbox/Smsg.pm | 3 ++- t/over.t | 13 +++++++------ 8 files changed, 30 insertions(+), 22 deletions(-) diff --git a/Documentation/standards.perl b/Documentation/standards.perl index a64f033e..0ac6cc52 100755 --- a/Documentation/standards.perl +++ b/Documentation/standards.perl @@ -48,8 +48,12 @@ my $rfcs = [ # 5032 = 'WITHIN search extension for IMAP', 4978 => 'IMAP COMPRESS Extension', # 5182 = 'IMAP Extension for Referencing the Last SEARCH Result', + # 5256 => 'IMAP SORT and THREAD extensions', # 5738 => 'IMAP Support for UTF-8', # 8474 => 'IMAP Extension for Object Identifiers', + + # 8620 => JSON Meta Application Protocol (JMAP) + # 8621 => JSON Meta Application Protocol (JMAP) for Mail # ... # TODO: flesh this out diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index a055b4cd..34d0b05d 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -213,7 +213,7 @@ sub get_art { my ($self, $num) = @_; # caching $sth ourselves is faster than prepare_cached my $sth = $self->{-get_art} //= $self->connect->prepare(<<''); -SELECT num,ds,ts,ddd FROM over WHERE num = ? LIMIT 1 +SELECT num,tid,ds,ts,ddd FROM over WHERE num = ? LIMIT 1 $sth->execute($num); my $smsg = $sth->fetchrow_hashref; diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 4543bfa1..d42d6fe7 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -284,7 +284,7 @@ sub add_overview { my $dd = $smsg->to_doc_data; utf8::encode($dd); $dd = compress($dd); - add_over($self, [ @$smsg{qw(ts ds num)}, $mids, $refs, $xpath, $dd ]); + add_over($self, $smsg, $mids, $refs, $xpath, $dd); } sub _add_over { @@ -311,10 +311,10 @@ sub _add_over { } sub add_over { - my ($self, $values) = @_; - my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values; + my ($self, $smsg, $mids, $refs, $xpath, $ddd) = @_; my $old_tid; my $vivified = 0; + my $num = $smsg->{num}; begin_lazy($self); delete_by_num($self, $num, \$old_tid); @@ -326,17 +326,17 @@ sub add_over { $v > 1 and warn "BUG: vivified multiple ($v) ghosts for $mid\n"; $vivified += $v; } - my $tid = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); - my $sid = sid($self, $xpath); + $smsg->{tid} = $vivified ? $old_tid : link_refs($self, $refs, $old_tid); + $smsg->{sid} = sid($self, $xpath); my $dbh = $self->{dbh}; my $sth = $dbh->prepare_cached(<<''); INSERT INTO over (num, tid, sid, ts, ds, ddd) VALUES (?,?,?,?,?,?) - my $n = 0; - my @v = ($num, $tid, $sid, $ts, $ds); - foreach (@v) { $sth->bind_param(++$n, $_) } - $sth->bind_param(++$n, $ddd, SQL_BLOB); + my $nc = 1; + $sth->bind_param($nc, $num); + $sth->bind_param(++$nc, $smsg->{$_}) for (qw(tid sid ts ds)); + $sth->bind_param(++$nc, $ddd, SQL_BLOB); $sth->execute; $sth = $dbh->prepare_cached(<<''); INSERT INTO id2num (id, num) VALUES (?,?) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index c18e19d4..4cfb7b38 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -18,9 +18,9 @@ use constant { # added for public-inbox 1.6.0+ BYTES => 3, # IMAP RFC822.SIZE UID => 4, # IMAP UID == NNTP article number == Xapian docid + THREADID => 5, # RFC 8474, RFC 8621 # TODO - # THREADID => ? # REPLYCNT => ?, # IMAP ANSWERED # SCHEMA_VERSION history @@ -47,7 +47,7 @@ use constant { # public-inbox v1.5.0 adds (still SCHEMA_VERSION=15): # * "lid:" and "l:" for List-Id searches # - # v1.6.0 adds BYTES and UID values + # v1.6.0 adds BYTES, UID and THREADID values SCHEMA_VERSION => 15, }; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 098fead7..baa6f41a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -356,6 +356,7 @@ sub add_xapian ($$$$) { add_val($doc, PublicInbox::Search::DT(), $dt); add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes}); add_val($doc, PublicInbox::Search::UID(), $smsg->{num}); + add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid}); my $tg = term_generator($self); $tg->set_document($doc); diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index c0f8be89..f23d23d0 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -68,8 +68,8 @@ sub shard_worker_loop ($$$$$) { } else { chomp $line; # n.b. $mid may contain spaces(!) - my ($to_read, $bytes, $num, $blob, $ds, $ts, $mid) = - split(/ /, $line, 7); + my ($to_read, $bytes, $num, $blob, $ds, $ts, $tid, $mid) + = split(/ /, $line, 8); $self->begin_txn_lazy; my $n = read($r, my $msg, $to_read) or die "read: $!\n"; $n == $to_read or die "short read: $n != $to_read\n"; @@ -79,6 +79,7 @@ sub shard_worker_loop ($$$$$) { num => $num + 0, blob => $blob, mid => $mid, + tid => $tid, ds => $ds, ts => $ts, }, 'PublicInbox::Smsg'; @@ -93,7 +94,7 @@ sub index_raw { if (my $w = $self->{w}) { # mid must be last, it can contain spaces (but not LF) print $w join(' ', @$smsg{qw(raw_bytes bytes - num blob ds ts mid)}), + num blob ds ts tid mid)}), "\n", $$msgref or die "failed to write shard $!\n"; } else { if ($eml) { diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm index 51226b8e..0a0384ef 100644 --- a/lib/PublicInbox/Smsg.pm +++ b/lib/PublicInbox/Smsg.pm @@ -82,7 +82,8 @@ sub psgi_cull ($) { # drop NNTP-only fields which aren't relevant to PSGI results: # saves ~80K on a 200 item search result: - delete @$self{qw(ts to cc bytes lines)}; + # TODO: we may need to keep some of these for JMAP... + delete @$self{qw(ts tid to cc bytes lines)}; $self; } diff --git a/t/over.t b/t/over.t index 734fdaa3..07672aa7 100644 --- a/t/over.t +++ b/t/over.t @@ -40,22 +40,23 @@ $y = $over->create_ghost('NEVAR'); is($y, $x + 1, 'integer tid for ghost increases'); my $ddd = compress(''); +my $msg = sub { { ts => 0, ds => 0, num => $_[0] } }; foreach my $s ('', undef) { - $over->add_over([0, 0, 98, [ 'a' ], [], $s, $ddd]); - $over->add_over([0, 0, 99, [ 'b' ], [], $s, $ddd]); + $over->add_over($msg->(98), [ 'a' ], [], $s, $ddd); + $over->add_over($msg->(99), [ 'b' ], [], $s, $ddd); my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ]; is_deeply([98], $msgs, 'messages not linked by empty subject'); } -$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]); -$over->add_over([0, 0, 99, [ 'b' ], [], 's', $ddd]); +$over->add_over($msg->(98), [ 'a' ], [], 's', $ddd); +$over->add_over($msg->(99), [ 'b' ], [], 's', $ddd); foreach my $mid (qw(a b)) { my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ]; is_deeply([98, 99], $msgs, 'linked messages by subject'); } -$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]); -$over->add_over([0, 0, 99, [ 'b' ], ['a'], 'diff', $ddd]); +$over->add_over($msg->(98), [ 'a' ], [], 's', $ddd); +$over->add_over($msg->(99), [ 'b' ], ['a'], 'diff', $ddd); foreach my $mid (qw(a b)) { my $msgs = [ map { $_->{num} } @{$over->get_thread($mid)} ]; is_deeply([98, 99], $msgs, "linked messages by Message-ID: <$mid>");