user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 3/5] searchidx: index THREADID in Xapian
  2020-08-22  6:06  7% [PATCH 0/5] "mairix -t" workalike for mbox.gz downloads Eric Wong
@ 2020-08-22  6:06  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-08-22  6:06 UTC (permalink / raw)
  To: meta

This is the `tid' column from over.sqlite3; and will be used for
IMAP and JMAP search (among other things).
---
 Documentation/standards.perl      |  4 ++++
 lib/PublicInbox/Over.pm           |  2 +-
 lib/PublicInbox/OverIdx.pm        | 18 +++++++++---------
 lib/PublicInbox/Search.pm         |  4 ++--
 lib/PublicInbox/SearchIdx.pm      |  1 +
 lib/PublicInbox/SearchIdxShard.pm |  7 ++++---
 lib/PublicInbox/Smsg.pm           |  3 ++-
 t/over.t                          | 13 +++++++------
 8 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/Documentation/standards.perl b/Documentation/standards.perl
index a64f033e..0ac6cc52 100755
--- a/Documentation/standards.perl
+++ b/Documentation/standards.perl
@@ -48,8 +48,12 @@ my $rfcs = [
 	# 5032 = 'WITHIN search extension for IMAP',
 	4978 => 'IMAP COMPRESS Extension',
 	# 5182 = 'IMAP Extension for Referencing the Last SEARCH Result',
+	# 5256 => 'IMAP SORT and THREAD extensions',
 	# 5738 =>  'IMAP Support for UTF-8',
 	# 8474 => 'IMAP Extension for Object Identifiers',
+
+	# 8620 => JSON Meta Application Protocol (JMAP)
+	# 8621 => JSON Meta Application Protocol (JMAP) for Mail
 	# ...
 
 	# TODO: flesh this out
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index a055b4cd..34d0b05d 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -213,7 +213,7 @@ sub get_art {
 	my ($self, $num) = @_;
 	# caching $sth ourselves is faster than prepare_cached
 	my $sth = $self->{-get_art} //= $self->connect->prepare(<<'');
-SELECT num,ds,ts,ddd FROM over WHERE num = ? LIMIT 1
+SELECT num,tid,ds,ts,ddd FROM over WHERE num = ? LIMIT 1
 
 	$sth->execute($num);
 	my $smsg = $sth->fetchrow_hashref;
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 4543bfa1..d42d6fe7 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -284,7 +284,7 @@ sub add_overview {
 	my $dd = $smsg->to_doc_data;
 	utf8::encode($dd);
 	$dd = compress($dd);
-	add_over($self, [ @$smsg{qw(ts ds num)}, $mids, $refs, $xpath, $dd ]);
+	add_over($self, $smsg, $mids, $refs, $xpath, $dd);
 }
 
 sub _add_over {
@@ -311,10 +311,10 @@ sub _add_over {
 }
 
 sub add_over {
-	my ($self, $values) = @_;
-	my ($ts, $ds, $num, $mids, $refs, $xpath, $ddd) = @$values;
+	my ($self, $smsg, $mids, $refs, $xpath, $ddd) = @_;
 	my $old_tid;
 	my $vivified = 0;
+	my $num = $smsg->{num};
 
 	begin_lazy($self);
 	delete_by_num($self, $num, \$old_tid);
@@ -326,17 +326,17 @@ sub add_over {
 		$v > 1 and warn "BUG: vivified multiple ($v) ghosts for $mid\n";
 		$vivified += $v;
 	}
-	my $tid = $vivified ? $old_tid : link_refs($self, $refs, $old_tid);
-	my $sid = sid($self, $xpath);
+	$smsg->{tid} = $vivified ? $old_tid : link_refs($self, $refs, $old_tid);
+	$smsg->{sid} = sid($self, $xpath);
 	my $dbh = $self->{dbh};
 	my $sth = $dbh->prepare_cached(<<'');
 INSERT INTO over (num, tid, sid, ts, ds, ddd)
 VALUES (?,?,?,?,?,?)
 
-	my $n = 0;
-	my @v = ($num, $tid, $sid, $ts, $ds);
-	foreach (@v) { $sth->bind_param(++$n, $_) }
-	$sth->bind_param(++$n, $ddd, SQL_BLOB);
+	my $nc = 1;
+	$sth->bind_param($nc, $num);
+	$sth->bind_param(++$nc, $smsg->{$_}) for (qw(tid sid ts ds));
+	$sth->bind_param(++$nc, $ddd, SQL_BLOB);
 	$sth->execute;
 	$sth = $dbh->prepare_cached(<<'');
 INSERT INTO id2num (id, num) VALUES (?,?)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index c18e19d4..4cfb7b38 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -18,9 +18,9 @@ use constant {
 	# added for public-inbox 1.6.0+
 	BYTES => 3, # IMAP RFC822.SIZE
 	UID => 4, # IMAP UID == NNTP article number == Xapian docid
+	THREADID => 5, # RFC 8474, RFC 8621
 
 	# TODO
-	# THREADID => ?
 	# REPLYCNT => ?, # IMAP ANSWERED
 
 	# SCHEMA_VERSION history
@@ -47,7 +47,7 @@ use constant {
 	#      public-inbox v1.5.0 adds (still SCHEMA_VERSION=15):
 	#      * "lid:" and "l:" for List-Id searches
 	#
-	#      v1.6.0 adds BYTES and UID values
+	#      v1.6.0 adds BYTES, UID and THREADID values
 	SCHEMA_VERSION => 15,
 };
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 098fead7..baa6f41a 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -356,6 +356,7 @@ sub add_xapian ($$$$) {
 	add_val($doc, PublicInbox::Search::DT(), $dt);
 	add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes});
 	add_val($doc, PublicInbox::Search::UID(), $smsg->{num});
+	add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid});
 
 	my $tg = term_generator($self);
 	$tg->set_document($doc);
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index c0f8be89..f23d23d0 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -68,8 +68,8 @@ sub shard_worker_loop ($$$$$) {
 		} else {
 			chomp $line;
 			# n.b. $mid may contain spaces(!)
-			my ($to_read, $bytes, $num, $blob, $ds, $ts, $mid) =
-							split(/ /, $line, 7);
+			my ($to_read, $bytes, $num, $blob, $ds, $ts, $tid, $mid)
+				= split(/ /, $line, 8);
 			$self->begin_txn_lazy;
 			my $n = read($r, my $msg, $to_read) or die "read: $!\n";
 			$n == $to_read or die "short read: $n != $to_read\n";
@@ -79,6 +79,7 @@ sub shard_worker_loop ($$$$$) {
 				num => $num + 0,
 				blob => $blob,
 				mid => $mid,
+				tid => $tid,
 				ds => $ds,
 				ts => $ts,
 			}, 'PublicInbox::Smsg';
@@ -93,7 +94,7 @@ sub index_raw {
 	if (my $w = $self->{w}) {
 		# mid must be last, it can contain spaces (but not LF)
 		print $w join(' ', @$smsg{qw(raw_bytes bytes
-						num blob ds ts mid)}),
+						num blob ds ts tid mid)}),
 			"\n", $$msgref or die "failed to write shard $!\n";
 	} else {
 		if ($eml) {
diff --git a/lib/PublicInbox/Smsg.pm b/lib/PublicInbox/Smsg.pm
index 51226b8e..0a0384ef 100644
--- a/lib/PublicInbox/Smsg.pm
+++ b/lib/PublicInbox/Smsg.pm
@@ -82,7 +82,8 @@ sub psgi_cull ($) {
 
 	# drop NNTP-only fields which aren't relevant to PSGI results:
 	# saves ~80K on a 200 item search result:
-	delete @$self{qw(ts to cc bytes lines)};
+	# TODO: we may need to keep some of these for JMAP...
+	delete @$self{qw(ts tid to cc bytes lines)};
 	$self;
 }
 
diff --git a/t/over.t b/t/over.t
index 734fdaa3..07672aa7 100644
--- a/t/over.t
+++ b/t/over.t
@@ -40,22 +40,23 @@ $y = $over->create_ghost('NEVAR');
 is($y, $x + 1, 'integer tid for ghost increases');
 
 my $ddd = compress('');
+my $msg = sub { { ts => 0, ds => 0, num => $_[0] } };
 foreach my $s ('', undef) {
-	$over->add_over([0, 0, 98, [ 'a' ], [], $s, $ddd]);
-	$over->add_over([0, 0, 99, [ 'b' ], [], $s, $ddd]);
+	$over->add_over($msg->(98), [ 'a' ], [], $s, $ddd);
+	$over->add_over($msg->(99), [ 'b' ], [], $s, $ddd);
 	my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ];
 	is_deeply([98], $msgs,
 		'messages not linked by empty subject');
 }
 
-$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]);
-$over->add_over([0, 0, 99, [ 'b' ], [], 's', $ddd]);
+$over->add_over($msg->(98), [ 'a' ], [], 's', $ddd);
+$over->add_over($msg->(99), [ 'b' ], [], 's', $ddd);
 foreach my $mid (qw(a b)) {
 	my $msgs = [ map { $_->{num} } @{$over->get_thread('a')} ];
 	is_deeply([98, 99], $msgs, 'linked messages by subject');
 }
-$over->add_over([0, 0, 98, [ 'a' ], [], 's', $ddd]);
-$over->add_over([0, 0, 99, [ 'b' ], ['a'], 'diff', $ddd]);
+$over->add_over($msg->(98), [ 'a' ], [], 's', $ddd);
+$over->add_over($msg->(99), [ 'b' ], ['a'], 'diff', $ddd);
 foreach my $mid (qw(a b)) {
 	my $msgs = [ map { $_->{num} } @{$over->get_thread($mid)} ];
 	is_deeply([98, 99], $msgs, "linked messages by Message-ID: <$mid>");

^ permalink raw reply related	[relevance 5%]

* [PATCH 0/5] "mairix -t" workalike for mbox.gz downloads
@ 2020-08-22  6:06  7% Eric Wong
  2020-08-22  6:06  5% ` [PATCH 3/5] searchidx: index THREADID in Xapian Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-08-22  6:06 UTC (permalink / raw)
  To: meta

Actually, the Xapian aspect of it turned out to be easy once
I learned ->set_collapse_key.

Getting the tests and compatibility with existing (pre-upgrade)
inboxes was more work.

It requires "public-inbox-index --reindex" to activate;
but PATCH 5/5 makes it safe to upgrade WWW either before
or after --reindex.  That means BOFHs can upgrade without
regard to ordering.

Tested with w3m, links, and lynx (I actually split out
my lynx fix separately):
  https://public-inbox.org/meta/20200822004125.9458-1-e@80x24.org/

TODO: CLI tool support, HTML interface, JMAP, etc...

Eric Wong (5):
  searchidxshard: clear $msgref buffer properly
  searchidx: put all shard-related stuff in SearchIdxShard.pm
  searchidx: index THREADID in Xapian
  search: support downloading mboxes results with full thread
  mbox: disable "&t" on existing Xapian until full reindex

 Documentation/standards.perl      |  4 +++
 lib/PublicInbox/Mbox.pm           | 54 +++++++++++++++++++++++++------
 lib/PublicInbox/Over.pm           | 31 +++++++++++++++++-
 lib/PublicInbox/OverIdx.pm        | 18 +++++------
 lib/PublicInbox/Search.pm         | 16 +++++++--
 lib/PublicInbox/SearchIdx.pm      | 51 +++++++++--------------------
 lib/PublicInbox/SearchIdxShard.pm | 48 ++++++++++++++++++++++-----
 lib/PublicInbox/SearchQuery.pm    |  8 +++--
 lib/PublicInbox/SearchView.pm     | 30 +++++++++++------
 lib/PublicInbox/Smsg.pm           |  3 +-
 lib/PublicInbox/V2Writable.pm     | 19 ++++++++---
 t/init.t                          |  1 +
 t/over.t                          | 13 ++++----
 t/psgi_search.t                   | 39 ++++++++++++++++++++--
 14 files changed, 244 insertions(+), 91 deletions(-)

^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-08-22  6:06  7% [PATCH 0/5] "mairix -t" workalike for mbox.gz downloads Eric Wong
2020-08-22  6:06  5% ` [PATCH 3/5] searchidx: index THREADID in Xapian Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).