user/dev discussion of public-inbox itself
 help / Atom feed
* [PATCH 0/2] improve threading performance
@ 2016-12-20  3:03 Eric Wong
  2016-12-20  3:03 ` [PATCH 1/2] tests: add thread-all testing for benchmarking Eric Wong
  2016-12-20  3:03 ` [PATCH 2/2] searchmsg: remove ensure_metadata Eric Wong
  0 siblings, 2 replies; 3+ messages in thread
From: Eric Wong @ 2016-12-20  3:03 UTC (permalink / raw)
  To: meta

Eric Wong (2):
      tests: add thread-all testing for benchmarking
      searchmsg: remove ensure_metadata

 MANIFEST                     |  1 +
 lib/PublicInbox/Search.pm    |  6 ------
 lib/PublicInbox/SearchMsg.pm | 39 ++++++++++++---------------------------
 lib/PublicInbox/View.pm      |  2 +-
 t/search.t                   |  2 --
 t/thread-all.t               | 38 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 52 insertions(+), 36 deletions(-)

^ permalink raw reply	[flat|threaded] 3+ messages in thread

* [PATCH 1/2] tests: add thread-all testing for benchmarking
  2016-12-20  3:03 [PATCH 0/2] improve threading performance Eric Wong
@ 2016-12-20  3:03 ` Eric Wong
  2016-12-20  3:03 ` [PATCH 2/2] searchmsg: remove ensure_metadata Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2016-12-20  3:03 UTC (permalink / raw)
  To: meta; +Cc: Eric Wong

I'll be using this to improve message threading performance.
---
 MANIFEST       |  1 +
 t/thread-all.t | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 t/thread-all.t

diff --git a/MANIFEST b/MANIFEST
index 3388b1a..8f5e487 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -156,6 +156,7 @@ t/qspawn.t
 t/search.t
 t/spamcheck_spamc.t
 t/spawn.t
+t/thread-all.t
 t/thread-cycle.t
 t/utf8.mbox
 t/view.t
diff --git a/t/thread-all.t b/t/thread-all.t
new file mode 100644
index 0000000..8ccf4f8
--- /dev/null
+++ b/t/thread-all.t
@@ -0,0 +1,38 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# real-world testing of search threading
+use strict;
+use warnings;
+use Test::More;
+use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC);
+my $pi_dir = $ENV{GIANT_PI_DIR};
+plan skip_all => "GIANT_PI_DIR not defined for $0" unless $pi_dir;
+eval { require PublicInbox::Search; };
+plan skip_all => "Xapian missing for $0" if $@;
+my $srch = eval { PublicInbox::Search->new($pi_dir) };
+plan skip_all => "$pi_dir not initialized for $0" if $@;
+
+require PublicInbox::View;
+require PublicInbox::SearchThread;
+
+my $pfx = PublicInbox::Search::xpfx('thread');
+my $opts = { limit => 1000000, asc => 1 };
+my $t0 = clock_gettime(CLOCK_MONOTONIC);
+my $elapsed;
+
+my $sres = $srch->_do_enquire(undef, $opts);
+$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0;
+diag "enquire: $elapsed";
+
+$t0 = clock_gettime(CLOCK_MONOTONIC);
+my $msgs = PublicInbox::View::load_results($srch, $sres);
+$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0;
+diag "load_results $elapsed";
+
+$t0 = clock_gettime(CLOCK_MONOTONIC);
+PublicInbox::View::thread_results($msgs);
+$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0;
+diag "thread_results $elapsed";
+
+done_testing();
-- 
EW


^ permalink raw reply	[flat|threaded] 3+ messages in thread

* [PATCH 2/2] searchmsg: remove ensure_metadata
  2016-12-20  3:03 [PATCH 0/2] improve threading performance Eric Wong
  2016-12-20  3:03 ` [PATCH 1/2] tests: add thread-all testing for benchmarking Eric Wong
@ 2016-12-20  3:03 ` Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2016-12-20  3:03 UTC (permalink / raw)
  To: meta; +Cc: Eric Wong

Instead, only preload the ->mid field for threading,
as we only need ->thread and ->path once in Search->get_thread
(but we will need the ->mid field repeatedly).

This more than doubles View->load_results performance on
according to thread-all on an inbox with over 300K messages.
---
 lib/PublicInbox/Search.pm    |  6 ------
 lib/PublicInbox/SearchMsg.pm | 39 ++++++++++++---------------------------
 lib/PublicInbox/View.pm      |  2 +-
 t/search.t                   |  2 --
 4 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 24cb266..d4f6f77 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -108,12 +108,6 @@ my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix);
 
 sub xpfx { $all_pfx{$_[0]} }
 
-our %PFX2TERM_RMAP;
-my %meta_pfx = (mid => 1, thread => 1, path => 1);
-while (my ($k, $v) = each %all_pfx) {
-	$PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k};
-}
-
 my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail');
 
 sub xdir {
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index d62f02c..96406c6 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -10,7 +10,6 @@ use Search::Xapian;
 use Date::Parse qw/str2time/;
 use PublicInbox::MID qw/mid_clean/;
 use PublicInbox::Address;
-our $PFX2TERM_RE = undef;
 
 sub new {
 	my ($class, $mime) = @_;
@@ -121,29 +120,17 @@ sub references {
 	defined $x ? $x : '';
 }
 
-sub ensure_metadata {
-	my ($self) = @_;
+sub _get_term_val ($$$) {
+	my ($self, $pfx, $re) = @_;
 	my $doc = $self->{doc};
 	my $end = $doc->termlist_end;
-
-	unless (defined $PFX2TERM_RE) {
-		my $or = join('|', keys %PublicInbox::Search::PFX2TERM_RMAP);
-		$PFX2TERM_RE = qr/\A($or)/;
-	}
-
-	while (my ($pfx, $field) = each %PublicInbox::Search::PFX2TERM_RMAP) {
-		# ideally we'd move this out of the loop:
-		my $i = $doc->termlist_begin;
-
-		$i->skip_to($pfx);
-		if ($i != $end) {
-			my $val = $i->get_termname;
-
-			if ($val =~ s/$PFX2TERM_RE//o) {
-				$self->{$field} = $val;
-			}
-		}
+	my $i = $doc->termlist_begin;
+	$i->skip_to($pfx);
+	if ($i != $end) {
+		my $val = $i->get_termname;
+		$val =~ s/$re// and return $val;
 	}
+	undef;
 }
 
 sub mid ($;$) {
@@ -154,8 +141,8 @@ sub mid ($;$) {
 	} elsif (my $rv = $self->{mid}) {
 		$rv;
 	} else {
-		$self->ensure_metadata; # needed for ghosts
-		$self->{mid} ||= $self->_extract_mid;
+		$self->{mid} = _get_term_val($self, 'Q', qr/\AQ/) ||
+				$self->_extract_mid;
 	}
 }
 
@@ -194,16 +181,14 @@ sub thread_id {
 	my ($self) = @_;
 	my $tid = $self->{thread};
 	return $tid if defined $tid;
-	$self->ensure_metadata;
-	$self->{thread};
+	$self->{thread} = _get_term_val($self, 'G', qr/\AG/); # *G*roup
 }
 
 sub path {
 	my ($self) = @_;
 	my $path = $self->{path};
 	return $path if defined $path;
-	$self->ensure_metadata;
-	$self->{path};
+	$self->{path} = _get_term_val($self, 'XPATH', qr/\AXPATH/); # path
 }
 
 1;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index fa47a16..a50cb64 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -737,7 +737,7 @@ sub indent_for {
 sub load_results {
 	my ($srch, $sres) = @_;
 	my $msgs = delete $sres->{msgs};
-	$srch->retry_reopen(sub { [ map { $_->ensure_metadata; $_ } @$msgs ] });
+	$srch->retry_reopen(sub { [ map { $_->mid; $_ } @$msgs ] });
 }
 
 sub msg_timestamp {
diff --git a/t/search.t b/t/search.t
index eed9c9b..c16811d 100644
--- a/t/search.t
+++ b/t/search.t
@@ -109,7 +109,6 @@ sub filter_mids {
 	my $found = $ro->lookup_message('<root@s>');
 	ok($found, "message found");
 	is($root_id, $found->{doc_id}, 'doc_id set correctly');
-	$found->ensure_metadata;
 	is($found->mid, 'root@s', 'mid set correctly');
 	ok(int($found->thread_id) > 0, 'thread_id is an integer');
 
@@ -290,7 +289,6 @@ sub filter_mids {
 		body => "LOOP!\n"));
 	ok($doc_id > 0, "doc_id defined with circular reference");
 	my $smsg = $rw->lookup_message('circle@a');
-	$smsg->ensure_metadata;
 	is($smsg->references, '', "no references created");
 	my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc});
 	is($s, $msg->subject, 'long subject not rewritten');
-- 
EW


^ permalink raw reply	[flat|threaded] 3+ messages in thread

end of thread, back to index

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-20  3:03 [PATCH 0/2] improve threading performance Eric Wong
2016-12-20  3:03 ` [PATCH 1/2] tests: add thread-all testing for benchmarking Eric Wong
2016-12-20  3:03 ` [PATCH 2/2] searchmsg: remove ensure_metadata Eric Wong

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/
       or Tor2web: https://www.tor2web.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox