user/dev discussion of public-inbox itself
 help / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 14/17] thread: use hash + array instead of hand-rolled linked list
Date: Wed,  5 Oct 2016 23:57:19 +0000
Message-ID: <20161005235722.14857-15-e@80x24.org> (raw)
In-Reply-To: <20161005235722.14857-1-e@80x24.org>

This starts to show noticeable performance improvements when
attempting to thread over 400 messages; but the improvement
may not be measurable with less.

However, the resulting code is much shorter and (IMHO)
much easier to understand.
---
 MANIFEST                        |   1 +
 lib/PublicInbox/SearchThread.pm | 158 +++++++++-------------------------------
 lib/PublicInbox/View.pm         |  28 ++++---
 t/thread-cycle.t                |  86 ++++++++++++++++++++++
 4 files changed, 138 insertions(+), 135 deletions(-)
 create mode 100644 t/thread-cycle.t

diff --git a/MANIFEST b/MANIFEST
index bcc4121..3a4d7c4 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -155,6 +155,7 @@ t/qspawn.t
 t/search.t
 t/spamcheck_spamc.t
 t/spawn.t
+t/thread-cycle.t
 t/utf8.mbox
 t/view.t
 t/watch_maildir.t
diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm
index 153eef2..05de9ec 100644
--- a/lib/PublicInbox/SearchThread.pm
+++ b/lib/PublicInbox/SearchThread.pm
@@ -32,9 +32,8 @@ sub new {
 sub thread {
 	my $self = shift;
 	_add_message($self, $_) foreach @{$self->{messages}};
-	$self->{rootset} = [
-			grep { !$_->{parent} } values %{$self->{id_table}} ];
-	delete $self->{id_table};
+	my $id_table = delete $self->{id_table};
+	$self->{rootset} = [ grep { !$_->{parent} } values %$id_table ];
 }
 
 sub _get_cont_for_id ($$) {
@@ -82,156 +81,67 @@ sub _add_message ($$) {
 
 sub order {
 	my ($self, $ordersub) = @_;
-
-	# make a fake root
-	my $root = _get_cont_for_id($self, 'fakeroot');
-	$root->add_child( $_ ) for @{ $self->{rootset} };
-
-	# sort it
-	$root->order_children( $ordersub );
-
-	# and untangle it
-	my $kids = $root->children;
-	$self->{rootset} = $kids;
-	$root->remove_child($_) for @$kids;
+	my $rootset = $ordersub->($self->{rootset});
+	$self->{rootset} = $rootset;
+	$_->order_children($ordersub) for @$rootset;
 }
 
 package PublicInbox::SearchThread::Msg;
+use strict;
+use warnings;
 use Carp qw(croak);
 use Scalar::Util qw(weaken);
 
-sub new { my $self = shift; bless { id => shift }, $self; }
+sub new {
+	bless {
+		id => $_[1],
+		children => {}, # becomes an array when sorted by ->order(...)
+	}, $_[0];
+}
 
 sub add_child {
 	my ($self, $child) = @_;
 	croak "Cowardly refusing to become my own parent: $self"
 	  if $self == $child;
 
-	if (grep { $_ == $child } @{$self->children}) {
-		# All is potentially correct with the world
-		weaken($child->{parent} = $self);
-		return;
-	}
-
-	my $parent = $child->{parent};
-	remove_child($parent, $child) if $parent;
+	my $cid = $child->{id};
+	$self->{children}->{$cid} = $child;
 
-	$child->{next} = $self->{child};
-	$self->{child} = $child;
-	weaken($child->{parent} = $self);
-}
-
-sub remove_child {
-	my ($self, $child) = @_;
-
-	my $x = $self->{child} or return;
-	if ($x == $child) {  # First one's easy.
-		$self->{child} = $child->{next};
-		$child->{parent} = $child->{next} = undef;
-		return;
+	# reparenting:
+	if (defined(my $parent = $child->{parent})) {
+		delete $parent->{children}->{$cid};
 	}
 
-	my $prev = $x;
-	while ($x = $x->{next}) {
-		if ($x == $child) {
-			$prev->{next} = $x->{next}; # Unlink x
-			$x->{next} = $x->{parent} = undef; # Deparent it
-			return;
-		}
-		$prev = $x;
-	}
-	# oddly, we can get here
-	$child->{next} = $child->{parent} = undef;
+	weaken($child->{parent} = $self);
 }
 
 sub has_descendent {
-	my ($self, $child) = @_;
+	my ($cur, $child) = @_;
 	my %seen;
-	my @q = ($self);
-	while (my $cont = shift @q) {
-		$seen{$cont} = 1;
+	my @q = ($cur->{parent} || $cur);
 
-		return 1 if $cont == $child;
+	while (defined($cur = shift @q)) {
+		return 1 if $cur == $child;
 
-		if (my $next = $cont->{next}) {
-			if ($seen{$next}) {
-				$cont->{next} = undef;
-			} else {
-				push @q, $next;
-			}
-		}
-		if (my $child = $cont->{child}) {
-			if ($seen{$child}) {
-				$cont->{child} = undef;
-			} else {
-				push @q, $child;
-			}
+		if (!$seen{$cur}++) {
+			push @q, values %{$cur->{children}};
 		}
 	}
 	0;
 }
 
-sub children {
-	my $self = shift;
-	my @children;
-	my $visitor = $self->{child};
-	while ($visitor) {
-		push @children, $visitor;
-		$visitor = $visitor->{next};
-	}
-	\@children;
-}
-
-sub set_children {
-	my ($self, $children) = @_;
-	my $walk = $self->{child} = shift @$children;
-	do {
-		$walk = $walk->{next} = shift @$children;
-	} while ($walk);
-}
-
 sub order_children {
-	my ($walk, $ordersub) = @_;
+	my ($cur, $ordersub) = @_;
 
-	my %seen;
-	my @visited;
-	while ($walk) {
-		push @visited, $walk;
+	my %seen = ($cur => 1);
+	my @q = ($cur);
+	while (defined($cur = shift @q)) {
+		my $c = $cur->{children}; # The hashref here...
 
-		# spot/break loops
-		$seen{$walk} = 1;
-
-		my $child = $walk->{child};
-		if ($child && $seen{$child}) {
-			$walk->{child} = $child = undef;
-		}
-
-		my $next = $walk->{next};
-		if ($next && $seen{$next}) {
-			$walk->{next} = $next = undef;
-		}
-
-		# go down, or across
-		$next = $child if $child;
-
-		# no next?  look up
-		if (!$next) {
-			my $up = $walk;
-			while ($up && !$next) {
-				$up = $up->{parent};
-				$next = $up->{next} if $up;
-			}
-		}
-		$walk = $next;
-	}
-	foreach my $cont (@visited) {
-		my $children = $cont->children;
-		next if @$children < 2;
-		$children = $ordersub->($children);
-		$cont = $cont->{child} = shift @$children;
-		do {
-			$cont = $cont->{next} = shift @$children;
-		} while ($cont);
+		$c = [ grep { !$seen{$_}++ } values %$c ]; # spot/break loops
+		$c = $ordersub->($c) if scalar @$c > 1;
+		$cur->{children} = $c; # ...becomes an arrayref
+		push @q, @$c;
 	}
 }
 
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index c09b4a2..d0c6d33 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -203,13 +203,15 @@ sub _th_index_lite {
 	my $pad = '  ';
 	# map = [children, attr, node, idx, level]
 	my $map = $mapping->{$mid_raw};
-	my $nr_c = scalar @{$map->[0]};
+	my $children = $map->[0];
+	my $nr_c = scalar @$children;
 	my $nr_s = 0;
 	my $level = $map->[4];
 	my $idx = $map->[3];
+	my $siblings;
 	my $irt_map = $mapping->{$irt} if defined $irt;
 	if (defined $irt_map) {
-		my $siblings = $irt_map->[0];
+		$siblings = $irt_map->[0];
 		$nr_s = scalar(@$siblings) - 1;
 		$rv .= $pad . $irt_map->[1];
 		if ($idx > 0) {
@@ -233,24 +235,26 @@ sub _th_index_lite {
 	$this =~ s!<a\nhref=[^>]+>([^<]+)</a>!$1!s; # no point linking to self
 	$rv .= "<b>@ $this";
 	my $node = $map->[2];
-	if (my $child = $node->{child}) {
-		my $cmid = $child->{id};
+	if ($nr_c) {
+		my $cmid = $children->[0]->{id};
 		$rv .= $pad . $mapping->{$cmid}->[1];
 		if ($nr_c > 2) {
 			my $s = ($nr_c - 1). ' more replies';
 			$rv .= pad_link($cmid, $level + 1, $s);
-		} elsif (my $cn = $child->{next}) {
+		} elsif (my $cn = $children->[1]) {
 			$rv .= $pad . $mapping->{$cn->{id}}->[1];
 		}
 	}
-	if (my $next = $node->{next}) {
+
+	my $next = $siblings->[$idx+1] if $siblings && $idx >= 0;
+	if ($next) {
 		my $nmid = $next->{id};
 		$rv .= $pad . $mapping->{$nmid}->[1];
 		my $nnext = $nr_s - $idx;
 		if ($nnext > 2) {
 			my $s = ($nnext - 1).' subsequent siblings';
 			$rv .= pad_link($nmid, $level, $s);
-		} elsif (my $nn = $next->{next}) {
+		} elsif (my $nn = $siblings->[$idx + 2]) {
 			$rv .= $pad . $mapping->{$nn->{id}}->[1];
 		}
 	}
@@ -264,7 +268,8 @@ sub walk_thread {
 		my $level = shift @q;
 		my $node = shift @q or next;
 		$cb->($ctx, $level, $node);
-		unshift @q, $level+1, $node->{child}, $level, $node->{next};
+		++$level;
+		unshift @q, map { ($level, $_) } @{$node->{children}};
 	}
 }
 
@@ -296,7 +301,8 @@ sub stream_thread ($$) {
 	while (@q) {
 		$level = shift @q;
 		my $node = shift @q or next;
-		unshift @q, $level+1, $node->{child}, $level, $node->{next};
+		my $cl = $level + 1;
+		unshift @q, map { ($cl, $_) } @{$node->{children}};
 		$mime = $inbox->msg_by_smsg($node->{smsg}) and last;
 	}
 	return missing_thread($ctx) unless $mime;
@@ -309,8 +315,8 @@ sub stream_thread ($$) {
 		while (@q) {
 			$level = shift @q;
 			my $node = shift @q or next;
-			unshift @q, $level+1, $node->{child},
-					$level, $node->{next};
+			my $cl = $level + 1;
+			unshift @q, map { ($cl, $_) } @{$node->{children}};
 			my $mid = $node->{id};
 			if ($mime = $inbox->msg_by_smsg($node->{smsg})) {
 				$mime = Email::MIME->new($mime);
diff --git a/t/thread-cycle.t b/t/thread-cycle.t
new file mode 100644
index 0000000..4d60f7e
--- /dev/null
+++ b/t/thread-cycle.t
@@ -0,0 +1,86 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use_ok('PublicInbox::SearchMsg');
+use_ok('PublicInbox::SearchThread');
+use Email::Simple;
+my $mt = eval {
+	require Mail::Thread;
+	no warnings 'once';
+	$Mail::Thread::nosubject = 1;
+	$Mail::Thread::noprune = 1;
+};
+my @check;
+my @msgs = map {
+	my $msg = $_;
+	$msg->{references} =~ s/\s+/ /sg if $msg->{references};
+	my $simple = Email::Simple->create(header => [
+		'Message-Id' => "<$msg->{mid}>",
+		'References' => $msg->{references},
+	]);
+	push @check, $simple;
+	bless $msg, 'PublicInbox::SearchMsg'
+} (
+
+# data from t/testbox-6 in Mail::Thread 2.55:
+	{ mid => '20021124145312.GA1759@nlin.net' },
+	{ mid => 'slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk',
+	  references => '<20021124145312.GA1759@nlin.net>',
+	},
+	{ mid => '15842.10677.577458.656565@jupiter.akutech-local.de',
+	  references => '<20021124145312.GA1759@nlin.net>
+			<slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk>',
+	},
+	{ mid => '20021125171807.GK8236@somanetworks.com',
+	  references => '<20021124145312.GA1759@nlin.net>
+			<slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk>
+			<15842.10677.577458.656565@jupiter.akutech-local.de>',
+	},
+	{ mid => '15843.12163.554914.469248@jupiter.akutech-local.de',
+	  references => '<20021124145312.GA1759@nlin.net>
+			<slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk>
+			<15842.10677.577458.656565@jupiter.akutech-local.de>
+			<E18GPHf-0000zp-00@cloaked.freeserve.co.uk>',
+	},
+	{ mid => 'E18GPHf-0000zp-00@cloaked.freeserve.co.uk',
+	  references => '<20021124145312.GA1759@nlin.net>
+			<slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk>
+			<15842.10677.577458.656565@jupiter.akutech-local.de>'
+	}
+);
+
+my $th = PublicInbox::SearchThread->new(\@msgs);
+$th->thread;
+$th->order(sub { [ sort { $a->{id} cmp $b->{id} } @{$_[0]} ] });
+my $st = '';
+my @q = map { (0, $_) } @{$th->{rootset}};
+while (@q) {
+	my $level = shift @q;
+	my $node = shift @q or next;
+	$st .= (" "x$level). "$node->{id}\n";
+	my $cl = $level + 1;
+	unshift @q, map { ($cl, $_) } @{$node->{children}}
+}
+
+SKIP: {
+	skip 'Mail::Thread missing', 1 unless $mt;
+	$mt = Mail::Thread->new(@check);
+	$mt->thread;
+	$mt->order(sub { sort { $a->messageid cmp $b->messageid } @_ });
+	my $check = '';
+
+	@q = map { (0, $_) } $mt->rootset;
+	while (@q) {
+		my $level = shift @q;
+		my $node = shift @q or next;
+		$check .= (" "x$level) . $node->messageid . "\n";
+		unshift @q, $level + 1, $node->child, $level, $node->next;
+	}
+	is($check, $st, 'Mail::Thread output matches');
+}
+
+done_testing();
+
+1;
-- 
EW


  parent reply index

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-05 23:57 [PATCH 0/17] remove Mail::Thread dependency Eric Wong
2016-10-05 23:57 ` [PATCH 01/17] view: remove "subject dummy" references Eric Wong
2016-10-05 23:57 ` [PATCH 02/17] thread: remove Mail::Thread dependency Eric Wong
2016-10-05 23:57 ` [PATCH 03/17] thread: pass array refs instead of entire arrays Eric Wong
2016-10-05 23:57 ` [PATCH 04/17] thread: remove accessor usage in internals Eric Wong
2016-10-05 23:57 ` [PATCH 05/17] inbox: deal with ghost smsg Eric Wong
2016-10-05 23:57 ` [PATCH 06/17] thread: remove Email::Abstract wrapping Eric Wong
2016-10-05 23:57 ` [PATCH 07/17] thread: remove rootset accessor method Eric Wong
2016-10-05 23:57 ` [PATCH 08/17] thread: simplify Eric Wong
2016-10-05 23:57 ` [PATCH 09/17] thread: remove iterate_down Eric Wong
2016-10-05 23:57 ` [PATCH 10/17] thread: avoid incrementing undefined value Eric Wong
2016-10-05 23:57 ` [PATCH 11/17] thread: order_children no longer cares about depth Eric Wong
2016-10-05 23:57 ` [PATCH 12/17] thread: inline and remove recurse_down logic Eric Wong
2016-10-05 23:57 ` [PATCH 13/17] thread: fix sorting without topmost Eric Wong
2016-10-14 21:17   ` [PATCH] thread: reinstates stable ordering when ghosts are present Eric Wong
2016-10-05 23:57 ` Eric Wong [this message]
2016-10-05 23:57 ` [PATCH 15/17] view: remove redundant children array in thread views Eric Wong
2016-10-05 23:57 ` [PATCH 16/17] t/thread-cycle: test self-referential messages Eric Wong
2016-10-05 23:57 ` [PATCH 17/17] thread: remove weaken dependency Eric Wong
2016-10-06  8:22 ` [PATCH 0/17] remove Mail::Thread dependency Eric Wong
2016-10-13  3:59   ` [PATCH 0/2] thread: fix regressions from Mail::Thread removal Eric Wong
2016-10-13  3:59     ` [PATCH 1/2] thread: reduce indentation level Eric Wong
2016-10-13  3:59     ` [PATCH 2/2] thread: fix parent/child relationships Eric Wong

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161005235722.14857-15-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/
       or Tor2web: https://www.tor2web.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox