From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 27198209C7 for ; Wed, 5 Oct 2016 23:57:28 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 14/17] thread: use hash + array instead of hand-rolled linked list Date: Wed, 5 Oct 2016 23:57:19 +0000 Message-Id: <20161005235722.14857-15-e@80x24.org> In-Reply-To: <20161005235722.14857-1-e@80x24.org> References: <20161005235722.14857-1-e@80x24.org> List-Id: This starts to show noticeable performance improvements when attempting to thread over 400 messages; but the improvement may not be measurable with less. However, the resulting code is much shorter and (IMHO) much easier to understand. --- MANIFEST | 1 + lib/PublicInbox/SearchThread.pm | 158 +++++++++------------------------------- lib/PublicInbox/View.pm | 28 ++++--- t/thread-cycle.t | 86 ++++++++++++++++++++++ 4 files changed, 138 insertions(+), 135 deletions(-) create mode 100644 t/thread-cycle.t diff --git a/MANIFEST b/MANIFEST index bcc4121..3a4d7c4 100644 --- a/MANIFEST +++ b/MANIFEST @@ -155,6 +155,7 @@ t/qspawn.t t/search.t t/spamcheck_spamc.t t/spawn.t +t/thread-cycle.t t/utf8.mbox t/view.t t/watch_maildir.t diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 153eef2..05de9ec 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -32,9 +32,8 @@ sub new { sub thread { my $self = shift; _add_message($self, $_) foreach @{$self->{messages}}; - $self->{rootset} = [ - grep { !$_->{parent} } values %{$self->{id_table}} ]; - delete $self->{id_table}; + my $id_table = delete $self->{id_table}; + $self->{rootset} = [ grep { !$_->{parent} } values %$id_table ]; } sub _get_cont_for_id ($$) { @@ -82,156 +81,67 @@ sub _add_message ($$) { sub order { my ($self, $ordersub) = @_; - - # make a fake root - my $root = _get_cont_for_id($self, 'fakeroot'); - $root->add_child( $_ ) for @{ $self->{rootset} }; - - # sort it - $root->order_children( $ordersub ); - - # and untangle it - my $kids = $root->children; - $self->{rootset} = $kids; - $root->remove_child($_) for @$kids; + my $rootset = $ordersub->($self->{rootset}); + $self->{rootset} = $rootset; + $_->order_children($ordersub) for @$rootset; } package PublicInbox::SearchThread::Msg; +use strict; +use warnings; use Carp qw(croak); use Scalar::Util qw(weaken); -sub new { my $self = shift; bless { id => shift }, $self; } +sub new { + bless { + id => $_[1], + children => {}, # becomes an array when sorted by ->order(...) + }, $_[0]; +} sub add_child { my ($self, $child) = @_; croak "Cowardly refusing to become my own parent: $self" if $self == $child; - if (grep { $_ == $child } @{$self->children}) { - # All is potentially correct with the world - weaken($child->{parent} = $self); - return; - } - - my $parent = $child->{parent}; - remove_child($parent, $child) if $parent; + my $cid = $child->{id}; + $self->{children}->{$cid} = $child; - $child->{next} = $self->{child}; - $self->{child} = $child; - weaken($child->{parent} = $self); -} - -sub remove_child { - my ($self, $child) = @_; - - my $x = $self->{child} or return; - if ($x == $child) { # First one's easy. - $self->{child} = $child->{next}; - $child->{parent} = $child->{next} = undef; - return; + # reparenting: + if (defined(my $parent = $child->{parent})) { + delete $parent->{children}->{$cid}; } - my $prev = $x; - while ($x = $x->{next}) { - if ($x == $child) { - $prev->{next} = $x->{next}; # Unlink x - $x->{next} = $x->{parent} = undef; # Deparent it - return; - } - $prev = $x; - } - # oddly, we can get here - $child->{next} = $child->{parent} = undef; + weaken($child->{parent} = $self); } sub has_descendent { - my ($self, $child) = @_; + my ($cur, $child) = @_; my %seen; - my @q = ($self); - while (my $cont = shift @q) { - $seen{$cont} = 1; + my @q = ($cur->{parent} || $cur); - return 1 if $cont == $child; + while (defined($cur = shift @q)) { + return 1 if $cur == $child; - if (my $next = $cont->{next}) { - if ($seen{$next}) { - $cont->{next} = undef; - } else { - push @q, $next; - } - } - if (my $child = $cont->{child}) { - if ($seen{$child}) { - $cont->{child} = undef; - } else { - push @q, $child; - } + if (!$seen{$cur}++) { + push @q, values %{$cur->{children}}; } } 0; } -sub children { - my $self = shift; - my @children; - my $visitor = $self->{child}; - while ($visitor) { - push @children, $visitor; - $visitor = $visitor->{next}; - } - \@children; -} - -sub set_children { - my ($self, $children) = @_; - my $walk = $self->{child} = shift @$children; - do { - $walk = $walk->{next} = shift @$children; - } while ($walk); -} - sub order_children { - my ($walk, $ordersub) = @_; + my ($cur, $ordersub) = @_; - my %seen; - my @visited; - while ($walk) { - push @visited, $walk; + my %seen = ($cur => 1); + my @q = ($cur); + while (defined($cur = shift @q)) { + my $c = $cur->{children}; # The hashref here... - # spot/break loops - $seen{$walk} = 1; - - my $child = $walk->{child}; - if ($child && $seen{$child}) { - $walk->{child} = $child = undef; - } - - my $next = $walk->{next}; - if ($next && $seen{$next}) { - $walk->{next} = $next = undef; - } - - # go down, or across - $next = $child if $child; - - # no next? look up - if (!$next) { - my $up = $walk; - while ($up && !$next) { - $up = $up->{parent}; - $next = $up->{next} if $up; - } - } - $walk = $next; - } - foreach my $cont (@visited) { - my $children = $cont->children; - next if @$children < 2; - $children = $ordersub->($children); - $cont = $cont->{child} = shift @$children; - do { - $cont = $cont->{next} = shift @$children; - } while ($cont); + $c = [ grep { !$seen{$_}++ } values %$c ]; # spot/break loops + $c = $ordersub->($c) if scalar @$c > 1; + $cur->{children} = $c; # ...becomes an arrayref + push @q, @$c; } } diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index c09b4a2..d0c6d33 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -203,13 +203,15 @@ sub _th_index_lite { my $pad = ' '; # map = [children, attr, node, idx, level] my $map = $mapping->{$mid_raw}; - my $nr_c = scalar @{$map->[0]}; + my $children = $map->[0]; + my $nr_c = scalar @$children; my $nr_s = 0; my $level = $map->[4]; my $idx = $map->[3]; + my $siblings; my $irt_map = $mapping->{$irt} if defined $irt; if (defined $irt_map) { - my $siblings = $irt_map->[0]; + $siblings = $irt_map->[0]; $nr_s = scalar(@$siblings) - 1; $rv .= $pad . $irt_map->[1]; if ($idx > 0) { @@ -233,24 +235,26 @@ sub _th_index_lite { $this =~ s!]+>([^<]+)!$1!s; # no point linking to self $rv .= "@ $this"; my $node = $map->[2]; - if (my $child = $node->{child}) { - my $cmid = $child->{id}; + if ($nr_c) { + my $cmid = $children->[0]->{id}; $rv .= $pad . $mapping->{$cmid}->[1]; if ($nr_c > 2) { my $s = ($nr_c - 1). ' more replies'; $rv .= pad_link($cmid, $level + 1, $s); - } elsif (my $cn = $child->{next}) { + } elsif (my $cn = $children->[1]) { $rv .= $pad . $mapping->{$cn->{id}}->[1]; } } - if (my $next = $node->{next}) { + + my $next = $siblings->[$idx+1] if $siblings && $idx >= 0; + if ($next) { my $nmid = $next->{id}; $rv .= $pad . $mapping->{$nmid}->[1]; my $nnext = $nr_s - $idx; if ($nnext > 2) { my $s = ($nnext - 1).' subsequent siblings'; $rv .= pad_link($nmid, $level, $s); - } elsif (my $nn = $next->{next}) { + } elsif (my $nn = $siblings->[$idx + 2]) { $rv .= $pad . $mapping->{$nn->{id}}->[1]; } } @@ -264,7 +268,8 @@ sub walk_thread { my $level = shift @q; my $node = shift @q or next; $cb->($ctx, $level, $node); - unshift @q, $level+1, $node->{child}, $level, $node->{next}; + ++$level; + unshift @q, map { ($level, $_) } @{$node->{children}}; } } @@ -296,7 +301,8 @@ sub stream_thread ($$) { while (@q) { $level = shift @q; my $node = shift @q or next; - unshift @q, $level+1, $node->{child}, $level, $node->{next}; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}}; $mime = $inbox->msg_by_smsg($node->{smsg}) and last; } return missing_thread($ctx) unless $mime; @@ -309,8 +315,8 @@ sub stream_thread ($$) { while (@q) { $level = shift @q; my $node = shift @q or next; - unshift @q, $level+1, $node->{child}, - $level, $node->{next}; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}}; my $mid = $node->{id}; if ($mime = $inbox->msg_by_smsg($node->{smsg})) { $mime = Email::MIME->new($mime); diff --git a/t/thread-cycle.t b/t/thread-cycle.t new file mode 100644 index 0000000..4d60f7e --- /dev/null +++ b/t/thread-cycle.t @@ -0,0 +1,86 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use_ok('PublicInbox::SearchMsg'); +use_ok('PublicInbox::SearchThread'); +use Email::Simple; +my $mt = eval { + require Mail::Thread; + no warnings 'once'; + $Mail::Thread::nosubject = 1; + $Mail::Thread::noprune = 1; +}; +my @check; +my @msgs = map { + my $msg = $_; + $msg->{references} =~ s/\s+/ /sg if $msg->{references}; + my $simple = Email::Simple->create(header => [ + 'Message-Id' => "<$msg->{mid}>", + 'References' => $msg->{references}, + ]); + push @check, $simple; + bless $msg, 'PublicInbox::SearchMsg' +} ( + +# data from t/testbox-6 in Mail::Thread 2.55: + { mid => '20021124145312.GA1759@nlin.net' }, + { mid => 'slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk', + references => '<20021124145312.GA1759@nlin.net>', + }, + { mid => '15842.10677.577458.656565@jupiter.akutech-local.de', + references => '<20021124145312.GA1759@nlin.net> + ', + }, + { mid => '20021125171807.GK8236@somanetworks.com', + references => '<20021124145312.GA1759@nlin.net> + + <15842.10677.577458.656565@jupiter.akutech-local.de>', + }, + { mid => '15843.12163.554914.469248@jupiter.akutech-local.de', + references => '<20021124145312.GA1759@nlin.net> + + <15842.10677.577458.656565@jupiter.akutech-local.de> + ', + }, + { mid => 'E18GPHf-0000zp-00@cloaked.freeserve.co.uk', + references => '<20021124145312.GA1759@nlin.net> + + <15842.10677.577458.656565@jupiter.akutech-local.de>' + } +); + +my $th = PublicInbox::SearchThread->new(\@msgs); +$th->thread; +$th->order(sub { [ sort { $a->{id} cmp $b->{id} } @{$_[0]} ] }); +my $st = ''; +my @q = map { (0, $_) } @{$th->{rootset}}; +while (@q) { + my $level = shift @q; + my $node = shift @q or next; + $st .= (" "x$level). "$node->{id}\n"; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}} +} + +SKIP: { + skip 'Mail::Thread missing', 1 unless $mt; + $mt = Mail::Thread->new(@check); + $mt->thread; + $mt->order(sub { sort { $a->messageid cmp $b->messageid } @_ }); + my $check = ''; + + @q = map { (0, $_) } $mt->rootset; + while (@q) { + my $level = shift @q; + my $node = shift @q or next; + $check .= (" "x$level) . $node->messageid . "\n"; + unshift @q, $level + 1, $node->child, $level, $node->next; + } + is($check, $st, 'Mail::Thread output matches'); +} + +done_testing(); + +1; -- EW