user/dev discussion of public-inbox itself
 help / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 02/17] thread: remove Mail::Thread dependency
Date: Wed,  5 Oct 2016 23:57:07 +0000
Message-ID: <20161005235722.14857-3-e@80x24.org> (raw)
In-Reply-To: <20161005235722.14857-1-e@80x24.org>

Introduce our own SearchThread class for threading messages.
This should allow us to specialize and optimize away objects
in future commits.
---
 INSTALL                         |   1 -
 MANIFEST                        |   2 +-
 Makefile.PL                     |   1 -
 lib/PublicInbox/SearchIdx.pm    |   4 +-
 lib/PublicInbox/SearchThread.pm | 323 ++++++++++++++++++++++++++++++++++++++++
 lib/PublicInbox/SearchView.pm   |   4 +-
 lib/PublicInbox/Thread.pm       |  86 -----------
 lib/PublicInbox/View.pm         |   4 +-
 lib/PublicInbox/WWW.pm          |   2 +-
 t/plack.t                       |   3 +-
 10 files changed, 332 insertions(+), 98 deletions(-)
 create mode 100644 lib/PublicInbox/SearchThread.pm
 delete mode 100644 lib/PublicInbox/Thread.pm

diff --git a/INSTALL b/INSTALL
index 5851892..3a2f840 100644
--- a/INSTALL
+++ b/INSTALL
@@ -37,7 +37,6 @@ Optional components:
 Optional Perl modules:
 
   - Plack[1]                   libplack-perl
-  - Mail::Thread (2.5+)[1]     libmail-thread-perl
   - URI::Escape[1]             liburi-perl
   - Search::Xapian[2][3]       libsearch-xapian-perl
   - IO::Compress::Gzip[3]      perl-modules (or libio-compress-perl)
diff --git a/MANIFEST b/MANIFEST
index c39fa26..bcc4121 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -78,11 +78,11 @@ lib/PublicInbox/SaPlugin/ListMirror.pm
 lib/PublicInbox/Search.pm
 lib/PublicInbox/SearchIdx.pm
 lib/PublicInbox/SearchMsg.pm
+lib/PublicInbox/SearchThread.pm
 lib/PublicInbox/SearchView.pm
 lib/PublicInbox/Spamcheck/Spamc.pm
 lib/PublicInbox/Spawn.pm
 lib/PublicInbox/SpawnPP.pm
-lib/PublicInbox/Thread.pm
 lib/PublicInbox/Unsubscribe.pm
 lib/PublicInbox/View.pm
 lib/PublicInbox/WWW.pm
diff --git a/Makefile.PL b/Makefile.PL
index 4a91103..0bac7c9 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -22,7 +22,6 @@ WriteMakefile(
 		'Email::MIME::ContentType' => 0,
 		'Email::Simple' => 0,
 		'Encode::MIME::Header' => 0,
-		'Mail::Thread' => '2.5', # 2.5+ needed for Email::Simple compat
 		'Plack' => 0,
 		'URI::Escape' => 0,
 		# We have more test dependencies, but do not force
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 23aef9f..4aac028 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -4,8 +4,8 @@
 #
 # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use
 # with the web and NNTP interfaces.  This index maintains thread
-# relationships for use by Mail::Thread.  This writes to the search
-# index.
+# relationships for use by PublicInbox::SearchThread.
+# This writes to the search index.
 package PublicInbox::SearchIdx;
 use strict;
 use warnings;
diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm
new file mode 100644
index 0000000..41fe859
--- /dev/null
+++ b/lib/PublicInbox/SearchThread.pm
@@ -0,0 +1,323 @@
+# This library is free software; you can redistribute it and/or modify
+# it under the same terms as Perl itself.
+#
+# This license differs from the rest of public-inbox
+#
+# Our own jwz-style threading class based on Mail::Thread from CPAN.
+# Mail::Thread is unmaintained and available on some distros.
+# We also do not want pruning or subject grouping, since we want
+# to encourage strict threading and hopefully encourage people
+# to use proper In-Reply-To.
+#
+# This includes fixes from several open bugs for Mail::Thread
+#
+# Avoid circular references
+# - https://rt.cpan.org/Public/Bug/Display.html?id=22817
+#
+# And avoid recursion in recurse_down:
+# - https://rt.cpan.org/Ticket/Display.html?id=116727
+# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479
+package PublicInbox::SearchThread;
+use strict;
+use warnings;
+use Email::Abstract;
+
+sub new {
+	return bless {
+		messages => $_[1],
+		id_table => {},
+		rootset  => []
+	}, $_[0];
+}
+
+sub _get_hdr {
+	my ($class, $msg, $hdr) = @_;
+	Email::Abstract->get_header($msg, $hdr) || '';
+}
+
+sub _uniq {
+	my %seen;
+	return grep { !$seen{$_}++ } @_;
+}
+
+sub _references {
+	my $class = shift;
+	my $msg = shift;
+	my @references = ($class->_get_hdr($msg, "References") =~ /<([^>]+)>/g);
+	my $foo = $class->_get_hdr($msg, "In-Reply-To");
+	chomp $foo;
+	$foo =~ s/.*?<([^>]+)>.*/$1/;
+	push @references, $foo
+	  if $foo =~ /^\S+\@\S+$/ && (!@references || $references[-1] ne $foo);
+	return _uniq(@references);
+}
+
+sub _msgid {
+	my ($class, $msg) = @_;
+	my $id = $class->_get_hdr($msg, "Message-ID");
+	die "attempt to thread message with no id" unless $id;
+	chomp $id;
+	$id =~ s/^<([^>]+)>.*/$1/; # We expect this not to have <>s
+	return $id;
+}
+
+sub rootset { @{$_[0]{rootset}} }
+
+sub thread {
+	my $self = shift;
+	$self->_setup();
+	$self->{rootset} = [ grep { !$_->parent } values %{$self->{id_table}} ];
+	$self->_finish();
+}
+
+sub _finish {
+	my $self = shift;
+	delete $self->{id_table};
+	delete $self->{seen};
+}
+
+sub _get_cont_for_id {
+	my $self = shift;
+	my $id = shift;
+	$self->{id_table}{$id} ||= $self->_container_class->new($id);
+}
+
+sub _container_class { 'PublicInbox::SearchThread::Container' }
+
+sub _setup {
+	my ($self) = @_;
+
+	_add_message($self, $_) foreach @{$self->{messages}};
+}
+
+sub _add_message ($$) {
+	my ($self, $message) = @_;
+
+	# A. if id_table...
+	my $this_container = $self->_get_cont_for_id($self->_msgid($message));
+	$this_container->message($message);
+
+	# B. For each element in the message's References field:
+	my @refs = $self->_references($message);
+
+	my $prev;
+	for my $ref (@refs) {
+		# Find a Container object for the given Message-ID
+		my $container = $self->_get_cont_for_id($ref);
+
+		# Link the References field's Containers together in the
+		# order implied by the References header
+		# * If they are already linked don't change the existing links
+		# * Do not add a link if adding that link would introduce
+		#   a loop...
+
+		if ($prev &&
+			!$container->parent &&  # already linked
+			!$container->has_descendent($prev) # would loop
+		   ) {
+			$prev->add_child($container);
+		}
+		$prev = $container;
+	}
+
+	# C. Set the parent of this message to be the last element in
+	# References...
+	if ($prev &&
+		!$this_container->has_descendent($prev) # would loop
+	   ) {
+		$prev->add_child($this_container)
+	}
+}
+
+sub order {
+	my $self = shift;
+	my $ordersub = shift;
+
+	# make a fake root
+	my $root = $self->_container_class->new( 'fakeroot' );
+	$root->add_child( $_ ) for @{ $self->{rootset} };
+
+	# sort it
+	$root->order_children( $ordersub );
+
+	# and untangle it
+	my @kids = $root->children;
+	$self->{rootset} = \@kids;
+	$root->remove_child($_) for @kids;
+}
+
+package PublicInbox::SearchThread::Container;
+use Carp qw(croak);
+use Scalar::Util qw(weaken);
+
+sub new { my $self = shift; bless { id => shift }, $self; }
+
+sub message { $_[0]->{message} = $_[1] if @_ == 2; $_[0]->{message} }
+sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} }
+sub child { $_[0]->{child} = $_[1] if @_ == 2; $_[0]->{child} }
+sub next { $_[0]->{next} = $_[1] if @_ == 2; $_[0]->{next} }
+sub messageid { $_[0]->{id} }
+
+sub add_child {
+	my ($self, $child) = @_;
+	croak "Cowardly refusing to become my own parent: $self"
+	  if $self == $child;
+
+	if (grep { $_ == $child } $self->children) {
+		# All is potentially correct with the world
+		$child->parent($self);
+		return;
+	}
+
+	$child->parent->remove_child($child) if $child->parent;
+
+	$child->next($self->child);
+	$self->child($child);
+	$child->parent($self);
+}
+
+sub remove_child {
+	my ($self, $child) = @_;
+	return unless $self->child;
+	if ($self->child == $child) {  # First one's easy.
+		$self->child($child->next);
+		$child->next(undef);
+		$child->parent(undef);
+		return;
+	}
+
+	my $x = $self->child;
+	my $prev = $x;
+	while ($x = $x->next) {
+		if ($x == $child) {
+			$prev->next($x->next); # Unlink x
+			$x->next(undef);
+			$x->parent(undef);	 # Deparent it
+			return;
+		}
+		$prev = $x;
+	}
+	# oddly, we can get here
+	$child->next(undef);
+	$child->parent(undef);
+}
+
+sub has_descendent {
+	my $self = shift;
+	my $child = shift;
+	die "Assertion failed: $child" unless eval {$child};
+	my $there = 0;
+	$self->recurse_down(sub { $there = 1 if $_[0] == $child });
+
+	return $there;
+}
+
+sub children {
+	my $self = shift;
+	my @children;
+	my $visitor = $self->child;
+	while ($visitor) {
+		push @children, $visitor;
+		$visitor = $visitor->next
+	}
+	return @children;
+}
+
+sub set_children {
+	my $self = shift;
+	my $walk = $self->child( shift );
+	while (@_) { $walk = $walk->next( shift ) }
+	$walk->next(undef) if $walk;
+}
+
+sub order_children {
+	my $self = shift;
+	my $ordersub = shift;
+
+	return unless $ordersub;
+
+	my $sub = sub {
+		my $cont = shift;
+		my @children = $cont->children;
+		return if @children < 2;
+		$cont->set_children( $ordersub->( @children ) );
+	};
+	$self->iterate_down( undef, $sub );
+	undef $sub;
+}
+
+# non-recursive version of recurse_down to avoid stack depth warnings
+sub recurse_down {
+	my ($self, $callback) = @_;
+	my %seen;
+	my @q = ($self);
+	while (my $cont = shift @q) {
+		$seen{$cont}++;
+		$callback->($cont);
+
+		if (my $next = $cont->next) {
+			if ($seen{$next}) {
+				$cont->next(undef);
+			} else {
+				push @q, $next;
+			}
+		}
+		if (my $child = $cont->child) {
+			if ($seen{$child}) {
+				$cont->child(undef);
+			} else {
+				push @q, $child;
+			}
+		}
+	}
+}
+
+sub iterate_down {
+	my $self = shift;
+	my ($before, $after) = @_;
+
+	my %seen;
+	my $walk = $self;
+	my $depth = 0;
+	my @visited;
+	while ($walk) {
+		push @visited, [ $walk, $depth ];
+		$before->($walk, $depth) if $before;
+
+		# spot/break loops
+		$seen{$walk}++;
+
+		my $child = $walk->child;
+		if ($child && $seen{$child}) {
+			$walk->child(undef);
+			$child = undef;
+		}
+
+		my $next = $walk->next;
+		if ($next && $seen{$next}) {
+			$walk->next(undef);
+			$next = undef;
+		}
+
+		# go down, or across
+		if ($child) {
+			$next = $child;
+			++$depth;
+		}
+
+		# no next?  look up
+		if (!$next) {
+			my $up = $walk;
+			while ($up && !$next) {
+				$up = $up->parent;
+				--$depth;
+				$next = $up->next if $up;
+			}
+		}
+		$walk = $next;
+	}
+	return unless $after;
+	while (@visited) { $after->(@{ pop @visited }) }
+}
+
+1;
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 4f0811a..da31109 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -11,7 +11,7 @@ use PublicInbox::View;
 use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape);
 use Email::MIME;
 require PublicInbox::Git;
-require PublicInbox::Thread;
+require PublicInbox::SearchThread;
 our $LIM = 50;
 
 sub noop {}
@@ -152,7 +152,7 @@ sub mset_thread {
 		$m;
 	} ($mset->items);
 
-	my $th = PublicInbox::Thread->new(@m);
+	my $th = PublicInbox::SearchThread->new(\@m);
 	$th->thread;
 	if ($q->{r}) { # order by relevance
 		$th->order(sub {
diff --git a/lib/PublicInbox/Thread.pm b/lib/PublicInbox/Thread.pm
deleted file mode 100644
index 8af9461..0000000
--- a/lib/PublicInbox/Thread.pm
+++ /dev/null
@@ -1,86 +0,0 @@
-# subclass Mail::Thread and use this to workaround a memory leak
-# Based on the patch in: https://rt.cpan.org/Public/Bug/Display.html?id=22817
-#
-# Additionally, workaround for a bug where $walk->topmost returns undef:
-# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913
-# - https://rt.cpan.org/Ticket/Display.html?id=106498
-#
-# And avoid recursion in recurse_down:
-# - https://rt.cpan.org/Ticket/Display.html?id=116727
-# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479
-#
-# License differs from the rest of public-inbox (but is compatible):
-# This library is free software; you can redistribute it and/or modify
-# it under the same terms as Perl itself.
-package PublicInbox::Thread;
-use strict;
-use warnings;
-use base qw(Mail::Thread);
-# WARNING! both these Mail::Thread knobs were found by inspecting
-# the Mail::Thread 2.55 source code, and we have some monkey patches
-# in PublicInbox::Thread to fix memory leaks.  Since Mail::Thread
-# appears unmaintained, I suppose it's safe to depend on these
-# variables for now:
-{
-	no warnings 'once';
-	# we want strict threads to expose (and hopefully discourage)
-	# use of broken email clients
-	$Mail::Thread::nosubject = 1;
-	# Keep ghosts with only a single direct child,
-	# don't hide that there may be missing messages.
-	$Mail::Thread::noprune = 1;
-}
-
-if ($Mail::Thread::VERSION <= 2.55) {
-	eval q(sub _container_class { 'PublicInbox::Thread::Container' });
-}
-
-package PublicInbox::Thread::Container;
-use strict;
-use warnings;
-use base qw(Mail::Thread::Container);
-use Scalar::Util qw(weaken);
-sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} }
-
-sub topmost {
-	$_[0]->SUPER::topmost || PublicInbox::Thread::CPANRTBug106498->new;
-}
-
-# non-recursive version of recurse_down to avoid stack depth warnings
-sub recurse_down {
-	my ($self, $callback) = @_;
-	my %seen;
-	my @q = ($self);
-	while (my $cont = shift @q) {
-		$seen{$cont}++;
-		$callback->($cont);
-
-		if (my $next = $cont->next) {
-			if ($seen{$next}) {
-				$cont->next(undef);
-			} else {
-				push @q, $next;
-			}
-		}
-		if (my $child = $cont->child) {
-			if ($seen{$child}) {
-				$cont->child(undef);
-			} else {
-				push @q, $child;
-			}
-		}
-	}
-}
-
-# ref:
-# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913
-# - https://rt.cpan.org/Ticket/Display.html?id=106498
-package PublicInbox::Thread::CPANRTBug106498;
-use strict;
-use warnings;
-
-sub new { bless {}, $_[0] }
-
-sub simple_subject {}
-
-1;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index a3b2681..9f1bf46 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -749,8 +749,8 @@ sub msg_timestamp {
 
 sub thread_results {
 	my ($msgs) = @_;
-	require PublicInbox::Thread;
-	my $th = PublicInbox::Thread->new(@$msgs);
+	require PublicInbox::SearchThread;
+	my $th = PublicInbox::SearchThread->new($msgs);
 	$th->thread;
 	$th->order(*sort_ts);
 	$th
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 4d599fc..11fc92e 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -112,7 +112,7 @@ sub call {
 sub preload {
 	require PublicInbox::Feed;
 	require PublicInbox::View;
-	require PublicInbox::Thread;
+	require PublicInbox::SearchThread;
 	require Email::MIME;
 	require Digest::SHA;
 	require POSIX;
diff --git a/t/plack.t b/t/plack.t
index 608afb9..1d62458 100644
--- a/t/plack.t
+++ b/t/plack.t
@@ -11,8 +11,7 @@ my $pi_config = "$tmpdir/config";
 my $maindir = "$tmpdir/main.git";
 my $addr = 'test-public@example.com';
 my $cfgpfx = "publicinbox.test";
-my @mods = qw(HTTP::Request::Common Plack::Test
-	Mail::Thread URI::Escape);
+my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape);
 foreach my $mod (@mods) {
 	eval "require $mod";
 	plan skip_all => "$mod missing for plack.t" if $@;
-- 
EW


  parent reply index

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-05 23:57 [PATCH 0/17] " Eric Wong
2016-10-05 23:57 ` [PATCH 01/17] view: remove "subject dummy" references Eric Wong
2016-10-05 23:57 ` Eric Wong [this message]
2016-10-05 23:57 ` [PATCH 03/17] thread: pass array refs instead of entire arrays Eric Wong
2016-10-05 23:57 ` [PATCH 04/17] thread: remove accessor usage in internals Eric Wong
2016-10-05 23:57 ` [PATCH 05/17] inbox: deal with ghost smsg Eric Wong
2016-10-05 23:57 ` [PATCH 06/17] thread: remove Email::Abstract wrapping Eric Wong
2016-10-05 23:57 ` [PATCH 07/17] thread: remove rootset accessor method Eric Wong
2016-10-05 23:57 ` [PATCH 08/17] thread: simplify Eric Wong
2016-10-05 23:57 ` [PATCH 09/17] thread: remove iterate_down Eric Wong
2016-10-05 23:57 ` [PATCH 10/17] thread: avoid incrementing undefined value Eric Wong
2016-10-05 23:57 ` [PATCH 11/17] thread: order_children no longer cares about depth Eric Wong
2016-10-05 23:57 ` [PATCH 12/17] thread: inline and remove recurse_down logic Eric Wong
2016-10-05 23:57 ` [PATCH 13/17] thread: fix sorting without topmost Eric Wong
2016-10-14 21:17   ` [PATCH] thread: reinstates stable ordering when ghosts are present Eric Wong
2016-10-05 23:57 ` [PATCH 14/17] thread: use hash + array instead of hand-rolled linked list Eric Wong
2016-10-05 23:57 ` [PATCH 15/17] view: remove redundant children array in thread views Eric Wong
2016-10-05 23:57 ` [PATCH 16/17] t/thread-cycle: test self-referential messages Eric Wong
2016-10-05 23:57 ` [PATCH 17/17] thread: remove weaken dependency Eric Wong
2016-10-06  8:22 ` [PATCH 0/17] remove Mail::Thread dependency Eric Wong
2016-10-13  3:59   ` [PATCH 0/2] thread: fix regressions from Mail::Thread removal Eric Wong

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161005235722.14857-3-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/
       or Tor2web: https://www.tor2web.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox