user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 02/17] thread: remove Mail::Thread dependency
  2016-10-05 23:57  7% [PATCH 0/17] remove Mail::Thread dependency Eric Wong
@ 2016-10-05 23:57  3% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2016-10-05 23:57 UTC (permalink / raw)
  To: meta

Introduce our own SearchThread class for threading messages.
This should allow us to specialize and optimize away objects
in future commits.
---
 INSTALL                         |   1 -
 MANIFEST                        |   2 +-
 Makefile.PL                     |   1 -
 lib/PublicInbox/SearchIdx.pm    |   4 +-
 lib/PublicInbox/SearchThread.pm | 323 ++++++++++++++++++++++++++++++++++++++++
 lib/PublicInbox/SearchView.pm   |   4 +-
 lib/PublicInbox/Thread.pm       |  86 -----------
 lib/PublicInbox/View.pm         |   4 +-
 lib/PublicInbox/WWW.pm          |   2 +-
 t/plack.t                       |   3 +-
 10 files changed, 332 insertions(+), 98 deletions(-)
 create mode 100644 lib/PublicInbox/SearchThread.pm
 delete mode 100644 lib/PublicInbox/Thread.pm

diff --git a/INSTALL b/INSTALL
index 5851892..3a2f840 100644
--- a/INSTALL
+++ b/INSTALL
@@ -37,7 +37,6 @@ Optional components:
 Optional Perl modules:
 
   - Plack[1]                   libplack-perl
-  - Mail::Thread (2.5+)[1]     libmail-thread-perl
   - URI::Escape[1]             liburi-perl
   - Search::Xapian[2][3]       libsearch-xapian-perl
   - IO::Compress::Gzip[3]      perl-modules (or libio-compress-perl)
diff --git a/MANIFEST b/MANIFEST
index c39fa26..bcc4121 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -78,11 +78,11 @@ lib/PublicInbox/SaPlugin/ListMirror.pm
 lib/PublicInbox/Search.pm
 lib/PublicInbox/SearchIdx.pm
 lib/PublicInbox/SearchMsg.pm
+lib/PublicInbox/SearchThread.pm
 lib/PublicInbox/SearchView.pm
 lib/PublicInbox/Spamcheck/Spamc.pm
 lib/PublicInbox/Spawn.pm
 lib/PublicInbox/SpawnPP.pm
-lib/PublicInbox/Thread.pm
 lib/PublicInbox/Unsubscribe.pm
 lib/PublicInbox/View.pm
 lib/PublicInbox/WWW.pm
diff --git a/Makefile.PL b/Makefile.PL
index 4a91103..0bac7c9 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -22,7 +22,6 @@ WriteMakefile(
 		'Email::MIME::ContentType' => 0,
 		'Email::Simple' => 0,
 		'Encode::MIME::Header' => 0,
-		'Mail::Thread' => '2.5', # 2.5+ needed for Email::Simple compat
 		'Plack' => 0,
 		'URI::Escape' => 0,
 		# We have more test dependencies, but do not force
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 23aef9f..4aac028 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -4,8 +4,8 @@
 #
 # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use
 # with the web and NNTP interfaces.  This index maintains thread
-# relationships for use by Mail::Thread.  This writes to the search
-# index.
+# relationships for use by PublicInbox::SearchThread.
+# This writes to the search index.
 package PublicInbox::SearchIdx;
 use strict;
 use warnings;
diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm
new file mode 100644
index 0000000..41fe859
--- /dev/null
+++ b/lib/PublicInbox/SearchThread.pm
@@ -0,0 +1,323 @@
+# This library is free software; you can redistribute it and/or modify
+# it under the same terms as Perl itself.
+#
+# This license differs from the rest of public-inbox
+#
+# Our own jwz-style threading class based on Mail::Thread from CPAN.
+# Mail::Thread is unmaintained and available on some distros.
+# We also do not want pruning or subject grouping, since we want
+# to encourage strict threading and hopefully encourage people
+# to use proper In-Reply-To.
+#
+# This includes fixes from several open bugs for Mail::Thread
+#
+# Avoid circular references
+# - https://rt.cpan.org/Public/Bug/Display.html?id=22817
+#
+# And avoid recursion in recurse_down:
+# - https://rt.cpan.org/Ticket/Display.html?id=116727
+# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479
+package PublicInbox::SearchThread;
+use strict;
+use warnings;
+use Email::Abstract;
+
+sub new {
+	return bless {
+		messages => $_[1],
+		id_table => {},
+		rootset  => []
+	}, $_[0];
+}
+
+sub _get_hdr {
+	my ($class, $msg, $hdr) = @_;
+	Email::Abstract->get_header($msg, $hdr) || '';
+}
+
+sub _uniq {
+	my %seen;
+	return grep { !$seen{$_}++ } @_;
+}
+
+sub _references {
+	my $class = shift;
+	my $msg = shift;
+	my @references = ($class->_get_hdr($msg, "References") =~ /<([^>]+)>/g);
+	my $foo = $class->_get_hdr($msg, "In-Reply-To");
+	chomp $foo;
+	$foo =~ s/.*?<([^>]+)>.*/$1/;
+	push @references, $foo
+	  if $foo =~ /^\S+\@\S+$/ && (!@references || $references[-1] ne $foo);
+	return _uniq(@references);
+}
+
+sub _msgid {
+	my ($class, $msg) = @_;
+	my $id = $class->_get_hdr($msg, "Message-ID");
+	die "attempt to thread message with no id" unless $id;
+	chomp $id;
+	$id =~ s/^<([^>]+)>.*/$1/; # We expect this not to have <>s
+	return $id;
+}
+
+sub rootset { @{$_[0]{rootset}} }
+
+sub thread {
+	my $self = shift;
+	$self->_setup();
+	$self->{rootset} = [ grep { !$_->parent } values %{$self->{id_table}} ];
+	$self->_finish();
+}
+
+sub _finish {
+	my $self = shift;
+	delete $self->{id_table};
+	delete $self->{seen};
+}
+
+sub _get_cont_for_id {
+	my $self = shift;
+	my $id = shift;
+	$self->{id_table}{$id} ||= $self->_container_class->new($id);
+}
+
+sub _container_class { 'PublicInbox::SearchThread::Container' }
+
+sub _setup {
+	my ($self) = @_;
+
+	_add_message($self, $_) foreach @{$self->{messages}};
+}
+
+sub _add_message ($$) {
+	my ($self, $message) = @_;
+
+	# A. if id_table...
+	my $this_container = $self->_get_cont_for_id($self->_msgid($message));
+	$this_container->message($message);
+
+	# B. For each element in the message's References field:
+	my @refs = $self->_references($message);
+
+	my $prev;
+	for my $ref (@refs) {
+		# Find a Container object for the given Message-ID
+		my $container = $self->_get_cont_for_id($ref);
+
+		# Link the References field's Containers together in the
+		# order implied by the References header
+		# * If they are already linked don't change the existing links
+		# * Do not add a link if adding that link would introduce
+		#   a loop...
+
+		if ($prev &&
+			!$container->parent &&  # already linked
+			!$container->has_descendent($prev) # would loop
+		   ) {
+			$prev->add_child($container);
+		}
+		$prev = $container;
+	}
+
+	# C. Set the parent of this message to be the last element in
+	# References...
+	if ($prev &&
+		!$this_container->has_descendent($prev) # would loop
+	   ) {
+		$prev->add_child($this_container)
+	}
+}
+
+sub order {
+	my $self = shift;
+	my $ordersub = shift;
+
+	# make a fake root
+	my $root = $self->_container_class->new( 'fakeroot' );
+	$root->add_child( $_ ) for @{ $self->{rootset} };
+
+	# sort it
+	$root->order_children( $ordersub );
+
+	# and untangle it
+	my @kids = $root->children;
+	$self->{rootset} = \@kids;
+	$root->remove_child($_) for @kids;
+}
+
+package PublicInbox::SearchThread::Container;
+use Carp qw(croak);
+use Scalar::Util qw(weaken);
+
+sub new { my $self = shift; bless { id => shift }, $self; }
+
+sub message { $_[0]->{message} = $_[1] if @_ == 2; $_[0]->{message} }
+sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} }
+sub child { $_[0]->{child} = $_[1] if @_ == 2; $_[0]->{child} }
+sub next { $_[0]->{next} = $_[1] if @_ == 2; $_[0]->{next} }
+sub messageid { $_[0]->{id} }
+
+sub add_child {
+	my ($self, $child) = @_;
+	croak "Cowardly refusing to become my own parent: $self"
+	  if $self == $child;
+
+	if (grep { $_ == $child } $self->children) {
+		# All is potentially correct with the world
+		$child->parent($self);
+		return;
+	}
+
+	$child->parent->remove_child($child) if $child->parent;
+
+	$child->next($self->child);
+	$self->child($child);
+	$child->parent($self);
+}
+
+sub remove_child {
+	my ($self, $child) = @_;
+	return unless $self->child;
+	if ($self->child == $child) {  # First one's easy.
+		$self->child($child->next);
+		$child->next(undef);
+		$child->parent(undef);
+		return;
+	}
+
+	my $x = $self->child;
+	my $prev = $x;
+	while ($x = $x->next) {
+		if ($x == $child) {
+			$prev->next($x->next); # Unlink x
+			$x->next(undef);
+			$x->parent(undef);	 # Deparent it
+			return;
+		}
+		$prev = $x;
+	}
+	# oddly, we can get here
+	$child->next(undef);
+	$child->parent(undef);
+}
+
+sub has_descendent {
+	my $self = shift;
+	my $child = shift;
+	die "Assertion failed: $child" unless eval {$child};
+	my $there = 0;
+	$self->recurse_down(sub { $there = 1 if $_[0] == $child });
+
+	return $there;
+}
+
+sub children {
+	my $self = shift;
+	my @children;
+	my $visitor = $self->child;
+	while ($visitor) {
+		push @children, $visitor;
+		$visitor = $visitor->next
+	}
+	return @children;
+}
+
+sub set_children {
+	my $self = shift;
+	my $walk = $self->child( shift );
+	while (@_) { $walk = $walk->next( shift ) }
+	$walk->next(undef) if $walk;
+}
+
+sub order_children {
+	my $self = shift;
+	my $ordersub = shift;
+
+	return unless $ordersub;
+
+	my $sub = sub {
+		my $cont = shift;
+		my @children = $cont->children;
+		return if @children < 2;
+		$cont->set_children( $ordersub->( @children ) );
+	};
+	$self->iterate_down( undef, $sub );
+	undef $sub;
+}
+
+# non-recursive version of recurse_down to avoid stack depth warnings
+sub recurse_down {
+	my ($self, $callback) = @_;
+	my %seen;
+	my @q = ($self);
+	while (my $cont = shift @q) {
+		$seen{$cont}++;
+		$callback->($cont);
+
+		if (my $next = $cont->next) {
+			if ($seen{$next}) {
+				$cont->next(undef);
+			} else {
+				push @q, $next;
+			}
+		}
+		if (my $child = $cont->child) {
+			if ($seen{$child}) {
+				$cont->child(undef);
+			} else {
+				push @q, $child;
+			}
+		}
+	}
+}
+
+sub iterate_down {
+	my $self = shift;
+	my ($before, $after) = @_;
+
+	my %seen;
+	my $walk = $self;
+	my $depth = 0;
+	my @visited;
+	while ($walk) {
+		push @visited, [ $walk, $depth ];
+		$before->($walk, $depth) if $before;
+
+		# spot/break loops
+		$seen{$walk}++;
+
+		my $child = $walk->child;
+		if ($child && $seen{$child}) {
+			$walk->child(undef);
+			$child = undef;
+		}
+
+		my $next = $walk->next;
+		if ($next && $seen{$next}) {
+			$walk->next(undef);
+			$next = undef;
+		}
+
+		# go down, or across
+		if ($child) {
+			$next = $child;
+			++$depth;
+		}
+
+		# no next?  look up
+		if (!$next) {
+			my $up = $walk;
+			while ($up && !$next) {
+				$up = $up->parent;
+				--$depth;
+				$next = $up->next if $up;
+			}
+		}
+		$walk = $next;
+	}
+	return unless $after;
+	while (@visited) { $after->(@{ pop @visited }) }
+}
+
+1;
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 4f0811a..da31109 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -11,7 +11,7 @@ use PublicInbox::View;
 use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape);
 use Email::MIME;
 require PublicInbox::Git;
-require PublicInbox::Thread;
+require PublicInbox::SearchThread;
 our $LIM = 50;
 
 sub noop {}
@@ -152,7 +152,7 @@ sub mset_thread {
 		$m;
 	} ($mset->items);
 
-	my $th = PublicInbox::Thread->new(@m);
+	my $th = PublicInbox::SearchThread->new(\@m);
 	$th->thread;
 	if ($q->{r}) { # order by relevance
 		$th->order(sub {
diff --git a/lib/PublicInbox/Thread.pm b/lib/PublicInbox/Thread.pm
deleted file mode 100644
index 8af9461..0000000
--- a/lib/PublicInbox/Thread.pm
+++ /dev/null
@@ -1,86 +0,0 @@
-# subclass Mail::Thread and use this to workaround a memory leak
-# Based on the patch in: https://rt.cpan.org/Public/Bug/Display.html?id=22817
-#
-# Additionally, workaround for a bug where $walk->topmost returns undef:
-# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913
-# - https://rt.cpan.org/Ticket/Display.html?id=106498
-#
-# And avoid recursion in recurse_down:
-# - https://rt.cpan.org/Ticket/Display.html?id=116727
-# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479
-#
-# License differs from the rest of public-inbox (but is compatible):
-# This library is free software; you can redistribute it and/or modify
-# it under the same terms as Perl itself.
-package PublicInbox::Thread;
-use strict;
-use warnings;
-use base qw(Mail::Thread);
-# WARNING! both these Mail::Thread knobs were found by inspecting
-# the Mail::Thread 2.55 source code, and we have some monkey patches
-# in PublicInbox::Thread to fix memory leaks.  Since Mail::Thread
-# appears unmaintained, I suppose it's safe to depend on these
-# variables for now:
-{
-	no warnings 'once';
-	# we want strict threads to expose (and hopefully discourage)
-	# use of broken email clients
-	$Mail::Thread::nosubject = 1;
-	# Keep ghosts with only a single direct child,
-	# don't hide that there may be missing messages.
-	$Mail::Thread::noprune = 1;
-}
-
-if ($Mail::Thread::VERSION <= 2.55) {
-	eval q(sub _container_class { 'PublicInbox::Thread::Container' });
-}
-
-package PublicInbox::Thread::Container;
-use strict;
-use warnings;
-use base qw(Mail::Thread::Container);
-use Scalar::Util qw(weaken);
-sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} }
-
-sub topmost {
-	$_[0]->SUPER::topmost || PublicInbox::Thread::CPANRTBug106498->new;
-}
-
-# non-recursive version of recurse_down to avoid stack depth warnings
-sub recurse_down {
-	my ($self, $callback) = @_;
-	my %seen;
-	my @q = ($self);
-	while (my $cont = shift @q) {
-		$seen{$cont}++;
-		$callback->($cont);
-
-		if (my $next = $cont->next) {
-			if ($seen{$next}) {
-				$cont->next(undef);
-			} else {
-				push @q, $next;
-			}
-		}
-		if (my $child = $cont->child) {
-			if ($seen{$child}) {
-				$cont->child(undef);
-			} else {
-				push @q, $child;
-			}
-		}
-	}
-}
-
-# ref:
-# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913
-# - https://rt.cpan.org/Ticket/Display.html?id=106498
-package PublicInbox::Thread::CPANRTBug106498;
-use strict;
-use warnings;
-
-sub new { bless {}, $_[0] }
-
-sub simple_subject {}
-
-1;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index a3b2681..9f1bf46 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -749,8 +749,8 @@ sub msg_timestamp {
 
 sub thread_results {
 	my ($msgs) = @_;
-	require PublicInbox::Thread;
-	my $th = PublicInbox::Thread->new(@$msgs);
+	require PublicInbox::SearchThread;
+	my $th = PublicInbox::SearchThread->new($msgs);
 	$th->thread;
 	$th->order(*sort_ts);
 	$th
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 4d599fc..11fc92e 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -112,7 +112,7 @@ sub call {
 sub preload {
 	require PublicInbox::Feed;
 	require PublicInbox::View;
-	require PublicInbox::Thread;
+	require PublicInbox::SearchThread;
 	require Email::MIME;
 	require Digest::SHA;
 	require POSIX;
diff --git a/t/plack.t b/t/plack.t
index 608afb9..1d62458 100644
--- a/t/plack.t
+++ b/t/plack.t
@@ -11,8 +11,7 @@ my $pi_config = "$tmpdir/config";
 my $maindir = "$tmpdir/main.git";
 my $addr = 'test-public@example.com';
 my $cfgpfx = "publicinbox.test";
-my @mods = qw(HTTP::Request::Common Plack::Test
-	Mail::Thread URI::Escape);
+my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape);
 foreach my $mod (@mods) {
 	eval "require $mod";
 	plan skip_all => "$mod missing for plack.t" if $@;
-- 
EW


^ permalink raw reply related	[relevance 3%]

* [PATCH 0/17] remove Mail::Thread dependency
@ 2016-10-05 23:57  7% Eric Wong
  2016-10-05 23:57  3% ` [PATCH 02/17] thread: " Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2016-10-05 23:57 UTC (permalink / raw)
  To: meta

This greatly reduces the amount of code we need to load while
reducing abstractions which slow us down and hurt memory use
when displaying gigantic threads.

More may be done and we may use SearchMsg directly for threading
in the future and obviate the need for the container
abstraction.

Eric Wong (17):
      view: remove "subject dummy" references
      thread: remove Mail::Thread dependency
      thread: pass array refs instead of entire arrays
      thread: remove accessor usage in internals
      inbox: deal with ghost smsg
      thread: remove Email::Abstract wrapping
      thread: remove rootset accessor method
      thread: simplify
      thread: remove iterate_down
      thread: avoid incrementing undefined value
      thread: order_children no longer cares about depth
      thread: inline and remove recurse_down logic
      thread: fix sorting without topmost
      thread: use hash + array instead of hand-rolled linked list
      view: remove redundant children array in thread views
      t/thread-cycle: test self-referential messages
      thread: remove weaken dependency

 INSTALL                         |   1 -
 MANIFEST                        |   3 +-
 Makefile.PL                     |   1 -
 lib/PublicInbox/Inbox.pm        |   2 +
 lib/PublicInbox/SearchIdx.pm    |   4 +-
 lib/PublicInbox/SearchMsg.pm    |  29 -------
 lib/PublicInbox/SearchThread.pm | 147 +++++++++++++++++++++++++++++++++++
 lib/PublicInbox/SearchView.pm   |  15 ++--
 lib/PublicInbox/Thread.pm       |  86 ---------------------
 lib/PublicInbox/View.pm         | 165 ++++++++++++++++++----------------------
 lib/PublicInbox/WWW.pm          |   2 +-
 t/plack.t                       |   3 +-
 t/search.t                      |   7 +-
 t/thread-cycle.t                |  97 +++++++++++++++++++++++
 14 files changed, 333 insertions(+), 229 deletions(-)


^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2016-10-05 23:57  7% [PATCH 0/17] remove Mail::Thread dependency Eric Wong
2016-10-05 23:57  3% ` [PATCH 02/17] thread: " Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).