public-inbox.git  about / heads / tags
an "archives first" approach to mailing lists
blob 931bd579c3f259d5158644b4216c0986bdd3f47b 4689 bytes (raw)
$ git show viewdiff:lib/PublicInbox/SearchThread.pm	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
 
# This library is free software; you can redistribute it and/or modify
# it under the same terms as Perl itself.
#
# This license differs from the rest of public-inbox
#
# Our own jwz-style threading class based on Mail::Thread from CPAN.
# Mail::Thread is unmaintained and unavailable on some distros.
# We also do not want pruning or subject grouping, since we want
# to encourage strict threading and hopefully encourage people
# to use proper In-Reply-To/References.
#
# This includes fixes from several open bugs for Mail::Thread
#
# Avoid circular references
# - https://rt.cpan.org/Public/Bug/Display.html?id=22817
#
# And avoid recursion in recurse_down:
# - https://rt.cpan.org/Ticket/Display.html?id=116727
# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479
package PublicInbox::SearchThread;
use strict;
use warnings;

sub thread {
	my ($msgs, $ordersub, $ibx) = @_;
	my $id_table = {};

	# Sadly, we sort here anyways since the fill-in-the-blanks References:
	# can be shakier if somebody used In-Reply-To with multiple, disparate
	# messages.  So, take the client Date: into account since we can't
	# alway determine ordering when somebody uses multiple In-Reply-To.
	# We'll trust the client Date: header here instead of the Received:
	# time since this is for display (and not retrieval)
	_add_message($id_table, $_) for sort { $a->{ds} <=> $b->{ds} } @$msgs;
	my $rootset = [ grep {
			!delete($_->{parent}) && $_->visible($ibx)
		} values %$id_table ];
	$id_table = undef;
	$rootset = $ordersub->($rootset);
	$_->order_children($ordersub, $ibx) for @$rootset;
	$rootset;
}

sub _get_cont_for_id ($$) {
	my ($id_table, $mid) = @_;
	$id_table->{$mid} ||= PublicInbox::SearchThread::Msg->new($mid);
}

sub _add_message ($$) {
	my ($id_table, $smsg) = @_;

	# A. if id_table...
	my $this = _get_cont_for_id($id_table, $smsg->{mid});
	$this->{smsg} = $smsg;

	# saves around 4K across 1K messages
	# TODO: move this to a more appropriate place, breaks tests
	# if we do it during psgi_cull
	delete $smsg->{num};

	# B. For each element in the message's References field:
	defined(my $refs = $smsg->{references}) or return;

	# This loop exists to help fill in gaps left from missing
	# messages.  It is not needed in a perfect world where
	# everything is perfectly referenced, only the last ref
	# matters.
	my $prev;
	foreach my $ref ($refs =~ m/<([^>]+)>/g) {
		# Find a Container object for the given Message-ID
		my $cont = _get_cont_for_id($id_table, $ref);

		# Link the References field's Containers together in
		# the order implied by the References header
		#
		# * If they are already linked don't change the
		#   existing links
		# * Do not add a link if adding that link would
		#   introduce a loop...
		if ($prev &&
			!$cont->{parent} &&  # already linked
			!$cont->has_descendent($prev) # would loop
		   ) {
			$prev->add_child($cont);
		}
		$prev = $cont;
	}

	# C. Set the parent of this message to be the last element in
	# References.
	if (defined $prev && !$this->has_descendent($prev)) { # would loop
		$prev->add_child($this);
	}
}

package PublicInbox::SearchThread::Msg;
use strict;
use warnings;
use Carp qw(croak);

sub new {
	bless {
		id => $_[1],
		children => {}, # becomes an array when sorted by ->order(...)
	}, $_[0];
}

sub topmost {
	my ($self) = @_;
	my @q = ($self);
	while (my $cont = shift @q) {
		return $cont if $cont->{smsg};
		push @q, values %{$cont->{children}};
	}
	undef;
}

sub add_child {
	my ($self, $child) = @_;
	croak "Cowardly refusing to become my own parent: $self"
	  if $self == $child;

	my $cid = $child->{id};

	# reparenting:
	if (defined(my $parent = $child->{parent})) {
		delete $parent->{children}->{$cid};
	}

	$self->{children}->{$cid} = $child;
	$child->{parent} = $self;
}

sub has_descendent {
	my ($self, $child) = @_;
	my %seen; # loop prevention
	while ($child) {
		return 1 if $self == $child || $seen{$child}++;
		$child = $child->{parent};
	}
	0;
}

# Do not show/keep ghosts iff they have no children.  Sometimes
# a ghost Message-ID is the result of a long header line
# being folded/mangled by a MUA, and not a missing message.
sub visible ($$) {
	my ($self, $ibx) = @_;
	($self->{smsg} ||= eval { $ibx->smsg_by_mid($self->{id}) }) ||
	 (scalar values %{$self->{children}});
}

sub order_children {
	my ($cur, $ordersub, $ibx) = @_;

	my %seen = ($cur => 1); # self-referential loop prevention
	my @q = ($cur);
	while (defined($cur = shift @q)) {
		my $c = $cur->{children}; # The hashref here...

		$c = [ grep { !$seen{$_}++ && visible($_, $ibx) } values %$c ];
		$c = $ordersub->($c) if scalar @$c > 1;
		$cur->{children} = $c; # ...becomes an arrayref
		push @q, @$c;
	}
}

1;

git clone https://public-inbox.org/public-inbox.git
git clone http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git