user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH] handle repeated References and In-Reply-To headers
@ 2017-02-12  0:04 Eric Wong
  2017-02-12  0:37 ` Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2017-02-12  0:04 UTC (permalink / raw)
  To: meta

It seems possible for git-send-email(1) to generate repeated
repeated instances of References and In-Reply-To headers,
as evidenced in:

https://public-inbox.org/git/20161111124541.8216-17-vascomalmeida@sapo.pt/raw

This causes a mismatch between how our search indexer threads
and how our HTML view handles threading.  In the future, View.pm
will use the smsg-parsed {references} field and avoid redoing
Email::MIME header parsing.

We will still need to figure out a way to deal with messages
with repeated Message-IDs, at some point, too.
---
 lib/PublicInbox/SearchIdx.pm    | 30 ++++++------------------------
 lib/PublicInbox/SearchThread.pm |  2 +-
 lib/PublicInbox/View.pm         | 19 +++++++++++--------
 3 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1142ca7..8a529c6 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -291,17 +291,12 @@ sub link_message {
 	my $mid = $smsg->mid;
 	my $mime = $smsg->{mime};
 	my $hdr = $mime->header_obj;
-	my $refs = $hdr->header_raw('References');
-	my @refs = defined $refs ? ($refs =~ /<([^>]+)>/g) : ();
-	my $irt = $hdr->header_raw('In-Reply-To');
-	if (defined $irt) {
-		if ($irt eq '') {
-			$irt = undef;
-		} else {
-			$irt = mid_clean($irt);
-			$irt = undef if $mid eq $irt;
-		}
-	}
+
+	# last References should be IRT, but some mail clients do things
+	# out of order, so trust IRT over References iff IRT exists
+	my @refs = ($hdr->header_raw('References'),
+			$hdr->header_raw('In-Reply-To'));
+	@refs = ((join(' ', @refs)) =~ /<([^>]+)>/g);
 
 	my $tid;
 	if (@refs) {
@@ -309,15 +304,6 @@ sub link_message {
 		my @orig_refs = @refs;
 		@refs = ();
 
-		if (defined $irt) {
-			# to check MAX_MID_SIZE
-			push @orig_refs, $irt;
-
-			# below, we will ensure IRT (if specified)
-			# is the last References
-			$uniq{$irt} = 1;
-		}
-
 		# prevent circular references via References: here:
 		foreach my $ref (@orig_refs) {
 			if (length($ref) > MAX_MID_SIZE) {
@@ -329,10 +315,6 @@ sub link_message {
 		}
 	}
 
-	# last References should be IRT, but some mail clients do things
-	# out of order, so trust IRT over References iff IRT exists
-	push @refs, $irt if defined $irt;
-
 	if (@refs) {
 		$smsg->{references} = '<'.join('> <', @refs).'>';
 
diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm
index 2cd066d..2966907 100644
--- a/lib/PublicInbox/SearchThread.pm
+++ b/lib/PublicInbox/SearchThread.pm
@@ -7,7 +7,7 @@
 # Mail::Thread is unmaintained and unavailable on some distros.
 # We also do not want pruning or subject grouping, since we want
 # to encourage strict threading and hopefully encourage people
-# to use proper In-Reply-To.
+# to use proper In-Reply-To/References.
 #
 # This includes fixes from several open bugs for Mail::Thread
 #
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 2c37cd4..0b1ec75 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -92,13 +92,13 @@ EOF
 
 sub in_reply_to {
 	my ($hdr) = @_;
-	my $irt = $hdr->header_raw('In-Reply-To');
-
-	return mid_clean($irt) if defined $irt && $irt ne '';
-
-	my $refs = $hdr->header_raw('References');
-	if ($refs && $refs =~ /<([^>]+)>\s*\z/s) {
-		return $1;
+	my %mid = map { $_ => 1 } $hdr->header_raw('Message-ID');
+	my @refs = ($hdr->header_raw('References'),
+			$hdr->header_raw('In-Reply-To'));
+	@refs = ((join(' ', @refs)) =~ /<([^>]+)>/g);
+	while (defined(my $irt = pop @refs)) {
+		next if $mid{"<$irt>"};
+		return $irt;
 	}
 	undef;
 }
@@ -201,7 +201,10 @@ sub _th_index_lite {
 	my $rv = '';
 	my $mapping = $ctx->{mapping} or return $rv;
 	my $pad = '  ';
-	my ($attr, $node, $idx, $level) = @{$mapping->{$mid_raw}};
+	my $mid_map = $mapping->{$mid_raw};
+	defined $mid_map or
+		return 'public-inbox BUG: '.ascii_html($mid_raw).' not mapped';
+	my ($attr, $node, $idx, $level) = @$mid_map;
 	my $children = $node->{children};
 	my $nr_c = scalar @$children;
 	my $nr_s = 0;
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2017-02-12  0:37 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-12  0:04 [PATCH] handle repeated References and In-Reply-To headers Eric Wong
2017-02-12  0:37 ` Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).