user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 4/4] search: preserve References: order in document data
Date: Thu, 20 Aug 2015 10:20:29 +0000	[thread overview]
Message-ID: <1440066029-7400-4-git-send-email-e@80x24.org> (raw)
In-Reply-To: <1440066029-7400-1-git-send-email-e@80x24.org>

We need proper ordering of References to thread messages
correctly.  We would lose this order if we load the terms
from the database, so set it directly document data.

Do not bother with a separate In-Reply-To, since Mail::Thread
just merges the IRT into References.  This bumps our schema
version once again.
---
 lib/PublicInbox/Search.pm    | 23 +++++++++++------------
 lib/PublicInbox/SearchMsg.pm | 39 +++++++++++++++++++--------------------
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 2c66e55..f004050 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -23,7 +23,8 @@ use constant {
 	# 3 - message-ID is compressed if it includes '%' (hack!)
 	# 4 - change "Re: " normalization, avoid circular Reference ghosts
 	# 5 - subject_path drops trailing '.'
-	SCHEMA_VERSION => 5,
+	# 6 - preserve References: order in document data
+	SCHEMA_VERSION => 6,
 	QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD,
 };
 
@@ -49,9 +50,9 @@ my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix);
 sub xpfx { $all_pfx{$_[0]} }
 
 our %PFX2TERM_RMAP;
+my %meta_pfx = (mid => 1, thread => 1, path => 1, type => 1);
 while (my ($k, $v) = each %all_pfx) {
-	next if $prob_prefix{$k};
-	$PFX2TERM_RMAP{$v} = $k;
+	$PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k};
 }
 
 my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail');
@@ -129,8 +130,6 @@ sub add_message {
 		my $ts = Search::Xapian::sortable_serialise($smsg->ts);
 		$doc->add_value(PublicInbox::Search::TS, $ts);
 
-		$doc->set_data($smsg->to_doc_data);
-
 		my $tg = $self->term_generator;
 
 		$tg->set_document($doc);
@@ -176,9 +175,11 @@ sub add_message {
 		if ($was_ghost) {
 			$doc_id = $smsg->doc_id;
 			$self->link_message($smsg, 0);
+			$doc->set_data($smsg->to_doc_data);
 			$db->replace_document($doc_id, $doc);
 		} else {
 			$self->link_message($smsg, 0);
+			$doc->set_data($smsg->to_doc_data);
 			$doc_id = $db->add_document($doc);
 		}
 	};
@@ -352,14 +353,14 @@ sub link_message_to_parents {
 	my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : ();
 	my $irt = $mime->header_obj->header('In-Reply-To');
 	if ($irt) {
-		if ($irt =~ /<([^>]+)>/) {
-			$irt = $1;
-		}
+		$irt = mid_compressed(mid_clean($irt));
 
 		# maybe some crazies will try to make a circular reference:
 		if ($irt eq $mid) {
 			$irt = undef;
 		} else {
+			# last References should be $irt
+			# we will de-dupe later
 			push @refs, $irt;
 		}
 	}
@@ -376,12 +377,10 @@ sub link_message_to_parents {
 			$uniq{$ref} = 1;
 			push @refs, $ref;
 		}
-		$irt = undef if (defined $irt && !$uniq{$irt});
 	}
 	if (@refs) {
-		if (defined $irt) {
-			$doc->add_term(xpfx('inreplyto') . $irt);
-		}
+		$doc->add_term(xpfx('inreplyto') . $irt) if defined $irt;
+		$smsg->{references_sorted} = '<'.join('><', @refs).'>';
 
 		my $ref_pfx = xpfx('references');
 
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 14a62eb..03df7ab 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -31,13 +31,14 @@ sub load_doc {
 	my ($class, $doc) = @_;
 	my $data = $doc->get_data;
 	$data = $enc_utf8->decode($data);
-	my ($mid, $subj, $from, $date) = split(/\n/, $data);
+	my ($mid, $subj, $from, $date, $refs) = split(/\n/, $data);
 	bless {
 		doc => $doc,
 		mid => $mid,
 		subject => $subj,
 		date => $date,
 		from_name => $from,
+		references_sorted => $refs,
 	}, $class;
 }
 
@@ -78,17 +79,16 @@ sub ts {
 	my ($self) = @_;
 	my $ts = $self->{ts};
 	return $ts if $ts;
-	$self->{date} = undef;
-	$self->date;
-	$self->{ts};
+	$self->{ts} = eval {
+		str2time($self->date || $self->mime->header('Date'))
+	} || 0;
 }
 
 sub date {
 	my ($self) = @_;
 	my $date = $self->{date};
 	return $date if $date;
-	my $ts = eval { str2time($self->mime->header('Date')) } || 0;
-	$self->{ts} = $ts;
+	my $ts = eval { str2time($self->mime->header('Date')) };
 	$self->{date} = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts));
 }
 
@@ -98,7 +98,14 @@ sub to_doc_data {
 	$self->mid . "\n" .
 	$self->subject . "\n" .
 	$self->from_name . "\n".
-	$self->date;
+	$self->date . "\n" .
+	$self->references_sorted;
+}
+
+sub references_sorted {
+	my ($self) = @_;
+	my $x = $self->{references_sorted};
+	defined $x ? $x : '';
 }
 
 sub ensure_metadata {
@@ -117,12 +124,7 @@ sub ensure_metadata {
 
 		if ($val =~ s/$PFX2TERM_RE//o) {
 			my $field = $PublicInbox::Search::PFX2TERM_RMAP{$1};
-			if ($field eq 'references') {
-				my $refs = $self->{references} ||= [];
-				push @$refs, $val;
-			} else {
-				$self->{$field} = $val;
-			}
+			$self->{$field} = $val;
 		}
 	}
 }
@@ -138,14 +140,11 @@ sub mini_mime {
 		'X-PI-TS' => $self->ts,
 		'Message-ID' => "<$self->{mid}>",
 	);
-	if (my $refs = $self->{references}) {
-		push @h, References => '<' . join('> <', @$refs) . '>';
-	}
-	if (my $irt = $self->{inreplyto}) {
-		push @h, 'In-Reply-To' => "<$irt>";
-	}
 
-	Email::MIME->create(header_str => \@h);
+	my $refs = $self->{references_sorted};
+	my $mime = Email::MIME->create(header_str => \@h);
+	$mime->header_set('References', $refs) if (defined $refs);
+	$mime;
 }
 
 sub mid {
-- 
EW


      parent reply	other threads:[~2015-08-20 10:20 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-08-20 10:20 [PATCH 1/4] dead code cleanup Eric Wong
2015-08-20 10:20 ` [PATCH 2/4] view: simplify message threading dumpers Eric Wong
2015-08-20 10:20 ` [PATCH 3/4] avoid using header_raw for Message-ID retrieval Eric Wong
2015-08-20 10:20 ` Eric Wong [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1440066029-7400-4-git-send-email-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).