From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-3.1 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=unavailable version=3.3.2 X-Original-To: meta@public-inbox.org Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 7419D1F8B7 for ; Thu, 20 Aug 2015 10:20:30 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 4/4] search: preserve References: order in document data Date: Thu, 20 Aug 2015 10:20:29 +0000 Message-Id: <1440066029-7400-4-git-send-email-e@80x24.org> In-Reply-To: <1440066029-7400-1-git-send-email-e@80x24.org> References: <1440066029-7400-1-git-send-email-e@80x24.org> List-Id: We need proper ordering of References to thread messages correctly. We would lose this order if we load the terms from the database, so set it directly document data. Do not bother with a separate In-Reply-To, since Mail::Thread just merges the IRT into References. This bumps our schema version once again. --- lib/PublicInbox/Search.pm | 23 +++++++++++------------ lib/PublicInbox/SearchMsg.pm | 39 +++++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 2c66e55..f004050 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -23,7 +23,8 @@ use constant { # 3 - message-ID is compressed if it includes '%' (hack!) # 4 - change "Re: " normalization, avoid circular Reference ghosts # 5 - subject_path drops trailing '.' - SCHEMA_VERSION => 5, + # 6 - preserve References: order in document data + SCHEMA_VERSION => 6, QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; @@ -49,9 +50,9 @@ my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); sub xpfx { $all_pfx{$_[0]} } our %PFX2TERM_RMAP; +my %meta_pfx = (mid => 1, thread => 1, path => 1, type => 1); while (my ($k, $v) = each %all_pfx) { - next if $prob_prefix{$k}; - $PFX2TERM_RMAP{$v} = $k; + $PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k}; } my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail'); @@ -129,8 +130,6 @@ sub add_message { my $ts = Search::Xapian::sortable_serialise($smsg->ts); $doc->add_value(PublicInbox::Search::TS, $ts); - $doc->set_data($smsg->to_doc_data); - my $tg = $self->term_generator; $tg->set_document($doc); @@ -176,9 +175,11 @@ sub add_message { if ($was_ghost) { $doc_id = $smsg->doc_id; $self->link_message($smsg, 0); + $doc->set_data($smsg->to_doc_data); $db->replace_document($doc_id, $doc); } else { $self->link_message($smsg, 0); + $doc->set_data($smsg->to_doc_data); $doc_id = $db->add_document($doc); } }; @@ -352,14 +353,14 @@ sub link_message_to_parents { my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); my $irt = $mime->header_obj->header('In-Reply-To'); if ($irt) { - if ($irt =~ /<([^>]+)>/) { - $irt = $1; - } + $irt = mid_compressed(mid_clean($irt)); # maybe some crazies will try to make a circular reference: if ($irt eq $mid) { $irt = undef; } else { + # last References should be $irt + # we will de-dupe later push @refs, $irt; } } @@ -376,12 +377,10 @@ sub link_message_to_parents { $uniq{$ref} = 1; push @refs, $ref; } - $irt = undef if (defined $irt && !$uniq{$irt}); } if (@refs) { - if (defined $irt) { - $doc->add_term(xpfx('inreplyto') . $irt); - } + $doc->add_term(xpfx('inreplyto') . $irt) if defined $irt; + $smsg->{references_sorted} = '<'.join('><', @refs).'>'; my $ref_pfx = xpfx('references'); diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 14a62eb..03df7ab 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -31,13 +31,14 @@ sub load_doc { my ($class, $doc) = @_; my $data = $doc->get_data; $data = $enc_utf8->decode($data); - my ($mid, $subj, $from, $date) = split(/\n/, $data); + my ($mid, $subj, $from, $date, $refs) = split(/\n/, $data); bless { doc => $doc, mid => $mid, subject => $subj, date => $date, from_name => $from, + references_sorted => $refs, }, $class; } @@ -78,17 +79,16 @@ sub ts { my ($self) = @_; my $ts = $self->{ts}; return $ts if $ts; - $self->{date} = undef; - $self->date; - $self->{ts}; + $self->{ts} = eval { + str2time($self->date || $self->mime->header('Date')) + } || 0; } sub date { my ($self) = @_; my $date = $self->{date}; return $date if $date; - my $ts = eval { str2time($self->mime->header('Date')) } || 0; - $self->{ts} = $ts; + my $ts = eval { str2time($self->mime->header('Date')) }; $self->{date} = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); } @@ -98,7 +98,14 @@ sub to_doc_data { $self->mid . "\n" . $self->subject . "\n" . $self->from_name . "\n". - $self->date; + $self->date . "\n" . + $self->references_sorted; +} + +sub references_sorted { + my ($self) = @_; + my $x = $self->{references_sorted}; + defined $x ? $x : ''; } sub ensure_metadata { @@ -117,12 +124,7 @@ sub ensure_metadata { if ($val =~ s/$PFX2TERM_RE//o) { my $field = $PublicInbox::Search::PFX2TERM_RMAP{$1}; - if ($field eq 'references') { - my $refs = $self->{references} ||= []; - push @$refs, $val; - } else { - $self->{$field} = $val; - } + $self->{$field} = $val; } } } @@ -138,14 +140,11 @@ sub mini_mime { 'X-PI-TS' => $self->ts, 'Message-ID' => "<$self->{mid}>", ); - if (my $refs = $self->{references}) { - push @h, References => '<' . join('> <', @$refs) . '>'; - } - if (my $irt = $self->{inreplyto}) { - push @h, 'In-Reply-To' => "<$irt>"; - } - Email::MIME->create(header_str => \@h); + my $refs = $self->{references_sorted}; + my $mime = Email::MIME->create(header_str => \@h); + $mime->header_set('References', $refs) if (defined $refs); + $mime; } sub mid { -- EW