From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id EE6191FAEC for ; Tue, 6 Mar 2018 08:42:42 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 10/34] searchidx: use new `references' method for parsing References Date: Tue, 6 Mar 2018 08:42:18 +0000 Message-Id: <20180306084242.19988-11-e@80x24.org> In-Reply-To: <20180306084242.19988-1-e@80x24.org> References: <20180306084242.19988-1-e@80x24.org> List-Id: It's shorter and more convenient, here. --- lib/PublicInbox/MID.pm | 3 +++ lib/PublicInbox/SearchIdx.pm | 39 +++++++++++++++------------------------ 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 786c056..4ccb704 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -68,6 +68,9 @@ sub uniq_mids { } sub mids { uniq_mids($_[0], 'Message-Id') } + +# last References should be IRT, but some mail clients do things +# out of order, so trust IRT over References iff IRT exists sub references { uniq_mids($_[0], 'References', 'In-Reply-To') } # RFC3986, section 3.3: diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ed52e38..57aed75 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -12,7 +12,7 @@ use warnings; use Fcntl qw(:flock :DEFAULT); use PublicInbox::MIME; use base qw(PublicInbox::Search); -use PublicInbox::MID qw/mid_clean id_compress mid_mime/; +use PublicInbox::MID qw/mid_clean id_compress mid_mime mids references/; use PublicInbox::MsgIter; use Carp qw(croak); use POSIX qw(strftime); @@ -447,33 +447,24 @@ sub next_thread_id { sub parse_references ($) { my ($smsg) = @_; - my $doc = $smsg->{doc}; - my $mid = $smsg->mid; my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; - - # last References should be IRT, but some mail clients do things - # out of order, so trust IRT over References iff IRT exists - my @refs = (($hdr->header_raw('References') || '') =~ /<([^>]+)>/g); - push(@refs, (($hdr->header_raw('In-Reply-To') || '') =~ /<([^>]+)>/g)); - - if (@refs) { - my %uniq = ($mid => 1); - my @orig_refs = @refs; - @refs = (); - - # prevent circular references via References: here: - foreach my $ref (@orig_refs) { - if (length($ref) > MAX_MID_SIZE) { - warn "References: <$ref> too long, ignoring\n"; - } - next if $uniq{$ref}; - $uniq{$ref} = 1; - push @refs, $ref; + my $refs = references($hdr); + return $refs if scalar(@$refs) == 0; + + # prevent circular references via References here: + my %mids = map { $_ => 1 } @{mids($hdr)}; + my @keep; + foreach my $ref (@$refs) { + # FIXME: this is an archive-prevention vector like X-No-Archive + if (length($ref) > MAX_MID_SIZE) { + warn "References: <$ref> too long, ignoring\n"; } + next if $mids{$ref}; + push @keep, $ref; } - $smsg->{references} = '<'.join('> <', @refs).'>' if @refs; - \@refs + $smsg->{references} = '<'.join('> <', @keep).'>' if @keep; + \@keep; } sub link_message { -- EW