From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id DDAA01F453 for ; Tue, 29 Jan 2019 07:56:44 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [RFC] mid: filter out 'y', 'n', and email addresses from references() Date: Tue, 29 Jan 2019 07:56:44 +0000 Message-Id: <20190129075644.3917-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Looking at git@vger history, several emails had broken References/In-Reply-To pointing to , and email addresses as Message-IDs in References and In-Reply-To headers. This was causing too many unrelated messaes to be linked together in the same thread. --- lib/PublicInbox/MID.pm | 25 +++++++++++++++++++------ t/mid.t | 4 ++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index cd56f27..7f1ab15 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -10,6 +10,7 @@ our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC mids references/; use URI::Escape qw(uri_escape_utf8); use Digest::SHA qw/sha1_hex/; +require PublicInbox::Address; use constant { MID_MAX => 40, # SHA-1 hex length # TODO: get rid of this MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q') @@ -79,22 +80,34 @@ sub references ($) { push(@mids, ($v =~ /<([^>]+)>/sg)); } } - uniq_mids(\@mids); + + # old versions of git-send-email would prompt users for + # In-Reply-To and users' muscle memory would use 'y' or 'n' + # as responses: + my %addr = ( y => 1, n => 1 ); + + foreach my $f (qw(To From Cc)) { + my @v = $hdr->header_raw($f); + foreach my $v (@v) { + $addr{$_} = 1 for (PublicInbox::Address::emails($v)); + } + } + uniq_mids(\@mids, \%addr); } -sub uniq_mids ($) { - my ($mids) = @_; +sub uniq_mids ($;$) { + my ($mids, $seen) = @_; my @ret; - my %seen; + $seen ||= {}; foreach my $mid (@$mids) { $mid =~ tr/\n\t\r//d; if (length($mid) > MAX_MID_SIZE) { warn "Message-ID: <$mid> too long, truncating\n"; $mid = substr($mid, 0, MAX_MID_SIZE); } - next if $seen{$mid}; + next if $seen->{$mid}; push @ret, $mid; - $seen{$mid} = 1; + $seen->{$mid} = 1; } \@ret; } diff --git a/t/mid.t b/t/mid.t index 8c307c8..69a8a70 100644 --- a/t/mid.t +++ b/t/mid.t @@ -36,6 +36,10 @@ is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)'); $mime->header_set('Message-ID', ""); is_deeply(mids($mime->header_obj), ['helloworld'], 'drop \t in Message-ID'); + + $mime->header_set('To', 'u@example.com'); + $mime->header_set('References', ' '); + is_deeply(references($mime->header_obj), [qw(hello world)]); } done_testing(); -- EW