From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id ACD711FAF8 for ; Tue, 6 Mar 2018 08:42:43 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 21/34] mid: truncate excessively long MIDs early Date: Tue, 6 Mar 2018 08:42:29 +0000 Message-Id: <20180306084242.19988-22-e@80x24.org> In-Reply-To: <20180306084242.19988-1-e@80x24.org> References: <20180306084242.19988-1-e@80x24.org> List-Id: Since we support duplicate MIDs in v2, we can safely truncate long MID terms in the database and let other normal duplicate resolution sort it out. It seems only spammers use excessively long MIDs, and there'll always be abuse/misuse vectors for causing mis-threaded messages, so it's not worth worrying about excessively long MIDs. --- lib/PublicInbox/MID.pm | 11 ++++++++++- lib/PublicInbox/SearchIdx.pm | 15 ++++++--------- lib/PublicInbox/SearchIdxSkeleton.pm | 3 --- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 9608539..422902f 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -10,7 +10,10 @@ our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC mids references/; use URI::Escape qw(uri_escape_utf8); use Digest::SHA qw/sha1_hex/; -use constant MID_MAX => 40; # SHA-1 hex length +use constant { + MID_MAX => 40, # SHA-1 hex length # TODO: get rid of this + MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q') +}; sub mid_clean { my ($mid) = @_; @@ -61,6 +64,12 @@ sub mids ($) { push(@mids, $v); } } + foreach my $i (0..$#mids) { + next if length($mids[$i]) <= MAX_MID_SIZE; + warn "Message-ID: <$mids[$i]> too long, truncating\n"; + $mids[$i] = substr($mids[$i], 0, MAX_MID_SIZE); + } + uniq_mids(\@mids); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 3ef444d..a70e1eb 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -19,7 +19,6 @@ use POSIX qw(strftime); require PublicInbox::Git; use constant { - MAX_MID_SIZE => 244, # max term size (Xapian limitation) - length('Q') PERM_UMASK => 0, OLD_PERM_GROUP => 1, OLD_PERM_EVERYBODY => 2, @@ -311,12 +310,6 @@ sub add_message { eval { my $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $smsg->{doc}; - foreach my $mid (@$mids) { - # FIXME: may be abused to prevent archival - length($mid) > MAX_MID_SIZE and - die 'Message-ID too long'; - $doc->add_term('Q' . $mid); - } my $subj = $smsg->subject; my $xpath; if ($subj ne '') { @@ -392,9 +385,11 @@ sub add_message { } } } + if ($skel) { push @values, $mids, $xpath, $data; $skel->index_skeleton(\@values); + $doc->add_boolean_term('Q' . $_) foreach @$mids; $doc_id = $self->{xdb}->add_document($doc); } else { $doc_id = link_and_save($self, $doc, $mids, $refs, @@ -469,9 +464,9 @@ sub parse_references ($) { my %mids = map { $_ => 1 } @{mids($hdr)}; my @keep; foreach my $ref (@$refs) { - # FIXME: this is an archive-prevention vector like X-No-Archive - if (length($ref) > MAX_MID_SIZE) { + if (length($ref) > PublicInbox::MID::MAX_MID_SIZE) { warn "References: <$ref> too long, ignoring\n"; + next; } next if $mids{$ref}; push @keep, $ref; @@ -510,6 +505,8 @@ sub link_and_save { my $doc_id; $doc->add_boolean_term('XNUM' . $num) if defined $num; $doc->add_boolean_term('XPATH' . $xpath) if defined $xpath; + $doc->add_boolean_term('Q' . $_) foreach @$mids; + my $vivified = 0; foreach my $mid (@$mids) { $self->each_smsg_by_mid($mid, sub { diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm index 4066b59..40b28c5 100644 --- a/lib/PublicInbox/SearchIdxSkeleton.pm +++ b/lib/PublicInbox/SearchIdxSkeleton.pm @@ -98,9 +98,6 @@ sub index_skeleton_real ($$) { my $ts = $values->[PublicInbox::Search::TS]; my $smsg = PublicInbox::SearchMsg->new(undef); my $doc = $smsg->{doc}; - foreach my $mid (@$mids) { - $doc->add_term('Q' . $mid); - } PublicInbox::SearchIdx::add_values($doc, $values); $doc->set_data($doc_data); $smsg->{ts} = $ts; -- EW