Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 977481F45A; Sat, 18 Apr 2020 22:25:08 +0000 (UTC) Date: Sat, 18 Apr 2020 22:25:08 +0000 From: Eric Wong To: test@public-inbox.org Subject: Re: embedded message test Message-ID: <20200418222508.GA13918@dcvr> References: <20200418222020.GA2745@dcvr> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="TB36FDmn/VVEgNH/" Content-Disposition: inline In-Reply-To: <20200418222020.GA2745@dcvr> --TB36FDmn/VVEgNH/ Content-Type: text/plain; charset=utf-8 Content-Disposition: inline testing embedded message harder --TB36FDmn/VVEgNH/ Content-Type: message/rfc822 Content-Disposition: attachment; filename="embed2x.eml" Date: Sat, 18 Apr 2020 22:20:20 +0000 From: Eric Wong To: test@public-inbox.org Subject: embedded message test Message-ID: <20200418222020.GA2745@dcvr> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="/04w6evG8XlLl3ft" Content-Disposition: inline --/04w6evG8XlLl3ft Content-Type: text/plain; charset=utf-8 Content-Disposition: inline testing embedded message --/04w6evG8XlLl3ft Content-Type: message/rfc822 Content-Disposition: attachment; filename="test.eml" From: Eric Wong To: spew@80x24.org Subject: [PATCH] mail header experiments Date: Sat, 18 Apr 2020 21:41:14 +0000 Message-Id: <20200418214114.7575-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit --- lib/PublicInbox/MailHeader.pm | 55 +++++++++++++++++++++++++++++++++++ t/mail_header.t | 31 ++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 lib/PublicInbox/MailHeader.pm create mode 100644 t/mail_header.t diff --git a/lib/PublicInbox/MailHeader.pm b/lib/PublicInbox/MailHeader.pm new file mode 100644 index 00000000..166baf91 --- /dev/null +++ b/lib/PublicInbox/MailHeader.pm @@ -0,0 +1,55 @@ +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ +package PublicInbox::MailHeader; +use strict; +use HTTP::Parser::XS qw(parse_http_response HEADERS_AS_ARRAYREF); +use bytes (); #bytes::length +my %casemap; + +sub _headerx_to_list { + my (undef, $head, $crlf) = @_; + + # picohttpparser uses `int' as the return value, so the + # actual limit is 2GB on most platforms. However, headers + # exceeding (or even close to) 1MB seems unreasonable + die 'headers too big' if bytes::length($$head) > 0x100000; + my ($ret, undef, undef, undef, $headers) = + parse_http_response('HTTP/1.0 1 X'. $crlf . $$head, + HEADERS_AS_ARRAYREF); + die 'failed to parse headers' if $ret <= 0; + # %casemap = map {; lc($_) => $_ } ($$head =~ m/^([^:]+):/gsm); + # my $nr = @$headers; + for (my $i = 0; $i < @$headers; $i += 2) { + my $key = $headers->[$i]; # = $casemap{$headers->[$i]}; + my $val = $headers->[$i + 1]; + (my $trimmed = $val) =~ s/\r?\n\s+/ /; + $headers->[$i + 1] = [ + $trimmed, + "$key: $val" + ] + } + $headers; +} + +sub _header_to_list { + my (undef, $head, $crlf) = @_; + my @tmp = ($$head =~ m/^(([^ \t:][^:\n]*):[ \t]* + ([^\n]*\n(?:[ \t]+[^\n]*\n)*))/gsmx); + my @headers; + $#headers = scalar @tmp; + @headers = (); + while (@tmp) { + my ($orig, $key, $val) = splice(@tmp, 0, 3); + # my $v = $tmp[$i + 2]; + # $v =~ s/\r?\n[ \t]+/ /sg; + # $v =~ s/\r?\n\z//s; + $val =~ s/\n[ \t]+/ /sg; + chomp($val, $orig); + # $val =~ s/\r?\n\z//s; + # $orig =~ s/\r?\n\z//s; + push @headers, $key, [ $val, $orig ]; + } + \@headers; +} + +1; diff --git a/t/mail_header.t b/t/mail_header.t new file mode 100644 index 00000000..4dc62c50 --- /dev/null +++ b/t/mail_header.t @@ -0,0 +1,31 @@ +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use PublicInbox::TestCommon; +require_mods('PublicInbox::MailHeader'); + +my $head = <<'EOF'; +From d0147582e289fdd4cdd84e91d8b0f8ae9c230124 Mon Sep 17 00:00:00 2001 +From: Eric Wong +Date: Fri, 17 Apr 2020 09:28:49 +0000 +Subject: [PATCH] searchthread: reduce indirection by removing container + +EOF +my $orig = $head; +use Email::Simple; +my $xshdr = PublicInbox::MailHeader->_header_to_list(\$head, "\n"); +my $simpl = Email::Simple::Header->_header_to_list(\$head, "\n"); +is_deeply($xshdr, $simpl); +use Benchmark qw(:all); +my $res = timethese(100000, { + pmh => sub { + PublicInbox::MailHeader->_header_to_list(\$head, "\n"); + }, + esh => sub { + PublicInbox::MailHeader->_header_to_list(\$head, "\n"); + } +}); +is($head, $orig); +use Data::Dumper; diag Dumper($res); +done_testing; --/04w6evG8XlLl3ft-- --TB36FDmn/VVEgNH/--