From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id C80451FBCC for ; Wed, 10 Jun 2020 07:05:21 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 14/82] imap: allow fetch of partial of BODY[...] and headers Date: Wed, 10 Jun 2020 07:04:11 +0000 Message-Id: <20200610070519.18252-15-e@yhbt.net> In-Reply-To: <20200610070519.18252-1-e@yhbt.net> References: <20200610070519.18252-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: IMAP supports a high level of granularity when it comes to fetching, but fortunately Perl makes it fairly easy to support. --- MANIFEST | 1 + lib/PublicInbox/IMAP.pm | 154 ++++++++++++++++++++++++++++++++++++++-- t/imap.t | 43 +++++++++++ t/imapd.t | 61 +++++++++++++++- 4 files changed, 253 insertions(+), 6 deletions(-) create mode 100644 t/imap.t diff --git a/MANIFEST b/MANIFEST index 8aff192c7ee..0803c3654d6 100644 --- a/MANIFEST +++ b/MANIFEST @@ -266,6 +266,7 @@ t/httpd-https.t t/httpd-unix.t t/httpd.t t/hval.t +t/imap.t t/imapd-tls.t t/imapd.t t/import.t diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index 6c5e0290925..86e0a176be1 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -390,7 +390,9 @@ sub uid_fetch_cb { # called by git->cat_async $self->msg_more(' BODYSTRUCTURE '.fetch_body($eml, 1)); $want->{BODY} and $self->msg_more(' BODY '.fetch_body($eml)); - + if (my $partial = delete $want->{-partial}) { + partial_emit($self, $partial, $eml); + } $self->msg_more(")\r\n"); } @@ -454,16 +456,158 @@ sub cmd_list ($$$$) { \(join('', @$l, "$tag OK List complete\r\n")); } +sub eml_index_offs_i { # PublicInbox::Eml::each_part callback + my ($p, $all) = @_; + my ($eml, undef, $idx) = @$p; + if ($idx && lc($eml->ct->{type}) eq 'multipart') { + $eml->{imap_bdy} = $eml->{bdy} // \''; + } + $all->{$idx} = $eml; # $idx => Eml +} + +# prepares an index for BODY[$SECTION_IDX] fetches +sub eml_body_idx ($$) { + my ($eml, $section_idx) = @_; + my $idx = $eml->{imap_all_parts} //= do { + my $all = {}; + $eml->each_part(\&eml_index_offs_i, $all, 0, 1); + # top-level of multipart, BODY[0] not allowed (nz-number) + delete $all->{0}; + $all; + }; + $idx->{$section_idx}; +} + +# BODY[($SECTION_IDX)?(.$SECTION_NAME)?]<$offset.$bytes> +sub partial_body { + my ($eml, $section_idx, $section_name) = @_; + if (defined $section_idx) { + $eml = eml_body_idx($eml, $section_idx) or return; + } + if (defined $section_name) { + if ($section_name eq 'MIME') { + # RFC 3501 6.4.5 states: + # The MIME part specifier MUST be prefixed + # by one or more numeric part specifiers + return unless defined $section_idx; + return $eml->header_obj->as_string . "\r\n"; + } + my $bdy = $eml->{bdy} // $eml->{imap_bdy} // \''; + $eml = PublicInbox::Eml->new($$bdy); + if ($section_name eq 'TEXT') { + return $eml->body_raw; + } elsif ($section_name eq 'HEADER') { + return $eml->header_obj->as_string . "\r\n"; + } else { + die "BUG: bad section_name=$section_name"; + } + } + ${$eml->{bdy} // $eml->{imap_bdy} // \''}; +} + +# similar to what's in PublicInbox::Eml::re_memo, but doesn't memoize +# to avoid OOM with malicious users +sub hdrs_regexp ($) { + my ($hdrs) = @_; + my $names = join('|', map { "\Q$_" } split(/[ \t]+/, $hdrs)); + qr/^(?:$names):[ \t]*[^\n]*\r?\n # 1st line + # continuation lines: + (?:[^:\n]*?[ \t]+[^\n]*\r?\n)* + /ismx; +} + +# BODY[($SECTION_IDX.)?HEADER.FIELDS.NOT ($HDRS)]<$offset.$bytes> +sub partial_hdr_not { + my ($eml, $section_idx, $hdrs) = @_; + if (defined $section_idx) { + $eml = eml_body_idx($eml, $section_idx) or return; + } + my $str = $eml->header_obj->as_string; + my $re = hdrs_regexp($hdrs); + $str =~ s/$re//g; + $str .= "\r\n"; +} + +# BODY[($SECTION_IDX.)?HEADER.FIELDS ($HDRS)]<$offset.$bytes> +sub partial_hdr_get { + my ($eml, $section_idx, $hdrs) = @_; + if (defined $section_idx) { + $eml = eml_body_idx($eml, $section_idx) or return; + } + my $str = $eml->header_obj->as_string; + my $re = hdrs_regexp($hdrs); + join('', ($str =~ m/($re)/g), "\r\n"); +} + +sub partial_prepare ($$$) { + my ($partial, $want, $att) = @_; + + # recombine [ "BODY[1.HEADER.FIELDS", "(foo", "bar)]" ] + # back to: "BODY[1.HEADER.FIELDS (foo bar)]" + return unless $att =~ /\ABODY(?:\.PEEK)?\[/s; + until (rindex($att, ']') >= 0) { + my $next = shift @$want or return; + $att .= ' ' . uc($next); + } + if ($att =~ /\ABODY(?:\.PEEK)?\[ + ([0-9]+(?:\.[0-9]+)*)? # 1 - section_idx + (?:\.(HEADER|MIME|TEXT))? # 2 - section_name + \](?:<([0-9]+)(?:\.([0-9]+))?>)?\z/sx) { # 3, 4 + $partial->{$att} = [ \&partial_body, $1, $2, $3, $4 ]; + } elsif ($att =~ /\ABODY(?:\.PEEK)?\[ + (?:([0-9]+(?:\.[0-9]+)*)\.)? # 1 - section_idx + (?:HEADER\.FIELDS(\.NOT)?)\x20 # 2 + \(([A-Z0-9\-\x20]+)\) # 3 - hdrs + \](?:<([0-9]+)(?:\.([0-9]+))?>)?\z/sx) { # 4 5 + $partial->{$att} = [ $2 ? \&partial_hdr_not + : \&partial_hdr_get, + $1, $3, $4, $5 ]; + } else { + undef; + } +} + +sub partial_emit ($$$) { + my ($self, $partial, $eml) = @_; + for my $k (sort keys %$partial) { + my ($cb, @args) = @{$partial->{$k}}; + my ($offset, $len) = splice(@args, -2); + # $cb is partial_body|partial_hdr_get|partial_hdr_not + my $str = $cb->($eml, @args) // ''; + if (defined $offset) { + if (defined $len) { + $str = substr($str, $offset, $len); + $k =~ s/\.$len>\z/>/ or warn +"BUG: unable to remove `.$len>' from `$k'"; + } else { + $str = substr($str, $offset); + $len = length($str); + } + } else { + $len = length($str); + } + $self->msg_more(" $k {$len}\r\n"); + $self->msg_more($str); + } +} + sub cmd_uid_fetch ($$$;@) { my ($self, $tag, $range, @want) = @_; my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n"; if ($want[0] =~ s/\A\(//s) { $want[-1] =~ s/\)\z//s or return "$tag BAD no rparen\r\n"; } - my %want = map {; - my $x = $FETCH_ATT{uc($_)} or return "$tag BAD param: $_\r\n"; - %$x; - } @want; + my (%partial, %want); + while (defined(my $att = shift @want)) { + $att = uc($att); + my $x = $FETCH_ATT{$att}; + if ($x) { + %want = (%want, %$x); + } elsif (!partial_prepare(\%partial, \@want, $att)) { + return "$tag BAD param: $att\r\n"; + } + } + $want{-partial} = \%partial if scalar keys %partial; my ($beg, $end); my $msgs = []; if ($range =~ /\A([0-9]+):([0-9]+)\z/s) { diff --git a/t/imap.t b/t/imap.t new file mode 100644 index 00000000000..c435d365543 --- /dev/null +++ b/t/imap.t @@ -0,0 +1,43 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use PublicInbox::IMAP; +{ + my $partial_prepare = \&PublicInbox::IMAP::partial_prepare; + my $x = {}; + my $r = $partial_prepare->($x, [], my $p = 'BODY.PEEK[9]'); + ok($r, $p); + $r = $partial_prepare->($x, [], $p = 'BODY.PEEK[9]<5>'); + ok($r, $p); + $r = $partial_prepare->($x, [], $p = 'BODY.PEEK[9]<5.1>'); + ok($r, $p); + $r = $partial_prepare->($x, [], $p = 'BODY[1.1]'); + ok($r, $p); + $r = $partial_prepare->($x, [], $p = 'BODY[HEADER.FIELDS (DATE FROM)]'); + ok($r, $p); + $r = $partial_prepare->($x, [], $p = 'BODY[HEADER.FIELDS.NOT (TO)]'); + ok($r, $p); + $r = $partial_prepare->($x, [], $p = 'BODY[HEDDER.FIELDS.NOT (TO)]'); + ok(!$r, "rejected misspelling $p"); + $r = $partial_prepare->($x, [], $p = 'BODY[1.1.HEADER.FIELDS (TO)]'); + ok($r, $p); + my $partial_body = \&PublicInbox::IMAP::partial_body; + my $partial_hdr_get = \&PublicInbox::IMAP::partial_hdr_get; + my $partial_hdr_not = \&PublicInbox::IMAP::partial_hdr_not; + is_deeply($x, { + 'BODY.PEEK[9]' => [ $partial_body, 9, undef, undef, undef ], + 'BODY.PEEK[9]<5>' => [ $partial_body, 9, undef, 5, undef ], + 'BODY.PEEK[9]<5.1>' => [ $partial_body, 9, undef, 5, 1 ], + 'BODY[1.1]' => [ $partial_body, '1.1', undef, undef, undef ], + 'BODY[HEADER.FIELDS (DATE FROM)]' => [ $partial_hdr_get, + undef, 'DATE FROM', undef, undef ], + 'BODY[HEADER.FIELDS.NOT (TO)]' => [ $partial_hdr_not, + undef, 'TO', undef, undef ], + 'BODY[1.1.HEADER.FIELDS (TO)]' => [ $partial_hdr_get, + '1.1', 'TO', undef, undef ], + }, 'structure matches expected'); +} + +done_testing; diff --git a/t/imapd.t b/t/imapd.t index 31154bdf3e7..a63be0fd0c2 100644 --- a/t/imapd.t +++ b/t/imapd.t @@ -191,6 +191,31 @@ for my $r ('1:*', '1') { $ret = $mic->fetch_hash($r, 'FLAGS') or BAIL_OUT "FETCH $@"; is_deeply($ret->{1}->{FLAGS}, '', 'no flags'); + $ret = $mic->fetch_hash($r, 'BODY[1]') or BAIL_OUT "FETCH $@"; + like($ret->{1}->{'BODY[1]'}, qr/\AThis is a test message/, 'BODY[1]'); + + $ret = $mic->fetch_hash($r, 'BODY[1]<1>') or BAIL_OUT "FETCH $@"; + like($ret->{1}->{'BODY[1]<1>'}, qr/\Ahis is a test message/, + 'BODY[1]<1>'); + + $ret = $mic->fetch_hash($r, 'BODY[1]<2.3>') or BAIL_OUT "FETCH $@"; + is($ret->{1}->{'BODY[1]<2>'}, "is ", 'BODY[1]<2.3>'); + $ret = $mic->bodypart_string($r, 1, 3, 2) or + BAIL_OUT "bodypart_string $@"; + is($ret, "is ", 'bodypart string'); + + $ret = $mic->fetch_hash($r, 'BODY[HEADER.FIELDS.NOT (Message-ID)]') + or BAIL_OUT "FETCH $@"; + $ret = $ret->{1}->{'BODY[HEADER.FIELDS.NOT (MESSAGE-ID)]'}; + unlike($ret, qr/message-id/i, 'Message-ID excluded'); + like($ret, qr/\r\n\r\n\z/s, 'got header end'); + + $ret = $mic->fetch_hash($r, 'BODY[HEADER.FIELDS (Message-ID)]') + or BAIL_OUT "FETCH $@"; + is($ret->{1}->{'BODY[HEADER.FIELDS (MESSAGE-ID)]'}, + 'Message-ID: '."\r\n\r\n", + 'got only Message-ID'); + my $bs = $mic->get_bodystructure($r) or BAIL_OUT("bodystructure: $@"); ok($bs, 'got a bodystructure'); is(lc($bs->bodytype), 'text', '->bodytype'); @@ -292,7 +317,41 @@ $pi_config->each_inbox(sub { ok($bs, 'BODYSTRUCTURE ok for deeply nested'); $ret = $mic->fetch_hash($uidnext, 'BODY') or BAIL_OUT "FETCH $@"; ok($ret->{$uidnext}->{BODY}, 'got something in BODY'); -}); + + # this matches dovecot behavior + $ret = $mic->fetch_hash($uidnext, 'BODY[1]') or BAIL_OUT "FETCH $@"; + is($ret->{$uidnext}->{'BODY[1]'}, + "testing embedded message harder\r\n", 'BODY[1]'); + $ret = $mic->fetch_hash($uidnext, 'BODY[2]') or BAIL_OUT "FETCH $@"; + like($ret->{$uidnext}->{'BODY[2]'}, + qr/\ADate: Sat, 18 Apr 2020 22:20:20 /, 'BODY[2]'); + + $ret = $mic->fetch_hash($uidnext, 'BODY[2.1.1]') or BAIL_OUT "FETCH $@"; + is($ret->{$uidnext}->{'BODY[2.1.1]'}, + "testing embedded message\r\n", 'BODY[2.1.1]'); + + $ret = $mic->fetch_hash($uidnext, 'BODY[2.1.2]') or BAIL_OUT "FETCH $@"; + like($ret->{$uidnext}->{'BODY[2.1.2]'}, qr/\AFrom: /, + 'BODY[2.1.2] tip matched'); + like($ret->{$uidnext}->{'BODY[2.1.2]'}, + # trailing CRLF may vary depending on MIME parser + qr/done_testing;(?:\r\n){1,2}\z/, + 'BODY[2.1.2] tail matched'); + + $ret = $mic->fetch_hash($uidnext, 'BODY[2.HEADER]') or + BAIL_OUT "2.HEADER $@"; + like($ret->{$uidnext}->{'BODY[2.HEADER]'}, + qr/\ADate: Sat, 18 Apr 2020 22:20:20 /, + '2.HEADER of message/rfc822'); + + $ret = $mic->fetch_hash($uidnext, 'BODY[2.MIME]') or + BAIL_OUT "2.MIME $@"; + is($ret->{$uidnext}->{'BODY[2.MIME]'}, <kill; $td->join;