From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 921721F8D0 for ; Thu, 7 May 2020 21:05:59 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 11/13] xt: eml comparison tests Date: Thu, 7 May 2020 21:05:54 +0000 Message-Id: <20200507210556.22995-12-e@yhbt.net> In-Reply-To: <20200507210556.22995-1-e@yhbt.net> References: <20200507210556.22995-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: While our codebase can still work with either MIME implementation, add comparison tests to ensure we handle corner cases in existing archives. --- MANIFEST | 2 + xt/cmp-msgstr.t | 108 +++++++++++++++++++++++++++++++++++++++++++++++ xt/cmp-msgview.t | 95 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) create mode 100644 xt/cmp-msgstr.t create mode 100644 xt/cmp-msgview.t diff --git a/MANIFEST b/MANIFEST index 055c8c9a..9c804a07 100644 --- a/MANIFEST +++ b/MANIFEST @@ -331,6 +331,8 @@ t/www_listing.t t/www_static.t t/x-unknown-alpine.eml t/xcpdb-reshard.t +xt/cmp-msgstr.t +xt/cmp-msgview.t xt/git-http-backend.t xt/git_async_cmp.t xt/mem-msgview.t diff --git a/xt/cmp-msgstr.t b/xt/cmp-msgstr.t new file mode 100644 index 00000000..6bae0f66 --- /dev/null +++ b/xt/cmp-msgstr.t @@ -0,0 +1,108 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use Benchmark qw(:all); +use PublicInbox::Inbox; +use PublicInbox::View; +use PublicInbox::TestCommon; +use PublicInbox::Eml; +use Digest::MD5; +use PublicInbox::MsgIter; +require_mods(qw(Data::Dumper Email::MIME)); +Data::Dumper->import('Dumper'); +require PublicInbox::MIME; +require_git(2.19); +my ($tmpdir, $for_destroy) = tmpdir(); +my $inboxdir = $ENV{GIANT_INBOX_DIR}; +plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir; +my @cat = qw(cat-file --buffer --batch-check --batch-all-objects --unordered); +my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'cmp' }); +my $git = $ibx->git; +my $fh = $git->popen(@cat); +vec(my $vec = '', fileno($fh), 1) = 1; +select($vec, undef, undef, 60) or die "timed out waiting for --batch-check"; +my $n = 0; +my $m = 0; +my $dig_cls = 'Digest::MD5'; +sub h ($) { + s/\s+\z//s; # E::M leaves trailing white space + s/\s+/ /sg; + "$_[0]: $_"; +} + +my $cmp = sub { + my ($p, $cmp_arg) = @_; + my $part = shift @$p; + push @$cmp_arg, '---'.join(', ', @$p).'---'; + my $ct = $part->content_type // 'text/plain'; + $ct =~ s/[ \t]+.*\z//s; + my ($s, $err); + eval { + push @$cmp_arg, map { h 'f' } $part->header('From'); + push @$cmp_arg, map { h 't' } $part->header('To'); + push @$cmp_arg, map { h 'cc' } $part->header('Cc'); + push @$cmp_arg, map { h 'mid' } $part->header('Message-ID'); + push @$cmp_arg, map { h 'refs' } $part->header('References'); + push @$cmp_arg, map { h 'irt' } $part->header('In-Reply-To'); + push @$cmp_arg, map { h 's' } $part->header('Subject'); + push @$cmp_arg, map { h 'cd' } + $part->header('Content-Description'); + ($s, $err) = msg_part_text($part, $ct); + if (defined $s) { + $s =~ s/\s+\z//s; + push @$cmp_arg, "S: ".$s; + } else { + $part = $part->body; + push @$cmp_arg, "T: $ct"; + if ($part =~ /[^\p{XPosixPrint}\s]/s) { # binary + my $dig = $dig_cls->new; + $dig->add($part); + push @$cmp_arg, "M: ".$dig->hexdigest; + push @$cmp_arg, "B: ".bytes::length($part); + } else { + $part =~ s/\s+\z//s; + push @$cmp_arg, "X: ".$part; + } + } + }; + if ($@) { + $err //= ''; + push @$cmp_arg, "E: $@ ($err)"; + } +}; + +my $ndiff = 0; +my $git_cb = sub { + my ($bref, $oid) = @_; + local $SIG{__WARN__} = sub { diag "$inboxdir $oid ", @_ }; + ++$m; + PublicInbox::MIME->new($$bref)->each_part($cmp, my $m_ctx = [], 1); + PublicInbox::Eml->new($$bref)->each_part($cmp, my $e_ctx = [], 1); + if (join("\0", @$e_ctx) ne join("\0", @$m_ctx)) { + ++$ndiff; + open my $fh, '>', "$tmpdir/mime" or die $!; + print $fh Dumper($m_ctx) or die $!; + close $fh or die $!; + open $fh, '>', "$tmpdir/eml" or die $!; + print $fh Dumper($e_ctx) or die $!; + close $fh or die $!; + diag "$inboxdir $oid differ"; + # using `git diff', diff(1) may not be installed + diag xqx([qw(git diff), "$tmpdir/mime", "$tmpdir/eml"]); + } +}; +$git->cat_async_begin; +my $t = timeit(1, sub { + while (<$fh>) { + my ($oid, $type) = split / /; + next if $type ne 'blob'; + ++$n; + $git->cat_async($oid, $git_cb); + } + $git->cat_async_wait; +}); +is($m, $n, "$inboxdir rendered all $m <=> $n messages"); +is($ndiff, 0, "$inboxdir $ndiff differences"); +done_testing(); diff --git a/xt/cmp-msgview.t b/xt/cmp-msgview.t new file mode 100644 index 00000000..66fb467e --- /dev/null +++ b/xt/cmp-msgview.t @@ -0,0 +1,95 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ +use strict; +use Test::More; +use Benchmark qw(:all); +use PublicInbox::Inbox; +use PublicInbox::View; +use PublicInbox::TestCommon; +use PublicInbox::Eml; +use Digest::MD5; +require_git(2.19); +require_mods qw(Data::Dumper Email::MIME Plack::Util); +Data::Dumper->import('Dumper'); +require PublicInbox::MIME; +my ($tmpdir, $for_destroy) = tmpdir(); +my $inboxdir = $ENV{GIANT_INBOX_DIR}; +plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir; +my @cat = qw(cat-file --buffer --batch-check --batch-all-objects --unordered); +my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'perf' }); +my $git = $ibx->git; +my $fh = $git->popen(@cat); +vec(my $vec = '', fileno($fh), 1) = 1; +select($vec, undef, undef, 60) or die "timed out waiting for --batch-check"; +my $mime_ctx = { + env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' }, + -inbox => $ibx, + www => Plack::Util::inline_object(style => sub {''}), + obuf => \(my $mime_buf = ''), + mhref => '../', +}; +my $eml_ctx = { %$mime_ctx, obuf => \(my $eml_buf = '') }; +my $n = 0; +my $m = 0; +my $ndiff_html = 0; +my $dig_cls = 'Digest::MD5'; +my $digest_attach = sub { # ensure ->body (not ->body_raw) matches + my ($p, $cmp_arg) = @_; + my $part = shift @$p; + my $dig = $cmp_arg->[0] //= $dig_cls->new; + $dig->add($part->body_raw); + push @$cmp_arg, join(', ', @$p); +}; + +my $git_cb = sub { + my ($bref, $oid) = @_; + local $SIG{__WARN__} = sub { diag "$inboxdir $oid ", @_ }; + ++$m; + my $mime = PublicInbox::MIME->new($$bref); + PublicInbox::View::multipart_text_as_html($mime, $mime_ctx); + my $eml = PublicInbox::Eml->new($$bref); + PublicInbox::View::multipart_text_as_html($eml, $eml_ctx); + if ($eml_buf ne $mime_buf) { + ++$ndiff_html; + open my $fh, '>', "$tmpdir/mime" or die $!; + print $fh $mime_buf or die $!; + close $fh or die $!; + open $fh, '>', "$tmpdir/eml" or die $!; + print $fh $eml_buf or die $!; + close $fh or die $!; + # using `git diff', diff(1) may not be installed + diag "$inboxdir $oid differs"; + diag xqx([qw(git diff), "$tmpdir/mime", "$tmpdir/eml"]); + } + $eml_buf = $mime_buf = ''; + + # don't tolerate differences in attachment downloads + $mime = PublicInbox::MIME->new($$bref); + $mime->each_part($digest_attach, my $mime_cmp = [], 1); + $eml = PublicInbox::Eml->new($$bref); + $eml->each_part($digest_attach, my $eml_cmp = [], 1); + $mime_cmp->[0] = $mime_cmp->[0]->hexdigest; + $eml_cmp->[0] = $eml_cmp->[0]->hexdigest; + # don't have millions of "ok" lines + if (join("\0", @$eml_cmp) ne join("\0", @$mime_cmp)) { + diag Dumper([ $oid, eml => $eml_cmp, mime =>$mime_cmp ]); + is_deeply($eml_cmp, $mime_cmp, "$inboxdir $oid match"); + } +}; +$git->cat_async_begin; +my $t = timeit(1, sub { + while (<$fh>) { + my ($oid, $type) = split / /; + next if $type ne 'blob'; + ++$n; + $git->cat_async($oid, $git_cb); + } + $git->cat_async_wait; +}); +is($m, $n, 'rendered all messages'); + +# we'll tolerate minor differences in HTML rendering +diag "$ndiff_html HTML differences"; + +done_testing();