From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 11/13] xt: eml comparison tests
Date: Thu, 7 May 2020 21:05:54 +0000 [thread overview]
Message-ID: <20200507210556.22995-12-e@yhbt.net> (raw)
In-Reply-To: <20200507210556.22995-1-e@yhbt.net>
While our codebase can still work with either MIME
implementation, add comparison tests to ensure we
handle corner cases in existing archives.
---
MANIFEST | 2 +
xt/cmp-msgstr.t | 108 +++++++++++++++++++++++++++++++++++++++++++++++
xt/cmp-msgview.t | 95 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 205 insertions(+)
create mode 100644 xt/cmp-msgstr.t
create mode 100644 xt/cmp-msgview.t
diff --git a/MANIFEST b/MANIFEST
index 055c8c9a..9c804a07 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -331,6 +331,8 @@ t/www_listing.t
t/www_static.t
t/x-unknown-alpine.eml
t/xcpdb-reshard.t
+xt/cmp-msgstr.t
+xt/cmp-msgview.t
xt/git-http-backend.t
xt/git_async_cmp.t
xt/mem-msgview.t
diff --git a/xt/cmp-msgstr.t b/xt/cmp-msgstr.t
new file mode 100644
index 00000000..6bae0f66
--- /dev/null
+++ b/xt/cmp-msgstr.t
@@ -0,0 +1,108 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use Benchmark qw(:all);
+use PublicInbox::Inbox;
+use PublicInbox::View;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use Digest::MD5;
+use PublicInbox::MsgIter;
+require_mods(qw(Data::Dumper Email::MIME));
+Data::Dumper->import('Dumper');
+require PublicInbox::MIME;
+require_git(2.19);
+my ($tmpdir, $for_destroy) = tmpdir();
+my $inboxdir = $ENV{GIANT_INBOX_DIR};
+plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir;
+my @cat = qw(cat-file --buffer --batch-check --batch-all-objects --unordered);
+my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'cmp' });
+my $git = $ibx->git;
+my $fh = $git->popen(@cat);
+vec(my $vec = '', fileno($fh), 1) = 1;
+select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
+my $n = 0;
+my $m = 0;
+my $dig_cls = 'Digest::MD5';
+sub h ($) {
+ s/\s+\z//s; # E::M leaves trailing white space
+ s/\s+/ /sg;
+ "$_[0]: $_";
+}
+
+my $cmp = sub {
+ my ($p, $cmp_arg) = @_;
+ my $part = shift @$p;
+ push @$cmp_arg, '---'.join(', ', @$p).'---';
+ my $ct = $part->content_type // 'text/plain';
+ $ct =~ s/[ \t]+.*\z//s;
+ my ($s, $err);
+ eval {
+ push @$cmp_arg, map { h 'f' } $part->header('From');
+ push @$cmp_arg, map { h 't' } $part->header('To');
+ push @$cmp_arg, map { h 'cc' } $part->header('Cc');
+ push @$cmp_arg, map { h 'mid' } $part->header('Message-ID');
+ push @$cmp_arg, map { h 'refs' } $part->header('References');
+ push @$cmp_arg, map { h 'irt' } $part->header('In-Reply-To');
+ push @$cmp_arg, map { h 's' } $part->header('Subject');
+ push @$cmp_arg, map { h 'cd' }
+ $part->header('Content-Description');
+ ($s, $err) = msg_part_text($part, $ct);
+ if (defined $s) {
+ $s =~ s/\s+\z//s;
+ push @$cmp_arg, "S: ".$s;
+ } else {
+ $part = $part->body;
+ push @$cmp_arg, "T: $ct";
+ if ($part =~ /[^\p{XPosixPrint}\s]/s) { # binary
+ my $dig = $dig_cls->new;
+ $dig->add($part);
+ push @$cmp_arg, "M: ".$dig->hexdigest;
+ push @$cmp_arg, "B: ".bytes::length($part);
+ } else {
+ $part =~ s/\s+\z//s;
+ push @$cmp_arg, "X: ".$part;
+ }
+ }
+ };
+ if ($@) {
+ $err //= '';
+ push @$cmp_arg, "E: $@ ($err)";
+ }
+};
+
+my $ndiff = 0;
+my $git_cb = sub {
+ my ($bref, $oid) = @_;
+ local $SIG{__WARN__} = sub { diag "$inboxdir $oid ", @_ };
+ ++$m;
+ PublicInbox::MIME->new($$bref)->each_part($cmp, my $m_ctx = [], 1);
+ PublicInbox::Eml->new($$bref)->each_part($cmp, my $e_ctx = [], 1);
+ if (join("\0", @$e_ctx) ne join("\0", @$m_ctx)) {
+ ++$ndiff;
+ open my $fh, '>', "$tmpdir/mime" or die $!;
+ print $fh Dumper($m_ctx) or die $!;
+ close $fh or die $!;
+ open $fh, '>', "$tmpdir/eml" or die $!;
+ print $fh Dumper($e_ctx) or die $!;
+ close $fh or die $!;
+ diag "$inboxdir $oid differ";
+ # using `git diff', diff(1) may not be installed
+ diag xqx([qw(git diff), "$tmpdir/mime", "$tmpdir/eml"]);
+ }
+};
+$git->cat_async_begin;
+my $t = timeit(1, sub {
+ while (<$fh>) {
+ my ($oid, $type) = split / /;
+ next if $type ne 'blob';
+ ++$n;
+ $git->cat_async($oid, $git_cb);
+ }
+ $git->cat_async_wait;
+});
+is($m, $n, "$inboxdir rendered all $m <=> $n messages");
+is($ndiff, 0, "$inboxdir $ndiff differences");
+done_testing();
diff --git a/xt/cmp-msgview.t b/xt/cmp-msgview.t
new file mode 100644
index 00000000..66fb467e
--- /dev/null
+++ b/xt/cmp-msgview.t
@@ -0,0 +1,95 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use Benchmark qw(:all);
+use PublicInbox::Inbox;
+use PublicInbox::View;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use Digest::MD5;
+require_git(2.19);
+require_mods qw(Data::Dumper Email::MIME Plack::Util);
+Data::Dumper->import('Dumper');
+require PublicInbox::MIME;
+my ($tmpdir, $for_destroy) = tmpdir();
+my $inboxdir = $ENV{GIANT_INBOX_DIR};
+plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir;
+my @cat = qw(cat-file --buffer --batch-check --batch-all-objects --unordered);
+my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'perf' });
+my $git = $ibx->git;
+my $fh = $git->popen(@cat);
+vec(my $vec = '', fileno($fh), 1) = 1;
+select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
+my $mime_ctx = {
+ env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
+ -inbox => $ibx,
+ www => Plack::Util::inline_object(style => sub {''}),
+ obuf => \(my $mime_buf = ''),
+ mhref => '../',
+};
+my $eml_ctx = { %$mime_ctx, obuf => \(my $eml_buf = '') };
+my $n = 0;
+my $m = 0;
+my $ndiff_html = 0;
+my $dig_cls = 'Digest::MD5';
+my $digest_attach = sub { # ensure ->body (not ->body_raw) matches
+ my ($p, $cmp_arg) = @_;
+ my $part = shift @$p;
+ my $dig = $cmp_arg->[0] //= $dig_cls->new;
+ $dig->add($part->body_raw);
+ push @$cmp_arg, join(', ', @$p);
+};
+
+my $git_cb = sub {
+ my ($bref, $oid) = @_;
+ local $SIG{__WARN__} = sub { diag "$inboxdir $oid ", @_ };
+ ++$m;
+ my $mime = PublicInbox::MIME->new($$bref);
+ PublicInbox::View::multipart_text_as_html($mime, $mime_ctx);
+ my $eml = PublicInbox::Eml->new($$bref);
+ PublicInbox::View::multipart_text_as_html($eml, $eml_ctx);
+ if ($eml_buf ne $mime_buf) {
+ ++$ndiff_html;
+ open my $fh, '>', "$tmpdir/mime" or die $!;
+ print $fh $mime_buf or die $!;
+ close $fh or die $!;
+ open $fh, '>', "$tmpdir/eml" or die $!;
+ print $fh $eml_buf or die $!;
+ close $fh or die $!;
+ # using `git diff', diff(1) may not be installed
+ diag "$inboxdir $oid differs";
+ diag xqx([qw(git diff), "$tmpdir/mime", "$tmpdir/eml"]);
+ }
+ $eml_buf = $mime_buf = '';
+
+ # don't tolerate differences in attachment downloads
+ $mime = PublicInbox::MIME->new($$bref);
+ $mime->each_part($digest_attach, my $mime_cmp = [], 1);
+ $eml = PublicInbox::Eml->new($$bref);
+ $eml->each_part($digest_attach, my $eml_cmp = [], 1);
+ $mime_cmp->[0] = $mime_cmp->[0]->hexdigest;
+ $eml_cmp->[0] = $eml_cmp->[0]->hexdigest;
+ # don't have millions of "ok" lines
+ if (join("\0", @$eml_cmp) ne join("\0", @$mime_cmp)) {
+ diag Dumper([ $oid, eml => $eml_cmp, mime =>$mime_cmp ]);
+ is_deeply($eml_cmp, $mime_cmp, "$inboxdir $oid match");
+ }
+};
+$git->cat_async_begin;
+my $t = timeit(1, sub {
+ while (<$fh>) {
+ my ($oid, $type) = split / /;
+ next if $type ne 'blob';
+ ++$n;
+ $git->cat_async($oid, $git_cb);
+ }
+ $git->cat_async_wait;
+});
+is($m, $n, 'rendered all messages');
+
+# we'll tolerate minor differences in HTML rendering
+diag "$ndiff_html HTML differences";
+
+done_testing();
next prev parent reply other threads:[~2020-05-07 21:05 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-05-07 21:05 [PATCH 00/13] eml: pure-Perl replacement for Email::MIME Eric Wong
2020-05-07 21:05 ` [PATCH 01/13] msg_iter: make ->each_part method for PublicInbox::MIME Eric Wong
2020-05-07 21:05 ` [PATCH 02/13] msg_iter: pass $idx as a scalar, not array Eric Wong
2020-05-07 21:05 ` [PATCH 03/13] filter/rubylang: avoid recursing subparts to strip trailers Eric Wong
2020-05-07 21:05 ` [PATCH 04/13] smsg: use capitalization for header retrieval Eric Wong
2020-05-07 21:05 ` [PATCH 05/13] eml: pure-Perl replacement for Email::MIME Eric Wong
2020-05-07 21:05 ` [PATCH 06/13] switch read-only Email::Simple users to Eml Eric Wong
2020-05-07 21:05 ` [PATCH 07/13] replace most uses of PublicInbox::MIME with Eml Eric Wong
2020-05-07 21:05 ` [PATCH 08/13] EmlContentFoo: Email::MIME::ContentType replacement Eric Wong
2020-05-07 21:05 ` [PATCH 09/13] EmlContentFoo: relax Encode version requirement Eric Wong
2020-05-07 21:05 ` [PATCH 10/13] eml: remove dependency on Email::MIME::Encodings Eric Wong
2020-05-07 21:05 ` Eric Wong [this message]
2020-05-08 4:47 ` [PATCH 11/13] xt: eml comparison tests Eric Wong
2020-05-07 21:05 ` [PATCH 12/13] remove most internal Email::MIME usage Eric Wong
2020-05-07 21:05 ` [PATCH 13/13] eml: drop trailing blank line on missing epilogue Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200507210556.22995-12-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).