user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* Re: [PATCH 11/13] xt: eml comparison tests
  2020-05-07 21:05  4% ` [PATCH 11/13] xt: eml comparison tests Eric Wong
@ 2020-05-08  4:47  7%   ` Eric Wong
  0 siblings, 0 replies; 3+ results
From: Eric Wong @ 2020-05-08  4:47 UTC (permalink / raw)
  To: meta

Eric Wong <e@yhbt.net> wrote:
>  xt/cmp-msgstr.t  | 108 +++++++++++++++++++++++++++++++++++++++++++++++
>  xt/cmp-msgview.t |  95 +++++++++++++++++++++++++++++++++++++++++

Btw, I run these in parallel on inboxes I have:

N=$(nproc)
find ~/v2/*/git/ -type d -name '*.git' -print0 | xargs -0 -P$N -n1 sh -c \
	'GIANT_INBOX_DIR=$1 perl -I lib -w xt/cmp-msgview.t' --
find ~/v1/ -type d -name '*.git' -print0 | xargs -0 -P$N -n1 sh -c \
	'GIANT_INBOX_DIR=$1 perl -I lib -w xt/cmp-msgstr.t' --

And the main differences I see are minor:

* trailing whitespace may still be different for broken messages
  missing epilogues (MIMEDefang, or some old gnus + GPG)

* trailing whitespace differences for header extraction
  (Eml strips all trailing spaces, not just LF/CRLF)

* empty parts of multipart messages are skipped for efficiency

^ permalink raw reply	[relevance 7%]

* [PATCH 11/13] xt: eml comparison tests
  2020-05-07 21:05  6% [PATCH 00/13] eml: pure-Perl replacement for Email::MIME Eric Wong
@ 2020-05-07 21:05  4% ` Eric Wong
  2020-05-08  4:47  7%   ` Eric Wong
  0 siblings, 1 reply; 3+ results
From: Eric Wong @ 2020-05-07 21:05 UTC (permalink / raw)
  To: meta

While our codebase can still work with either MIME
implementation, add comparison tests to ensure we
handle corner cases in existing archives.
---
 MANIFEST         |   2 +
 xt/cmp-msgstr.t  | 108 +++++++++++++++++++++++++++++++++++++++++++++++
 xt/cmp-msgview.t |  95 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 205 insertions(+)
 create mode 100644 xt/cmp-msgstr.t
 create mode 100644 xt/cmp-msgview.t

diff --git a/MANIFEST b/MANIFEST
index 055c8c9a..9c804a07 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -331,6 +331,8 @@ t/www_listing.t
 t/www_static.t
 t/x-unknown-alpine.eml
 t/xcpdb-reshard.t
+xt/cmp-msgstr.t
+xt/cmp-msgview.t
 xt/git-http-backend.t
 xt/git_async_cmp.t
 xt/mem-msgview.t
diff --git a/xt/cmp-msgstr.t b/xt/cmp-msgstr.t
new file mode 100644
index 00000000..6bae0f66
--- /dev/null
+++ b/xt/cmp-msgstr.t
@@ -0,0 +1,108 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use Benchmark qw(:all);
+use PublicInbox::Inbox;
+use PublicInbox::View;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use Digest::MD5;
+use PublicInbox::MsgIter;
+require_mods(qw(Data::Dumper Email::MIME));
+Data::Dumper->import('Dumper');
+require PublicInbox::MIME;
+require_git(2.19);
+my ($tmpdir, $for_destroy) = tmpdir();
+my $inboxdir = $ENV{GIANT_INBOX_DIR};
+plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir;
+my @cat = qw(cat-file --buffer --batch-check --batch-all-objects --unordered);
+my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'cmp' });
+my $git = $ibx->git;
+my $fh = $git->popen(@cat);
+vec(my $vec = '', fileno($fh), 1) = 1;
+select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
+my $n = 0;
+my $m = 0;
+my $dig_cls = 'Digest::MD5';
+sub h ($) {
+	s/\s+\z//s; # E::M leaves trailing white space
+	s/\s+/ /sg;
+	"$_[0]: $_";
+}
+
+my $cmp = sub {
+	my ($p, $cmp_arg) = @_;
+	my $part = shift @$p;
+	push @$cmp_arg, '---'.join(', ', @$p).'---';
+	my $ct = $part->content_type // 'text/plain';
+	$ct =~ s/[ \t]+.*\z//s;
+	my ($s, $err);
+	eval {
+		push @$cmp_arg, map { h 'f' } $part->header('From');
+		push @$cmp_arg, map { h 't' } $part->header('To');
+		push @$cmp_arg, map { h 'cc' } $part->header('Cc');
+		push @$cmp_arg, map { h 'mid' } $part->header('Message-ID');
+		push @$cmp_arg, map { h 'refs' } $part->header('References');
+		push @$cmp_arg, map { h 'irt' } $part->header('In-Reply-To');
+		push @$cmp_arg, map { h 's' } $part->header('Subject');
+		push @$cmp_arg, map { h 'cd' }
+					$part->header('Content-Description');
+		($s, $err) = msg_part_text($part, $ct);
+		if (defined $s) {
+			$s =~ s/\s+\z//s;
+			push @$cmp_arg, "S: ".$s;
+		} else {
+			$part = $part->body;
+			push @$cmp_arg, "T: $ct";
+			if ($part =~ /[^\p{XPosixPrint}\s]/s) { # binary
+				my $dig = $dig_cls->new;
+				$dig->add($part);
+				push @$cmp_arg, "M: ".$dig->hexdigest;
+				push @$cmp_arg, "B: ".bytes::length($part);
+			} else {
+				$part =~ s/\s+\z//s;
+				push @$cmp_arg, "X: ".$part;
+			}
+		}
+	};
+	if ($@) {
+		$err //= '';
+		push @$cmp_arg, "E: $@ ($err)";
+	}
+};
+
+my $ndiff = 0;
+my $git_cb = sub {
+	my ($bref, $oid) = @_;
+	local $SIG{__WARN__} = sub { diag "$inboxdir $oid ", @_ };
+	++$m;
+	PublicInbox::MIME->new($$bref)->each_part($cmp, my $m_ctx = [], 1);
+	PublicInbox::Eml->new($$bref)->each_part($cmp, my $e_ctx = [], 1);
+	if (join("\0", @$e_ctx) ne join("\0", @$m_ctx)) {
+		++$ndiff;
+		open my $fh, '>', "$tmpdir/mime" or die $!;
+		print $fh Dumper($m_ctx) or die $!;
+		close $fh or die $!;
+		open $fh, '>', "$tmpdir/eml" or die $!;
+		print $fh Dumper($e_ctx) or die $!;
+		close $fh or die $!;
+		diag "$inboxdir $oid differ";
+		# using `git diff', diff(1) may not be installed
+		diag xqx([qw(git diff), "$tmpdir/mime", "$tmpdir/eml"]);
+	}
+};
+$git->cat_async_begin;
+my $t = timeit(1, sub {
+	while (<$fh>) {
+		my ($oid, $type) = split / /;
+		next if $type ne 'blob';
+		++$n;
+		$git->cat_async($oid, $git_cb);
+	}
+	$git->cat_async_wait;
+});
+is($m, $n, "$inboxdir rendered all $m <=> $n messages");
+is($ndiff, 0, "$inboxdir $ndiff differences");
+done_testing();
diff --git a/xt/cmp-msgview.t b/xt/cmp-msgview.t
new file mode 100644
index 00000000..66fb467e
--- /dev/null
+++ b/xt/cmp-msgview.t
@@ -0,0 +1,95 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use Benchmark qw(:all);
+use PublicInbox::Inbox;
+use PublicInbox::View;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use Digest::MD5;
+require_git(2.19);
+require_mods qw(Data::Dumper Email::MIME Plack::Util);
+Data::Dumper->import('Dumper');
+require PublicInbox::MIME;
+my ($tmpdir, $for_destroy) = tmpdir();
+my $inboxdir = $ENV{GIANT_INBOX_DIR};
+plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir;
+my @cat = qw(cat-file --buffer --batch-check --batch-all-objects --unordered);
+my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'perf' });
+my $git = $ibx->git;
+my $fh = $git->popen(@cat);
+vec(my $vec = '', fileno($fh), 1) = 1;
+select($vec, undef, undef, 60) or die "timed out waiting for --batch-check";
+my $mime_ctx = {
+	env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'https' },
+	-inbox => $ibx,
+	www => Plack::Util::inline_object(style => sub {''}),
+	obuf => \(my $mime_buf = ''),
+	mhref => '../',
+};
+my $eml_ctx = { %$mime_ctx, obuf => \(my $eml_buf = '') };
+my $n = 0;
+my $m = 0;
+my $ndiff_html = 0;
+my $dig_cls = 'Digest::MD5';
+my $digest_attach = sub { # ensure ->body (not ->body_raw) matches
+	my ($p, $cmp_arg) = @_;
+	my $part = shift @$p;
+	my $dig = $cmp_arg->[0] //= $dig_cls->new;
+	$dig->add($part->body_raw);
+	push @$cmp_arg, join(', ', @$p);
+};
+
+my $git_cb = sub {
+	my ($bref, $oid) = @_;
+	local $SIG{__WARN__} = sub { diag "$inboxdir $oid ", @_ };
+	++$m;
+	my $mime = PublicInbox::MIME->new($$bref);
+	PublicInbox::View::multipart_text_as_html($mime, $mime_ctx);
+	my $eml = PublicInbox::Eml->new($$bref);
+	PublicInbox::View::multipart_text_as_html($eml, $eml_ctx);
+	if ($eml_buf ne $mime_buf) {
+		++$ndiff_html;
+		open my $fh, '>', "$tmpdir/mime" or die $!;
+		print $fh $mime_buf or die $!;
+		close $fh or die $!;
+		open $fh, '>', "$tmpdir/eml" or die $!;
+		print $fh $eml_buf or die $!;
+		close $fh or die $!;
+		# using `git diff', diff(1) may not be installed
+		diag "$inboxdir $oid differs";
+		diag xqx([qw(git diff), "$tmpdir/mime", "$tmpdir/eml"]);
+	}
+	$eml_buf = $mime_buf = '';
+
+	# don't tolerate differences in attachment downloads
+	$mime = PublicInbox::MIME->new($$bref);
+	$mime->each_part($digest_attach, my $mime_cmp = [], 1);
+	$eml = PublicInbox::Eml->new($$bref);
+	$eml->each_part($digest_attach, my $eml_cmp = [], 1);
+	$mime_cmp->[0] = $mime_cmp->[0]->hexdigest;
+	$eml_cmp->[0] = $eml_cmp->[0]->hexdigest;
+	# don't have millions of "ok" lines
+	if (join("\0", @$eml_cmp) ne join("\0", @$mime_cmp)) {
+		diag Dumper([ $oid, eml => $eml_cmp, mime =>$mime_cmp ]);
+		is_deeply($eml_cmp, $mime_cmp, "$inboxdir $oid match");
+	}
+};
+$git->cat_async_begin;
+my $t = timeit(1, sub {
+	while (<$fh>) {
+		my ($oid, $type) = split / /;
+		next if $type ne 'blob';
+		++$n;
+		$git->cat_async($oid, $git_cb);
+	}
+	$git->cat_async_wait;
+});
+is($m, $n, 'rendered all messages');
+
+# we'll tolerate minor differences in HTML rendering
+diag "$ndiff_html HTML differences";
+
+done_testing();

^ permalink raw reply related	[relevance 4%]

* [PATCH 00/13] eml: pure-Perl replacement for Email::MIME
@ 2020-05-07 21:05  6% Eric Wong
  2020-05-07 21:05  4% ` [PATCH 11/13] xt: eml comparison tests Eric Wong
  0 siblings, 1 reply; 3+ results
From: Eric Wong @ 2020-05-07 21:05 UTC (permalink / raw)
  To: meta

Eric Wong (13):
  msg_iter: make ->each_part method for PublicInbox::MIME
  msg_iter: pass $idx as a scalar, not array
  filter/rubylang: avoid recursing subparts to strip trailers
  smsg: use capitalization for header retrieval
  eml: pure-Perl replacement for Email::MIME
  switch read-only Email::Simple users to Eml
  replace most uses of PublicInbox::MIME with Eml
  EmlContentFoo: Email::MIME::ContentType replacement
  EmlContentFoo: relax Encode version requirement
  eml: remove dependency on Email::MIME::Encodings
  xt: eml comparison tests
  remove most internal Email::MIME usage
  eml: drop trailing blank line on missing epilogue

 Documentation/mknews.perl          |   4 +-
 INSTALL                            |  26 +-
 MANIFEST                           |   7 +
 Makefile.PL                        |   7 +-
 ci/deps.perl                       |   3 -
 lib/PublicInbox/Admin.pm           |   2 +-
 lib/PublicInbox/Eml.pm             | 421 +++++++++++++++++++++++++++++
 lib/PublicInbox/EmlContentFoo.pm   | 317 ++++++++++++++++++++++
 lib/PublicInbox/Filter/RubyLang.pm |  32 ++-
 lib/PublicInbox/Filter/Vger.pm     |   4 +-
 lib/PublicInbox/Import.pm          |  11 +-
 lib/PublicInbox/Inbox.pm           |   4 +-
 lib/PublicInbox/InboxWritable.pm   |   4 +-
 lib/PublicInbox/MDA.pm             |   1 -
 lib/PublicInbox/MIME.pm            |   6 +
 lib/PublicInbox/Mbox.pm            |  16 +-
 lib/PublicInbox/MboxGz.pm          |   4 +-
 lib/PublicInbox/MsgIter.pm         |  21 +-
 lib/PublicInbox/MsgTime.pm         |   8 +-
 lib/PublicInbox/NNTP.pm            |  19 +-
 lib/PublicInbox/SearchIdx.pm       |   8 +-
 lib/PublicInbox/SearchIdxShard.pm  |   3 +-
 lib/PublicInbox/Smsg.pm            |  24 +-
 lib/PublicInbox/SolverGit.pm       |   4 +-
 lib/PublicInbox/TestCommon.pm      |  11 +-
 lib/PublicInbox/V2Writable.pm      |  17 +-
 lib/PublicInbox/View.pm            |  28 +-
 lib/PublicInbox/WWW.pm             |   8 +-
 lib/PublicInbox/WatchMaildir.pm    |   4 +-
 lib/PublicInbox/WwwAttach.pm       |  15 +-
 script/public-inbox-edit           |   8 +-
 script/public-inbox-learn          |   4 +-
 script/public-inbox-mda            |  16 +-
 script/public-inbox-purge          |   4 +-
 t/altid.t                          |   4 +-
 t/altid_v2.t                       |   4 +-
 t/cgi.t                            |   8 +-
 t/content_id.t                     |   6 +-
 t/convert-compact.t                |   4 +-
 t/edit.t                           |  20 +-
 t/eml.t                            | 363 +++++++++++++++++++++++++
 t/eml_content_disposition.t        | 102 +++++++
 t/eml_content_type.t               | 289 ++++++++++++++++++++
 t/feed.t                           |   6 +-
 t/filter_base.t                    |   4 +-
 t/filter_mirror.t                  |   2 +-
 t/filter_rubylang.t                |   8 +-
 t/filter_subjecttag.t              |   4 +-
 t/filter_vger.t                    |   6 +-
 t/html_index.t                     |   4 +-
 t/httpd.t                          |   4 +-
 t/import.t                         |   6 +-
 t/indexlevels-mirror.t             |   4 +-
 t/mda.t                            |   4 +-
 t/mda_filter_rubylang.t            |   2 +-
 t/mid.t                            |   4 +-
 t/mime.t                           |  82 +++---
 t/msg_iter.t                       |  10 +-
 t/msgtime.t                        |   6 +-
 t/multi-mid.t                      |   6 +-
 t/nntp.t                           |   4 +-
 t/nntpd-tls.t                      |   4 +-
 t/nntpd.t                          |   6 +-
 t/nulsubject.t                     |   2 +-
 t/plack.t                          |  10 +-
 t/precheck.t                       |  10 +-
 t/psgi_attach.t                    |   2 +-
 t/psgi_bad_mids.t                  |   4 +-
 t/psgi_mount.t                     |   4 +-
 t/psgi_multipart_not.t             |   4 +-
 t/psgi_scan_all.t                  |   4 +-
 t/psgi_search.t                    |   8 +-
 t/psgi_text.t                      |   2 +-
 t/psgi_v2.t                        |   6 +-
 t/purge.t                          |   2 +-
 t/replace.t                        |  12 +-
 t/reply.t                          |   4 +-
 t/search-thr-index.t               |   6 +-
 t/search.t                         |  26 +-
 t/solver_git.t                     |   4 +-
 t/spamcheck_spamc.t                |   8 +-
 t/thread-cycle.t                   |   3 +-
 t/time.t                           |   4 +-
 t/v1-add-remove-add.t              |   4 +-
 t/v1reindex.t                      |   4 +-
 t/v2-add-remove-add.t              |   4 +-
 t/v2mda.t                          |   4 +-
 t/v2mirror.t                       |   4 +-
 t/v2reindex.t                      |   8 +-
 t/v2writable.t                     |   8 +-
 t/watch_filter_rubylang.t          |   2 +-
 t/watch_maildir.t                  |   2 +-
 t/watch_maildir_v2.t               |   2 +-
 t/www_altid.t                      |   2 +-
 t/xcpdb-reshard.t                  |   4 +-
 xt/cmp-msgstr.t                    | 108 ++++++++
 xt/cmp-msgview.t                   |  95 +++++++
 xt/msgtime_cmp.t                   |  12 +-
 xt/perf-msgview.t                  |   2 +-
 99 files changed, 2084 insertions(+), 353 deletions(-)
 create mode 100644 lib/PublicInbox/Eml.pm
 create mode 100644 lib/PublicInbox/EmlContentFoo.pm
 create mode 100644 t/eml.t
 create mode 100644 t/eml_content_disposition.t
 create mode 100644 t/eml_content_type.t
 create mode 100644 xt/cmp-msgstr.t
 create mode 100644 xt/cmp-msgview.t


^ permalink raw reply	[relevance 6%]

Results 1-3 of 3 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-05-07 21:05  6% [PATCH 00/13] eml: pure-Perl replacement for Email::MIME Eric Wong
2020-05-07 21:05  4% ` [PATCH 11/13] xt: eml comparison tests Eric Wong
2020-05-08  4:47  7%   ` Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).