user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCH 08/36] mboxreader: new class for reading various mbox formats
  2020-12-31 13:51  7% [PATCH 00/36] another round of lei stuff Eric Wong
@ 2020-12-31 13:51  5% ` Eric Wong
  0 siblings, 0 replies; 2+ results
From: Eric Wong @ 2020-12-31 13:51 UTC (permalink / raw)
  To: meta

This is only lightly-tested against stuff LeiToMail generates
and will need real-world tests to validate.
---
 MANIFEST                      |   2 +
 lib/PublicInbox/MboxReader.pm | 124 ++++++++++++++++++++++++++++++++++
 t/mbox_reader.t               |  76 +++++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 lib/PublicInbox/MboxReader.pm
 create mode 100644 t/mbox_reader.t

diff --git a/MANIFEST b/MANIFEST
index d32f064e..1fb1e181 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -176,6 +176,7 @@ lib/PublicInbox/MIME.pm
 lib/PublicInbox/ManifestJsGz.pm
 lib/PublicInbox/Mbox.pm
 lib/PublicInbox/MboxGz.pm
+lib/PublicInbox/MboxReader.pm
 lib/PublicInbox/MiscIdx.pm
 lib/PublicInbox/MiscSearch.pm
 lib/PublicInbox/MsgIter.pm
@@ -334,6 +335,7 @@ t/lei_to_mail.t
 t/lei_xsearch.t
 t/linkify.t
 t/main-bin/spamc
+t/mbox_reader.t
 t/mda-mime.eml
 t/mda.t
 t/mda_filter_rubylang.t
diff --git a/lib/PublicInbox/MboxReader.pm b/lib/PublicInbox/MboxReader.pm
new file mode 100644
index 00000000..e1944aaf
--- /dev/null
+++ b/lib/PublicInbox/MboxReader.pm
@@ -0,0 +1,124 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# reader for mbox variants we support
+package PublicInbox::MboxReader;
+use strict;
+use v5.10.1;
+use Data::Dumper;
+$Data::Dumper::Useqq = 1; # should've been the default, for bad data
+
+my $from_strict =
+	qr/^From \S+ +\S+ \S+ +\S+ [^\n:]+:[^\n:]+:[^\n:]+ [^\n:]+\n/sm;
+
+sub _mbox_from {
+	my ($mbfh, $from_re, $eml_cb, @arg) = @_;
+	my $buf = '';
+	my @raw;
+	while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) {
+		if ($r == 0) { # close here to check for "curl --fail"
+			close($mbfh) or die "error closing mbox: \$?=$? $!";
+			@raw = ($buf);
+		} else {
+			@raw = split(/$from_strict/mos, $buf, -1);
+			next if scalar(@raw) == 0;
+			$buf = pop(@raw); # last bit may be incomplete
+		}
+		@raw = grep /[^ \t\r\n]/s, @raw; # skip empty messages
+		while (defined(my $raw = shift @raw)) {
+			$raw =~ s/\r?\n\z//s;
+			$raw =~ s/$from_re/$1/gms;
+			my $eml = PublicInbox::Eml->new(\$raw);
+			$eml_cb->($eml, @arg);
+		}
+		return if $r == 0; # EOF
+	}
+	die "error reading mboxo/mboxrd handle: $!";
+}
+
+sub mboxrd {
+	my (undef, $mbfh, $eml_cb, @arg) = @_;
+	_mbox_from($mbfh, qr/^>(>*From )/ms, $eml_cb, @arg);
+}
+
+sub mboxo {
+	my (undef, $mbfh, $eml_cb, @arg) = @_;
+	_mbox_from($mbfh, qr/^>(From )/ms, $eml_cb, @arg);
+}
+
+sub _cl_body {
+	my ($mbfh, $bref, $cl) = @_;
+	my $body = substr($$bref, 0, $cl, '');
+	my $need = $cl - length($body);
+	if ($need > 0) {
+		$mbfh or die "E: needed $need bytes after EOF";
+		defined(my $r = read($mbfh, $body, $need, length($body))) or
+			die "E: read error: $!\n";
+		$r == $need or die "E: read $r of $need bytes\n";
+	}
+	\$body;
+}
+
+sub _extract_hdr {
+	my ($ref) = @_;
+	if (index($$ref, "\r\n") < 0 && (my $pos = index($$ref, "\n\n")) >= 0) {
+		# likely on *nix
+		\substr($$ref, 0, $pos + 2, ''); # sv_chop on $$ref
+	} elsif ($$ref =~ /\r?\n\r?\n/s) {
+		\substr($$ref, 0, $+[0], ''); # sv_chop on $$ref
+	} else {
+		undef
+	}
+}
+
+sub _mbox_cl ($$$;@) {
+	my ($mbfh, $uxs_from, $eml_cb, @arg) = @_;
+	my $buf = '';
+	while (defined(my $r = read($mbfh, $buf, 65536, length($buf)))) {
+		if ($r == 0) { # detect "curl --fail"
+			close($mbfh) or
+				die "error closing mboxcl/mboxcl2: \$?=$? $!";
+			undef $mbfh;
+		}
+		while (my $hdr = _extract_hdr(\$buf)) {
+			$$hdr =~ s/\A[\r\n]*From [^\n]*\n//s or
+				die "E: no 'From ' line in:\n", Dumper($hdr);
+			my $eml = PublicInbox::Eml->new($hdr);
+			my @cl = $eml->header_raw('Content-Length');
+			my $n = scalar(@cl);
+			$n == 0 and die "E: Content-Length missing in:\n",
+					Dumper($eml->as_string);
+			$n == 1 or die "E: multiple ($n) Content-Length in:\n",
+					Dumper($eml->as_string);
+			$cl[0] =~ /\A[0-9]+\z/ or die
+				"E: Content-Length `$cl[0]' invalid\n",
+					Dumper($eml->as_string);
+			if (($eml->{bdy} = _cl_body($mbfh, \$buf, $cl[0]))) {
+				$uxs_from and
+					${$eml->{bdy}} =~ s/^>From /From /sgm;
+			}
+			$eml_cb->($eml, @arg);
+		}
+		if ($r == 0) {
+			$buf =~ /[^ \r\n\t]/ and
+				warn "W: leftover at end of mboxcl/mboxcl2:\n",
+					Dumper(\$buf);
+			return;
+		}
+	}
+	die "error reading mboxcl/mboxcl2 handle: $!";
+}
+
+sub mboxcl {
+	my (undef, $mbfh, $eml_cb, @arg) = @_;
+	_mbox_cl($mbfh, 1, $eml_cb, @arg);
+}
+
+sub mboxcl2 {
+	my (undef, $mbfh, $eml_cb, @arg) = @_;
+	_mbox_cl($mbfh, undef, $eml_cb, @arg);
+}
+
+sub new { bless \(my $x), __PACKAGE__ }
+
+1;
diff --git a/t/mbox_reader.t b/t/mbox_reader.t
new file mode 100644
index 00000000..9391dc24
--- /dev/null
+++ b/t/mbox_reader.t
@@ -0,0 +1,76 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use List::Util qw(shuffle);
+use PublicInbox::Eml;
+use Fcntl qw(SEEK_SET);
+require_ok 'PublicInbox::MboxReader';
+require_ok 'PublicInbox::LeiToMail';
+my %raw = (
+	hdr_only => "From: header-only\@example.com\n\n",
+	small_from => "From: small-from\@example.com\n\nFrom hell\n",
+	small => "From: small\@example.com\n\nfrom hell\n",
+	big_hdr_only => "From: big-header\@example.com\n" .
+		(('A: '.('a' x 72)."\n") x 1000)."\n",
+	big_body => "From: big-body\@example.com\n\n".
+		(('b: '.('b' x 72)."\n") x 1000) .
+		"From hell\n",
+	big_all => "From: big-all\@example.com\n".
+		(("A: ".('a' x 72)."\n") x 1000). "\n" .
+		(("b: ".('b' x 72)."\n") x 1000) .
+		"From hell\n",
+);
+
+if ($ENV{TEST_EXTRA}) {
+	for my $fn (glob('t/*.eml'), glob('t/*/*.{patch,eml}')) {
+		$raw{$fn} = eml_load($fn)->as_string;
+	}
+}
+
+my $reader = PublicInbox::MboxReader->new;
+my $write_in_full = PublicInbox::LeiToMail->can('write_in_full');
+my $check_fmt = sub {
+	my $fmt = shift;
+	my @order = shuffle(keys %raw);
+	my $eml2mbox = PublicInbox::LeiToMail->can("eml2$fmt");
+	open my $fh, '+>', undef or BAIL_OUT "open: $!";
+	for my $k (@order) {
+		my $eml = PublicInbox::Eml->new($raw{$k});
+		my $buf = $eml2mbox->($eml);
+		$write_in_full->($fh, $buf, undef);
+	}
+	seek($fh, 0, SEEK_SET) or BAIL_OUT "seek: $!";
+	$reader->$fmt($fh, sub {
+		my ($eml) = @_;
+		my $cur = shift @order;
+		my @cl = $eml->header_raw('Content-Length');
+		if ($fmt =~ /\Amboxcl/) {
+			is(scalar(@cl), 1, "Content-Length set $fmt $cur");
+			my $raw = $eml->body_raw;
+			my $adj = 0;
+			if ($fmt eq 'mboxcl') {
+				my @from = ($raw =~ /^(From )/smg);
+				$adj = scalar(@from);
+			}
+			is(length($raw), $cl[0] - $adj,
+				"Content-Length is correct $fmt $cur");
+			# clobber for ->as_string comparison below
+			$eml->header_set('Content-Length');
+		} else {
+			is(scalar(@cl), 0, "Content-Length unset $fmt $cur");
+		}
+		my $orig = PublicInbox::Eml->new($raw{$cur});
+		is($eml->as_string, $orig->as_string,
+			"read back original $fmt $cur");
+	});
+};
+my @mbox = qw(mboxrd mboxo mboxcl mboxcl2);
+for my $fmt (@mbox) { $check_fmt->($fmt) }
+s/\n/\r\n/sg for (values %raw);
+for my $fmt (@mbox) { $check_fmt->($fmt) }
+
+done_testing;

^ permalink raw reply related	[relevance 5%]

* [PATCH 00/36] another round of lei stuff
@ 2020-12-31 13:51  7% Eric Wong
  2020-12-31 13:51  5% ` [PATCH 08/36] mboxreader: new class for reading various mbox formats Eric Wong
  0 siblings, 1 reply; 2+ results
From: Eric Wong @ 2020-12-31 13:51 UTC (permalink / raw)
  To: meta

This is against lei branch @ commit
0c8106d44f317175e122744b43407bf067183175 in
https://public-inbox.org/public-inbox.git

Infrastructure stuff for reading + writing local Maildirs and a
bunch of mbox formats are done (including gz/bz2/xz support)
and it's usage should be familiar to mairix(1) users.

Infrastructure for deduplication + augmenting search results
in place and tested.

Going to skip MH and MMDF for now; but IMAP/JMAP might happen
sooner but deduplication needs low-latency.

"extinbox" renamed "external"

Basic infrastructure like PublicInbox::IPC and SharedKV
should've been done and in use ages ago...  I look forward to
using them, at least.

Some DS safety fixes since lei will use it in stranger ways
than current.

Bad enough we have messages with duplicate Message-IDs, lei will
need to deal with Unsent/Drafts messages w/o Message-IDs at all!

Eric Wong (36):
  import: respect init.defaultBranch
  lei_store: use per-machine refname as git HEAD
  revert "lei_store: use per-machine refname as git HEAD"
  lei_to_mail: initial implementation for writing mbox formats
  sharedkv: fork()-friendly key-value store
  sharedkv: split out index_values
  lei_to_mail: start atomic and compressed mbox writing
  mboxreader: new class for reading various mbox formats
  lei_to_mail: start --augment, dedupe, bz2 and xz
  lei: implement various deduplication strategies
  lei_to_mail: lazy-require LeiDedupe
  lei_to_mail: support for non-seekable outputs
  lei_to_mail: support Maildir, fix+test --augment
  ipc: generic IPC dispatch based on Storable
  ipc: support Sereal
  lei_store: add ->set_eml, ->add_eml can return smsg
  lei: rename "extinbox" => "external"
  mid: use defined-or with `push' for uniqueness check
  mid: hoist out mids_in sub
  lei_store: handle messages without Message-ID at all
  ipc: use shutdown(2), base atfork* callback
  lei_to_mail: unlink mboxes if not augmenting
  lei: add --mfolder as an option
  spawn: move run_die here from PublicInbox::Import
  init: remove embedded UnlinkMe package
  t/run.perl: avoid uninitialized var on incomplete test
  gcf2client: reap process on DESTROY
  lei_to_mail: open FIFOs O_WRONLY so we block
  searchidxshard: call DS->Reset at worker start
  t/ipc.t: test for references via `die'
  use PublicInbox::DS for dwaitpid
  syscall: SFD_NONBLOCK can be a constant, again
  lei: avoid Spawn package when starting daemon
  avoid calling waitpid from children in DESTROY
  ds: clobber $in_loop first at reset
  on_destroy: support PID owner guard

 MANIFEST                                      |  12 +-
 lib/PublicInbox/DS.pm                         |  42 +-
 lib/PublicInbox/DSKQXS.pm                     |   4 +-
 lib/PublicInbox/Daemon.pm                     |   4 +-
 lib/PublicInbox/Gcf2Client.pm                 |  18 +-
 lib/PublicInbox/Git.pm                        |   7 +-
 lib/PublicInbox/IPC.pm                        | 165 ++++++++
 lib/PublicInbox/Import.pm                     |  36 +-
 lib/PublicInbox/LEI.pm                        |  44 +--
 lib/PublicInbox/LeiDedupe.pm                  | 100 +++++
 .../{LeiExtinbox.pm => LeiExternal.pm}        |  18 +-
 lib/PublicInbox/LeiStore.pm                   |  32 +-
 lib/PublicInbox/LeiToMail.pm                  | 361 ++++++++++++++++++
 lib/PublicInbox/LeiXSearch.pm                 |   2 +-
 lib/PublicInbox/Lock.pm                       |  17 +-
 lib/PublicInbox/MID.pm                        |  15 +-
 lib/PublicInbox/MboxReader.pm                 | 127 ++++++
 lib/PublicInbox/OnDestroy.pm                  |   5 +
 lib/PublicInbox/OverIdx.pm                    |   2 +
 lib/PublicInbox/ProcessPipe.pm                |  34 +-
 lib/PublicInbox/Qspawn.pm                     |  43 +--
 lib/PublicInbox/SearchIdxShard.pm             |   1 +
 lib/PublicInbox/SharedKV.pm                   | 148 +++++++
 lib/PublicInbox/Sigfd.pm                      |   4 +-
 lib/PublicInbox/Smsg.pm                       |   6 +-
 lib/PublicInbox/Spawn.pm                      |   9 +-
 lib/PublicInbox/Syscall.pm                    |   4 +-
 lib/PublicInbox/TestCommon.pm                 |  25 +-
 lib/PublicInbox/V2Writable.pm                 |  10 +-
 script/lei                                    |  17 +-
 script/public-inbox-init                      |  32 +-
 script/public-inbox-watch                     |   4 +-
 t/convert-compact.t                           |   4 +-
 t/index-git-times.t                           |   3 +-
 t/ipc.t                                       |  80 ++++
 t/lei.t                                       |  22 +-
 t/lei_dedupe.t                                |  59 +++
 t/lei_store.t                                 |  47 ++-
 t/lei_to_mail.t                               | 246 ++++++++++++
 t/lei_xsearch.t                               |   2 +-
 t/mbox_reader.t                               |  75 ++++
 t/on_destroy.t                                |   9 +
 t/plack.t                                     |   4 +-
 t/run.perl                                    |   3 +-
 t/shared_kv.t                                 |  58 +++
 t/sigfd.t                                     |   6 +-
 46 files changed, 1755 insertions(+), 211 deletions(-)
 create mode 100644 lib/PublicInbox/IPC.pm
 create mode 100644 lib/PublicInbox/LeiDedupe.pm
 rename lib/PublicInbox/{LeiExtinbox.pm => LeiExternal.pm} (75%)
 create mode 100644 lib/PublicInbox/LeiToMail.pm
 create mode 100644 lib/PublicInbox/MboxReader.pm
 create mode 100644 lib/PublicInbox/SharedKV.pm
 create mode 100644 t/ipc.t
 create mode 100644 t/lei_dedupe.t
 create mode 100644 t/lei_to_mail.t
 create mode 100644 t/mbox_reader.t
 create mode 100644 t/shared_kv.t


^ permalink raw reply	[relevance 7%]

Results 1-2 of 2 | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2020-12-31 13:51  7% [PATCH 00/36] another round of lei stuff Eric Wong
2020-12-31 13:51  5% ` [PATCH 08/36] mboxreader: new class for reading various mbox formats Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).