user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 04/13] InboxWritable: add mbox/maildir parsing + import logic
Date: Thu, 22 Mar 2018 09:40:06 +0000	[thread overview]
Message-ID: <20180322094015.14422-5-e@80x24.org> (raw)
In-Reply-To: <20180322094015.14422-1-e@80x24.org>

This will make it easier to as well as supporting future
Filter API users.  It allows simplifying our ad-hoc
import_vger_from_mbox script.
---
 lib/PublicInbox/InboxWritable.pm | 103 +++++++++++++++++++++++++++++++++++++++
 lib/PublicInbox/V2Writable.pm    |   8 +++
 lib/PublicInbox/WatchMaildir.pm  |  20 +++-----
 script/public-inbox-init         |   6 +--
 scripts/import_vger_from_mbox    |  51 +++----------------
 5 files changed, 126 insertions(+), 62 deletions(-)

diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index 0a976ea..82834f0 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -7,6 +7,8 @@ use strict;
 use warnings;
 use base qw(PublicInbox::Inbox);
 use PublicInbox::Import;
+use PublicInbox::Filter::Base;
+*REJECT = *PublicInbox::Filter::Base::REJECT;
 
 sub new {
 	my ($class, $ibx) = @_;
@@ -54,4 +56,105 @@ sub filter {
 	undef;
 }
 
+sub is_maildir_basename ($) {
+	my ($bn) = @_;
+	return 0 if $bn !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/;
+	if ($bn =~ /:2,([A-Z]+)\z/i) {
+		my $flags = $1;
+		return 0 if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail
+	}
+	1;
+}
+
+sub is_maildir_path ($) {
+	my ($path) = @_;
+	my @p = split(m!/+!, $path);
+	(is_maildir_basename($p[-1]) && -f $path) ? 1 : 0;
+}
+
+sub maildir_path_load ($) {
+	my ($path) = @_;
+	if (open my $fh, '<', $path) {
+		local $/;
+		my $str = <$fh>;
+		$str or return;
+		return PublicInbox::MIME->new(\$str);
+	} elsif ($!{ENOENT}) {
+		# common with Maildir
+		return;
+	} else {
+		warn "failed to open $path: $!\n";
+		return;
+	}
+}
+
+sub import_maildir {
+	my ($self, $dir) = @_;
+	my $im = $self->importer(1);
+	my $filter = $self->filter;
+	foreach my $sub (qw(cur new tmp)) {
+		-d "$dir/$sub" or die "$dir is not a Maildir (missing $sub)\n";
+	}
+	foreach my $sub (qw(cur new)) {
+		opendir my $dh, "$dir/$sub" or die "opendir $dir/$sub: $!\n";
+		while (defined(my $fn = readdir($dh))) {
+			next unless is_maildir_basename($fn);
+			my $mime = maildir_file_load("$dir/$fn") or next;
+			if ($filter) {
+				my $ret = $filter->scrub($mime) or return;
+				return if $ret == REJECT();
+				$mime = $ret;
+			}
+			$im->add($mime);
+		}
+	}
+	$im->done;
+}
+
+# asctime: From example@example.com Fri Jun 23 02:56:55 2000
+my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/;
+
+sub mb_add ($$$$) {
+	my ($im, $variant, $filter, $msg) = @_;
+	$$msg =~ s/(\r?\n)+\z/$1/s;
+	my $mime = PublicInbox::MIME->new($msg);
+	if ($variant eq 'mboxrd') {
+		$$msg =~ s/^>(>*From )/$1/sm;
+	} elsif ($variant eq 'mboxo') {
+		$$msg =~ s/^>From /From /sm;
+	}
+	if ($filter) {
+		my $ret = $filter->scrub($mime) or return;
+		return if $ret == REJECT();
+		$mime = $ret;
+	}
+	$im->add($mime)
+}
+
+sub import_mbox {
+	my ($self, $fh, $variant) = @_;
+	if ($variant !~ /\A(?:mboxrd|mboxo)\z/) {
+		die "variant must be 'mboxrd' or 'mboxo'\n";
+	}
+	my $im = $self->importer(1);
+	my $prev = undef;
+	my $msg = '';
+	my $filter = $self->filter;
+	while (defined(my $l = <$fh>)) {
+		if ($l =~ /$from_strict/o) {
+			if (!defined($prev) || $prev =~ /^\r?$/) {
+				mb_add($im, $variant, $filter, \$msg) if $msg;
+				$msg = '';
+				$prev = $l;
+				next;
+			}
+			warn "W[$.] $l\n";
+		}
+		$prev = $l;
+		$msg .= $l;
+	}
+	mb_add($im, $variant, $filter, \$msg) if $msg;
+	$im->done;
+}
+
 1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index dc96b87..46bfebb 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -65,6 +65,14 @@ sub new {
 	bless $self, $class;
 }
 
+sub init_inbox {
+	my ($self, $parallel) = @_;
+	$self->{parallel} = $parallel;
+	$self->idx_init;
+	$self->git_init(0);
+	$self->done;
+}
+
 # returns undef on duplicate or spam
 # mimics Import::add and wraps it for v2
 sub add {
diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm
index d3ca2a1..7ee29da 100644
--- a/lib/PublicInbox/WatchMaildir.pm
+++ b/lib/PublicInbox/WatchMaildir.pm
@@ -13,6 +13,8 @@ use PublicInbox::MDA;
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::InboxWritable;
 use File::Temp qw//;
+use PublicInbox::Filter::Base;
+*REJECT = *PublicInbox::Filter::Base::REJECT;
 
 sub new {
 	my ($class, $config) = @_;
@@ -125,7 +127,7 @@ sub _remove_spam {
 			$im->remove($mime, 'spam');
 			if (my $scrub = $ibx->filter) {
 				my $scrubbed = $scrub->scrub($mime) or return;
-				$scrubbed == 100 and return;
+				$scrubbed == REJECT() and return;
 				$im->remove($scrubbed, 'spam');
 			}
 		};
@@ -138,13 +140,7 @@ sub _remove_spam {
 
 sub _try_path {
 	my ($self, $path) = @_;
-	my @p = split(m!/+!, $path);
-	return if $p[-1] !~ /\A[a-zA-Z0-9][\-\w:,=\.]+\z/;
-	if ($p[-1] =~ /:2,([A-Z]+)\z/i) {
-		my $flags = $1;
-		return if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail
-	}
-	return unless -f $path;
+	return unless PublicInbox::InboxWritable::is_maildir_path($path);
 	if ($path !~ $self->{mdre}) {
 		warn "unrecognized path: $path\n";
 		return;
@@ -166,7 +162,7 @@ sub _try_path {
 	}
 	if (my $scrub = $inbox->filter) {
 		my $ret = $scrub->scrub($mime) or return;
-		$ret == 100 and return;
+		$ret == REJECT() and return;
 		$mime = $ret;
 	}
 
@@ -258,14 +254,14 @@ sub _path_to_mime {
 
 sub _importer_for {
 	my ($self, $ibx) = @_;
-	my $im = $ibx->importer(0);
 	my $importers = $self->{importers};
+	my $im = $importers->{"$ibx"} ||= $ibx->importer(0);
 	if (scalar(keys(%$importers)) > 2) {
-		delete $importers->{"$im"};
+		delete $importers->{"$ibx"};
 		_done_for_now($self);
 	}
 
-	$importers->{"$im"} = $im;
+	$importers->{"$ibx"} = $im;
 }
 
 sub _spamcheck_cb {
diff --git a/script/public-inbox-init b/script/public-inbox-init
index fdad136..86cf8b5 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -82,11 +82,7 @@ if ($version >= 2) {
 		-primary_address => $address[0],
 	};
 	$ibx = PublicInbox::Inbox->new($ibx);
-	my $v2w = PublicInbox::V2Writable->new($ibx, 1);
-	$v2w->{parallel} = 0;
-	$v2w->idx_init;
-	$v2w->git_init(0);
-	$v2w->done;
+	PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0);
 } elsif ($version == 1) {
 	x(qw(git init -q --bare), $mainrepo);
 
diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox
index 1edb987..369dac7 100644
--- a/scripts/import_vger_from_mbox
+++ b/scripts/import_vger_from_mbox
@@ -5,7 +5,7 @@ use strict;
 use warnings;
 use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
 use PublicInbox::MIME;
-use PublicInbox::Inbox;
+use PublicInbox::InboxWritable;
 use PublicInbox::V2Writable;
 use PublicInbox::Import;
 use PublicInbox::MDA;
@@ -30,55 +30,16 @@ my $ibx = {
 	name => $name,
 	version => $version,
 	-primary_address => $email,
+	filter => 'PublicInbox::Filter::Vger',
 };
 $ibx = PublicInbox::Inbox->new($ibx);
-my $im;
 unless ($dry_run) {
 	if ($version >= 2) {
-		$im = PublicInbox::V2Writable->new($ibx, 1);
+		PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0);
 	} else {
-		system(qw(git init --bare -q), $mainrepo);
-		my $git = PublicInbox::Git->new($mainrepo);
-		$im = PublicInbox::Import->new($git, $name, $email, $ibx);
+		system(qw(git init --bare -q), $mainrepo) == 0 or die;
 	}
 }
+$ibx = PublicInbox::InboxWritable->new($ibx);
 binmode STDIN;
-my $msg = '';
-use PublicInbox::Filter::Vger;
-my $vger = PublicInbox::Filter::Vger->new;
-
-sub do_add ($$) {
-	my ($im, $msg) = @_;
-	$$msg =~ s/(\r?\n)+\z/$1/s;
-	my $mime = PublicInbox::MIME->new($msg);
-	if ($variant eq 'mboxrd') {
-		$$msg =~ s/^>(>*From )/$1/sm;
-	} elsif ($variant eq 'mboxo') {
-		$$msg =~ s/^>From /From /sm;
-	}
-	$mime = $vger->scrub($mime);
-	return unless $im;
-	$mime->header_set($_) foreach @PublicInbox::MDA::BAD_HEADERS;
-	$im->add($mime) or
-		warn "duplicate: ",
-			$mime->header_obj->header_raw('Message-ID'), "\n";
-}
-
-# asctime: From example@example.com Fri Jun 23 02:56:55 2000
-my $from_strict = qr/^From \S+ +\S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/;
-my $prev = undef;
-while (defined(my $l = <STDIN>)) {
-	if ($l =~ /$from_strict/o) {
-		if (!defined($prev) || $prev =~ /^\r?$/) {
-			do_add($im, \$msg) if $msg;
-			$msg = '';
-			$prev = $l;
-			next;
-		}
-		warn "W[$.] $l\n";
-	}
-	$prev = $l;
-	$msg .= $l;
-}
-do_add($im, \$msg) if $msg;
-$im->done if $im;
+$ibx->import_mbox(\*STDIN, $variant);
-- 
EW


  parent reply	other threads:[~2018-03-22  9:40 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-22  9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 01/13] content_id: do not take Message-Id into account Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 02/13] introduce InboxWritable class Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 03/13] import: discard all the same headers as MDA Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-22  9:40 ` [PATCH 05/13] use both Date: and Received: times Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 06/13] msgmap: add tmp_clone to create an anonymous copy Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 07/13] fix syntax warnings Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 08/13] v2writable: support reindexing Xapian Eric Wong (Contractor, The Linux Foundation)
2018-03-26 20:08   ` Eric Wong
2018-03-22  9:40 ` [PATCH 09/13] t/altid.t: extra tests for mid_set Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 10/13] v2writable: add NNTP article number regeneration support Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 11/13] v2writable: clarify header cleanups Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 12/13] v2writable: DEBUG_DIFF respects $TMPDIR Eric Wong (Contractor, The Linux Foundation)
2018-03-22  9:40 ` [PATCH 13/13] feed: $INBOX/new.atom endpoint supports v2 inboxes Eric Wong (Contractor, The Linux Foundation)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180322094015.14422-5-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).