From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 9EC591FAEC for ; Thu, 15 Feb 2018 11:08:45 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [WIP 10/17] scripts/import_vger_from_mbox: support --dry-run option Date: Thu, 15 Feb 2018 11:08:33 +0000 Message-Id: <20180215110840.30413-11-e@80x24.org> In-Reply-To: <20180215110840.30413-1-e@80x24.org> References: <20180215105509.GA22409@dcvr> <20180215110840.30413-1-e@80x24.org> List-Id: This can be useful for getting baseline of performance of just Email::MIME and Date: header parsing. We'll need to do some Date: header parsing for LKML since there are some wonky date formats which causes the git RFC822 parser to choke. --- scripts/import_vger_from_mbox | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox index 9b3afc8..3fa5c77 100644 --- a/scripts/import_vger_from_mbox +++ b/scripts/import_vger_from_mbox @@ -3,16 +3,21 @@ # License: AGPL-3.0+ use strict; use warnings; +use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; +use Date::Parse qw/str2time/; use Email::MIME; $Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect use PublicInbox::Git; use PublicInbox::Import; my $usage = "usage: $0 NAME EMAIL \$dry_run ); +GetOptions(%opts) or die $usage; chomp(my $git_dir = `git rev-parse --git-dir`); my $git = PublicInbox::Git->new($git_dir); my $name = shift or die $usage; # git my $email = shift or die $usage; # git@vger.kernel.org -my $im = PublicInbox::Import->new($git, $name, $email); +my $im = $dry_run ? undef : PublicInbox::Import->new($git, $name, $email); binmode STDIN; my $msg = ''; use PublicInbox::Filter::Vger; @@ -22,9 +27,27 @@ sub do_add ($$) { $$msg =~ s/(\r?\n)+\z/$1/s; $msg = Email::MIME->new($$msg); $msg = $vger->scrub($msg); + my $hdr = $msg->header_obj; + my $date = $hdr->header_raw('Date'); + if ($date) { + eval { str2time($date) }; + if ($@) { + warn "bad Date: $date in ", + $hdr->header_raw('Message-ID'), ": $@\n"; + } + } else { + warn "missing Date: $date in ", + $hdr->header_raw('Message-ID'), ": $@\n"; + my $n = 0; + foreach my $r ($hdr->header_raw('Received')) { + warn "$n Received: $r\n"; + } + warn(('-' x 72), "\n"); + } + return unless $im; $im->add($msg) or warn "duplicate: ", - $msg->header_obj->header_raw('Message-ID'), "\n"; + $hdr->header_raw('Message-ID'), "\n"; } # asctime: From example@example.com Fri Jun 23 02:56:55 2000 @@ -44,4 +67,4 @@ while (defined(my $l = )) { $msg .= $l; } do_add($im, \$msg) if $msg; -$im->done; +$im->done if $im; -- EW