From 83470f5e60e44d7f70b378f4b250c6584f42f64e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:15 +0000 Subject: learn: support multiple To/Cc headers It's possible to specify these headers multiple times, and PublicInbox::MDA->precheck takes that into account, so -learn should, too. --- script/public-inbox-learn | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/script/public-inbox-learn b/script/public-inbox-learn index c4c4d4b9..8ff1652b 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -42,9 +42,11 @@ my $mime = PublicInbox::MIME->new(eval { # get all recipients my %dests; foreach my $h (qw(Cc To)) { - my $val = $mime->header($h) or next; - foreach my $email (PublicInbox::Address::emails($val)) { - $dests{lc($email)} = 1; + my @val = $mime->header($h) or next; + for (@val) { + foreach my $email (PublicInbox::Address::emails($_)) { + $dests{lc($email)} = 1; + } } } -- cgit v1.2.3-24-ge0c7 From bc6b44cd7feee7e83ef1dcf26092808f92f757d9 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:16 +0000 Subject: learn: only map recipient list on "ham" or "rm" It's assumed that "spam" can end up anywhere due to Bcc:, so we need to scan every single inbox. However, "rm" is usually more targeted and and "ham" obviously only belongs in some inboxes. --- script/public-inbox-learn | 71 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/script/public-inbox-learn b/script/public-inbox-learn index 8ff1652b..d2d665d5 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -39,17 +39,7 @@ my $mime = PublicInbox::MIME->new(eval { $data }); -# get all recipients -my %dests; -foreach my $h (qw(Cc To)) { - my @val = $mime->header($h) or next; - for (@val) { - foreach my $email (PublicInbox::Address::emails($_)) { - $dests{lc($email)} = 1; - } - } -} - +# spam is removed from all known inboxes if ($train eq 'spam') { $pi_config->each_inbox(sub { my ($ibx) = @_; @@ -58,36 +48,45 @@ if ($train eq 'spam') { $im->remove($mime, 'spam'); $im->done; }); -} +} else { + require PublicInbox::MDA if $train eq "ham"; -require PublicInbox::MDA if $train eq "ham"; + # get all recipients + my %dests; # address => + for ($mime->header('Cc'), $mime->header('To')) { + foreach my $addr (PublicInbox::Address::emails($_)) { + $addr = lc($addr); + $dests{$addr} //= $pi_config->lookup($addr) // 0; + } + } -# n.b. message may be cross-posted to multiple public-inboxes -foreach my $recipient (keys %dests) { - my $dst = $pi_config->lookup($recipient) or next; - # We do not touch GIT_COMMITTER_* env here so we can track - # who trained the message. - $dst->{name} = $ENV{GIT_COMMITTER_NAME} || $dst->{name}; - $dst->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} || $recipient; - $dst = PublicInbox::InboxWritable->new($dst); - my $im = $dst->importer(0); + # n.b. message may be cross-posted to multiple public-inboxes + while (my ($addr, $dst) = each %dests) { + next unless ref($dst); + # We do not touch GIT_COMMITTER_* env here so we can track + # who trained the message. + $dst->{name} = $ENV{GIT_COMMITTER_NAME} || $dst->{name}; + $dst->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} || $addr; + $dst = PublicInbox::InboxWritable->new($dst); + my $im = $dst->importer(0); - if ($train eq "spam" || $train eq "rm") { - # This needs to be idempotent, as my inotify trainer - # may train for each cross-posted message, and this - # script already learns for every list in - # ~/.public-inbox/config - $im->remove($mime, $train); - } else { # $train eq "ham" - # no checking for spam here, we assume the message has - # been reviewed by a human at this point: - PublicInbox::MDA->set_list_headers($mime, $dst); + if ($train eq "rm") { + # This needs to be idempotent, as my inotify trainer + # may train for each cross-posted message, and this + # script already learns for every list in + # ~/.public-inbox/config + $im->remove($mime, $train); + } elsif ($train eq "ham") { + # no checking for spam here, we assume the message has + # been reviewed by a human at this point: + PublicInbox::MDA->set_list_headers($mime, $dst); - # Ham messages are trained when they're marked into - # a SEEN state, so this is idempotent: - $im->add($mime); + # Ham messages are trained when they're marked into + # a SEEN state, so this is idempotent: + $im->add($mime); + } + $im->done; } - $im->done; } if ($err) { -- cgit v1.2.3-24-ge0c7 From ec294cdbc5392fdb136572dbcedb250798023703 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:17 +0000 Subject: learn: update usage statement Use since that seems to be the favored notation for required command args (taking a hint from git(1) manpage). While we're at it, remove the space after '<' for the redirect to match git.git coding style. --- script/public-inbox-learn | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/public-inbox-learn b/script/public-inbox-learn index d2d665d5..ad132985 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -4,7 +4,7 @@ # # Used for training spam (via SpamAssassin) and removing messages from a # public-inbox -my $usage = "$0 (spam|ham) < /path/to/message"; +my $usage = "$0 new(eval { $data }); -# spam is removed from all known inboxes +# spam is removed from all known inboxes since it is often Bcc:-ed if ($train eq 'spam') { $pi_config->each_inbox(sub { my ($ibx) = @_; -- cgit v1.2.3-24-ge0c7 From b1e4d474efcccaa5c5b5e71aac75c8f7836f1d91 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:18 +0000 Subject: learn: GIT_COMMITTER_ may be "" or "0" Users may be zeroes or blanks. --- script/public-inbox-learn | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/public-inbox-learn b/script/public-inbox-learn index ad132985..299f75a0 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -65,8 +65,8 @@ if ($train eq 'spam') { next unless ref($dst); # We do not touch GIT_COMMITTER_* env here so we can track # who trained the message. - $dst->{name} = $ENV{GIT_COMMITTER_NAME} || $dst->{name}; - $dst->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} || $addr; + $dst->{name} = $ENV{GIT_COMMITTER_NAME} // $dst->{name}; + $dst->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} // $addr; $dst = PublicInbox::InboxWritable->new($dst); my $im = $dst->importer(0); -- cgit v1.2.3-24-ge0c7 From 29beb70160d15ffe46cb07e9d9468acfcda34db7 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:19 +0000 Subject: learn: hoist out remove_or_add subroutine We'll be reusing it for List-ID processing in the next commit. --- script/public-inbox-learn | 56 ++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/script/public-inbox-learn b/script/public-inbox-learn index 299f75a0..56739f88 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -39,6 +39,34 @@ my $mime = PublicInbox::MIME->new(eval { $data }); +sub remove_or_add ($$$) { + my ($ibx, $train, $addr) = @_; + + # We do not touch GIT_COMMITTER_* env here so we can track + # who trained the message. + $ibx->{name} = $ENV{GIT_COMMITTER_NAME} // $ibx->{name}; + $ibx->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} // $addr; + $ibx = PublicInbox::InboxWritable->new($ibx); + my $im = $ibx->importer(0); + + if ($train eq "rm") { + # This needs to be idempotent, as my inotify trainer + # may train for each cross-posted message, and this + # script already learns for every list in + # ~/.public-inbox/config + $im->remove($mime, $train); + } elsif ($train eq "ham") { + # no checking for spam here, we assume the message has + # been reviewed by a human at this point: + PublicInbox::MDA->set_list_headers($mime, $ibx); + + # Ham messages are trained when they're marked into + # a SEEN state, so this is idempotent: + $im->add($mime); + } + $im->done; +} + # spam is removed from all known inboxes since it is often Bcc:-ed if ($train eq 'spam') { $pi_config->each_inbox(sub { @@ -61,31 +89,9 @@ if ($train eq 'spam') { } # n.b. message may be cross-posted to multiple public-inboxes - while (my ($addr, $dst) = each %dests) { - next unless ref($dst); - # We do not touch GIT_COMMITTER_* env here so we can track - # who trained the message. - $dst->{name} = $ENV{GIT_COMMITTER_NAME} // $dst->{name}; - $dst->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} // $addr; - $dst = PublicInbox::InboxWritable->new($dst); - my $im = $dst->importer(0); - - if ($train eq "rm") { - # This needs to be idempotent, as my inotify trainer - # may train for each cross-posted message, and this - # script already learns for every list in - # ~/.public-inbox/config - $im->remove($mime, $train); - } elsif ($train eq "ham") { - # no checking for spam here, we assume the message has - # been reviewed by a human at this point: - PublicInbox::MDA->set_list_headers($mime, $dst); - - # Ham messages are trained when they're marked into - # a SEEN state, so this is idempotent: - $im->add($mime); - } - $im->done; + while (my ($addr, $ibx) = each %dests) { + next unless ref($ibx); # $ibx may be 0 + remove_or_add($ibx, $train, $addr); } } -- cgit v1.2.3-24-ge0c7 From d1525c3b35e9aa1bb14143996a7fa2b6e34cba3e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:20 +0000 Subject: mda: hoist out List-ID handling and reuse in -learn It's now possible to inject false-positive ham into an inbox the same way -mda does via List-ID. --- lib/PublicInbox/MDA.pm | 15 +++++++++++++++ script/public-inbox-learn | 8 +++++++- script/public-inbox-mda | 5 +---- 3 files changed, 23 insertions(+), 5 deletions(-) mode change 100755 => 100644 script/public-inbox-learn diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index 9cafda13..ce2c870f 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -83,4 +83,19 @@ sub set_list_headers { } } +# TODO: deal with multiple List-ID headers? +sub inbox_for_list_id ($$) { + my ($klass, $config, $simple) = @_; + + # newer Email::Simple allows header_raw, as does Email::MIME: + my $list_id = $simple->can('header_raw') ? + $simple->header_raw('List-Id') : + $simple->header('List-Id'); + my $ibx; + if (defined $list_id && $list_id =~ /<[ \t]*(.+)?[ \t]*>/) { + $ibx = $config->lookup_list_id($1); + } + $ibx; +} + 1; diff --git a/script/public-inbox-learn b/script/public-inbox-learn old mode 100755 new mode 100644 index 56739f88..79f3ead5 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -77,7 +77,7 @@ if ($train eq 'spam') { $im->done; }); } else { - require PublicInbox::MDA if $train eq "ham"; + require PublicInbox::MDA; # get all recipients my %dests; # address => @@ -89,10 +89,16 @@ if ($train eq 'spam') { } # n.b. message may be cross-posted to multiple public-inboxes + my %seen; while (my ($addr, $ibx) = each %dests) { next unless ref($ibx); # $ibx may be 0 + next if $seen{"$ibx"}++; remove_or_add($ibx, $train, $addr); } + my $ibx = PublicInbox::MDA->inbox_for_list_id($pi_config, $mime); + if ($ibx && !$seen{"$ibx"}) { + remove_or_add($ibx, $train, $ibx->{-primary_address}); + } } if ($err) { diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 584218b5..3ff318c9 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -43,10 +43,7 @@ if (defined $recipient) { $dst = $config->lookup($recipient); # first check } if (!defined $dst) { - my $list_id = $simple->header('List-Id'); - if (defined $list_id && $list_id =~ /<[ \t]*(.+)?[ \t]*>/) { - $dst = $config->lookup_list_id($1); - } + $dst = PublicInbox::MDA->inbox_for_list_id($config, $simple); if (!defined $dst && !defined $recipient) { die "ORIGINAL_RECIPIENT not defined in ENV\n"; } -- cgit v1.2.3-24-ge0c7 From 84d811cfb3a0c8724c52ce485402e4586d5c9e04 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:21 +0000 Subject: filter/base: remove MAX_MID_SIZE constant We don't need it in the filter, here, since we have one in the MDA package. --- lib/PublicInbox/Filter/Base.pm | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/PublicInbox/Filter/Base.pm b/lib/PublicInbox/Filter/Base.pm index 052cd332..7a0c720f 100644 --- a/lib/PublicInbox/Filter/Base.pm +++ b/lib/PublicInbox/Filter/Base.pm @@ -6,7 +6,6 @@ package PublicInbox::Filter::Base; use strict; use warnings; use PublicInbox::MsgIter; -use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian sub No ($) { "*** We only accept plain-text mail, No $_[0] ***" } -- cgit v1.2.3-24-ge0c7 From 420fddb8b683637cc1fd39727896cac4a459c3b6 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:22 +0000 Subject: mda: hoist out mda_filter_adjust It makes it easier to document the default -mda behavior is stricter than normal, including "public-inbox-learn ham" --- script/public-inbox-mda | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 3ff318c9..71c5d937 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -75,13 +75,19 @@ if ($spamc) { my $mime = PublicInbox::MIME->new(\$str); do_exit(0) unless $spam_ok; -my $fcfg = $dst->{filter} || ''; -# -mda defaults to the strict base filter -if ($fcfg eq '') { - $dst->{filter} = 'PublicInbox::Filter::Base'; -} elsif ($fcfg eq 'scrub') { # legacy alias, undocumented, remove? - $dst->{filter} = 'PublicInbox::Filter::Mirror'; +# -mda defaults to the strict base filter which we won't use anywhere else +sub mda_filter_adjust ($) { + my ($ibx) = @_; + my $fcfg = $ibx->{filter} || ''; + if ($fcfg eq '') { + $ibx->{filter} = 'PublicInbox::Filter::Base'; + } elsif ($fcfg eq 'scrub') { # legacy alias, undocumented, remove? + $ibx->{filter} = 'PublicInbox::Filter::Mirror'; + } } + +mda_filter_adjust($dst); + my $filter = $dst->filter; my $ret = $filter->delivery($mime); if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message -- cgit v1.2.3-24-ge0c7 From 1d9317540bc3fea86dcd512bb54275324ed1b0fa Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:23 +0000 Subject: mda: skip MIME parsing if spam We don't want to waste cycles parsing the message for MIME bits if it's spam. --- script/public-inbox-mda | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 71c5d937..69354616 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -71,9 +71,9 @@ if ($spamc) { my $fh = $emm->fh; read($fh, $str, -s $fh); } +do_exit(0) unless $spam_ok; my $mime = PublicInbox::MIME->new(\$str); -do_exit(0) unless $spam_ok; # -mda defaults to the strict base filter which we won't use anywhere else sub mda_filter_adjust ($) { -- cgit v1.2.3-24-ge0c7 From 6c559dae69e244895fd7e6c5a9ae29f58d03058e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:24 +0000 Subject: inboxwritable: add assert_usable_dir sub And use it for mda, since "0" could be a usable directory if somebody insists on using relative paths... --- lib/PublicInbox/InboxWritable.pm | 9 ++++++++- lib/PublicInbox/V2Writable.pm | 5 ++--- script/public-inbox-mda | 4 +++- t/import.t | 8 ++++++++ t/v2writable.t | 12 ++++++++++++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index ab7b0ed5..9eab394d 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -30,12 +30,19 @@ sub new { $self; } +sub assert_usable_dir { + my ($self) = @_; + my $dir = $self->{inboxdir}; + return $dir if defined($dir) && $dir ne ''; + die "no inboxdir defined for $self->{name}\n"; +} + sub init_inbox { my ($self, $shards, $skip_epoch, $skip_artnum) = @_; # TODO: honor skip_artnum my $v = $self->{version} || 1; if ($v == 1) { - my $dir = $self->{inboxdir} or die "no inboxdir in inbox\n"; + my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); } else { my $v2w = importer($self); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ad2e8e62..1825da2c 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -77,7 +77,8 @@ sub new { # $creat may be any true value, or 0/undef. A hashref is true, # and $creat->{nproc} may be set to an integer my ($class, $v2ibx, $creat) = @_; - my $dir = $v2ibx->{inboxdir} or die "no inboxdir in inbox\n"; + $v2ibx = PublicInbox::InboxWritable->new($v2ibx); + my $dir = $v2ibx->assert_usable_dir; unless (-d $dir) { if ($creat) { require File::Path; @@ -86,8 +87,6 @@ sub new { die "$dir does not exist\n"; } } - - $v2ibx = PublicInbox::InboxWritable->new($v2ibx); $v2ibx->umask_prepare; my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 69354616..c122984f 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -49,8 +49,10 @@ if (!defined $dst) { } defined $dst or do_exit(67); # EX_NOUSER 5.1.1 user unknown } -$dst->{inboxdir} or do_exit(67); + $dst = PublicInbox::InboxWritable->new($dst); +eval { $dst->assert_usable_dir }; +do_exit(67) if $@; # pre-check, MDA has stricter rules than an importer might; if ($precheck && !PublicInbox::MDA->precheck($simple, $dst->{address})) { diff --git a/t/import.t b/t/import.t index 4ec3c4f3..d309eec5 100644 --- a/t/import.t +++ b/t/import.t @@ -96,4 +96,12 @@ is(undef, $im->checkpoint, 'checkpoint works before ->done'); $im->done; is(undef, $im->checkpoint, 'checkpoint works after ->done'); $im->checkpoint; + +my $nogit = PublicInbox::Git->new("$dir/non-existent/dir"); +eval { + my $nope = PublicInbox::Import->new($nogit, 'nope', 'no@example.com'); + $nope->add($mime); +}; +ok($@, 'Import->add fails on non-existent dir'); + done_testing(); diff --git a/t/v2writable.t b/t/v2writable.t index c2daac2f..06dafe98 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -260,4 +260,16 @@ EOF $im->done; } +my $tmp = { + inboxdir => "$inboxdir/non-existent/subdir", + name => 'nope', + version => 2, + -primary_address => 'test@example.com', +}; +eval { + my $nope = PublicInbox::V2Writable->new($tmp); + $nope->add($mime); +}; +ok($@, 'V2Writable fails on non-existent dir'); + done_testing(); -- cgit v1.2.3-24-ge0c7 From 4315455300e29e4ef0ea2f2d68bf4e86d261ae1d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:25 +0000 Subject: mda: prepare for multiple destinations Multiple List-ID headers will be supported in the next commit --- script/public-inbox-mda | 92 +++++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/script/public-inbox-mda b/script/public-inbox-mda index c122984f..821bd9cc 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -37,27 +37,39 @@ my $config = PublicInbox::Config->new; my $key = 'publicinboxmda.spamcheck'; my $default = 'PublicInbox::Spamcheck::Spamc'; my $spamc = PublicInbox::Spamcheck::get($config, $key, $default); -my $dst; +my $dests = []; my $recipient = $ENV{ORIGINAL_RECIPIENT}; if (defined $recipient) { - $dst = $config->lookup($recipient); # first check + my $ibx = $config->lookup($recipient); # first check + push @$dests, $ibx if $ibx; } -if (!defined $dst) { - $dst = PublicInbox::MDA->inbox_for_list_id($config, $simple); - if (!defined $dst && !defined $recipient) { +if (!scalar(@$dests)) { + my $ibx = PublicInbox::MDA->inbox_for_list_id($config, $simple); + if (!defined($ibx) && !defined($recipient)) { die "ORIGINAL_RECIPIENT not defined in ENV\n"; } - defined $dst or do_exit(67); # EX_NOUSER 5.1.1 user unknown + defined($ibx) or do_exit(67); # EX_NOUSER 5.1.1 user unknown + push @$dests, $ibx; } -$dst = PublicInbox::InboxWritable->new($dst); -eval { $dst->assert_usable_dir }; -do_exit(67) if $@; +my $err; +@$dests = grep { + my $ibx = PublicInbox::InboxWritable->new($_); + eval { $ibx->assert_usable_dir }; + if ($@) { + warn $@; + $err = 1; + 0; + # pre-check, MDA has stricter rules than an importer might; + } elsif ($precheck) { + !!PublicInbox::MDA->precheck($simple, $ibx->{address}); + } else { + 1; + } +} @$dests; + +do_exit(67) if $err && scalar(@$dests) == 0; -# pre-check, MDA has stricter rules than an importer might; -if ($precheck && !PublicInbox::MDA->precheck($simple, $dst->{address})) { - do_exit(0); -} $simple = undef; my $spam_ok; if ($spamc) { @@ -75,8 +87,6 @@ if ($spamc) { } do_exit(0) unless $spam_ok; -my $mime = PublicInbox::MIME->new(\$str); - # -mda defaults to the strict base filter which we won't use anywhere else sub mda_filter_adjust ($) { my ($ibx) = @_; @@ -88,30 +98,38 @@ sub mda_filter_adjust ($) { } } -mda_filter_adjust($dst); +my @rejects; +for my $ibx (@$dests) { + mda_filter_adjust($ibx); + my $filter = $ibx->filter; + my $mime = PublicInbox::MIME->new($str); + my $ret = $filter->delivery($mime); + if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message + $mime = $ret; + } elsif ($ret == PublicInbox::Filter::Base::IGNORE) { + next; # nothing, keep looping + } elsif ($ret == PublicInbox::Filter::Base::REJECT) { + push @rejects, $filter->err; + next; + } -my $filter = $dst->filter; -my $ret = $filter->delivery($mime); -if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message - $mime = $ret; -} elsif ($ret == PublicInbox::Filter::Base::IGNORE) { - do_exit(0); # chuck it to emergency -} elsif ($ret == PublicInbox::Filter::Base::REJECT) { - $! = 65; # EX_DATAERR 5.6.0 data format error - die $filter->err, "\n"; -} # else { accept -$filter = undef; + PublicInbox::MDA->set_list_headers($mime, $ibx); + my $im = $ibx->importer(0); + if (defined $im->add($mime)) { + # ->abort is idempotent, no emergency if a single + # destination succeeds + $emm->abort; + } else { # v1-only + my $mid = $mime->header_obj->header_raw('Message-ID'); + # this message is similar to what ssoma-mda shows: + print STDERR "CONFLICT: Message-ID: $mid exists\n"; + } + $im->done; +} -PublicInbox::MDA->set_list_headers($mime, $dst); -my $im = $dst->importer(0); -if (defined $im->add($mime)) { - $emm = $emm->abort; -} else { - # this message is similar to what ssoma-mda shows: - print STDERR "CONFLICT: Message-ID: ", - $mime->header_obj->header_raw('Message-ID'), - " exists\n"; +if (scalar(@rejects) && scalar(@rejects) == scalar(@$dests)) { + $! = 65; # EX_DATAERR 5.6.0 data format error + die join("\n", @rejects, ''); } -$im->done; do_exit(0); -- cgit v1.2.3-24-ge0c7 From 74a3206babe0572a1494500d21267a31873af7b0 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:26 +0000 Subject: mda: support multiple List-ID matches While it's not RFC2919-conformant, mail software can theoretically set multiple List-ID headers. Deliver to all inboxes which match a given List-ID since that's likely the intended. Cc: Eric W. Biederman Link: https://public-inbox.org/meta/87pniltscf.fsf@x220.int.ebiederm.org/ --- lib/PublicInbox/MDA.pm | 20 +++++++++++++------- script/public-inbox-learn | 5 +++-- script/public-inbox-mda | 7 +++---- t/mda.t | 19 +++++++++++++++++++ 4 files changed, 38 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index ce2c870f..b0dfac45 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -83,19 +83,25 @@ sub set_list_headers { } } -# TODO: deal with multiple List-ID headers? -sub inbox_for_list_id ($$) { +sub inboxes_for_list_id ($$) { my ($klass, $config, $simple) = @_; # newer Email::Simple allows header_raw, as does Email::MIME: - my $list_id = $simple->can('header_raw') ? + my @list_ids = $simple->can('header_raw') ? $simple->header_raw('List-Id') : $simple->header('List-Id'); - my $ibx; - if (defined $list_id && $list_id =~ /<[ \t]*(.+)?[ \t]*>/) { - $ibx = $config->lookup_list_id($1); + my @dests; + for my $list_id (@list_ids) { + $list_id =~ /<[ \t]*(.+)?[ \t]*>/ or next; + if (my $ibx = $config->lookup_list_id($1)) { + push @dests, $ibx; + } + } + if (scalar(@list_ids) > 1) { + warn "W: multiple List-IDs in message:\n"; + warn "W: List-ID: $_\n" for @list_ids } - $ibx; + \@dests; } 1; diff --git a/script/public-inbox-learn b/script/public-inbox-learn index 79f3ead5..3073294a 100644 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -95,8 +95,9 @@ if ($train eq 'spam') { next if $seen{"$ibx"}++; remove_or_add($ibx, $train, $addr); } - my $ibx = PublicInbox::MDA->inbox_for_list_id($pi_config, $mime); - if ($ibx && !$seen{"$ibx"}) { + my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_config, $mime); + for my $ibx (@$dests) { + next if !$seen{"$ibx"}++; remove_or_add($ibx, $train, $ibx->{-primary_address}); } } diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 821bd9cc..dca8a0ea 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -44,12 +44,11 @@ if (defined $recipient) { push @$dests, $ibx if $ibx; } if (!scalar(@$dests)) { - my $ibx = PublicInbox::MDA->inbox_for_list_id($config, $simple); - if (!defined($ibx) && !defined($recipient)) { + $dests = PublicInbox::MDA->inboxes_for_list_id($config, $simple); + if (!scalar(@$dests) && !defined($recipient)) { die "ORIGINAL_RECIPIENT not defined in ENV\n"; } - defined($ibx) or do_exit(67); # EX_NOUSER 5.1.1 user unknown - push @$dests, $ibx; + scalar(@$dests) or do_exit(67); # EX_NOUSER 5.1.1 user unknown } my $err; diff --git a/t/mda.t b/t/mda.t index 99592b2d..35811ac6 100644 --- a/t/mda.t +++ b/t/mda.t @@ -308,6 +308,25 @@ EOF my $cur = `git --git-dir=$maindir diff HEAD~1..HEAD`; like($cur, qr/this message would not be accepted without --no-precheck/, '--no-precheck delivered message anyways'); + + # try a message with multiple List-ID headers + $in = < +List-ID: <$list_id> +Message-ID: <2lids\@example> +Subject: two List-IDs +From: user +To: $addr +Date: Fri, 02 Oct 1993 00:00:00 +0000 + +EOF + ($out, $err) = ('', ''); + IPC::Run::run([$mda], \$in, \$out, \$err); + is($?, 0, 'mda OK with multiple List-Id matches'); + $cur = `git --git-dir=$maindir diff HEAD~1..HEAD`; + like($cur, qr/Message-ID: <2lids\@example>/, + 'multi List-ID match delivered'); + like($err, qr/multiple List-ID/, 'warned about multiple List-ID'); } done_testing(); -- cgit v1.2.3-24-ge0c7 From 4da1c13914d958807fe9ef347d3a6abf7b129b62 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 28 Oct 2019 10:45:28 +0000 Subject: doc: add public-inbox-learn(1) manpage Tools intended for end users need manpages. --- Documentation/include.mk | 1 + Documentation/public-inbox-learn.pod | 86 ++++++++++++++++++++++++++++++++++++ MANIFEST | 1 + 3 files changed, 88 insertions(+) create mode 100644 Documentation/public-inbox-learn.pod diff --git a/Documentation/include.mk b/Documentation/include.mk index d2357ffc..bb622c1a 100644 --- a/Documentation/include.mk +++ b/Documentation/include.mk @@ -41,6 +41,7 @@ m1 += public-inbox-edit m1 += public-inbox-httpd m1 += public-inbox-index m1 += public-inbox-init +m1 += public-inbox-learn m1 += public-inbox-mda m1 += public-inbox-nntpd m1 += public-inbox-watch diff --git a/Documentation/public-inbox-learn.pod b/Documentation/public-inbox-learn.pod new file mode 100644 index 00000000..b8190b59 --- /dev/null +++ b/Documentation/public-inbox-learn.pod @@ -0,0 +1,86 @@ +=head1 NAME + +public-inbox-learn - spam trainer and remover for public-inbox + +=head1 SYNOPSIS + +B EMESSAGE + +=head1 DESCRIPTION + +public-inbox-learn can remove spam or inject ham messages into +an inbox while training a SpamAssassin instance. + +It is intended for users of L or +L, but not users relying on +L to mirror inboxes. + +It reads one message from standard input and operates on it +depending on the command given: + +=head1 COMMANDS + +public-inbox-learn takes one of the following commands as its +first and only argument: + +=over 8 + +=item spam + +Treat the message as spam. This will mark the message as +removed so it becomes inaccessible via NNTP or WWW endpoints +for all configured inboxes. + +The message remains accessible in git history. + +It will also be fed to L for training purposes unless +C is C in L. + +=item ham + +Treat standard input as ham. This is useful for manually injecting +messages into the archives which failed the spam check run by +L or L. + +It relies on the C, C, and C headers +to match configured inbox addresses and C directives. + +It will also be fed to L for training purposes unless +C is C in L. + +=item rm + +This is identical to the C command above, but does +not feed the message to L + +=back + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +Per-user config file parseable by L. +See L. + +Default: ~/.public-inbox/config + +=back + +=head1 CONTACT + +Feedback welcome via plain-text mail to L + +The mail archives are hosted at L +and L + +=head1 COPYRIGHT + +Copyright 2019 all contributors L + +License: AGPL-3.0+ L + +=head1 SEE ALSO + +L, L, L diff --git a/MANIFEST b/MANIFEST index 7d2ac17c..d1b6749a 100644 --- a/MANIFEST +++ b/MANIFEST @@ -22,6 +22,7 @@ Documentation/public-inbox-edit.pod Documentation/public-inbox-httpd.pod Documentation/public-inbox-index.pod Documentation/public-inbox-init.pod +Documentation/public-inbox-learn.pod Documentation/public-inbox-mda.pod Documentation/public-inbox-nntpd.pod Documentation/public-inbox-overview.pod -- cgit v1.2.3-24-ge0c7