diff options
-rw-r--r-- | Documentation/include.mk | 1 | ||||
-rw-r--r-- | Documentation/public-inbox-learn.pod | 86 | ||||
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | lib/PublicInbox/Filter/Base.pm | 1 | ||||
-rw-r--r-- | lib/PublicInbox/InboxWritable.pm | 9 | ||||
-rw-r--r-- | lib/PublicInbox/MDA.pm | 21 | ||||
-rw-r--r-- | lib/PublicInbox/V2Writable.pm | 5 | ||||
-rw-r--r--[-rwxr-xr-x] | script/public-inbox-learn | 76 | ||||
-rwxr-xr-x | script/public-inbox-mda | 110 | ||||
-rw-r--r-- | t/import.t | 8 | ||||
-rw-r--r-- | t/mda.t | 19 | ||||
-rw-r--r-- | t/v2writable.t | 12 |
12 files changed, 269 insertions, 80 deletions
diff --git a/Documentation/include.mk b/Documentation/include.mk index d2357ffc..bb622c1a 100644 --- a/Documentation/include.mk +++ b/Documentation/include.mk @@ -41,6 +41,7 @@ m1 += public-inbox-edit m1 += public-inbox-httpd m1 += public-inbox-index m1 += public-inbox-init +m1 += public-inbox-learn m1 += public-inbox-mda m1 += public-inbox-nntpd m1 += public-inbox-watch diff --git a/Documentation/public-inbox-learn.pod b/Documentation/public-inbox-learn.pod new file mode 100644 index 00000000..b8190b59 --- /dev/null +++ b/Documentation/public-inbox-learn.pod @@ -0,0 +1,86 @@ +=head1 NAME + +public-inbox-learn - spam trainer and remover for public-inbox + +=head1 SYNOPSIS + +B<public-inbox-learn> <spam|ham|rm> E<lt>MESSAGE + +=head1 DESCRIPTION + +public-inbox-learn can remove spam or inject ham messages into +an inbox while training a SpamAssassin instance. + +It is intended for users of L<public-inbox-mda(1)> or +L<public-inbox-watch(1)>, but not users relying on +L<git-fetch(1)> to mirror inboxes. + +It reads one message from standard input and operates on it +depending on the command given: + +=head1 COMMANDS + +public-inbox-learn takes one of the following commands as its +first and only argument: + +=over 8 + +=item spam + +Treat the message as spam. This will mark the message as +removed so it becomes inaccessible via NNTP or WWW endpoints +for all configured inboxes. + +The message remains accessible in git history. + +It will also be fed to L<spamc(1)> for training purposes unless +C<publicinboxmda.spamcheck> is C<none> in L<public-inbox-config(5)>. + +=item ham + +Treat standard input as ham. This is useful for manually injecting +messages into the archives which failed the spam check run by +L<public-inbox-mda(1)> or L<public-inbox-watch(1)>. + +It relies on the C<To:>, C<Cc:>, and C<List-ID:> headers +to match configured inbox addresses and C<listid> directives. + +It will also be fed to L<spamc(1)> for training purposes unless +C<publicinboxmda.spamcheck> is C<none> in L<public-inbox-config(5)>. + +=item rm + +This is identical to the C<spam> command above, but does +not feed the message to L<spamc(1)> + +=back + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +Per-user config file parseable by L<git-config(1)>. +See L<public-inbox-config(5)>. + +Default: ~/.public-inbox/config + +=back + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright 2019 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<spamc(1)>, L<public-inbox-mda(1)>, L<public-inbox-watch(1)> @@ -22,6 +22,7 @@ Documentation/public-inbox-edit.pod Documentation/public-inbox-httpd.pod Documentation/public-inbox-index.pod Documentation/public-inbox-init.pod +Documentation/public-inbox-learn.pod Documentation/public-inbox-mda.pod Documentation/public-inbox-nntpd.pod Documentation/public-inbox-overview.pod diff --git a/lib/PublicInbox/Filter/Base.pm b/lib/PublicInbox/Filter/Base.pm index 052cd332..7a0c720f 100644 --- a/lib/PublicInbox/Filter/Base.pm +++ b/lib/PublicInbox/Filter/Base.pm @@ -6,7 +6,6 @@ package PublicInbox::Filter::Base; use strict; use warnings; use PublicInbox::MsgIter; -use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian sub No ($) { "*** We only accept plain-text mail, No $_[0] ***" } diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index ab7b0ed5..9eab394d 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -30,12 +30,19 @@ sub new { $self; } +sub assert_usable_dir { + my ($self) = @_; + my $dir = $self->{inboxdir}; + return $dir if defined($dir) && $dir ne ''; + die "no inboxdir defined for $self->{name}\n"; +} + sub init_inbox { my ($self, $shards, $skip_epoch, $skip_artnum) = @_; # TODO: honor skip_artnum my $v = $self->{version} || 1; if ($v == 1) { - my $dir = $self->{inboxdir} or die "no inboxdir in inbox\n"; + my $dir = assert_usable_dir($self); PublicInbox::Import::init_bare($dir); } else { my $v2w = importer($self); diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index 9cafda13..b0dfac45 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -83,4 +83,25 @@ sub set_list_headers { } } +sub inboxes_for_list_id ($$) { + my ($klass, $config, $simple) = @_; + + # newer Email::Simple allows header_raw, as does Email::MIME: + my @list_ids = $simple->can('header_raw') ? + $simple->header_raw('List-Id') : + $simple->header('List-Id'); + my @dests; + for my $list_id (@list_ids) { + $list_id =~ /<[ \t]*(.+)?[ \t]*>/ or next; + if (my $ibx = $config->lookup_list_id($1)) { + push @dests, $ibx; + } + } + if (scalar(@list_ids) > 1) { + warn "W: multiple List-IDs in message:\n"; + warn "W: List-ID: $_\n" for @list_ids + } + \@dests; +} + 1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ad2e8e62..1825da2c 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -77,7 +77,8 @@ sub new { # $creat may be any true value, or 0/undef. A hashref is true, # and $creat->{nproc} may be set to an integer my ($class, $v2ibx, $creat) = @_; - my $dir = $v2ibx->{inboxdir} or die "no inboxdir in inbox\n"; + $v2ibx = PublicInbox::InboxWritable->new($v2ibx); + my $dir = $v2ibx->assert_usable_dir; unless (-d $dir) { if ($creat) { require File::Path; @@ -86,8 +87,6 @@ sub new { die "$dir does not exist\n"; } } - - $v2ibx = PublicInbox::InboxWritable->new($v2ibx); $v2ibx->umask_prepare; my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; diff --git a/script/public-inbox-learn b/script/public-inbox-learn index c4c4d4b9..3073294a 100755..100644 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -4,7 +4,7 @@ # # Used for training spam (via SpamAssassin) and removing messages from a # public-inbox -my $usage = "$0 (spam|ham) < /path/to/message"; +my $usage = "$0 <spam|ham|rm> </path/to/message"; use strict; use warnings; use PublicInbox::Config; @@ -39,47 +39,26 @@ my $mime = PublicInbox::MIME->new(eval { $data }); -# get all recipients -my %dests; -foreach my $h (qw(Cc To)) { - my $val = $mime->header($h) or next; - foreach my $email (PublicInbox::Address::emails($val)) { - $dests{lc($email)} = 1; - } -} - -if ($train eq 'spam') { - $pi_config->each_inbox(sub { - my ($ibx) = @_; - $ibx = PublicInbox::InboxWritable->new($ibx); - my $im = $ibx->importer(0); - $im->remove($mime, 'spam'); - $im->done; - }); -} - -require PublicInbox::MDA if $train eq "ham"; +sub remove_or_add ($$$) { + my ($ibx, $train, $addr) = @_; -# n.b. message may be cross-posted to multiple public-inboxes -foreach my $recipient (keys %dests) { - my $dst = $pi_config->lookup($recipient) or next; # We do not touch GIT_COMMITTER_* env here so we can track # who trained the message. - $dst->{name} = $ENV{GIT_COMMITTER_NAME} || $dst->{name}; - $dst->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} || $recipient; - $dst = PublicInbox::InboxWritable->new($dst); - my $im = $dst->importer(0); + $ibx->{name} = $ENV{GIT_COMMITTER_NAME} // $ibx->{name}; + $ibx->{-primary_address} = $ENV{GIT_COMMITTER_EMAIL} // $addr; + $ibx = PublicInbox::InboxWritable->new($ibx); + my $im = $ibx->importer(0); - if ($train eq "spam" || $train eq "rm") { + if ($train eq "rm") { # This needs to be idempotent, as my inotify trainer # may train for each cross-posted message, and this # script already learns for every list in # ~/.public-inbox/config $im->remove($mime, $train); - } else { # $train eq "ham" + } elsif ($train eq "ham") { # no checking for spam here, we assume the message has # been reviewed by a human at this point: - PublicInbox::MDA->set_list_headers($mime, $dst); + PublicInbox::MDA->set_list_headers($mime, $ibx); # Ham messages are trained when they're marked into # a SEEN state, so this is idempotent: @@ -88,6 +67,41 @@ foreach my $recipient (keys %dests) { $im->done; } +# spam is removed from all known inboxes since it is often Bcc:-ed +if ($train eq 'spam') { + $pi_config->each_inbox(sub { + my ($ibx) = @_; + $ibx = PublicInbox::InboxWritable->new($ibx); + my $im = $ibx->importer(0); + $im->remove($mime, 'spam'); + $im->done; + }); +} else { + require PublicInbox::MDA; + + # get all recipients + my %dests; # address => <PublicInbox::Inbox|0(false)> + for ($mime->header('Cc'), $mime->header('To')) { + foreach my $addr (PublicInbox::Address::emails($_)) { + $addr = lc($addr); + $dests{$addr} //= $pi_config->lookup($addr) // 0; + } + } + + # n.b. message may be cross-posted to multiple public-inboxes + my %seen; + while (my ($addr, $ibx) = each %dests) { + next unless ref($ibx); # $ibx may be 0 + next if $seen{"$ibx"}++; + remove_or_add($ibx, $train, $addr); + } + my $dests = PublicInbox::MDA->inboxes_for_list_id($pi_config, $mime); + for my $ibx (@$dests) { + next if !$seen{"$ibx"}++; + remove_or_add($ibx, $train, $ibx->{-primary_address}); + } +} + if ($err) { warn $err; exit 1; diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 584218b5..dca8a0ea 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -37,28 +37,38 @@ my $config = PublicInbox::Config->new; my $key = 'publicinboxmda.spamcheck'; my $default = 'PublicInbox::Spamcheck::Spamc'; my $spamc = PublicInbox::Spamcheck::get($config, $key, $default); -my $dst; +my $dests = []; my $recipient = $ENV{ORIGINAL_RECIPIENT}; if (defined $recipient) { - $dst = $config->lookup($recipient); # first check + my $ibx = $config->lookup($recipient); # first check + push @$dests, $ibx if $ibx; } -if (!defined $dst) { - my $list_id = $simple->header('List-Id'); - if (defined $list_id && $list_id =~ /<[ \t]*(.+)?[ \t]*>/) { - $dst = $config->lookup_list_id($1); - } - if (!defined $dst && !defined $recipient) { +if (!scalar(@$dests)) { + $dests = PublicInbox::MDA->inboxes_for_list_id($config, $simple); + if (!scalar(@$dests) && !defined($recipient)) { die "ORIGINAL_RECIPIENT not defined in ENV\n"; } - defined $dst or do_exit(67); # EX_NOUSER 5.1.1 user unknown + scalar(@$dests) or do_exit(67); # EX_NOUSER 5.1.1 user unknown } -$dst->{inboxdir} or do_exit(67); -$dst = PublicInbox::InboxWritable->new($dst); -# pre-check, MDA has stricter rules than an importer might; -if ($precheck && !PublicInbox::MDA->precheck($simple, $dst->{address})) { - do_exit(0); -} +my $err; +@$dests = grep { + my $ibx = PublicInbox::InboxWritable->new($_); + eval { $ibx->assert_usable_dir }; + if ($@) { + warn $@; + $err = 1; + 0; + # pre-check, MDA has stricter rules than an importer might; + } elsif ($precheck) { + !!PublicInbox::MDA->precheck($simple, $ibx->{address}); + } else { + 1; + } +} @$dests; + +do_exit(67) if $err && scalar(@$dests) == 0; + $simple = undef; my $spam_ok; if ($spamc) { @@ -74,39 +84,51 @@ if ($spamc) { my $fh = $emm->fh; read($fh, $str, -s $fh); } - -my $mime = PublicInbox::MIME->new(\$str); do_exit(0) unless $spam_ok; -my $fcfg = $dst->{filter} || ''; -# -mda defaults to the strict base filter -if ($fcfg eq '') { - $dst->{filter} = 'PublicInbox::Filter::Base'; -} elsif ($fcfg eq 'scrub') { # legacy alias, undocumented, remove? - $dst->{filter} = 'PublicInbox::Filter::Mirror'; +# -mda defaults to the strict base filter which we won't use anywhere else +sub mda_filter_adjust ($) { + my ($ibx) = @_; + my $fcfg = $ibx->{filter} || ''; + if ($fcfg eq '') { + $ibx->{filter} = 'PublicInbox::Filter::Base'; + } elsif ($fcfg eq 'scrub') { # legacy alias, undocumented, remove? + $ibx->{filter} = 'PublicInbox::Filter::Mirror'; + } +} + +my @rejects; +for my $ibx (@$dests) { + mda_filter_adjust($ibx); + my $filter = $ibx->filter; + my $mime = PublicInbox::MIME->new($str); + my $ret = $filter->delivery($mime); + if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message + $mime = $ret; + } elsif ($ret == PublicInbox::Filter::Base::IGNORE) { + next; # nothing, keep looping + } elsif ($ret == PublicInbox::Filter::Base::REJECT) { + push @rejects, $filter->err; + next; + } + + PublicInbox::MDA->set_list_headers($mime, $ibx); + my $im = $ibx->importer(0); + if (defined $im->add($mime)) { + # ->abort is idempotent, no emergency if a single + # destination succeeds + $emm->abort; + } else { # v1-only + my $mid = $mime->header_obj->header_raw('Message-ID'); + # this message is similar to what ssoma-mda shows: + print STDERR "CONFLICT: Message-ID: $mid exists\n"; + } + $im->done; } -my $filter = $dst->filter; -my $ret = $filter->delivery($mime); -if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message - $mime = $ret; -} elsif ($ret == PublicInbox::Filter::Base::IGNORE) { - do_exit(0); # chuck it to emergency -} elsif ($ret == PublicInbox::Filter::Base::REJECT) { - $! = 65; # EX_DATAERR 5.6.0 data format error - die $filter->err, "\n"; -} # else { accept -$filter = undef; -PublicInbox::MDA->set_list_headers($mime, $dst); -my $im = $dst->importer(0); -if (defined $im->add($mime)) { - $emm = $emm->abort; -} else { - # this message is similar to what ssoma-mda shows: - print STDERR "CONFLICT: Message-ID: ", - $mime->header_obj->header_raw('Message-ID'), - " exists\n"; +if (scalar(@rejects) && scalar(@rejects) == scalar(@$dests)) { + $! = 65; # EX_DATAERR 5.6.0 data format error + die join("\n", @rejects, ''); } -$im->done; do_exit(0); @@ -96,4 +96,12 @@ is(undef, $im->checkpoint, 'checkpoint works before ->done'); $im->done; is(undef, $im->checkpoint, 'checkpoint works after ->done'); $im->checkpoint; + +my $nogit = PublicInbox::Git->new("$dir/non-existent/dir"); +eval { + my $nope = PublicInbox::Import->new($nogit, 'nope', 'no@example.com'); + $nope->add($mime); +}; +ok($@, 'Import->add fails on non-existent dir'); + done_testing(); @@ -308,6 +308,25 @@ EOF my $cur = `git --git-dir=$maindir diff HEAD~1..HEAD`; like($cur, qr/this message would not be accepted without --no-precheck/, '--no-precheck delivered message anyways'); + + # try a message with multiple List-ID headers + $in = <<EOF; +List-ID: <foo.bar> +List-ID: <$list_id> +Message-ID: <2lids\@example> +Subject: two List-IDs +From: user <user\@example.com> +To: $addr +Date: Fri, 02 Oct 1993 00:00:00 +0000 + +EOF + ($out, $err) = ('', ''); + IPC::Run::run([$mda], \$in, \$out, \$err); + is($?, 0, 'mda OK with multiple List-Id matches'); + $cur = `git --git-dir=$maindir diff HEAD~1..HEAD`; + like($cur, qr/Message-ID: <2lids\@example>/, + 'multi List-ID match delivered'); + like($err, qr/multiple List-ID/, 'warned about multiple List-ID'); } done_testing(); diff --git a/t/v2writable.t b/t/v2writable.t index 2b825768..bfe17d0a 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -276,4 +276,16 @@ EOF $im->done; } +my $tmp = { + inboxdir => "$inboxdir/non-existent/subdir", + name => 'nope', + version => 2, + -primary_address => 'test@example.com', +}; +eval { + my $nope = PublicInbox::V2Writable->new($tmp); + $nope->add($mime); +}; +ok($@, 'V2Writable fails on non-existent dir'); + done_testing(); |