about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--Documentation/dc-dlvr-spam-flow.txt38
-rwxr-xr-xscripts/dc-dlvr12
-rw-r--r--scripts/dc-dlvr.pre12
-rwxr-xr-xscripts/report-spam25
4 files changed, 74 insertions, 13 deletions
diff --git a/Documentation/dc-dlvr-spam-flow.txt b/Documentation/dc-dlvr-spam-flow.txt
new file mode 100644
index 00000000..2cdcefa5
--- /dev/null
+++ b/Documentation/dc-dlvr-spam-flow.txt
@@ -0,0 +1,38 @@
+dc-dlvr spam/ham training system flow
+-------------------------------------
+
+An overview of the Maildir + inotify-based spam training system Eric
+uses on his mail server.  This idea may be implemented for kqueue-based
+systems, too.
+
+The idea is to use inotify (via incron) to watch for new files appearing
+in Maildirs.  We only want to train seen messages as ham, and old (but
+not necessarily seen) messages as spam.  The overall goal of this is to
+allow a user to train their filters without leaving his favorite mail
+user agent.
+
+Every message written to Maildir involves a rename, so we only
+have incron watch for IN_MOVED_TO events.
+
+The generic flow is as follows, all for a single Unix user account:
+
+    incron -> report-spam +-> sendmail -> MTA -> dc-dlvr -> spamc -> spamd
+                          |
+                          V
+                         ...
+
+For public-inbox, Eric uses a separate Unix account ("pi") to add a
+layer of protection from fat-fingering something.  So his report-spam
+script delivers to a second recipient for training, the "pi" user:
+                         ...
+                          |
+                          +-> sendmail -> MTA -> dc-dlvr
+                                                    |
+                                                    V
+                                            ~pi/.dc-dlvr.pre
+                                                    |
+                                                    V
+                                           public-inbox-learn
+
+public-inbox-learn will then internally handle the "spamc -> spamd"
+delivery path as well as calling ssoma-rm on falsely trained
diff --git a/scripts/dc-dlvr b/scripts/dc-dlvr
index 68123f84..ca64505c 100755
--- a/scripts/dc-dlvr
+++ b/scripts/dc-dlvr
@@ -1,6 +1,7 @@
 #!/bin/sh
 # Copyright (C) 2008-2013, Eric Wong <e@80x24.org>
 # License: GPLv3 or later <http://www.gnu.org/licenses/gpl-3.0.txt>
+# This is installed as /etc/dc-dcvr on my system
 # to use with postfix main.cf: mailbox_command = /etc/dc-dlvr "$EXTENSION"
 DELIVER=/usr/lib/dovecot/deliver
 
@@ -11,7 +12,7 @@ catchall) exec $DELIVER ;;
 esac
 
 # change if your spamc/spamd listens elsewhere
-spamc='spamc -U /run/spamd.sock'
+spamc='spamc'
 
 # allow plus addressing to train spam filters, $1 is the $EXTENSION
 # which may be "trainspam" or "trainham".  Only allow spam training
@@ -30,11 +31,14 @@ then
         set -e
         cat > $TMPMSG
         DEFAULT_INBOX=$(. ~/.dc-dlvr.pre)
-        if test xINBOX != x"$DEFAULT_INBOX"
-        then
+        case $DEFAULT_INBOX in
+        '') exec rm -f $rm_list ;;
+        INBOX) ;; # do nothing
+        *)
                 $DELIVER -m $DEFAULT_INBOX < $TMPMSG
                 exec rm -f $rm_list
-        fi
+                ;;
+        esac
         PREMSG=$(mktemp -t dc-dlvr.orig.$USER.XXXXXX || exit 1)
         rm_list="$rm_list $PREMSG"
         set +e
diff --git a/scripts/dc-dlvr.pre b/scripts/dc-dlvr.pre
new file mode 100644
index 00000000..9183a96e
--- /dev/null
+++ b/scripts/dc-dlvr.pre
@@ -0,0 +1,12 @@
+# Copyright (C) 2014, Eric Wong <e@80x24.org>
+# License: AGPLv3 or later <http://www.gnu.org/licenses/agpl-3.0.txt>
+# sourced by /etc/dc-dlvr in ~$PI_USER/.dc-dlvr.rc, this just exits,
+# aborting /etc/dc-dlvr
+export PATH=/usr/local/bin:/usr/bin:/bin
+exec 2>> ~/log/dc-dlvr.pre.err
+trap 'err=$?; set +e; test $err -eq 0 || rm -f $TMPMSG; exit $err' EXIT
+case $1,$CLIENT_ADDRESS in
+pispam,) exec public-inbox-learn spam < $TMPMSG ;;
+piham,) exec public-inbox-learn ham < $TMPMSG ;;
+esac
+exec public-inbox-mda < $TMPMSG
diff --git a/scripts/report-spam b/scripts/report-spam
index 75200431..0015ef0b 100755
--- a/scripts/report-spam
+++ b/scripts/report-spam
@@ -1,12 +1,11 @@
 #!/bin/sh
-# Copyright (C) 2008-2013, Eric Wong <e@80x24.org>
+# Copyright (C) 2008-2014, Eric Wong <e@80x24.org>
 # License: GPLv3 or later <http://www.gnu.org/licenses/gpl-3.0.txt>
 # Usage: report-spam /path/to/message/in/maildir
-# This is intended to be used with incron or similar systems.
+# This is intended for use with incron or similar systems.
 # my incrontab(5) looks like this:
-#  /path/to/.maildir/cur IN_MOVED_TO /path/to/report-spam $@/$#
-#  /path/to/.maildir/.INBOX.good/cur IN_MOVED_TO /path/to/report-spam $@/$#
-#  /path/to/.maildir/.INBOX.spam/cur IN_MOVED_TO /path/to/report-spam $@/$#
+#  /path/to/maildir/.INBOX.good/cur IN_MOVED_TO /path/to/report-spam $@/$#
+#  /path/to/maildir/.INBOX.spam/cur IN_MOVED_TO /path/to/report-spam $@/$#
 
 # gigantic emails tend not to be spam (but they suck anyways...)
 bytes=$(stat -c %s $1)
@@ -21,18 +20,26 @@ fi
 # incrond has no concurrency limits and will fork a new process on
 # every single event, which sucks with rename storms when a client
 # commits folder changes.  The sendmail executable exits quickly and
-# queues up the message for training.  This shoudl also ensure fairness
+# queues up the message for training.  This should also ensure fairness
 # to newly arriving mail.  Instead of installing/configuring
 # another queueing system, I reuse the queue in the MTA.
-# See scripts/dc-dlvr for corresponding trainspam/trainham handlers.
+# See scripts/dc-dlvr for corresponding trainspam/trainham handlers,
+# which are for my personal bayes training, and scripts/dc-dlvr.pre
+# for the pispam/piham handlers for training emails going to public-inbox
+
+DO_SENDMAIL='/usr/sbin/sendmail -oi'
+PI_USER=pi
+
 case $1 in
 *[/.]spam/cur/*) # non-new messages in spam get trained
-        exec /usr/sbin/sendmail -oem -oi $USER+trainspam < $1
+        $DO_SENDMAIL $PI_USER+pispam < $1
+        exec $DO_SENDMAIL $USER+trainspam < $1
         ;;
 *:2,*S*) # otherwise, seen messages only
         case $1 in
         *:2,*T*) exit 0 ;; # ignore trashed messages
         esac
-        exec /usr/sbin/sendmail -oem -oi $USER+trainham < $1
+        $DO_SENDMAIL $PI_USER+piham < $1
+        exec $DO_SENDMAIL $USER+trainham < $1
         ;;
 esac