diff options
149 files changed, 9570 insertions, 3758 deletions
@@ -1,3 +1,5 @@ +/config.mak +/MANIFEST.gen /Makefile.old /pm_to_blib /MYMETA.* @@ -7,5 +9,6 @@ *.1 *.5 *.7 +*.8 *.html *.gz diff --git a/Documentation/.gitignore b/Documentation/.gitignore index 8ba4186a..107ad36f 100644 --- a/Documentation/.gitignore +++ b/Documentation/.gitignore @@ -1 +1 @@ -/public-inbox-mda.txt +/public-inbox-*.txt diff --git a/Documentation/dc-dlvr-spam-flow.txt b/Documentation/dc-dlvr-spam-flow.txt index 978763ad..81aba766 100644 --- a/Documentation/dc-dlvr-spam-flow.txt +++ b/Documentation/dc-dlvr-spam-flow.txt @@ -35,8 +35,7 @@ script delivers to a second recipient for training, the "pi" user: public-inbox-learn public-inbox-learn will then internally handle the "spamc -> spamd" -delivery path as well as calling ssoma-rm on falsely trained - +delivery path as well as removing the message from the git tree. * incron - run commands based on filesystem events: http://incron.aiken.cz/ @@ -46,7 +45,4 @@ delivery path as well as calling ssoma-rm on falsely trained * spamc / spamd - SpamAssassin: http://spamassassin.apache.org/ * report-spam / dc-dlvr - distributed with public-inbox in the scripts/ - directory: git clone git://80x24.org/public-inbox - -* ssoma-rm - part of ssoma: some sort of mail archiver, a dependency of - public-inbox: git clone git://80x24.org/ssoma + directory: git clone https://public-inbox.org/ public-inbox diff --git a/Documentation/design_www.txt b/Documentation/design_www.txt index 1be4d18e..704f8177 100644 --- a/Documentation/design_www.txt +++ b/Documentation/design_www.txt @@ -1,32 +1,47 @@ -URL naming ----------- +URL and anchor naming +--------------------- ### Unstable endpoints -/$LISTNAME/?r=$GIT_COMMIT -> HTML only -/$LISTNAME/new.atom -> Atom feed +/$INBOX/?r=$GIT_COMMIT -> HTML only +/$INBOX/new.atom -> Atom feed #### Optional, relies on Search::Xapian -/$LISTNAME/$MESSAGE_ID/t/ -> HTML content of thread -/$LISTNAME/$MESSAGE_ID/t.atom -> Atom feed for thread -/$LISTNAME/$MESSAGE_ID/t.mbox.gz -> gzipped mbox of thread +/$INBOX/$MESSAGE_ID/t/ -> HTML content of thread + anchors: + #u location of $MESSAGE_ID in URL + #m<SHA-1> per-message links, where <SHA-1> is of the Message-ID + of each message (stable) + #s<NUM> relative numeric position of message in thread (unstable) + +/$INBOX/$MESSAGE_ID/t.atom -> Atom feed for thread +/$INBOX/$MESSAGE_ID/t.mbox.gz -> gzipped mbox of thread ### Stable endpoints -/$LISTNAME/$MESSAGE_ID/ -> HTML content (short quotes) -/$LISTNAME/$MESSAGE_ID -> 301 to /$LISTNAME/$MESSAGE_ID -/$LISTNAME/$MESSAGE_ID/raw -> raw mbox -/$LISTNAME/$MESSAGE_ID/f/ -> HTML content (full quotes) -/$LISTNAME/$MESSAGE_ID/R/ -> HTML reply instructions +/$INBOX/$MESSAGE_ID/ -> HTML content + anchors: + #r location of the current message in thread skeleton + (requires Xapian search) + #b start of the message body (linked from thread skeleton) + +/$INBOX/$MESSAGE_ID -> 301 to /$INBOX/$MESSAGE_ID/ +/$INBOX/$MESSAGE_ID/raw -> raw mbox +/$INBOX/$MESSAGE_ID/#R -> HTML reply instructions + +# Covering up a pre-1.0 design mistake: +/$INBOX/$MESSAGE_ID/f/ -> 301 to /$INBOX/$MESSAGE_ID/ ### Legacy endpoints (may be ambiguous given Message-IDs with similar suffixes) -/$LISTNAME/m/$MESSAGE_ID/ -> 301 to /$LISTNAME/$MESSAGE_ID/ -/$LISTNAME/m/$MESSAGE_ID.html -> 301 to /$LISTNAME/$MESSAGE_ID/ -/$LISTNAME/m/$MESSAGE_ID.txt -> 301 to /$LISTNAME/$MESSAGE_ID/raw -/$LISTNAME/f/$MESSAGE_ID.html -> 301 to /$LISTNAME/$MESSAGE_ID/f/ -/$LISTNAME/f/$MESSAGE_ID.txt [1] -> 301 to /$LISTNAME/$MESSAGE_ID/raw +/$INBOX/m/$MESSAGE_ID/ -> 301 to /$INBOX/$MESSAGE_ID/ +/$INBOX/m/$MESSAGE_ID.html -> 301 to /$INBOX/$MESSAGE_ID/ +/$INBOX/m/$MESSAGE_ID.txt -> 301 to /$INBOX/$MESSAGE_ID/raw +/$INBOX/f/$MESSAGE_ID.html -> 301 to /$INBOX/$MESSAGE_ID/ +/$INBOX/f/$MESSAGE_ID.txt [1] -> 301 to /$INBOX/$MESSAGE_ID/raw -/$LISTNAME/atom.xml [2] -> identical to /$LISTNAME/new.atom +/$INBOX/atom.xml [2] -> identical to /$INBOX/new.atom -Additionally, we support "git clone" pointed to http://$HOST/$LISTNAME +Additionally, we support git clone/fetch over HTTP (dumb and smart): + + git clone --mirror http://$HOSTNAME/$INBOX FIXME: we must refactor/cleanup/add tests for most of our CGI before adding more endpoints and features. @@ -41,7 +56,8 @@ Encoding notes -------------- Raw HTML and XML should only contain us-ascii characters which render -to UTF-8. +to UTF-8. We must not rely on users having the necessary fonts +installed to render uncommon characters. Plain text (raw message) endpoints display in the original encoding(s) of the original email. @@ -55,17 +71,18 @@ We also set <title> to make window management easier. We favor <pre>-formatted text since public-inbox is intended as a place to share and discuss patches and code. Unfortunately, long paragraphs tends to be less readable with fixed-width serif fonts which GUI -browsers default to. So perhaps we will add different endpoints for -variable-width fonts. +browsers default to. * No graphics, images, or icons at all. We tolerate, but do not encourage the use of GUIs. * No setting colors or font sizes, power to users to decide those. + We will include and document <span class=?> to support colors + for user-supplied CSS. -* Only one font type (fixed or variable) per page. This is for - accessibility, we must not blow certain elements out-of-proportion - when a reader increases font size. +* Only one font type: fixed. This is for accessibility, we must + not blow certain elements out-of-proportion with different + fonts on the page when a reader increases font size. * Bold and underline elements are OK since they should render fine regardless of chosen font and gracefully degrade if a display does @@ -80,7 +97,16 @@ variable-width fonts. * We only use CSS for one reason: wrapping pre-formatted text This is necessary because unfortunate GUI browsers tend to be - prone to layout widening. w3m is fine here without CSS :) + prone to layout widening from unwrapped mailers. + w3m is fine here without CSS :) No other CSS is allowed, especially with scary things like: http://thejh.net/misc/website-terminal-copy-paste + + However, we will try to make it easy for users to supply their + own colors via user-side CSS. + +CSS classes (for user-supplied CSS) +----------------------------------- +span.q - quoted text in email messages +... diff --git a/Documentation/include.mk b/Documentation/include.mk index 4669ac54..28b69aed 100644 --- a/Documentation/include.mk +++ b/Documentation/include.mk @@ -4,60 +4,85 @@ all:: RSYNC = rsync RSYNC_DEST = public-inbox.org:/srv/public-inbox/ -docs := README COPYING INSTALL TODO $(shell git ls-files 'Documentation/*.txt') +docs := README COPYING INSTALL TODO HACKING +docs += $(shell git ls-files 'Documentation/*.txt') INSTALL = install -POD2MAN = pod2man -POD2MAN_OPTS = -v --stderr -d 1994-10-02 -c 'public-inbox user manual' -pod2man = $(POD2MAN) $(POD2MAN_OPTS) -POD2TEXT = pod2text -POD2TEXT_OPTS = --stderr -pod2text = $(POD2TEXT) $(POD2TEXT_OPTS) - +PODMAN = pod2man +PODMAN_OPTS = -v --stderr -d 1993-10-02 -c 'public-inbox user manual' +PODMAN_OPTS += -r public-inbox.git +podman = $(PODMAN) $(PODMAN_OPTS) +PODTEXT = pod2text +PODTEXT_OPTS = --stderr +podtext = $(PODTEXT) $(PODTEXT_OPTS) + +# MakeMaker only seems to support manpage sections 1 and 3... m1 = m1 += public-inbox-mda +m1 += public-inbox-httpd +m1 += public-inbox-nntpd +m1 += public-inbox-watch +m1 += public-inbox-index m5 = +m5 += public-inbox-config m7 = +m7 += public-inbox-overview +m8 = +m8 += public-inbox-daemon man1 := $(addsuffix .1, $(m1)) man5 := $(addsuffix .5, $(m5)) man7 := $(addsuffix .7, $(m7)) +man8 := $(addsuffix .8, $(m8)) all:: man html -man: $(man1) $(man5) $(man7) +man: $(man1) $(man5) $(man7) $(man8) +prefix ?= $(PREFIX) prefix ?= $(HOME) mandir ?= $(prefix)/share/man man1dir = $(mandir)/man1 man5dir = $(mandir)/man5 man7dir = $(mandir)/man7 +man8dir = $(mandir)/man8 install-man: man - test -z "$(man1)" || $(INSTALL) -d -m 755 $(DESTDIR)$(man1dir) - test -z "$(man5)" || $(INSTALL) -d -m 755 $(DESTDIR)$(man5dir) - test -z "$(man7)" || $(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) - test -z "$(man1)" || $(INSTALL) -m 644 $(man1) $(DESTDIR)$(man1dir) - test -z "$(man5)" || $(INSTALL) -m 644 $(man5) $(DESTDIR)$(man5dir) - test -z "$(man7)" || $(INSTALL) -m 644 $(man7) $(DESTDIR)$(man7dir) - -%.1 : Documentation/%.pod - $(pod2man) -s 1 $< $@+ && mv $@+ $@ - -mantxt = $(addprefix Documentation/, $(addsuffix .txt, $(m1))) + $(INSTALL) -d -m 755 $(DESTDIR)$(man1dir) + $(INSTALL) -d -m 755 $(DESTDIR)$(man5dir) + $(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) + $(INSTALL) -d -m 755 $(DESTDIR)$(man8dir) + $(INSTALL) -m 644 $(man1) $(DESTDIR)$(man1dir) + $(INSTALL) -m 644 $(man5) $(DESTDIR)$(man5dir) + $(INSTALL) -m 644 $(man7) $(DESTDIR)$(man7dir) + $(INSTALL) -m 644 $(man8) $(DESTDIR)$(man8dir) + +doc_install :: install-man + +%.1 %.5 %.7 %.8 : Documentation/%.pod + $(podman) -s $(subst .,,$(suffix $@)) $< $@+ && mv $@+ $@ + +manuals := +manuals += $(m1) +manuals += $(m5) +manuals += $(m7) +manuals += $(m8) + +mantxt = $(addprefix Documentation/, $(addsuffix .txt, $(manuals))) docs += $(mantxt) all :: $(mantxt) Documentation/%.txt : Documentation/%.pod - $(pod2text) $< $@+ && mv $@+ $@ + $(podtext) $< $@+ && mv $@+ $@ -txt2pre = ./Documentation/txt2pre < $< > $@+ && touch -r $< $@+ && mv $@+ $@ +txt2pre = $(PERL) -I lib ./Documentation/txt2pre <$< >$@+ && \ + touch -r $< $@+ && mv $@+ $@ txt := INSTALL README COPYING TODO dtxt := design_notes.txt design_www.txt dc-dlvr-spam-flow.txt dtxt := $(addprefix Documentation/, $(dtxt)) $(mantxt) %.html: %.txt - $(txt2pre) + TITLE="$(basename $(<F))" $(txt2pre) %.html: % $(txt2pre) @@ -66,7 +91,7 @@ html: $(docs_html) gz_docs := $(addsuffix .gz, $(docs) $(docs_html)) rsync_docs := $(gz_docs) $(docs) $(txt) $(docs_html) %.gz: % - gzip -9 --rsyncable < $< > $@+ + gzip -9 --rsyncable <$< >$@+ touch -r $< $@+ mv $@+ $@ diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod new file mode 100644 index 00000000..00376457 --- /dev/null +++ b/Documentation/public-inbox-config.pod @@ -0,0 +1,151 @@ +=head1 NAME + +public-inbox-config - public-inbox config file description + +=head1 SYNOPSIS + +~/.public-inbox/config + +=head1 DESCRIPTION + +The public-inbox config file is parseable by L<git-config(1)>. +This is a global configuration file for mapping/discovering +all public-inboxes used by a particular user. + +=head1 CONFIGURATION FILE + +=head2 EXAMPLE + + [publicinbox "test"] + mainrepo = /home/user/path/to/test.git + ; multiple addresses are supported + address = test@example.com + ; address = alternate@example.com + url = http://example.com/test + newsgroup = inbox.test + +=head2 VARIABLES + +=over 8 + +=item publicinbox.<name>.address + +The email address of the public-inbox. May be specified +more than once for merging multiple mailing lists (or migrating +to new addresses). This must be specified at least once, +the first value will be considered the primary address for +informational purposes. + +Default: none, required + +=item publicinbox.<name>.mainrepo + +The absolute path to the git repository which hosts the +public-inbox. This must be specified once. + +Default: none, required + +=item publicinbox.<name>.url + +The primary URL for hosting the HTTP/HTTPS archives. +Additional HTTP/HTTPS URLs may be specified via +C<$GIT_DIR/cloneurl> as documented in L<gitweb(1)> + +Default: none, optional + +=item publicinbox.<name>.newsgroup + +The NNTP group name for use with L<public-inbox-nntpd(8)>. This +may be any newsgroup name with hierarchies delimited by '.'. +For example, the newsgroup for L<mailto:meta@public-inbox.org> +is: C<inbox.comp.mail.public-inbox.meta> + +Omitting this for the given inbox will prevent the group from +being read by L<public-inbox-nntpd(1)> + +Default: none, optional + +=item publicinbox.<name>.watch + +A location for L<public-inbox-watch(1)> to watch. Currently, +only C<maildir:> paths are supported: + + [publicinbox "test"] + watch = maildir:/path/to/maildirs/.INBOX.test/ + +Default: none; only for L<public-inbox-watch(1)> users + +=item publicinbox.<name>.watchheader + + [publicinbox "test"] + watchheader = List-Id:<test.example.com> + +Default: none; only for L<public-inbox-watch(1)> users + +=item publicinbox.<name>.nntpmirror + +This may be the full NNTP URL of an independently-run mirror. +For example, the https://public-inbox.org/meta/ inbox is +mirrored by Gmane at +C<nntp://news.gmane.org/gmane.mail.public-inbox.general> + +Default: none + +=item publicinboxwatch.spamcheck + +This may be set to C<spamc> to enable the use of SpamAssassin +L<spamc(1)> for filtering spam before it is imported into git +history. Other spam filtering backends may be supported in +the future. + +Default: none + +=item publicinboxwatch.watchspam + +This may be set to C<spamc> to enable the use of SpamAssassin +L<spamc(1)> for filtering spam before it is imported into git +history. Other spam filtering backends may be supported in +the future. This only affects L<public-inbox-watch(1)>. + +Default: none + +=item publicinbox.nntpserver + +Set this to point to the address of the L<public-inbox-nntpd(1)> +instance. This is used to advertise the existence of the NNTP +presnce in the L<PublicInbox::WWW> HTML interface. + +Multiple values are allowed for servers with multiple +addresses or mirrors. + +Default: none + +=back + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +Used to override the default "~/.public-inbox/config" value. + +=back + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<git(1)>, L<git-config(1)>, L<public-inbox-daemon(8)>, +L<public-inbox-mda(1)>, L<public-inbox-watch(1)> diff --git a/Documentation/public-inbox-daemon.pod b/Documentation/public-inbox-daemon.pod new file mode 100644 index 00000000..72794a51 --- /dev/null +++ b/Documentation/public-inbox-daemon.pod @@ -0,0 +1,183 @@ +=head1 NAME + +public-inbox-daemon - common usage for public-inbox network daemons + +=head1 SYNOPSIS + + public-inbox-httpd + public-inbox-nntpd + +=head1 DESCRIPTION + +This manual describes common options and behavior for +public-inbox network daemons. Network daemons for public-inbox +provide read-only NNTP and HTTP access to public-inboxes. Write +access to a public-inbox repository will never be required to +run these. + +These daemons are implemented with a common core using +non-blocking sockets and optimized for fairness; even with +thousands of connected clients over slow links. + +They also provide graceful shutdown/upgrade support to avoid +breaking existing connections during software upgrades. + +These daemons may also utilize multiple pre-forked worker +processes to take advantage of multiple CPUs. + +Native TLS (Transport Layer Security) support is planned. + +=head1 OPTIONS + +=over + +=item -l, --listen ADDRESS + +This takes an absolute path to a Unix socket or HOST:PORT +to listen on. For example, to listen to TCP connections on +port 119, use: C<-l 0.0.0.0:119>. This may also point to +a Unix socket (C<-l /path/to/http.sock>) for a reverse proxy +like L<nginx(1)> to use. + +May be specified multiple times to allow listening on multiple +sockets. + +Default: server-dependent unless socket activation is used with +L<systemd(1)> or similar (see L<systemd.socket(5)>). + +=item -1, --stdout PATH + +Specify an appendable path to redirect stdout descriptor (1) to. +Using this is preferable to setting up the redirect externally +(e.g. E<gt>E<gt>/path/to/log in shell) since it allows +SIGUSR1 to be handled (see L<SIGNALS/SIGNALS> below). + +Default: /dev/null + +=item -2, --stderr PATH + +Like C<--stdout>, but for the stderr descriptor (2). + +=item -W, --worker-processes + +Set the number of worker processes. + +Normally, this should match the number of CPUs on the system to +take full advantage of the hardware. However, users of +memory-constrained systems may want to lower this. + +Setting this to zero (C<-W0>) disables the master/worker split; +saving some memory but removing the ability to use SIGTTIN +to increase worker processes or have the worker restarted by +the master on crashes. + +Default: 1 + +=back + +=head1 SIGNALS + +Most of our signal handling behavior is copied from L<nginx(1)> +and/or L<starman(1)>; so it is possible to reuse common scripts +for managing them. + +=over 8 + +=item SIGUSR1 + +Reopens log files pointed to by --stdout and --stderr options. + +=item SIGUSR2 + +Spawn a new process with the intention to replace the running one. +See L</UPGRADING> below. + +=item SIGHUP + +Reload config files associated with the process. +(FIXME: not tested for -httpd, yet) + +=item SIGTTIN + +Increase the number of running workers processes by one. + +=item SIGTTOU + +Decrease the number of running worker processes by one. + +=item SIGWINCH + +Stop all running worker processes. SIGHUP or SIGTTIN +may be used to restart workers. + +=item SIGQUIT + +Gracefully terminate the running process. + +=back + +SIGTTOU, SIGTTIN, SIGWINCH all have no effect when worker +processes are disabled with C<-W0> on the command-line. + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +The default config file, normally "~/.public-inbox/config". +See L<public-inbox-config(5)> + +=item LISTEN_FDS, LISTEN_PID + +Used by systemd (and compatible) installations for socket +activation. See L<systemd.socket(5)> and L<sd_listen_fds(3)>. + +=item PERL_INLINE_DIRECTORY + +Pointing this to point to a writable directory enables the use +of L<Inline> and L<Inline::C> extensions which may provide +platform-specific performance improvements. Currently, this +enables the use of L<vfork(2)> which speeds up subprocess +spawning with the Linux kernel. + +public-inbox will never enable L<Inline::C> automatically without +this environment variable set. See L<Inline> and L<Inline::C> +for more details. + +=back + +=head1 UPGRADING + +There are two ways to upgrade a running process. + +Users of process management systems with socket activation +(L<systemd(1)> or similar) may rely on multiple instances For +systemd, this means using two (or more) '@' instances for each +service (e.g. C<SERVICENAME@INSTANCE>) as documented in +L<systemd.unit(5)>. + +Users of traditional SysV init may use SIGUSR2 to spawn +a replacement process and gracefully terminate the old +process using SIGQUIT. + +In either case, the old process will not truncate running +responses; so responses to expensive requests do not get +interrupted and lost. + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright 2013-2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<public-inbox-httpd(1)>, L<public-inbox-nntpd(1)> diff --git a/Documentation/public-inbox-httpd.pod b/Documentation/public-inbox-httpd.pod new file mode 100644 index 00000000..8605d747 --- /dev/null +++ b/Documentation/public-inbox-httpd.pod @@ -0,0 +1,40 @@ +=head1 NAME + +public-inbox-httpd - PSGI server optimized for public-inbox + +=head1 SYNOPSIS + +B<public-inbox-httpd> [OPTIONS] [/path/to/myapp.psgi] + +=head1 DESCRIPTION + +public-inbox-httpd is a PSGI/Plack server supporting HTTP/1.1 +and HTTP/1.0. It uses options and environment variables common +to all L<public-inbox-daemon(8)> implementations in addition to +the PSGI file. + +If a PSGI file is not specified, L<PublicInbox::WWW> is +loaded with a default middleware stack consisting of +L<Plack::Middleware::Deflater>, +L<Plack::Middleware::ReverseProxy>, and +L<Plack::Middleware::Head> + +This may point to a PSGI file for supporting generic PSGI apps. + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright 2013-2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<git(1)>, L<git-config(1)>, L<public-inbox-daemon(8)>, +L<Plack> diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod new file mode 100644 index 00000000..2871f0da --- /dev/null +++ b/Documentation/public-inbox-index.pod @@ -0,0 +1,120 @@ +=head1 NAME + +public-inbox-index - create and update search indices + +=head1 SYNOPSIS + +public-inbox-index [OPTIONS] GIT_DIR + +=head1 DESCRIPTION + +public-inbox-index creates and updates the search and NNTP +article number database used by the read-only public-inbox HTTP +and NNTP interfaces. Currently, this requires L<Search::Xapian> +and L<DBD::SQlite> and L<DBI> Perl modules. + +Once the initial indices are created by public-inbox-index, +L<public-inbox-mda(1)> and L<public-inbox-watch(1)> will +automatically maintain them. + +Running this manually to update indices is only required if +relying on L<git-fetch(1)> to mirror an existing public-inbox; +or if upgrading to a new version of public-inbox using +the C<--reindex> option. + +Having a search and article number database is essential to +running the NNTP interface, and strongly recommended for the +HTTP interface as it provides thread grouping in addition +to normal search functionality. + +=head1 OPTIONS + +=over + +=item --reindex + +Forces a search engine re-index of all messages in the +repository. This can be used for in-place upgrades while +NNTP/HTTP server processes are utilizing the index. Keep in +mind this roughly doubles the size of the already-large +Xapian database. + +This does not touch the NNTP article number database. + +=back + +=head1 FILES + +All public-inbox-specific files are contained within the +C<$GIT_DIR/public-inbox/> directory. All files are expected to +grow in size as more messages are archived, so using compaction +commands (e.g. L<xapian-compact(1)>) is not recommended unless +the list is no longer active. + +=over + +=item $GIT_DIR/public-inbox/msgmap.sqlite3 + +The stable NNTP article number to Message-ID mapping is +stored in an SQLite3 database. + +This is required for users of L<public-inbox-nntpd(1)>, but +users of the L<PublicInbox::WWW> interface will find it +useful for attempting recovery from copy-paste truncations of +URLs containing long Message-IDs. + +Avoid removing this file and regenerating it; it may cause +existing NNTP readers to lose sync and miss (or see duplicate) +messages. + +This file is relatively small, and typically less than 5% +of the space of the mail stored in a packed git repository. + +=item $GIT_DIR/public-inbox/xapian* + +The database used by L<Search::Xapian>. This directory name is +followed by a number indicating the index schema version this +installation of public-inbox uses. + +These directories may be safely deleted or removed in full +while the NNTP and HTTP interfaces are no longer accessing +them. + +In addition to providing a search interface for the HTTP +interface, the Xapian database is used to group and combine +related messages into threads. For NNTP servers, it also +provides a cache of metadata and header information often +requested by NNTP clients. + +This directory is large, often two to three times the size of +the objects stored in a packed git repository. Using the +C<--reindex> option makes it larger, still. + +=back + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +Used to override the default "~/.public-inbox/config" value. + +=back + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<Search::Xapian>, L<DBD::SQLite> diff --git a/Documentation/public-inbox-mda.pod b/Documentation/public-inbox-mda.pod index cb174714..3a43a1ce 100644 --- a/Documentation/public-inbox-mda.pod +++ b/Documentation/public-inbox-mda.pod @@ -1,17 +1,16 @@ -% public-inbox-mda(1) public-inbox user manual - =head1 NAME -public-inbox-mda - mail delivery for public-inbox +public-inbox-mda - mail delivery agent for public-inbox =head1 SYNOPSIS -B<public-inbox-mda> E<lt> message +B<public-inbox-mda> E<lt>MESSAGE =head1 DESCRIPTION Mail Delivery Agent (MDA) for public-inbox installations. -Each system user may have their own public-inbox instances +Each system user may have their own public-inbox instances. +This may be invoked via L<procmail(1)> or similar tools. =head1 ENVIRONMENT @@ -19,33 +18,37 @@ Each system user may have their own public-inbox instances =item ORIGINAL_RECIPIENT -the original recipient email address, from Postfix +The original recipient email address, set by the MTA. Postfix +sets it by default, untested on other MTAs. =item PI_CONFIG -config file. default: ~/.public-inbox/config +Per-user config file parseable by L<git-config(1)>. +See L<public-inbox-config(5)>. + +Default: ~/.public-inbox/config =item PI_EMERGENCY -emergency destination. default: ~/.public-inbox/emergency/ +emergency Maildir destination. -=back +Default: ~/.public-inbox/emergency/ -=head1 PI_CONFIG FILE +=back -This is a config file parseable by L<git-config(1)>. =head1 CONTACT -All feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> -The mail archives are hosted at L<http://public-inbox.org/meta/> +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> =head1 COPYRIGHT Copyright 2013-2016 all contributors L<mailto:meta@public-inbox.org> -License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> =head1 SEE ALSO diff --git a/Documentation/public-inbox-nntpd.pod b/Documentation/public-inbox-nntpd.pod new file mode 100644 index 00000000..2f9dbabf --- /dev/null +++ b/Documentation/public-inbox-nntpd.pod @@ -0,0 +1,53 @@ +=head1 NAME + +public-inbox-nntpd - NNTP server for sharing public-inbox + +=head1 SYNOPSIS + +B<public-inbox-nntpd> [OPTIONS] + +=head1 DESCRIPTION + +public-inbox-nntpd provides a read-only NNTP daemon for +public-inbox. It uses options and environment variables common +to all L<public-inbox-daemon(8)> implementations. + +The default configuration will never require write access +tto the directory where the public-inbox is stored, so it +may be run as a different user than the user running +L<public-inbox-watch(1)>, L<public-inbox-mda(1)>, or +L<git-fetch(1)>. + +=head1 CONFIGURATION + +These configuration knobs should be used in the +L<public-inbox-config(5)> + +=over 8 + +=item publicinbox.<name>.newsgroup + +=item publicinbox.nntpserver + +=back + +See L<public-inbox-config(5)> for documentation on them. + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/>, +L<nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta>, +L<nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta> + +=head1 COPYRIGHT + +Copyright 2013-2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<git(1)>, L<git-config(1)>, L<public-inbox-daemon(8)>, +L<public-inbox-config(5)> diff --git a/Documentation/public-inbox-overview.pod b/Documentation/public-inbox-overview.pod new file mode 100644 index 00000000..2c97f876 --- /dev/null +++ b/Documentation/public-inbox-overview.pod @@ -0,0 +1,108 @@ +=head1 NAME + +public-inbox-overview - an overview of public-inbox + +=head1 DESCRIPTION + +public-inbox consists of many parts which may be used +independently or in conjunction of each other for: + +=over 4 + +=item 1 + +Mirroring existing public-inboxes. + +=item 2 + +Mirroring mailing lists directly. + +=item 3 + +Hosting standalone. + +=back + +=head2 Mirroring existing public-inboxes + +Mirroring existing public-inboxes is the easiest way to get +started. Your mirror will remain dependent on the REMOTE_URL +you are mirroring and you only need to use two new commands in +addition to common L<git(1)> commands. + + git clone --mirror REMOTE_URL /path/to/repo.git + + # The following should create the necessary entry in + # ~/.public-inbox/config + public-inbox-init NAME /path/to/repo.git MY_URL LIST_ADDRESS + + # Optional but strongly recommended for hosting HTTP + # (and required for NNTP) + # enable search (requires Search::Xapian and DBD::SQLite) + public-inbox-index /path/to/repo.git + + # Periodically update the repo with the following commands + # to update the git repo and index new messages: + cd /path/to/repo.git && git fetch && public-inbox-index + +See L</"Hosting public-inboxes"> below for info on how to expose +your mirror to other readers. + +=head2 Mirroring mailing lists directly + +Mirroring existing mailing lists may be done by any reader +of a mailing list using L<public-inbox-watch(1)>. + + # This will create a new git repository: + public-inbox-init NAME /path/to/repo.git MY_URL LIST_ADDRESS + +Then, see the L<public-inbox-watch(1)> manual for configuring +C<watch>, C<watchheader>, and the optional C<spamcheck> and +C<watchspam> entries. + +You will need to leave L<public-inbox-watch(1)> running to +keep the mailbox up-to-date as messages are delivered to +the mailing list. + +Running L<public-inbox-index(1)> to create search indices +is recommended. L<public-inbox-watch(1)> will automatically +maintain the indices if they were created by +L<public-inbox-index(1)> + + public-inbox-index /path/to/repo.git + +=head2 Hosting standalone + +Using L<public-inbox-init(1)> to initialize the inbox as in the +other methods is recommended. See L<public-inbox-mda(1)> for +more details; but this also requires MTA-specific knowledge. + +=head2 Hosting public-inboxes + +Since public-inboxes are git repositories, they may be served to +remote clients via L<git-daemon(1)> as well as specialized HTTP +and NNTP daemons distributed with public-inbox. + +See L<public-inbox-httpd(1)> and L<public-inbox-nntpd(1)> +for more information on using these daemons. + +Hosting a public-inbox over HTTP or NNTP will never require +write access to any files in the git repository, including +the search indices or article number map database. + +Users familiar with PSGI and L<Plack> may also use +L<PublicInbox::WWW> with the preferred server instead of +L<public-inbox-httpd(1)> + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> diff --git a/Documentation/public-inbox-watch.pod b/Documentation/public-inbox-watch.pod new file mode 100644 index 00000000..a59ba32b --- /dev/null +++ b/Documentation/public-inbox-watch.pod @@ -0,0 +1,127 @@ +=head1 NAME + +public-inbox-watch - mailbox watcher for public-inbox + +=head1 SYNOPSIS + +B<public-inbox-watch> + +In ~/.public-inbox/config: + + [publicinbox "test"] + ; generic public-inbox-config keys: + address = test@example.com + url = http://example.com/test + mainrepo = /path/to/test.example.com.git + + ; config keys specific to public-inbox-watch: + watch = maildir:/path/to/maildirs/.INBOX.test/ + watchheader = List-Id:<test.example.com> + + [publicinboxwatch] + ; optional, enable use of spamc(1) for checking: + spamcheck = spamc + + ; optional, emails marked as read which appear + ; here will be trained as spam and deleted from + ; the mainrepos of any public-inboxes which are + ; configured for watch. + ; This is global for all publicinbox.* sections + watchspam = maildir:/path/to/maildirs/.INBOX.spam + +=head1 DESCRIPTION + +public-inbox-watch allows watching a mailbox (currently only +Maildir) for the arrival of new messages and automatically +importing them into a public-inbox (git) repository. +public-inbox-watch is useful in situations when a user wishes to +mirror an existing mailing list, but has no access to run +L<public-inbox-mda(1)> on a server. Unlike public-inbox-mda +which is invoked once per-message, public-inbox-watch is a +persistent process, making it faster for after-the-fact imports +of large Maildirs. + +Upon startup, it scans the mailbox for new messages to be +imported while it was not running. + +Currently, only Maildirs are supported and the +L<Filesys::Notify::Simple> Perl module is required. + +For now, IMAP users should use tools such as L<mbsync(1)> +or L<offlineimap(1)> to bidirectionally sync their IMAP +folders to Maildirs for public-inbox-watch. + +public-inbox-watch should be run inside a L<screen(1)> session +or as a L<systemd(1)> service. Errors are emitted to stderr. + +=head1 OPTIONS + +public-inbox-watch takes no command-line options. + +=head1 CONFIGURATION + +These configuration knobs should be used in the +L<public-inbox-config(5)> + +=over 8 + +=item publicinbox.<name>.watch + +=item publicinbox.<name>.watchheader + +=item publicinboxwatch.spamcheck + +=item publicinboxwatch.watchspam + +=back + +See L<public-inbox-config(5)> for documentation on them. + +=head1 SIGNALS + +=over 8 + +=item SIGHUP + +Reload the config file (default: ~/.public-inbox/config) + +=item SIGUSR1 + +Rescan all watched mailboxes. This is done automatically after +startup. + +=back + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +config file. default: ~/.public-inbox/config +See L<public-inbox-config(5)> + +=item PERL_INLINE_DIRECTORY + +This may affect any public-inbox processes, but is intended +for long-lived ones such as C<public-inbox-watch> or network +daemons. See L<public-inbox-daemon(8)>. + +=back + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<public-inbox-config(5)> diff --git a/Documentation/txt2pre b/Documentation/txt2pre index ef6a4f35..2f1799fc 100755 --- a/Documentation/txt2pre +++ b/Documentation/txt2pre @@ -1,28 +1,25 @@ #!/usr/bin/env perl -# Copyright (C) 2014-2015 all contributors <meta@public-inbox.org> -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) +# Copyright (C) 2014-2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Stupid script to make HTML from preformatted, utf-8 text versions, # only generating links for http(s). Markdown does too much # and requires indentation to output preformatted text. use strict; use warnings; -use CGI qw/escapeHTML/; -use Encode qw/encode/; -my $str = eval { local $/; <> }; -$str = escapeHTML($str); -$str = encode('us-ascii', $str, Encode::HTMLCREF); -my ($title) = ($str =~ /\A([^\n]+)/); - -# temporarily swap > for escape so our s!! to add href works. -# there's probably a way to do this with only a single s!! ... -$str =~ s!>!\e!g; -$str =~ s!\b((nntp|ftp|https?)://[\w+\+\&\?\.\%\;/#-]+)!<a -href="$1"\n>$1</a>!g; +use PublicInbox::Linkify; +use PublicInbox::Hval qw(ascii_html); -$str =~ s!\e!>!g; # swap escapes back to > +my $str = eval { local $/; <> }; +my $title = $ENV{TITLE}; +($title) = ($str =~ /\A([^\n]+)/) unless $title; +$title = ascii_html($title); +my $l = PublicInbox::Linkify->new; +$str = $l->linkify_1($str); +$str = ascii_html($str); +$str = $l->linkify_2($str); print '<html><head>', - '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />', + qq(<meta\nhttp-equiv="Content-Type"\ncontent="text/html; charset=utf-8"\n/>), "<title>$title</title>", - "</head><body>\n<pre>", $str , '</pre></body></html>'; + "</head><body><pre>", $str , '</pre></body></html>'; @@ -1,8 +1,14 @@ hacking public-inbox -------------------- -Send all patches via to our self-hosting inbox at meta@public-inbox.org -It is archived at <http://public-inbox.org/meta/>. +Send all patches and "git request-pull"-formatted emails to our +self-hosting inbox at meta@public-inbox.org +It is archived at: https://public-inbox.org/meta/ +and http://hjrcffqmbrq6wope.onion/meta/ (using Tor) + +Contributions are email-driven, just like contributing to git +itself or the Linux kernel; however anonymous and pseudonymous +contributions will always be welcome. Please consider our goals in mind: @@ -30,12 +36,19 @@ In general, we favor mature and well-tested old things rather than the shiny new. Avoid relying on compiled modules too much. Even if it is Free, -compiled code makes packages more expensive to audit, build, and +compiled code makes packages more expensive to audit, build, distribute and verify. public-inbox itself will only be implemented in scripting languages (currently Perl 5). Performance should be reasonably good for server administrators, too, and we will sacrifice features to achieve predictable performance. +Encouraging folks to self-host will be easier with lower hardware +requirements. + +See design_www.txt and design_notes.txt in the Documentation/ +directory for design decisions made during development. -See design_www.txt and design_notes.txt in the Documentation/ directory -for design decisions made during development. +For now, one may optionally subscribe to the mailing list by +sending an email to: meta+subscribe@public-inbox.org +(and confirming). However, reading over the mailing list is +the least reliable method of reading a public-inbox. @@ -2,7 +2,7 @@ public-inbox (server-side) installation --------------------------------------- This is for folks who want to setup their own public-inbox instance. -Clients should see http://ssoma.public-inbox.org/INSTALL.html instead +Clients should see https://ssoma.public-inbox.org/INSTALL.html instead if they want to import mail into their personal inboxes. TODO: this still needs to be documented better, @@ -19,42 +19,53 @@ standard MakeMaker installation (Perl) make test make install # root permissions may be needed -Requirements (server MDA) -------------------------- +Requirements +------------ * git -* ssoma - http://ssoma.public-inbox.org/INSTALL.html -* SpamAssassin (spamc/spamd) -* MTA - postfix is recommended -* lynx (for converting HTML messages to text) * Perl and several modules: (Debian package name) - Date::Parse libtimedate-perl - - Email::Address libemail-address-perl - - Email::Filter libemail-filter-perl - Email::MIME libemail-mime-perl - Email::MIME::ContentType libemail-mime-contenttype-perl - Encode::MIME::Header perl - - File::Path::Expand libfile-path-expand-perl - - IPC::Run libipc-run-perl -Optional modules: +Optional components: + +* MTA - postfix is recommended (for public-inbox-mda) +* SpamAssassin (spamc/spamd) (for public-inbox-watch/public-inbox-mda) + +Optional Perl modules: - Plack[1] libplack-perl - - Mail::Thread (2.5+)[1] libmail-thread-perl - URI::Escape[1] liburi-perl - - Search::Xapian[3] libsearch-xapian-perl - - IO::Compress::Gzip[3] libio-compress-perl + - Search::Xapian[2][3] libsearch-xapian-perl + - IO::Compress::Gzip[3] perl-modules (or libio-compress-perl) - DBI[3] libdbi-perl - - DBD::SQLite[3] libdbd-sqlite3-perl + - DBD::SQLite[2][3] libdbd-sqlite3-perl - Danga::Socket[4] libdanga-socket-perl - - Net::Server[4] libnet-server-perl + - Net::Server[5] libnet-server-perl + - Filesys::Notify::Simple[6] libfilesys-notify-simple-perl + +[1] - Optional, needed for serving/generating Atom and HTML pages +[2] - Optional, only required for NNTP server +[3] - Optional, needed for gzipped mbox support over HTTP +[4] - Optional, needed for bundled HTTP and NNTP servers +[5] - Optional, needed for standalone daemonization of HTTP+NNTP servers +[6] - Optional, needed for public-inbox-watch Maildir watcher + +When installing Search::Xapian, make sure the underlying Xapian +installation is not affected by an index corruption bug: + + https://bugs.debian.org/808610 + +For Debian 8.x (jessie), this means using Debian 8.5 or later. -[1] - Only required for serving/generating Atom and HTML pages. -[3] - Optional for HTML web interface and HTTP/NNTP servers -[4] - Optional for HTTP and NNTP servers +public-inbox will never store unregeneratable data in Xapian +or any other search database we might use; Xapian corruption +will not destroy critical data. Copyright --------- -Copyright 2013-2015 all contributors <meta@public-inbox.org> -License: AGPLv3 or later <http://www.gnu.org/licenses/agpl-3.0.txt> +Copyright 2013-2016 all contributors <meta@public-inbox.org> +License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> @@ -5,7 +5,14 @@ Documentation/dc-dlvr-spam-flow.txt Documentation/design_notes.txt Documentation/design_www.txt Documentation/include.mk +Documentation/public-inbox-config.pod +Documentation/public-inbox-daemon.pod +Documentation/public-inbox-httpd.pod +Documentation/public-inbox-index.pod Documentation/public-inbox-mda.pod +Documentation/public-inbox-nntpd.pod +Documentation/public-inbox-overview.pod +Documentation/public-inbox-watch.pod Documentation/txt2pre HACKING INSTALL @@ -14,41 +21,97 @@ Makefile.PL README TODO examples/README +examples/README.unsubscribe examples/apache2_cgi.conf examples/apache2_perl.conf examples/apache2_perl_old.conf examples/cgi-webrick.rb examples/cgit-commit-filter.lua +examples/logrotate.conf examples/public-inbox-config +examples/public-inbox-httpd.socket +examples/public-inbox-httpd@.service +examples/public-inbox-nntpd.socket +examples/public-inbox-nntpd@.service examples/public-inbox.psgi +examples/repobrowse.psgi +examples/unsubscribe-milter.socket +examples/unsubscribe-milter@.service +examples/unsubscribe-psgi.socket +examples/unsubscribe-psgi@.service +examples/unsubscribe.milter +examples/unsubscribe.psgi +examples/varnish-4.vcl +lib/PublicInbox/Address.pm +lib/PublicInbox/AltId.pm lib/PublicInbox/Config.pm lib/PublicInbox/Daemon.pm +lib/PublicInbox/Emergency.pm +lib/PublicInbox/EvCleanup.pm lib/PublicInbox/ExtMsg.pm lib/PublicInbox/Feed.pm -lib/PublicInbox/Filter.pm +lib/PublicInbox/Filter/Base.pm +lib/PublicInbox/Filter/Mirror.pm +lib/PublicInbox/Filter/Vger.pm +lib/PublicInbox/GetlineBody.pm lib/PublicInbox/Git.pm lib/PublicInbox/GitHTTPBackend.pm lib/PublicInbox/HTTP.pm +lib/PublicInbox/HTTPD.pm +lib/PublicInbox/HTTPD/Async.pm lib/PublicInbox/Hval.pm +lib/PublicInbox/Import.pm +lib/PublicInbox/Inbox.pm lib/PublicInbox/Linkify.pm lib/PublicInbox/Listener.pm lib/PublicInbox/MDA.pm lib/PublicInbox/MID.pm lib/PublicInbox/Mbox.pm +lib/PublicInbox/MsgIter.pm lib/PublicInbox/Msgmap.pm lib/PublicInbox/NNTP.pm -lib/PublicInbox/NewsGroup.pm +lib/PublicInbox/NNTPD.pm lib/PublicInbox/NewsWWW.pm +lib/PublicInbox/ParentPipe.pm lib/PublicInbox/ProcessPipe.pm +lib/PublicInbox/Qspawn.pm +lib/PublicInbox/Repobrowse.pm +lib/PublicInbox/RepobrowseBase.pm +lib/PublicInbox/RepobrowseConfig.pm +lib/PublicInbox/RepobrowseGit.pm +lib/PublicInbox/RepobrowseGitAtom.pm +lib/PublicInbox/RepobrowseGitBlob.pm +lib/PublicInbox/RepobrowseGitCommit.pm +lib/PublicInbox/RepobrowseGitDiff.pm +lib/PublicInbox/RepobrowseGitDiffCommon.pm +lib/PublicInbox/RepobrowseGitFallback.pm +lib/PublicInbox/RepobrowseGitLog.pm +lib/PublicInbox/RepobrowseGitPatch.pm +lib/PublicInbox/RepobrowseGitPlain.pm +lib/PublicInbox/RepobrowseGitQuery.pm +lib/PublicInbox/RepobrowseGitSnapshot.pm +lib/PublicInbox/RepobrowseGitSummary.pm +lib/PublicInbox/RepobrowseGitTag.pm +lib/PublicInbox/RepobrowseGitTree.pm +lib/PublicInbox/RepobrowseRoot.pm +lib/PublicInbox/SaPlugin/ListMirror.pm lib/PublicInbox/Search.pm lib/PublicInbox/SearchIdx.pm lib/PublicInbox/SearchMsg.pm +lib/PublicInbox/SearchThread.pm lib/PublicInbox/SearchView.pm +lib/PublicInbox/Spamcheck/Spamc.pm lib/PublicInbox/Spawn.pm lib/PublicInbox/SpawnPP.pm -lib/PublicInbox/Thread.pm +lib/PublicInbox/Unsubscribe.pm lib/PublicInbox/View.pm lib/PublicInbox/WWW.pm +lib/PublicInbox/WWW.pod +lib/PublicInbox/WatchMaildir.pm +lib/PublicInbox/WwwAtomStream.pm +lib/PublicInbox/WwwAttach.pm +lib/PublicInbox/WwwStream.pm +lib/PublicInbox/WwwText.pm sa_config/Makefile sa_config/README sa_config/root/etc/spamassassin/public-inbox.pre @@ -59,36 +122,71 @@ script/public-inbox-init script/public-inbox-learn script/public-inbox-mda script/public-inbox-nntpd +script/public-inbox-watch script/public-inbox.cgi scripts/dc-dlvr scripts/dc-dlvr.pre scripts/edit-sa-prefs scripts/import_maildir scripts/import_slrnspool +scripts/import_vger_from_mbox scripts/report-spam scripts/slrnspool2maildir +scripts/ssoma-replay +scripts/xhdr-num2mid +t/address.t +t/altid.t t/cgi.t +t/check-www-inbox.perl t/common.perl t/config.t +t/config_limiter.t +t/emergency.t t/fail-bin/spamc t/feed.t -t/filter.t +t/filter_base.t +t/filter_mirror.t +t/filter_vger.t +t/git-http-backend.psgi +t/git-http-backend.t t/git.fast-import-data t/git.t t/html_index.t t/httpd-corner.psgi t/httpd-corner.t +t/httpd-unix.t t/httpd.t +t/hval.t +t/import.t +t/inbox.t t/init.t t/linkify.t t/main-bin/spamc t/mda.t +t/mid.t +t/msg_iter.t t/msgmap.t t/nntp.t t/nntpd.t t/plack.t t/precheck.t +t/psgi_attach.t +t/psgi_mount.t +t/psgi_text.t +t/qspawn.t +t/repobrowse.t +t/repobrowse_common_git.perl +t/repobrowse_git.t +t/repobrowse_git_atom.t +t/repobrowse_git_commit.t +t/repobrowse_git_httpd.t +t/repobrowse_git_plain.t +t/repobrowse_git_snapshot.t +t/repobrowse_git_tree.t t/search.t +t/spamcheck_spamc.t t/spawn.t +t/thread-cycle.t t/utf8.mbox t/view.t +t/watch_maildir.t diff --git a/Makefile.PL b/Makefile.PL index 904b6adf..0bac7c95 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -14,19 +14,14 @@ WriteMakefile( ABSTRACT => 'public-inbox server infrastructure', EXE_FILES => \@EXE_FILES, PREREQ_PM => { - # note: we use ssoma(1) and spamc(1), NOT the Perl modules - # We also depend on git through ssoma. + # note: we use spamc(1), NOT the Perl modules + # We also depend on git. # Keep this sorted and synced to the INSTALL document 'Date::Parse' => 0, - 'Email::Address' => 0, - 'Email::Filter' => 0, 'Email::MIME' => 0, 'Email::MIME::ContentType' => 0, 'Email::Simple' => 0, 'Encode::MIME::Header' => 0, - 'File::Path::Expand' => 0, - 'IPC::Run' => 0, - 'Mail::Thread' => '2.5', # 2.5+ needed for Email::Simple compat 'Plack' => 0, 'URI::Escape' => 0, # We have more test dependencies, but do not force @@ -36,18 +31,26 @@ WriteMakefile( sub MY::postamble { <<EOF; +# support using eatmydata to speed up tests (apt-get install eatmydata): +# https://www.flamingspork.com/projects/libeatmydata/ +EATMYDATA = +-include config.mak -include Documentation/include.mk -my_syntax := \$(addsuffix .syntax, $PM_FILES \$(EXE_FILES)) +N ?= \$(shell echo \$\$(( \$\$(nproc 2>/dev/null || echo 2) + 1))) +SCRIPTS := scripts/ssoma-replay +my_syntax := \$(addsuffix .syntax, $PM_FILES \$(EXE_FILES) \$(SCRIPTS)) -N := \$(shell echo \$\$(( \$\$(nproc 2>/dev/null || echo 2) + 1))) %.syntax :: @\$(PERL) -I lib -c \$(subst .syntax,,\$@) syntax:: \$(my_syntax) -check:: pure_all - prove -lv -j\$(N) +check-manifest :: MANIFEST + if git ls-files >\$<.gen 2>&1; then diff -u \$< \$<.gen; fi + +check:: pure_all check-manifest + \$(EATMYDATA) prove -lv -j\$(N) EOF } @@ -22,33 +22,31 @@ to run their own instances with minimal overhead. Implementation -------------- -public-inbox uses ssoma[1], Some Sort Of Mail Archiver which implements -no policy of its own. By storing (and optionally) exposing an inbox -via git, it is fast and efficient to host and mirror public-inboxes. +public-inbox stores mail in a git repository keyed by Message-ID +as documented in: https://ssoma.public-inbox.org/ssoma_repository.txt -Traditional mailing lists use the "push" model. For readers, this -requires commitment to subscribe and effort to unsubscribe. New readers -may also have difficulty following existing discussions if archives do -not expose Message-ID headers for responses. List server admins may be -burdened with delivery failures. +By storing (and optionally) exposing an inbox via git, it is +fast and efficient to host and mirror public-inboxes. -public-inbox uses the "pull" model. Casual readers may also follow -the list via NNTP, Atom feed or HTML archives. +Traditional mailing lists use the "push" model. For readers, +that requires commitment to subscribe and effort to unsubscribe. +New readers may also have difficulty following existing +discussions if archives do not expose Message-ID and References +headers. List server admins are also burdened with delivery +failures. -Users of the ssoma[1] command-line tool may import mail into an mbox, -Maildir, or IMAP folder from git repositories periodically. +public-inbox uses the "pull" model. Casual readers may also +follow the list via NNTP, Atom feed or HTML archives. If a reader loses interest, they simply stop syncing. Since we use git, mirrors are easy-to-setup, and lists are -easy-to-relocate to different mail addresses without losing/splitting -archives. +easy-to-relocate to different mail addresses without losing +or splitting archives. _Anybody_ may also setup a delivery-only mailing list server to replay a public-inbox git archive to subscribers via SMTP. -[1] http://ssoma.public-inbox.org/ - Features -------- @@ -60,7 +58,7 @@ Features * uses only well-documented and easy-to-implement data formats -Try it out now, see http://try.public-inbox.org/ +Try it out now, see https://try.public-inbox.org/ Requirements for reading: @@ -82,14 +80,16 @@ Requirements (participant) Requirements (server) --------------------- -See http://public-inbox.org/INSTALL +See https://public-inbox.org/INSTALL Hacking ------- -Source code is available via git: +AGPL source code is available via git: - git clone git://80x24.org/public-inbox + git clone https://public-inbox.org/ public-inbox + git clone git://repo.or.cz/public-inbox + torsocks git clone http://hjrcffqmbrq6wope.onion/public-inbox See below for contact info. @@ -111,15 +111,21 @@ on git@vger.kernel.org). The archives are readable via NNTP or HTTP: nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta - http://public-inbox.org/meta/ + https://public-inbox.org/meta/ + +And as Tor hidden services: + + http://hjrcffqmbrq6wope.onion/meta/ + nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta You may also clone all messages via git: - git clone --mirror git://public-inbox.org/meta.git + git clone --mirror https://public-inbox.org/meta/ + torsocks git clone --mirror http://hjrcffqmbrq6wope.onion/meta/ Or pass the same git repository URL for ssoma using the instructions at: - http://ssoma.public-inbox.org/README.html + https://ssoma.public-inbox.org/README.html Anti-Spam --------- @@ -128,15 +134,15 @@ The maintainer of public-inbox has found SpamAssassin a good tool for filtering his personal mail, and it will be the default spam filtering tool in public-inbox. -See http://public-inbox.org/dc-dlvr-spam-flow.html for more info. +See https://public-inbox.org/dc-dlvr-spam-flow.html for more info. Content Filtering ----------------- -To discourage phishing, web bugs (tracking), viruses and other nuisances, -only plain-text content is allowed and non-text content is stripped. -This saves I/O bandwidth and storage, which is important as -entire mail archives are shared between clients. +To discourage phishing, trackers, exploits and other nuisances, +only plain-text emails are allowed and HTML is rejected. +This improves accessibility, and saves bandwidth and storage +as mail is archived forever. As of the 2010s, successful online social networks and forums are the ones which heavily restrict users formatting options; so public-inbox @@ -145,8 +151,8 @@ aims to preserve the focus on content, and not presentation. Copyright --------- -Copyright 2013-2015 all contributors <meta@public-inbox.org> -License: AGPLv3 or later <http://www.gnu.org/licenses/agpl-3.0.txt> +Copyright 2013-2016 all contributors <meta@public-inbox.org> +License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -159,7 +165,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. +along with this program. If not, see <https://www.gnu.org/licenses/>. Additional permission under GNU GPL version 3 section 7: @@ -1,10 +1,24 @@ TODO items for public-inbox -(Not in any particular order) +(Not in any particular order, and +performance, ease-of-setup, installation, maintainability, etc +all need to be considered for everything we introduce) + +* general performance improvements, but without relying on + XS or compiled code any more than we currently do. + +* mailmap support (same as git) for remapping expired email addresses + +* POP3 server, since some webmail providers support external POP3: + https://public-inbox.org/meta/20160411034104.GA7817@dcvr.yhbt.net/ + +* TLS support for various daemons (including STARTTLS for NNTP and POP3) + +* Combined "super server" for NNTP/HTTP/POP3 to reduce memory overhead * Optional reply-to-list support for mirroring lists that want it :< Reply-to-list encourages the existing list as a single-point-of-failure, - but having an extra mirror using public-inbox.org is nice regardless. + but having an extra mirror using public-inbox code is nice regardless. * Configurable linkification for per-inbox shorthands: "$gmane/123456" could be configured to expand to the @@ -12,24 +26,43 @@ TODO items for public-inbox likewise "[Bug #123456]" could be configured to expand to point to some project's bug tracker at http://example.com/bug/123456 +* Support optional "HTTPS Everywhere" for mapping old HTTP to HTTPS + links if (and only if) the user wants to use HTTPS. We may also + be able to configure redirects for expired URLs. + + Note: message bodies rendered as HTML themselves must NOT change, + the links should point to an anchor tag within the same page, + instead; giving the user options. + * implement RFC 4685 (Atom message threading) -* configurable constants (quote folding, index limits) +* configurable constants (index limits, search results) -* use tags as date-based skiplists for navigating history - (maybe not needed with Xapian support nowadays?) +* handle messages with multiple Message-IDs -* handle Xapian date range queries: - http://mid.gmane.org/20151005222157.GE5880@survex.com +* handle broken double-bracketed References properly (maybe) + and totally broken Message-IDs -* use REQUEST_URI properly for CGI / mod_perl2 compatibility - with Message-IDs which include '%' + cf. https://public-inbox.org/git/20160814012706.GA18784@starla/ + +* portability to FreeBSD (and other Free Software *BSDs) + ugh... https://rt.cpan.org/Ticket/Display.html?id=116615 + +* improve documentation -* more test cases (use git fast-import to speed up creation) +* linkify thread skeletons better + https://public-inbox.org/git/6E3699DEA672430CAEA6DEFEDE6918F4@PhilipOakley/ -* large mbox/Maildir/MH/NNTP spool import (use git fast-import) +* generate sample CSS for use with Stylish/dillo/etc + +* streaming Email::MIME replacement: currently we generate many + allocations/strings for headers we never look at and slurp + entire message bodies into memory. + (this is pie-in-the-sky territory...) + +* use REQUEST_URI properly for CGI / mod_perl2 compatibility + with Message-IDs which include '%' (done?) -* remove dependency on ssoma installation (inline the code) +* more and better test cases (use git fast-import to speed up creation) -* improve + document mlmmj integration, currently only at: - http://bogomips.org/unicorn-public/20140508084301.GA2033%40dcvr.yhbt.net/ +* large mbox/Maildir/MH/NNTP spool import (see PublicInbox::Import) diff --git a/examples/README b/examples/README index 1244cb2c..1d5dcd34 100644 --- a/examples/README +++ b/examples/README @@ -16,4 +16,4 @@ apache2_perl.conf - intended to be the basis of a production config Contact ------- Please send any related feedback to public-inbox: meta@public-inbox.org -Our public-inbox is: git://public-inbox.org/meta +Our public-inbox is: https://public-inbox.org/meta/ diff --git a/examples/README.unsubscribe b/examples/README.unsubscribe new file mode 100644 index 00000000..7c41067c --- /dev/null +++ b/examples/README.unsubscribe @@ -0,0 +1,40 @@ +Unsubscribe endpoints for mlmmj users (and possibly Mailman, too) + +* examples/unsubscribe.milter filters outgoing messages + and appends an HTTPS URL to the List-Unsubscribe header. + This List-Unsubscribe header should point to the PSGI + described below. + Currently, this is only active for a whitelist of test + addresses in /etc/unsubscribe-milter.whitelist + with one email address per line. + +* examples/unsubscribe.psgi is a PSGI which needs to run + as the mlmmj user with permission to run mlmmj-unsub. + This depends on the PublicInbox::Unsubscribe module + which may be extracted from the rest of public-inbox. + It is strongly recommended to NOT run the rest of the + public-inbox WWW code in the same process as this PSGI. + (The public-inbox WWW code will never need write + permissions to anything besides stderr). + +* Both the .milter and .psgi examples are bundled with + systemd service and socket activation examples. + AFAIK no other PSGI server besides public-inbox-httpd + supports systemd socket activation. + +To wire up the milter for postfix, I use the following +in /etc/postfix/main.cf: + + # Milter configuration + milter_default_action = accept + milter_protocol = 2 + + # other milters may be chained here (e.g. opendkim) + # chroot users will need to adjust this path + smtpd_milters = local:/var/spool/postfix/unsubscribe/unsubscribe.sock + + # This is not needed for mlmmj since mlmmj uses SMTP: + # non_smtpd_milters = local:/var/spool/postfix/unsubscribe/unsubscribe.sock + +Copyright (C) 2016 all contributors <meta@public-inbox.org> +License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> diff --git a/examples/logrotate.conf b/examples/logrotate.conf new file mode 100644 index 00000000..4ce08843 --- /dev/null +++ b/examples/logrotate.conf @@ -0,0 +1,24 @@ +# ==> /etc/logrotate.d/public-inbox <== +# +# See the logrotate(8) manpage for more information: +# http://linux.die.net/man/8/logrotate +/var/log/public-inbox/*.log { + weekly + missingok + rotate 52 + compress + delaycompress + notifempty + sharedscripts + dateext + # note the lack of the racy "copytruncate" option in this + # config. public-inbox-*d supports the USR1 signal and + # we send it as our "lastaction": + lastaction + # systemd users do not need PID files, + # only signal the @1 process since the @2 is short-lived + # For systemd users, assuming you use two services + systemctl kill -s SIGUSR1 public-inbox-httpd@1.service + systemctl kill -s SIGUSR1 public-inbox-nntpd@1.service + endscript +} diff --git a/examples/public-inbox-config b/examples/public-inbox-config index 0c1db118..7fcbe0ba 100644 --- a/examples/public-inbox-config +++ b/examples/public-inbox-config @@ -10,4 +10,3 @@ address = meta@public-inbox.org mainrepo = /home/pi/meta-main.git url = http://example.com/meta - atomUrl = http://example.com/meta diff --git a/examples/public-inbox-httpd.socket b/examples/public-inbox-httpd.socket new file mode 100644 index 00000000..1a1ed735 --- /dev/null +++ b/examples/public-inbox-httpd.socket @@ -0,0 +1,10 @@ +# ==> /etc/systemd/system/public-inbox-httpd.socket <== +[Unit] +Description = public-inbox-httpd socket + +[Socket] +ListenStream = 80 +Service = public-inbox-httpd@1.service + +[Install] +WantedBy = sockets.target diff --git a/examples/public-inbox-httpd@.service b/examples/public-inbox-httpd@.service new file mode 100644 index 00000000..56117ef0 --- /dev/null +++ b/examples/public-inbox-httpd@.service @@ -0,0 +1,32 @@ +# ==> /etc/systemd/system/public-inbox-httpd@.service <== +# Since SIGUSR2 upgrades do not work under systemd, this service file +# allows starting two simultaneous services during upgrade time +# (e.g. public-inbox-httpd@1 public-inbox-httpd@2) with the intention +# that they take turns running in-between upgrades. This should +# allow upgrading without downtime. + +[Unit] +Description = public-inbox PSGI server %i +Wants = public-inbox-httpd.socket +After = public-inbox-httpd.socket + +[Service] +Environment = PI_CONFIG=/home/pi/.public-inbox/config \ +PATH=/usr/local/bin:/usr/bin:/bin \ +PERL_INLINE_DIRECTORY=/tmp/.pub-inline + +LimitNOFILE = 30000 +ExecStartPre = /bin/mkdir -p -m 1777 /tmp/.pub-inline +ExecStart = /usr/local/bin/public-inbox-httpd \ +-1 /var/log/public-inbox/httpd.out.log +StandardError = syslog +Sockets = public-inbox-httpd.socket +KillSignal = SIGQUIT +User = nobody +Group = nogroup +ExecReload = /bin/kill -HUP $MAINPID +TimeoutStopSec = 86400 +KillMode = process + +[Install] +WantedBy = multi-user.target diff --git a/examples/public-inbox-nntpd.socket b/examples/public-inbox-nntpd.socket new file mode 100644 index 00000000..eeddf343 --- /dev/null +++ b/examples/public-inbox-nntpd.socket @@ -0,0 +1,10 @@ +# ==> /etc/systemd/system/public-inbox-nntpd.socket <== +[Unit] +Description = public-inbox-nntpd socket + +[Socket] +ListenStream = 119 +Service = public-inbox-nntpd@1.service + +[Install] +WantedBy = sockets.target diff --git a/examples/public-inbox-nntpd@.service b/examples/public-inbox-nntpd@.service new file mode 100644 index 00000000..62202c2f --- /dev/null +++ b/examples/public-inbox-nntpd@.service @@ -0,0 +1,32 @@ +# ==> /etc/systemd/system/public-inbox-nntpd@.service <== +# Since SIGUSR2 upgrades do not work under systemd, this service file +# allows starting two simultaneous services during upgrade time +# (e.g. public-inbox-nntpd@1 public-inbox-nntpd@2) with the intention +# that they take turns running in-between upgrades. This should +# allow upgrading without downtime. + +[Unit] +Description = public-inbox NNTP server %i +Wants = public-inbox-nntpd.socket +After = public-inbox-nntpd.socket + +[Service] +Environment = PI_CONFIG=/home/pi/.public-inbox/config \ +PATH=/usr/local/bin:/usr/bin:/bin \ +PERL_INLINE_DIRECTORY=/tmp/.pub-inline + +LimitNOFILE = 30000 +ExecStartPre = /bin/mkdir -p -m 1777 /tmp/.pub-inline +ExecStart = /usr/local/bin/public-inbox-nntpd \ +-1 /var/log/public-inbox/nntpd.out.log +StandardError = syslog +Sockets = public-inbox-nntpd.socket +KillSignal = SIGQUIT +User = nobody +Group = nogroup +ExecReload = /bin/kill -HUP $MAINPID +TimeoutStopSec = 86400 +KillMode = process + +[Install] +WantedBy = multi-user.target diff --git a/examples/public-inbox.psgi b/examples/public-inbox.psgi index 71592a7a..e97f917f 100644 --- a/examples/public-inbox.psgi +++ b/examples/public-inbox.psgi @@ -11,10 +11,11 @@ use PublicInbox::WWW; PublicInbox::WWW->preload; use Plack::Builder; my $www = PublicInbox::WWW->new; + +# share the public-inbox code itself: +my $src = $ENV{SRC_GIT_DIR}; # '/path/to/public-inbox.git' + builder { - # Chunked middleware conflicts with Starman: - # https://github.com/miyagawa/Starman/issues/23 - # enable 'Chunked'; eval { enable 'Deflater', content_type => [ qw( @@ -28,7 +29,7 @@ builder { # Enable to ensure redirects and Atom feed URLs are generated # properly when running behind a reverse proxy server which - # sets X-Forwarded-For and X-Forwarded-Proto request headers. + # sets the X-Forwarded-Proto request header. # See Plack::Middleware::ReverseProxy documentation for details eval { enable 'ReverseProxy' }; $@ and warn @@ -43,5 +44,15 @@ builder { # format => '%t "%r" %>s %b %D'; enable 'Head'; - sub { $www->call(@_) }; + sub { + my ($env) = @_; + # share public-inbox.git code! + if ($src && $env->{PATH_INFO} =~ + m!\A/(?:public-inbox(?:\.git)?/)? + ($PublicInbox::GitHTTPBackend::ANY)\z!xo) { + PublicInbox::GitHTTPBackend::serve($env, $src, $1); + } else { + $www->call($env); + } + }; } diff --git a/examples/unsubscribe-milter.socket b/examples/unsubscribe-milter.socket new file mode 100644 index 00000000..bfaa97a1 --- /dev/null +++ b/examples/unsubscribe-milter.socket @@ -0,0 +1,10 @@ +# ==> /etc/systemd/system/unsubscribe-milter.socket <== +[Unit] +Description = unsubscribe.milter socket + +[Socket] +ListenStream = /var/spool/postfix/unsubscribe/unsubscribe.sock +Service = unsubscribe-milter@1.service + +[Install] +WantedBy = sockets.target diff --git a/examples/unsubscribe-milter@.service b/examples/unsubscribe-milter@.service new file mode 100644 index 00000000..98e3d478 --- /dev/null +++ b/examples/unsubscribe-milter@.service @@ -0,0 +1,24 @@ +# ==> /etc/systemd/system/unsubscribe-milter@.service <== +# The '@' is to allow multiple simultaneous services to start +# and share the same socket so new code can be cycled in +# without downtime + +[Unit] +Description = unsubscribe milter %i +Wants = unsubscribe-milter.socket +After = unsubscribe-milter.socket + +[Service] +# First 8 bytes is for the key, next 8 bytes is for the IV +# using Blowfish. We want as short URLs as possible to avoid +# copy+paste errors +# umask 077 && dd if=/dev/urandom bs=16 count=1 of=.unsubscribe.key +ExecStart = /usr/local/sbin/unsubscribe.milter /home/mlmmj/.unsubscribe.key +Sockets = unsubscribe-milter.socket + +# the corresponding PSGI app needs permissions to modify the +# mlmmj spool, so we might as well use the same user since +User = mlmmj + +[Install] +WantedBy = multi-user.target diff --git a/examples/unsubscribe-psgi.socket b/examples/unsubscribe-psgi.socket new file mode 100644 index 00000000..e7ab797b --- /dev/null +++ b/examples/unsubscribe-psgi.socket @@ -0,0 +1,11 @@ +# ==> /etc/systemd/system/unsubscribe-psgi.socket <== +[Unit] +Description = unsubscribe PSGI socket + +[Socket] +# Forward to the PSGI using nginx or similar +ListenStream = /run/unsubscribe-psgi.sock +Service = unsubscribe-psgi@1.service + +[Install] +WantedBy = sockets.target diff --git a/examples/unsubscribe-psgi@.service b/examples/unsubscribe-psgi@.service new file mode 100644 index 00000000..acc29e8e --- /dev/null +++ b/examples/unsubscribe-psgi@.service @@ -0,0 +1,21 @@ +# ==> /etc/systemd/system/unsubscribe-psgi@.service <== +# The '@' is to allow multiple simultaneous services to start +# and share the same socket so new code can be cycled in +# without downtime + +[Unit] +Description = unsubscribe PSGI %i +Wants = unsubscribe-psgi.socket +After = unsubscribe-psgi.socket + +[Service] +# any PSGI server ought to work, +# but public-inbox-httpd supports socket activation like unsubscribe.milter +ExecStart = /usr/local/bin/public-inbox-httpd -W0 /etc/unsubscribe.psgi +Sockets = unsubscribe-psgi.socket +# we need to modify the mlmmj spool +User = mlmmj +KillMode = process + +[Install] +WantedBy = multi-user.target diff --git a/examples/unsubscribe.milter b/examples/unsubscribe.milter new file mode 100644 index 00000000..c245a5b8 --- /dev/null +++ b/examples/unsubscribe.milter @@ -0,0 +1,134 @@ +#!/usr/bin/perl -w +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Sendmail::PMilter qw(:all); +use IO::Socket; +use Crypt::CBC; +use MIME::Base64 qw(encode_base64url); + +my $key_file = shift @ARGV or die "Usage: $0 KEY_FILE\n"; +open my $fh, '<', $key_file or die "failed to open $key_file\n"; +my ($key, $iv); +if (read($fh, $key, 8) != 8 || read($fh, $iv, 8) != 8 || + read($fh, my $end, 8) != 0) { + die "KEY_FILE must be 16 bytes\n"; +} + +# these parameters were chosen to generate shorter parameters +# to reduce the possibility of copy+paste errors +my $crypt = Crypt::CBC->new(-key => $key, + -iv => $iv, + -header => 'none', + -cipher => 'Blowfish'); +$fh = $iv = $key = undef; + +my %cbs; +$cbs{connect} = sub { + my ($ctx) = @_; + eval { $ctx->setpriv({ header => {}, envrcpt => {} }) }; + warn $@ if $@; + SMFIS_CONTINUE; +}; + +$cbs{envrcpt} = sub { + my ($ctx, $addr) = @_; + eval { + $addr =~ tr!<>!!d; + $ctx->getpriv->{envrcpt}->{$addr} = 1; + }; + warn $@ if $@; + SMFIS_CONTINUE; +}; + +$cbs{header} = sub { + my ($ctx, $k, $v) = @_; + eval { + my $k_ = lc $k; + if ($k_ eq 'list-unsubscribe') { + my $header = $ctx->getpriv->{header} ||= {}; + my $ary = $header->{$k_} ||= []; + + # we create placeholders in case there are + # multiple headers of the same name + my $cur = []; + push @$ary, $cur; + + # This relies on mlmmj convention: + # $LIST+unsubscribe@$DOMAIN + if ($v =~ /\A<mailto:([^@]+)\+unsubscribe@([^>]+)>\z/) { + @$cur = ($k, $v, $1, $2); + + # Mailman convention: + # $LIST-request@$DOMAIN?subject=unsubscribe + } elsif ($v =~ /\A<mailto:([^@]+)-request@ + ([^\?]+)\?subject=unsubscribe>\z/x) { + # @$cur = ($k, $v, $1, $2); + } + } + }; + warn $@ if $@; + SMFIS_CONTINUE; +}; + +# We don't want people unsubscribing archivers: +sub archive_addr { + my ($addr) = @_; + return 1 if ($addr =~ /\@m\.gmane\.org\z/); + return 1 if ($addr eq 'archive@mail-archive.com'); + 0 +} + +$cbs{eom} = sub { + my ($ctx) = @_; + eval { + my $priv = $ctx->getpriv; + $ctx->setpriv({ header => {}, envrcpt => {} }); + my @rcpt = keys %{$priv->{envrcpt}}; + + # one recipient, one unique HTTP(S) URL + return SMFIS_CONTINUE if @rcpt != 1; + return SMFIS_CONTINUE if archive_addr(lc($rcpt[0])); + + my $unsub = $priv->{header}->{'list-unsubscribe'} || []; + my $n = 0; + foreach my $u (@$unsub) { + # Milter indices are 1-based, + # not 0-based like Perl arrays + my $index = ++$n; + my ($k, $v, $list, $domain) = @$u; + + next unless $k && $v && $list && $domain; + my $u = $crypt->encrypt($rcpt[0]); + $u = encode_base64url($u); + $v .= ",\n <https://$domain/u/$u/$list>"; + + $ctx->chgheader($k, $index, $v); + } + }; + warn $@ if $@; + SMFIS_CONTINUE; +}; + +my $milter = Sendmail::PMilter->new; + +# Try to inherit a socket from systemd or similar: +my $fds = $ENV{LISTEN_FDS}; +if ($fds && (($ENV{LISTEN_PID} || 0) == $$)) { + die "$0 can only listen on one FD\n" if $fds != 1; + my $start_fd = 3; + my $s = IO::Socket->new_from_fd($start_fd, 'r') or + die "inherited bad FD from LISTEN_FDS: $!\n"; + $milter->set_socket($s); +} else { + # fall back to binding a socket: + my $sock = 'unix:/var/spool/postfix/unsubscribe/unsubscribe.sock'; + $milter->set_listen(1024); + my $umask = umask 0000; + $milter->setconn($sock); + umask $umask; +} + +$milter->register('unsubscribe', \%cbs, SMFI_CURR_ACTS); +$milter->main(); diff --git a/examples/unsubscribe.psgi b/examples/unsubscribe.psgi new file mode 100644 index 00000000..5b9b16cc --- /dev/null +++ b/examples/unsubscribe.psgi @@ -0,0 +1,69 @@ +#!/usr/bin/perl -w +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt> +# This should not require any other PublicInbox code, but may use +# PublicInbox::Config if ~/.public-inbox/config exists or +# PI_CONFIG is pointed to an appropriate location +use strict; +use Plack::Builder; +use PublicInbox::Unsubscribe; +my $app = PublicInbox::Unsubscribe->new( + pi_config => eval { # optional, for pointing out archives + require PublicInbox::Config; + # uses ~/.public-inbox/config by default, + # can override with PI_CONFIG or here since + # I run this .psgi as the mlmmj user while the + # public-inbox-mda code which actually writes to + # the archives runs as a different user. + PublicInbox::Config->new('/home/pi/.public-inbox/config') + }, + # change if you fork + code_url => 'https://public-inbox.org/public-inbox.git', + owner_email => 'BOFH@example.com', + confirm => 0, + + # First 8 bytes is for the key, next 8 bytes is for the IV + # using Blowfish. We want as short URLs as possible to avoid + # copy+paste errors + # umask 077 && dd if=/dev/urandom bs=16 count=1 of=.unsubscribe.key + key_file => '/home/mlmmj/.unsubscribe.key', + + # this runs as whatever user has perms to run /usr/bin/mlmmj-unsub + # users of other mailing lists. Returns '' on success. + unsubscribe => sub { + my ($user_addr, $list_addr) = @_; + + # map list_addr to mlmmj spool, I use: + # /home/mlmmj/spool/$LIST here + my ($list, $domain) = split('@', $list_addr, 2); + my $spool = "/home/mlmmj/spool/$list"; + + return "Invalid list: $list" unless -d $spool; + + # -c to send a confirmation email, -s is important + # in case a user is click-happy and clicks twice. + my @cmd = (qw(/usr/bin/mlmmj-unsub -c -s), + '-L', $spool, '-a', $user_addr); + + # we don't know which version they're subscribed to, + # try both non-digest and digest + my $normal = system(@cmd); + my $digest = system(@cmd, '-d'); + + # success if either succeeds: + return '' if ($normal == 0 || $digest == 0); + + # missing executable or FS error, + # otherwise -s always succeeds, right? + return 'Unknown error, contact admin'; + }, +); + +builder { + mount '/u' => builder { + eval { enable 'Deflater' }; # optional + eval { enable 'ReverseProxy' }; # optional + enable 'Head'; + sub { $app->call(@_) }; + }; +}; diff --git a/examples/varnish-4.vcl b/examples/varnish-4.vcl new file mode 100644 index 00000000..24296032 --- /dev/null +++ b/examples/varnish-4.vcl @@ -0,0 +1,68 @@ +# Example VCL for Varnish 4.0 with public-inbox WWW code +# This is based on what shipped for 3.x a long time ago (I think) +# and I'm hardly an expert in VCL (nor should we expect anybody +# who maintains a public-inbox HTTP interface to be). +# +# It seems to work for providing some protection from traffic +# bursts; but perhaps the public-inbox WWW interface can someday +# provide enough out-of-the-box performance that configuration +# of an extra component is pointless. + +vcl 4.0; +backend default { + # this is where public-inbox-http listens + .host = "127.0.0.1"; + .port = "280"; +} + +sub vcl_recv { + /* pipe POST and any other weird methods directly to backend */ + if (req.method != "GET" && req.method != "HEAD") { + return (pipe); + } + if (req.http.Authorization || req.http.Cookie) { + /* Not cacheable by default */ + return (pass); + } + return (hash); +} + +sub vcl_pipe { + # By default Connection: close is set on all piped requests by varnish, + # but public-inbox-httpd supports persistent connections well :) + unset bereq.http.connection; + return (pipe); +} + +sub vcl_hash { + hash_data(req.url); + if (req.http.host) { + hash_data(req.http.host); + } else { + hash_data(server.ip); + } + /* we generate fully-qualified URLs for Atom feeds and redirects */ + if (req.http.X-Forwarded-Proto) { + hash_data(req.http.X-Forwarded-Proto); + } + return (lookup); +} + +sub vcl_backend_response { + set beresp.grace = 60s; + set beresp.do_stream = true; + if (beresp.ttl <= 0s || + /* no point in caching stuff git already stores on disk */ + beresp.http.Content-Type ~ "application/x-git" || + beresp.http.Set-Cookie || + beresp.http.Vary == "*") { + /* Mark as "Hit-For-Pass" for the next 2 minutes */ + set beresp.ttl = 120 s; + set beresp.uncacheable = true; + return (deliver); + } else { + /* short TTL for up-to-dateness, our PSGI is not that slow */ + set beresp.ttl = 10s; + } + return (deliver); +} diff --git a/lib/PublicInbox/Address.pm b/lib/PublicInbox/Address.pm new file mode 100644 index 00000000..2c0bb040 --- /dev/null +++ b/lib/PublicInbox/Address.pm @@ -0,0 +1,27 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::Address; +use strict; +use warnings; + +# very loose regexes, here. We don't need RFC-compliance, +# just enough to make thing sanely displayable and pass to git + +sub emails { + ($_[0] =~ /([\w\.\+=\-]+\@[\w\.\-]+)>?\s*(?:\(.*?\))?(?:,\s*|\z)/g) +} + +sub names { + map { + tr/\r\n\t/ /; + s/\s*<([^<]+)\z//; + my $e = $1; + s/\A['"\s]*//; + s/['"\s]*\z//; + $e = $_ =~ /\S/ ? $_ : $e; + $e =~ s/\@\S+\z//; + $e; + } split(/\@+[\w\.\-]+>?\s*(?:\(.*?\))?(?:,\s*|\z)/, $_[0]); +} + +1; diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm new file mode 100644 index 00000000..6fdc3a2d --- /dev/null +++ b/lib/PublicInbox/AltId.pm @@ -0,0 +1,38 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::AltId; +use strict; +use warnings; +use URI::Escape qw(uri_unescape); + +# spec: TYPE:PREFIX:param1=value1¶m2=value2&... +# Example: serial:gmane:file=/path/to/altmsgmap.sqlite3 +sub new { + my ($class, $inbox, $spec) = @_; + my ($type, $prefix, $query) = split(/:/, $spec, 3); + $type eq 'serial' or die "non-serial not supported, yet\n"; + + require PublicInbox::Msgmap; + + my %params = map { + my ($k, $v) = split(/=/, uri_unescape($_), 2); + $v = '' unless defined $v; + ($k, $v); + } split(/[&;]/, $query); + my $f = $params{file} or die "file: required for $type spec $spec\n"; + unless (index($f, '/') == 0) { + $f = "$inbox->{mainrepo}/public-inbox/$f"; + } + bless { + mm_alt => PublicInbox::Msgmap->new_file($f), + xprefix => 'X'.uc($prefix), + }, $class; +} + +sub mid2alt { + my ($self, $mid) = @_; + $self->{mm_alt}->num_for($mid); +} + +1; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index f84a9550..8d66cf8c 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -5,21 +5,31 @@ package PublicInbox::Config; use strict; use warnings; -use base qw/Exporter/; -our @EXPORT_OK = qw/try_cat/; -use File::Path::Expand qw/expand_filename/; +require PublicInbox::Inbox; +use PublicInbox::Spawn qw(popen_rd); # returns key-value pairs of config directives in a hash # if keys may be multi-value, the value is an array ref containing all values sub new { my ($class, $file) = @_; $file = default_file() unless defined($file); - bless git_config_dump($file), $class; + $file = ref $file ? $file : git_config_dump($file); + my $self = bless $file, $class; + + # caches + $self->{-by_addr} ||= {}; + $self->{-by_name} ||= {}; + $self->{-by_newsgroup} ||= {}; + $self->{-limiters} ||= {}; + $self; } sub lookup { my ($self, $recipient) = @_; my $addr = lc($recipient); + my $inbox = $self->{-by_addr}->{$addr}; + return $inbox if $inbox; + my $pfx; foreach my $k (keys %$self) { @@ -37,29 +47,65 @@ sub lookup { last; } } - defined $pfx or return; + _fill($self, $pfx); +} - my %rv; - foreach my $k (qw(mainrepo address filter)) { - my $v = $self->{"$pfx.$k"}; - $rv{$k} = $v if defined $v; +sub lookup_name ($$) { + my ($self, $name) = @_; + $self->{-by_name}->{$name} || _fill($self, "publicinbox.$name"); +} + +sub each_inbox { + my ($self, $cb) = @_; + my %seen; + foreach my $k (keys %$self) { + $k =~ /\Apublicinbox\.([A-Z0-9a-z-]+)\.mainrepo\z/ or next; + next if $seen{$1}; + $seen{$1} = 1; + my $ibx = lookup_name($self, $1) or next; + $cb->($ibx); } - my $listname = $pfx; - $listname =~ s/\Apublicinbox\.//; - $rv{listname} = $listname; - my $v = $rv{address}; - $rv{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; - \%rv; +} + +sub lookup_newsgroup { + my ($self, $ng) = @_; + $ng = lc($ng); + my $rv = $self->{-by_newsgroup}->{$ng}; + return $rv if $rv; + + foreach my $k (keys %$self) { + $k =~ /\A(publicinbox\.[\w-]+)\.newsgroup\z/ or next; + my $v = $self->{$k}; + my $pfx = $1; + if ($v eq $ng) { + $rv = _fill($self, $pfx); + return $rv; + } + } + undef; +} + +sub limiter { + my ($self, $name) = @_; + $self->{-limiters}->{$name} ||= do { + require PublicInbox::Qspawn; + my $max; + # XXX "limiter.<name>.max" was a historical mistake + foreach my $pfx (qw(publicinboxlimiter limiter)) { + $max ||= $self->{"$pfx.$name.max"}; + } + PublicInbox::Qspawn::Limiter->new($max); + }; } sub get { - my ($self, $listname, $key) = @_; + my ($self, $inbox, $key) = @_; - $self->{"publicinbox.$listname.$key"}; + $self->{"publicinbox.$inbox.$key"}; } -sub config_dir { $ENV{PI_DIR} || expand_filename('~/.public-inbox') } +sub config_dir { $ENV{PI_DIR} || "$ENV{HOME}/.public-inbox" } sub default_file { my $f = $ENV{PI_CONFIG}; @@ -72,9 +118,9 @@ sub git_config_dump { my ($in, $out); my @cmd = (qw/git config/, "--file=$file", '-l'); my $cmd = join(' ', @cmd); - my $pid = open(my $fh, '-|', @cmd); - defined $pid or die "$cmd failed: $!"; + my $fh = popen_rd(\@cmd) or die "popen_rd failed for $file: $!\n"; my %rv; + local $/ = "\n"; foreach my $line (<$fh>) { chomp $line; my ($k, $v) = split(/=/, $line, 2); @@ -90,19 +136,44 @@ sub git_config_dump { $rv{$k} = $v; } } - close $fh or die "failed to close ($cmd) pipe: $!"; - $? and warn "$$ $cmd exited with: ($pid) $?"; + close $fh or die "failed to close ($cmd) pipe: $?"; \%rv; } -sub try_cat { - my ($path) = @_; - my $rv; - if (open(my $fh, '<', $path)) { - local $/; - $rv = <$fh>; +sub _fill { + my ($self, $pfx) = @_; + my $rv = {}; + + foreach my $k (qw(mainrepo address filter url newsgroup + infourl watch watchheader httpbackendmax)) { + my $v = $self->{"$pfx.$k"}; + $rv->{$k} = $v if defined $v; + } + + # TODO: more arrays, we should support multi-value for + # more things to encourage decentralization + foreach my $k (qw(altid nntpmirror)) { + if (defined(my $v = $self->{"$pfx.$k"})) { + $rv->{$k} = ref($v) eq 'ARRAY' ? $v : [ $v ]; + } + } + + return unless $rv->{mainrepo}; + my $name = $pfx; + $name =~ s/\Apublicinbox\.//; + $rv->{name} = $name; + $rv->{-pi_config} = $self; + $rv = PublicInbox::Inbox->new($rv); + my $v = $rv->{address}; + if (ref($v) eq 'ARRAY') { + $self->{-by_addr}->{lc($_)} = $rv foreach @$v; + } else { + $self->{-by_addr}->{lc($v)} = $rv; + } + if (my $ng = $rv->{newsgroup}) { + $self->{-by_newsgroup}->{$ng} = $rv; } - $rv; + $self->{-by_name}->{$name} = $rv; } 1; diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index c9594a37..37aa4187 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -9,11 +9,13 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; use IO::Handle; use IO::Socket; use Cwd qw/abs_path/; +use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); STDOUT->autoflush(1); STDERR->autoflush(1); require Danga::Socket; require POSIX; require PublicInbox::Listener; +require PublicInbox::ParentPipe; my @CMD; my $set_user; my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize); @@ -101,17 +103,18 @@ sub check_absolute ($$) { } sub daemonize () { - foreach my $i (0..$#ARGV) { - my $arg = $ARGV[$i]; - next unless -e $arg; - $ARGV[$i] = abs_path($arg); - } - check_absolute('stdout', $stdout); - check_absolute('stderr', $stderr); - check_absolute('pid-file', $pid_file); + if ($daemonize) { + foreach my $i (0..$#ARGV) { + my $arg = $ARGV[$i]; + next unless -e $arg; + $ARGV[$i] = abs_path($arg); + } + check_absolute('stdout', $stdout); + check_absolute('stderr', $stderr); + check_absolute('pid-file', $pid_file); - chdir '/' or die "chdir failed: $!"; - open(STDIN, '+<', '/dev/null') or die "redirect stdin failed: $!"; + chdir '/' or die "chdir failed: $!"; + } return unless (defined $pid_file || defined $group || defined $user || $daemonize); @@ -140,15 +143,17 @@ sub daemonize () { }; if ($daemonize) { - my ($pid, $err) = do_fork(); - die "could not fork: $err\n" unless defined $pid; + my $pid = fork; + die "could not fork: $!\n" unless defined $pid; exit if $pid; + open(STDIN, '+<', '/dev/null') or + die "redirect stdin failed: $!\n"; open STDOUT, '>&STDIN' or die "redirect stdout failed: $!\n"; open STDERR, '>&STDIN' or die "redirect stderr failed: $!\n"; POSIX::setsid(); - ($pid, $err) = do_fork(); - die "could not fork: $err\n" unless defined $pid; + $pid = fork; + die "could not fork: $!\n" unless defined $pid; exit if $pid; } if (defined $pid_file) { @@ -161,29 +166,44 @@ sub daemonize () { } } -sub worker_quit () { + +sub worker_quit { + my ($reason) = @_; # killing again terminates immediately: exit unless @listeners; $_->close foreach @listeners; # call Danga::Socket::close @listeners = (); + $reason->close if ref($reason) eq 'PublicInbox::ParentPipe'; - # give slow clients 30s to finish reading/writing whatever - Danga::Socket->AddTimer(30, sub { exit }); - + my $proc_name; + my $warn = 0; # drop idle connections and try to quit gracefully Danga::Socket->SetPostLoopCallback(sub { my ($dmap, undef) = @_; my $n = 0; + my $now = clock_gettime(CLOCK_MONOTONIC); foreach my $s (values %$dmap) { - if ($s->can('busy') && $s->busy) { - $n = 1; + $s->can('busy') or next; + if ($s->busy($now)) { + ++$n; } else { # close as much as possible, early as possible $s->close; } } + if ($n) { + if (($warn + 5) < time) { + warn "$$ quitting, $n client(s) left\n"; + $warn = time; + } + unless (defined $proc_name) { + $proc_name = (split(/\s+/, $0))[0]; + $proc_name =~ s!\A.*?([^/]+)\z!$1!; + } + $0 = "$proc_name quitting, $n client(s) left"; + } $n; # true: loop continues, false: loop breaks }); } @@ -264,9 +284,9 @@ sub upgrade () { $pid_file .= '.oldbin'; write_pid($pid_file); } - my ($pid, $err) = do_fork(); + my $pid = fork; unless (defined $pid) { - warn "fork failed: $err\n"; + warn "fork failed: $!\n"; return; } if ($pid == 0) { @@ -291,17 +311,6 @@ sub kill_workers ($) { } } -sub do_fork () { - my $new = POSIX::SigSet->new; - $new->fillset; - my $old = POSIX::SigSet->new; - POSIX::sigprocmask(&POSIX::SIG_BLOCK, $new, $old) or die "SIG_BLOCK: $!"; - my $pid = fork; - my $err = $!; - POSIX::sigprocmask(&POSIX::SIG_SETMASK, $old) or die "SIG_SETMASK: $!"; - ($pid, $err); -} - sub upgrade_aborted ($) { my ($p) = @_; warn "reexec PID($p) died with: $?\n"; @@ -336,6 +345,7 @@ sub unlink_pid_file_safe_ish ($$) { return unless defined $unlink_pid && $unlink_pid == $$; open my $fh, '<', $file or return; + local $/ = "\n"; defined(my $read_pid = <$fh>) or return; chomp $read_pid; if ($read_pid == $unlink_pid) { @@ -359,6 +369,7 @@ sub master_loop { } reopen_logs(); # main loop + my $quit = 0; while (1) { while (my $s = shift @caught) { if ($s eq 'USR1') { @@ -367,10 +378,16 @@ sub master_loop { } elsif ($s eq 'USR2') { upgrade(); } elsif ($s =~ /\A(?:QUIT|TERM|INT)\z/) { - # drops pipes and causes children to die - exit + exit if $quit++; + kill_workers($s); } elsif ($s eq 'WINCH') { - $worker_processes = 0; + if (-t STDIN || -t STDOUT || -t STDERR) { + warn +"ignoring SIGWINCH since we are not daemonized\n"; + $SIG{WINCH} = 'IGNORE'; + } else { + $worker_processes = 0; + } } elsif ($s eq 'HUP') { $worker_processes = $set_workers; kill_workers($s); @@ -390,6 +407,11 @@ sub master_loop { } my $n = scalar keys %pids; + if ($quit) { + exit if $n == 0; + $set_workers = $worker_processes = $n = 0; + } + if ($n > $worker_processes) { while (my ($k, $v) = each %pids) { kill('TERM', $k) if $v >= $worker_processes; @@ -397,9 +419,9 @@ sub master_loop { $n = $worker_processes; } foreach my $i ($n..($worker_processes - 1)) { - my ($pid, $err) = do_fork(); + my $pid = fork; if (!defined $pid) { - warn "failed to fork worker[$i]: $err\n"; + warn "failed to fork worker[$i]: $!\n"; } elsif ($pid == 0) { $set_user->() if $set_user; return $p0; # run normal work code @@ -419,13 +441,12 @@ sub daemon_loop ($$) { my $parent_pipe; if ($worker_processes > 0) { $refresh->(); # preload by default - $parent_pipe = master_loop(); # returns if in child process - my $fd = fileno($parent_pipe); - Danga::Socket->AddOtherFds($fd => *worker_quit); + my $fh = master_loop(); # returns if in child process + $parent_pipe = PublicInbox::ParentPipe->new($fh, *worker_quit); } else { reopen_logs(); $set_user->() if $set_user; - $SIG{USR2} = sub { worker_quit() if upgrade() }; + $SIG{USR2} = sub { worker_quit('USR2') if upgrade() }; $refresh->(); } $uid = $gid = undef; @@ -433,6 +454,8 @@ sub daemon_loop ($$) { $SIG{QUIT} = $SIG{INT} = $SIG{TERM} = *worker_quit; $SIG{USR1} = *reopen_logs; $SIG{HUP} = $refresh; + $SIG{CHLD} = 'DEFAULT'; + $SIG{$_} = 'IGNORE' for qw(USR2 TTIN TTOU WINCH); # this calls epoll_create: @listeners = map { PublicInbox::Listener->new($_, $post_accept) diff --git a/lib/PublicInbox/Emergency.pm b/lib/PublicInbox/Emergency.pm new file mode 100644 index 00000000..4ee86215 --- /dev/null +++ b/lib/PublicInbox/Emergency.pm @@ -0,0 +1,96 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Emergency Maildir delivery for MDA +package PublicInbox::Emergency; +use strict; +use warnings; +use Fcntl qw(:DEFAULT SEEK_SET); +use Sys::Hostname qw(hostname); +use IO::Handle; + +sub new { + my ($class, $dir) = @_; + + -d $dir or mkdir($dir) or die "failed to mkdir($dir): $!\n"; + foreach (qw(new tmp cur)) { + my $d = "$dir/$_"; + next if -d $d; + -d $d or mkdir($d) or die "failed to mkdir($d): $!\n"; + } + bless { dir => $dir, files => {}, t => 0, cnt => 0 }, $class; +} + +sub _fn_in { + my ($self, $dir) = @_; + my @host = split(/\./, hostname); + my $now = time; + if ($self->{t} != $now) { + $self->{t} = $now; + $self->{cnt} = 0; + } else { + $self->{cnt}++; + } + + my $f; + do { + $f = "$self->{dir}/$dir/$self->{t}.$$"."_$self->{cnt}.$host[0]"; + $self->{cnt}++; + } while (-e $f); + $f; +} + +sub prepare { + my ($self, $strref) = @_; + + die "already in transaction: $self->{tmp}" if $self->{tmp}; + my ($tmp, $fh); + do { + $tmp = _fn_in($self, 'tmp'); + $! = undef; + } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_RDWR) && $!{EEXIST}); + print $fh $$strref or die "write failed: $!"; + $fh->flush or die "flush failed: $!"; + $fh->autoflush(1); + $self->{fh} = $fh; + $self->{tmp} = $tmp; +} + +sub abort { + my ($self) = @_; + delete $self->{fh}; + my $tmp = delete $self->{tmp} or return; + + unlink($tmp) or warn "Failed to unlink $tmp: $!"; + undef; +} + +sub fh { + my ($self) = @_; + my $fh = $self->{fh} or die "{fh} not open!\n"; + seek($fh, 0, SEEK_SET) or die "seek(fh) failed: $!"; + sysseek($fh, 0, SEEK_SET) or die "sysseek(fh) failed: $!"; + $fh; +} + +sub commit { + my ($self) = @_; + + delete $self->{fh}; + my $tmp = delete $self->{tmp} or return; + my $new; + do { + $new = _fn_in($self, 'new'); + } while (!link($tmp, $new) && $!{EEXIST}); + my @sn = stat($new) or die "stat $new failed: $!"; + my @st = stat($tmp) or die "stat $tmp failed: $!"; + if ($st[0] == $sn[0] && $st[1] == $sn[1]) { + unlink($tmp) or warn "Failed to unlink $tmp: $!"; + } else { + warn "stat($new) and stat($tmp) differ"; + } +} + +sub DESTROY { commit($_[0]) } + +1; diff --git a/lib/PublicInbox/EvCleanup.pm b/lib/PublicInbox/EvCleanup.pm new file mode 100644 index 00000000..2b77c617 --- /dev/null +++ b/lib/PublicInbox/EvCleanup.pm @@ -0,0 +1,74 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# event cleanups (currently for Danga::Socket) +package PublicInbox::EvCleanup; +use strict; +use warnings; +use base qw(Danga::Socket); +use fields qw(rd); +my $singleton; +my $asapq = [ [], undef ]; +my $nextq = [ [], undef ]; +my $laterq = [ [], undef ]; + +sub once_init () { + my $self = fields::new('PublicInbox::EvCleanup'); + my ($r, $w); + pipe($r, $w) or die "pipe: $!"; + $self->SUPER::new($w); + $self->{rd} = $r; # never read, since we never write.. + $self; +} + +sub _run_all ($) { + my ($q) = @_; + + my $run = $q->[0]; + $q->[0] = []; + $q->[1] = undef; + $_->() foreach @$run; +} + +sub _run_asap () { _run_all($asapq) } +sub _run_next () { _run_all($nextq) } +sub _run_later () { _run_all($laterq) } + +# Called by Danga::Socket +sub event_write { + my ($self) = @_; + $self->watch_write(0); + _run_asap(); +} + +sub _asap_timer () { + $singleton ||= once_init(); + $singleton->watch_write(1); + 1; +} + +sub asap ($) { + my ($cb) = @_; + push @{$asapq->[0]}, $cb; + $asapq->[1] ||= _asap_timer(); +} + +sub next_tick ($) { + my ($cb) = @_; + push @{$nextq->[0]}, $cb; + $nextq->[1] ||= Danga::Socket->AddTimer(0, *_run_next); +} + +sub later ($) { + my ($cb) = @_; + push @{$laterq->[0]}, $cb; + $laterq->[1] ||= Danga::Socket->AddTimer(60, *_run_later); +} + +END { + _run_asap(); + _run_next(); + _run_later(); +} + +1; diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 6356c324..67ce0407 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -8,79 +8,77 @@ package PublicInbox::ExtMsg; use strict; use warnings; -use URI::Escape qw(uri_escape_utf8); use PublicInbox::Hval; use PublicInbox::MID qw/mid2path/; +use PublicInbox::WwwStream; # TODO: user-configurable our @EXT_URL = ( - 'http://mid.gmane.org/%s', - 'https://lists.debian.org/msgid-search/%s', # leading "//" denotes protocol-relative (http:// or https://) - '//mid.mail-archive.com/%s', '//marc.info/?i=%s', + '//mid.mail-archive.com/%s', + 'http://mid.gmane.org/%s', + 'https://lists.debian.org/msgid-search/%s', + '//docs.FreeBSD.org/cgi/mid.cgi?db=mid&id=%s', + 'https://www.w3.org/mid/%s', + 'http://www.postgresql.org/message-id/%s', + 'https://lists.debconf.org/cgi-lurker/keyword.cgi?'. + 'doc-url=/lurker&format=en.html&query=id:%s' ); sub ext_msg { my ($ctx) = @_; - my $pi_config = $ctx->{pi_config}; - my $listname = $ctx->{listname}; + my $cur = $ctx->{-inbox}; my $mid = $ctx->{mid}; - my $cgi = $ctx->{cgi}; - my $env = $cgi->{env}; eval { require PublicInbox::Search }; my $have_xap = $@ ? 0 : 1; - my (@nox, @pfx); + my (@nox, @ibx, @found); - foreach my $k (keys %$pi_config) { - $k =~ /\Apublicinbox\.([A-Z0-9a-z-]+)\.url\z/ or next; - my $list = $1; - next if $list eq $listname; + $ctx->{www}->{pi_config}->each_inbox(sub { + my ($other) = @_; + return if $other->{name} eq $cur->{name} || !$other->base_url; - my $git_dir = $pi_config->{"publicinbox.$list.mainrepo"}; - defined $git_dir or next; - - my $url = $pi_config->{"publicinbox.$list.url"}; - defined $url or next; - - $url =~ s!/+\z!!; - $url = PublicInbox::Hval::prurl($env, $url); + my $s = $other->search; + if (!$s) { + push @nox, $other; + return; + } # try to find the URL with Xapian to avoid forking - if ($have_xap) { - my $s; - my $doc_id = eval { - $s = PublicInbox::Search->new($git_dir); - $s->find_unique_doc_id('mid', $mid); - }; - if ($@) { - # xapian not configured for this repo - } else { - # maybe we found it! - return r302($url, $mid) if (defined $doc_id); - - # no point in trying the fork fallback if we - # know Xapian is up-to-date but missing the - # message in the current repo - push @pfx, { git_dir => $git_dir, url => $url }; - next; - } + my $doc_id = eval { $s->find_unique_doc_id('mid', $mid) }; + if ($@) { + # xapian not configured properly for this repo + push @nox, $other; + return; } - # queue up for forking after we've tried Xapian on all of them - push @nox, { git_dir => $git_dir, url => $url }; - } + # maybe we found it! + if (defined $doc_id) { + push @found, $other; + } else { + # no point in trying the fork fallback if we + # know Xapian is up-to-date but missing the + # message in the current repo + push @ibx, $other; + } + }); - # Xapian not installed or configured for some repos - my $path = "HEAD:" . mid2path($mid); + return exact($ctx, \@found, $mid) if @found; - foreach my $n (@nox) { - # TODO: reuse existing PublicInbox::Git objects to save forks - my $git = PublicInbox::Git->new($n->{git_dir}); - my (undef, $type, undef) = $git->check($path); - return r302($n->{url}, $mid) if ($type && $type eq 'blob'); + # Xapian not installed or configured for some repos, + # do a full MID check (this is expensive...): + if (@nox) { + my $path = mid2path($mid); + foreach my $other (@nox) { + my (undef, $type, undef) = $other->path_check($path); + + if ($type && $type eq 'blob') { + push @found, $other; + } + } } + return exact($ctx, \@found, $mid) if @found; # fall back to partial MID matching my $n_partial = 0; @@ -88,22 +86,15 @@ sub ext_msg { eval { require PublicInbox::Msgmap }; my $have_mm = $@ ? 0 : 1; - my $base_url = $cgi->base->as_string; if ($have_mm) { my $tmp_mid = $mid; - my $url; again: - $url = $base_url . $listname; - unshift @pfx, { git_dir => $ctx->{git_dir}, url => $url }; - foreach my $pfx (@pfx) { - my $git_dir = delete $pfx->{git_dir} or next; - my $mm = eval { PublicInbox::Msgmap->new($git_dir) }; - - $mm or next; + unshift @ibx, $cur; + foreach my $ibx (@ibx) { + my $mm = $ibx->mm or next; if (my $res = $mm->mid_prefixes($tmp_mid)) { $n_partial += scalar(@$res); - $pfx->{res} = $res; - push @partial, $pfx; + push @partial, [ $ibx, $res ]; } } # fixup common errors: @@ -113,51 +104,76 @@ again: } my $code = 404; - my $h = PublicInbox::Hval->new_msgid($mid, 1); - my $href = $h->as_href; + my $h = PublicInbox::Hval->new_msgid($mid); + my $href = $h->{href}; my $html = $h->as_html; - my $title = "Message-ID <$html> not found"; - my $s = "<html><head><title>$title</title>" . - "</head><body><pre><b>$title</b>\n"; - + my $title = "<$html> not found"; + my $s = "<pre>Message-ID <$html>\nnot found\n"; if ($n_partial) { $code = 300; my $es = $n_partial == 1 ? '' : 'es'; - $s.= "\n$n_partial partial match$es found:\n\n"; - foreach my $pfx (@partial) { - my $u = $pfx->{url}; - foreach my $m (@{$pfx->{res}}) { + $s .= "\n$n_partial partial match$es found:\n\n"; + my $cur_name = $cur->{name}; + foreach my $pair (@partial) { + my ($ibx, $res) = @$pair; + my $env = $ctx->{env} if $ibx->{name} eq $cur_name; + my $u = $ibx->base_url($env) or next; + foreach my $m (@$res) { my $p = PublicInbox::Hval->new_msgid($m); - my $r = $p->as_href; + my $r = $p->{href}; my $t = $p->as_html; - $s .= qq{<a\nhref="$u/$r/">$u/$t/</a>\n}; + $s .= qq{<a\nhref="$u$r/">$u$t/</a>\n}; } } } + my $ext = ext_urls($ctx, $mid, $href, $html); + if ($ext ne '') { + $s .= $ext; + $code = 300; + } + $ctx->{-html_tip} = $s .= '</pre>'; + $ctx->{-title_html} = $title; + $ctx->{-upfx} = '../'; + PublicInbox::WwwStream->response($ctx, $code); +} + +sub ext_urls { + my ($ctx, $mid, $href, $html) = @_; # Fall back to external repos if configured if (@EXT_URL && index($mid, '@') >= 0) { - $code = 300; - $s .= "\nPerhaps try an external site:\n\n"; + my $env = $ctx->{env}; + my $e = "\nPerhaps try an external site:\n\n"; foreach my $url (@EXT_URL) { my $u = PublicInbox::Hval::prurl($env, $url); my $r = sprintf($u, $href); my $t = sprintf($u, $html); - $s .= qq{<a\nhref="$r">$t</a>\n}; + $e .= qq{<a\nhref="$r">$t</a>\n}; } + return $e; } - $s .= '</pre></body></html>'; - - [$code, ['Content-Type'=>'text/html; charset=UTF-8'], [$s]]; + '' } -# Redirect to another public-inbox which is mapped by $pi_config -sub r302 { - my ($url, $mid) = @_; - $url .= '/' . uri_escape_utf8($mid) . '/'; - [ 302, - [ 'Location' => $url, 'Content-Type' => 'text/plain' ], - [ "Redirecting to\n$url\n" ] ] +sub exact { + my ($ctx, $found, $mid) = @_; + my $h = PublicInbox::Hval->new_msgid($mid); + my $href = $h->{href}; + my $html = $h->as_html; + my $title = "<$html> found in "; + my $end = @$found == 1 ? 'another inbox' : 'other inboxes'; + $ctx->{-title_html} = $title . $end; + $ctx->{-upfx} = '../'; + my $ext_urls = ext_urls($ctx, $mid, $href, $html); + my $code = (@$found == 1 && $ext_urls eq '') ? 200 : 300; + $ctx->{-html_tip} = join('', + "<pre>Message-ID: <$html>\nfound in $end:\n\n", + (map { + my $u = $_->base_url; + qq(<a\nhref="$u$href/">$u$html/</a>\n) + } @$found), + $ext_urls, '</pre>'); + PublicInbox::WwwStream->response($ctx, $code); } 1; diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index d014434f..31d82adb 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -5,194 +5,111 @@ package PublicInbox::Feed; use strict; use warnings; -use Email::Address; use Email::MIME; -use Date::Parse qw(strptime); -use PublicInbox::Hval qw/ascii_html/; -use PublicInbox::Git; use PublicInbox::View; -use PublicInbox::MID qw/mid_clean mid2path/; -use POSIX qw/strftime/; +use PublicInbox::WwwAtomStream; use constant { - DATEFMT => '%Y-%m-%dT%H:%M:%SZ', # Atom standard MAX_PER_PAGE => 25, # this needs to be tunable }; # main function sub generate { my ($ctx) = @_; - sub { emit_atom($_[0], $ctx) }; + my @paths; + each_recent_blob($ctx, sub { push @paths, $_[0] }); + return _no_thread() unless @paths; + + my $ibx = $ctx->{-inbox}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $path = shift @paths) { + my $mime = do_cat_mail($ibx, $path) or next; + return $mime; + } + }); } sub generate_thread_atom { my ($ctx) = @_; - sub { emit_atom_thread($_[0], $ctx) }; -} - -sub generate_html_index { - my ($ctx) = @_; - sub { emit_html_index($_[0], $ctx) }; -} - -# private subs - -sub title_tag { - my ($title) = @_; - $title =~ tr/\t\n / /s; # squeeze spaces - # try to avoid the type attribute in title: - $title = ascii_html($title); - my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; - "<title$type>$title</title>"; -} - -sub atom_header { - my ($feed_opts, $title) = @_; - - $title = title_tag($feed_opts->{description}) unless (defined $title); - - qq(<?xml version="1.0" encoding="us-ascii"?>\n) . - qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . - qq{$title} . - qq(<link\nrel="alternate"\ntype="text/html") . - qq(\nhref="$feed_opts->{url}"/>) . - qq(<link\nrel="self"\nhref="$feed_opts->{atomurl}"/>) . - qq(<id>mailto:$feed_opts->{id_addr}</id>); -} - -sub emit_atom { - my ($cb, $ctx) = @_; - my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]); - my $max = $ctx->{max} || MAX_PER_PAGE; - my $feed_opts = get_feedopts($ctx); - my $x = atom_header($feed_opts); - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - each_recent_blob($ctx, sub { - my ($path, undef, $ts) = @_; - if (defined $x) { - $fh->write($x . feed_updated(undef, $ts)); - $x = undef; + my $mid = $ctx->{mid}; + my $res = $ctx->{srch}->get_thread($mid); + return _no_thread() unless $res->{total}; + + my $ibx = $ctx->{-inbox}; + my $html_url = $ibx->base_url($ctx->{env}); + $html_url .= PublicInbox::Hval->new_msgid($mid)->{href}; + $ctx->{-html_url} = $html_url; + my $msgs = $res->{msgs}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $msg = shift @$msgs) { + $msg = $ibx->msg_by_smsg($msg) and + return Email::MIME->new($msg); } - add_to_feed($feed_opts, $fh, $path, $git); }); - end_feed($fh); } -sub _no_thread { - my ($cb) = @_; - my $fh = $cb->([404, ['Content-Type' => 'text/plain']]); - $fh->write("No feed found for thread\n"); - $fh->close; -} - -sub end_feed { - my ($fh) = @_; - Email::Address->purge_cache; - $fh->write('</feed>'); - $fh->close; -} - -sub emit_atom_thread { - my ($cb, $ctx) = @_; - my $res = $ctx->{srch}->get_thread($ctx->{mid}); - return _no_thread($cb) unless $res->{total}; - my $fh = $cb->([200, ['Content-Type' => 'application/atom+xml']]); - my $feed_opts = get_feedopts($ctx); - - my $html_url = $feed_opts->{atomurl} = $ctx->{self_url}; - $html_url =~ s!/t\.atom\z!/!; - $feed_opts->{url} = $html_url; - $feed_opts->{emit_header} = 1; - - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - foreach my $msg (@{$res->{msgs}}) { - add_to_feed($feed_opts, $fh, mid2path($msg->mid), $git); - } - end_feed($fh); -} - -sub emit_html_index { - my ($res, $ctx) = @_; - my $fh = $res->([200,['Content-Type'=>'text/html; charset=UTF-8']]); - - my $max = $ctx->{max} || MAX_PER_PAGE; - my $feed_opts = get_feedopts($ctx); - - my $title = ascii_html($feed_opts->{description} || ''); - my ($footer, $param, $last); - my $state = { ctx => $ctx, seen => {}, anchor_idx => 0, fh => $fh }; - my $srch = $ctx->{srch}; - - my $top = "<b>$title</b> (<a\nhref=\"new.atom\">Atom feed</a>)"; - - if ($srch) { - $top = qq{<form\naction=""><pre>$top} . - qq{ <input\nname=q\ntype=text />} . - qq{<input\ntype=submit\nvalue=search />} . - q{</pre></form><pre>} - } else { - $top = '<pre>' . $top . "\n"; - } - - $fh->write("<html><head><title>$title</title>" . - "<link\nrel=alternate\ntitle=\"Atom feed\"\n". - "href=\"new.atom\"\ntype=\"application/atom+xml\"/>" . - PublicInbox::Hval::STYLE . - "</head><body>$top"); - +sub generate_html_index { + my ($ctx) = @_; # if the 'r' query parameter is given, it is a legacy permalink # which we must continue supporting: - my $cgi = $ctx->{cgi}; - if ($cgi && !$cgi->param('r') && $srch) { - $state->{srch} = $srch; - $last = PublicInbox::View::emit_index_topics($state); - $param = 'o'; - } else { - $last = emit_index_nosrch($ctx, $state); - $param = 'r'; - } - $footer = nav_footer($cgi, $last, $feed_opts, $state, $param); - if ($footer) { - my $list_footer = $ctx->{footer}; - $footer .= "\n\n" . $list_footer if $list_footer; - $footer = "<hr /><pre>$footer</pre>"; + my $qp = $ctx->{qp}; + if ($qp && !$qp->{r} && $ctx->{srch}) { + return PublicInbox::View::index_topics($ctx); } - $fh->write("$footer</body></html>"); - $fh->close; + + my $env = $ctx->{env}; + my $url = $ctx->{-inbox}->base_url($env) . 'new.html'; + my $qs = $env->{QUERY_STRING}; + $url .= "?$qs" if $qs ne ''; + [302, [ 'Location', $url, 'Content-Type', 'text/plain'], + [ "Redirecting to $url\n" ] ]; } -sub emit_index_nosrch { - my ($ctx, $state) = @_; - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); +sub new_html { + my ($ctx) = @_; + my @paths; my (undef, $last) = each_recent_blob($ctx, sub { my ($path, $commit, $ts, $u, $subj) = @_; - $state->{first} ||= $commit; - - my $mime = do_cat_mail($git, $path) or return 0; - PublicInbox::View::index_entry($mime, 0, $state); - 1; + $ctx->{first} ||= $commit; + push @paths, $path; }); - Email::Address->purge_cache; - $last; + if (!@paths) { + return [404, ['Content-Type', 'text/plain'], + ["No messages, yet\n"] ]; + } + $ctx->{-html_tip} = '<pre>'; + $ctx->{-upfx} = ''; + $ctx->{-hr} = 1; + PublicInbox::WwwStream->response($ctx, 200, sub { + while (my $path = shift @paths) { + my $m = do_cat_mail($ctx->{-inbox}, $path) or next; + my $more = scalar @paths; + my $s = PublicInbox::View::index_entry($m, $ctx, $more); + return $s; + } + new_html_footer($ctx, $last); + }); +} + +# private subs + +sub _no_thread () { + [404, ['Content-Type', 'text/plain'], ["No feed found for thread\n"]]; } -sub nav_footer { - my ($cgi, $last, $feed_opts, $state, $param) = @_; - $cgi or return ''; - my $old_r = $cgi->param($param); - my $head = ' '; +sub new_html_footer { + my ($ctx, $last) = @_; + my $qp = delete $ctx->{qp} or return; + my $old_r = $qp->{r}; + my $latest = ''; my $next = ' '; - my $first = $state->{first}; - my $anchor = $state->{anchor_idx}; if ($last) { - $next = qq!<a\nhref="?$param=$last">next</a>!; + $next = qq!<a\nhref="?r=$last"\nrel=next>next</a>!; } if ($old_r) { - $head = $cgi->path_info; - $head = qq!<a\nhref="$head">head</a>!; + $latest = qq! <a\nhref='./new.html'>latest</a>!; } - my $atom = "<a\nhref=\"$feed_opts->{atomurl}\">Atom feed</a>"; - "<a\nname=\"s$anchor\">page:</a> $next $head $atom"; + "<hr><pre>page: $next$latest</pre>"; } sub each_recent_blob { @@ -202,11 +119,11 @@ sub each_recent_blob { my $addmsg = qr!^:000000 100644 \S+ \S+ A\t(${hex}{2}/${hex}{38})$!; my $delmsg = qr!^:100644 000000 \S+ \S+ D\t(${hex}{2}/${hex}{38})$!; my $refhex = qr/(?:HEAD|${hex}{4,40})(?:~\d+)?/; - my $cgi = $ctx->{cgi}; + my $qp = $ctx->{qp}; # revision ranges may be specified my $range = 'HEAD'; - my $r = $cgi->param('r') if $cgi; + my $r = $qp->{r} if $qp; if ($r && ($r =~ /\A(?:$refhex\.\.)?$refhex\z/o)) { $range = $r; } @@ -214,9 +131,9 @@ sub each_recent_blob { # get recent messages # we could use git log -z, but, we already know ssoma will not # leave us with filenames with spaces in them.. - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my $log = $git->popen(qw/log --no-notes --no-color --raw -r - --abbrev-commit/, $git->abbrev, + my $log = $ctx->{-inbox}->git->popen(qw/log + --no-notes --no-color --raw -r + --abbrev=16 --abbrev-commit/, "--format=%h%x00%ct%x00%an%x00%s%x00", $range); my %deleted; # only an optimization at this point @@ -224,11 +141,12 @@ sub each_recent_blob { my $nr = 0; my ($cur_commit, $first_commit, $last_commit); my ($ts, $subj, $u); + local $/ = "\n"; while (defined(my $line = <$log>)) { if ($line =~ /$addmsg/o) { my $add = $1; next if $deleted{$add}; # optimization-only - $nr += $cb->($add, $cur_commit, $ts, $u, $subj); + $cb->($add, $cur_commit, $ts, $u, $subj) and $nr++; if ($nr >= $max) { $last = 1; last; @@ -244,6 +162,7 @@ sub each_recent_blob { } if ($last) { + local $/ = "\n"; while (my $line = <$log>) { if ($line =~ /^(${hex}{7,40})/o) { $last_commit = $1; @@ -256,108 +175,10 @@ sub each_recent_blob { ($first_commit, $last_commit); } -# private functions below -sub get_feedopts { - my ($ctx) = @_; - my $pi_config = $ctx->{pi_config}; - my $listname = $ctx->{listname}; - my $cgi = $ctx->{cgi}; - my %rv; - if (open my $fh, '<', "$ctx->{git_dir}/description") { - chomp($rv{description} = <$fh>); - } else { - $rv{description} = '($GIT_DIR/description missing)'; - } - - if ($pi_config && defined $listname && $listname ne '') { - my $addr = $pi_config->get($listname, 'address') || ""; - $rv{address} = $addr; - $addr = $addr->[0] if ref($addr); - $rv{id_addr} = $addr; - } - $rv{id_addr} ||= 'public-inbox@example.com'; - - my $url_base; - if ($cgi) { - $url_base = $cgi->base->as_string . $listname; - if (my $mid = $ctx->{mid}) { # per-thread feed: - $rv{atomurl} = "$url_base/$mid/t.atom"; - } else { - $rv{atomurl} = "$url_base/new.atom"; - } - } else { - $url_base = "http://example.com"; - $rv{atomurl} = "$url_base/new.atom"; - } - $rv{url} ||= "$url_base/"; - $rv{midurl} = "$url_base/"; - - \%rv; -} - -sub feed_updated { - my ($date, $ts) = @_; - my @t = eval { strptime($date) } if defined $date; - @t = gmtime($ts || time) unless scalar @t; - - '<updated>' . strftime(DATEFMT, @t) . '</updated>'; -} - -# returns 0 (skipped) or 1 (added) -sub add_to_feed { - my ($feed_opts, $fh, $add, $git) = @_; - - my $mime = do_cat_mail($git, $add) or return 0; - my $url = $feed_opts->{url}; - my $midurl = $feed_opts->{midurl}; - - my $header_obj = $mime->header_obj; - my $mid = $header_obj->header_raw('Message-ID'); - defined $mid or return 0; - $mid = PublicInbox::Hval->new_msgid($mid); - my $href = $mid->as_href; - my $content = PublicInbox::View->feed_entry($mime, "$midurl$href/f/"); - defined($content) or return 0; - $mime = undef; - - my $date = $header_obj->header('Date'); - my $updated = feed_updated($date); - - my $title = $header_obj->header('Subject'); - defined $title or return 0; - $title = title_tag($title); - - my $from = $header_obj->header('From') or return 0; - my @from = Email::Address->parse($from) or return 0; - my $name = ascii_html($from[0]->name); - my $email = $from[0]->address; - $email = ascii_html($email); - - if (delete $feed_opts->{emit_header}) { - $fh->write(atom_header($feed_opts, $title) . $updated); - } - $fh->write("<entry><author><name>$name</name><email>$email</email>" . - "</author>$title$updated" . - qq{<content\ntype="xhtml">} . - qq{<div\nxmlns="http://www.w3.org/1999/xhtml">}); - $fh->write($content); - - $add =~ tr!/!!d; - my $h = '[a-f0-9]'; - my (@uuid5) = ($add =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); - my $id = 'urn:uuid:' . join('-', @uuid5); - $fh->write(qq!</div></content><link\nhref="$midurl$href/"/>!. - "<id>$id</id></entry>"); - 1; -} - sub do_cat_mail { - my ($git, $path) = @_; - my $mime = eval { - my $str = $git->cat_file("HEAD:$path"); - Email::MIME->new($str); - }; - $@ ? undef : $mime; + my ($ibx, $path) = @_; + my $mime = eval { $ibx->msg_by_path($path) } or return; + Email::MIME->new($mime); } 1; diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm deleted file mode 100644 index ea6fd33f..00000000 --- a/lib/PublicInbox/Filter.pm +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (C) 2013-2015 all contributors <meta@public-inbox.org> -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -# -# Used to filter incoming mail for -mda and importers -# This only exposes one function: run -# Note: the settings here are highly opinionated. Obviously, this is -# Free Software (AGPLv3), so you may change it if you host yourself. -package PublicInbox::Filter; -use strict; -use warnings; -use Email::MIME; -use Email::MIME::ContentType qw/parse_content_type/; -use Email::Filter; -use IPC::Run; -our $VERSION = '0.0.1'; -use constant NO_HTML => '*** We only accept plain-text email, no HTML ***'; -use constant TEXT_ONLY => '*** We only accept plain-text email ***'; - -# start with the same defaults as mailman -our $BAD_EXT = qr/\.(exe|bat|cmd|com|pif|scr|vbs|cpl|zip)\s*\z/i; -our $MIME_HTML = qr!\btext/x?html\b!i; -our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i; - -# this is highly opinionated delivery -# returns 0 only if there is nothing to deliver -sub run { - my ($class, $mime, $filter) = @_; - - my $content_type = $mime->header('Content-Type') || 'text/plain'; - - # kill potentially bad/confusing headers - # Note: ssoma already does this, but since we mangle the message, - # we should do this before it gets to ssoma. - # We also kill Mail-{Followup,Reply}-To headers due to - # the nature of public-inbox having no real subscribers. - foreach my $d (qw(status lines content-length - mail-followup-to mail-reply-to)) { - $mime->header_set($d); - } - - if ($content_type =~ m!\btext/plain\b!i) { - return 1; # yay, nothing to do - } elsif ($content_type =~ $MIME_HTML) { - $filter->reject(NO_HTML) if $filter; - # HTML-only, non-multipart - my $body = $mime->body; - my $ct_parsed = parse_content_type($content_type); - dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_body($mime, $body); - return 1; - } elsif ($content_type =~ m!\bmultipart/!i) { - return strip_multipart($mime, $content_type, $filter); - } else { - $filter->reject(TEXT_ONLY) if $filter; - replace_body($mime, "$content_type message scrubbed"); - return 0; - } -} - -sub replace_part { - my ($mime, $part, $type) = ($_[0], $_[1], $_[3]); - # don't copy $_[2], that's the body (it may be huge) - - # Email::MIME insists on setting Date:, so just set it consistently - # to avoid conflicts to avoid git merge conflicts in a split brain - # situation. - unless (defined $part->header('Date')) { - my $date = $mime->header('Date') || - 'Thu, 01 Jan 1970 00:00:00 +0000'; - $part->header_set('Date', $date); - } - - $part->charset_set(undef); - $part->name_set(undef); - $part->filename_set(undef); - $part->format_set(undef); - $part->encoding_set('8bit'); - $part->disposition_set(undef); - $part->content_type_set($type); - $part->body_set($_[2]); -} - -# converts one part of a multipart message to text -sub html_part_to_text { - my ($mime, $part) = @_; - my $body = $part->body; - my $ct_parsed = parse_content_type($part->content_type); - dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_part($mime, $part, $body, 'text/plain'); -} - -# modifies $_[0] in place -sub dump_html { - my ($body, $charset) = @_; - $charset ||= 'US-ASCII'; - my @cmd = qw(lynx -stdin -stderr -dump); - my $out = ""; - my $err = ""; - - # be careful about remote command injection! - if ($charset =~ /\A([A-Za-z0-9\-]+)\z/) { - push @cmd, "-assume_charset=$charset"; - } - if (IPC::Run::run(\@cmd, $body, \$out, \$err)) { - $out =~ s/\r\n/\n/sg; - $$body = $out; - } else { - # give them an ugly version: - $$body = "public-inbox HTML conversion failed: $err\n" . - $$body . "\n"; - } -} - -# this is to correct old archives during import. -sub strip_multipart { - my ($mime, $content_type, $filter) = @_; - - my (@html, @keep); - my $rejected = 0; - my $ok = 1; - - # scan through all parts once - $mime->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - - # some extensions are just bad, reject them outright - my $fn = $part->filename; - if (defined($fn) && $fn =~ $BAD_EXT) { - $filter->reject("Bad file type: $1") if $filter; - $rejected++; - return; - } - - my $part_type = $part->content_type || ''; - if ($part_type =~ m!\btext/plain\b!i) { - push @keep, $part; - } elsif ($part_type =~ $MIME_HTML) { - $filter->reject(NO_HTML) if $filter; - push @html, $part; - } elsif ($part_type =~ $MIME_TEXT_ANY) { - # Give other text attachments the benefit of the doubt, - # here? Could be source code or script the user wants - # help with. - - push @keep, $part; - } elsif ($part_type eq '' || - $part_type =~ m!\bapplication/octet-stream\b!i) { - # unfortunately, some mailers don't set correct types, - # let messages of unknown type through but do not - # change the sender-specified type - if (recheck_type_ok($part)) { - push @keep, $part; - } elsif ($filter) { - $filter->reject("Bad attachment: $part_type ". - TEXT_ONLY); - } else { - $rejected++; - } - } elsif ($part_type =~ m!\bapplication/pgp-signature\b!i) { - # PGP signatures are not huge, we may keep them. - # They can only be valid if it's the last element, - # so we keep them iff the message is unmodified: - if ($rejected == 0 && !@html) { - push @keep, $part; - } - } elsif ($filter) { - $filter->reject("unacceptable mime-type: $part_type ". - TEXT_ONLY); - } else { - # reject everything else, including non-PGP signatures - $rejected++; - } - }); - - if ($content_type =~ m!\bmultipart/alternative\b!i) { - if (scalar @keep == 1) { - return collapse($mime, $keep[0]); - } - } else { # convert HTML parts to plain text - foreach my $part (@html) { - html_part_to_text($mime, $part); - push @keep, $part; - } - } - - if (@keep == 0) { - @keep = (Email::MIME->create( - attributes => { - content_type => 'text/plain', - charset => 'US-ASCII', - encoding => '8bit', - }, - body_str => 'all attachments scrubbed by '. __PACKAGE__ - )); - $ok = 0; - } - if (scalar(@html) || $rejected) { - $mime->parts_set(\@keep); - $mime->body_set($mime->body_raw); - mark_changed($mime); - } # else: no changes - - return $ok; -} - -sub mark_changed { - my ($mime) = @_; - $mime->header_set('X-Content-Filtered-By', __PACKAGE__ ." $VERSION"); -} - -sub collapse { - my ($mime, $part) = @_; - $mime->header_set('Content-Type', $part->content_type); - $mime->body_set($part->body_raw); - my $cte = $part->header('Content-Transfer-Encoding'); - if (defined($cte) && $cte ne '') { - $mime->header_set('Content-Transfer-Encoding', $cte); - } - mark_changed($mime); - return 1; -} - -sub replace_body { - my $mime = $_[0]; - $mime->body_set($_[1]); - $mime->header_set('Content-Type', 'text/plain'); - if ($mime->header('Content-Transfer-Encoding')) { - $mime->header_set('Content-Transfer-Encoding', undef); - } - mark_changed($mime); -} - -# Check for display-able text, no messed up binaries -# Note: we can not rewrite the message with the detected mime type -sub recheck_type_ok { - my ($part) = @_; - my $s = $part->body; - ((length($s) < 0x10000) && ($s =~ /\A([[:print:]\s]+)\z/s)); -} - -1; diff --git a/lib/PublicInbox/Filter/Base.pm b/lib/PublicInbox/Filter/Base.pm new file mode 100644 index 00000000..b2bb1462 --- /dev/null +++ b/lib/PublicInbox/Filter/Base.pm @@ -0,0 +1,110 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# base class for creating per-list or per-project filters +package PublicInbox::Filter::Base; +use strict; +use warnings; +use PublicInbox::MsgIter; +use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian + +sub No ($) { "*** We only accept plain-text mail, No $_[0] ***" } + +our %DEFAULTS = ( + reject_suffix => [ qw(exe bat cmd com pif scr vbs cpl zip swf swfl) ], + reject_type => [ 'text/html:'.No('HTML'), 'text/xhtml:'.No('HTML'), + 'application/vnd.*:'.No('vendor-specific formats'), + 'image/*:'.No('images'), 'video/*:'.No('video'), + 'audio/*:'.No('audio') ], +); +our $INVALID_FN = qr/\0/; + +sub REJECT () { 100 } +sub ACCEPT { scalar @_ > 1 ? $_[1] : 1 } +sub IGNORE () { 0 } + +my %patmap = ('*' => '.*', '?' => '.', '[' => '[', ']' => ']'); +sub glob2pat { + my ($glob) = @_; + $glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; + $glob; +} + +sub new { + my ($class, %opts) = @_; + my $self = bless { err => '', %opts }, $class; + foreach my $f (qw(reject_suffix reject_type)) { + # allow undef: + $self->{$f} = $DEFAULTS{$f} unless exists $self->{$f}; + } + if (defined $self->{reject_suffix}) { + my $tmp = $self->{reject_suffix}; + $tmp = join('|', map { glob2pat($_) } @$tmp); + $self->{reject_suffix} = qr/\.($tmp)\s*\z/i; + } + my $rt = []; + if (defined $self->{reject_type}) { + my $tmp = $self->{reject_type}; + @$rt = map { + my ($type, $msg) = split(':', $_, 2); + $type = lc $type; + $msg ||= "Unacceptable Content-Type: $type"; + my $re = glob2pat($type); + [ qr/\b$re\b/i, $msg ]; + } @$tmp; + } + $self->{reject_type} = $rt; + $self; +} + +sub reject ($$) { + my ($self, $reason) = @_; + $self->{err} = $reason; + REJECT; +} + +sub err ($) { $_[0]->{err} } + +# by default, scrub is a no-op, see PublicInbox::Filter::Vger::scrub +# for an example of the override +sub scrub { + my ($self, $mime) = @_; + $self->ACCEPT($mime); +} + +# for MDA +sub delivery { + my ($self, $mime) = @_; + + my $rt = $self->{reject_type}; + my $reject_suffix = $self->{reject_suffix} || $INVALID_FN; + my (%sfx, %type); + + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + + my $ct = $part->content_type || 'text/plain'; + foreach my $p (@$rt) { + if ($ct =~ $p->[0]) { + $type{$p->[1]} = 1; + } + } + + my $fn = $part->filename; + if (defined($fn) && $fn =~ $reject_suffix) { + $sfx{$1} = 1; + } + }); + + my @r; + if (keys %type) { + push @r, sort keys %type; + } + if (keys %sfx) { + push @r, 'Rejected suffixes(s): '.join(', ', sort keys %sfx); + } + + @r ? $self->reject(join("\n", @r)) : $self->scrub($mime); +} + +1; diff --git a/lib/PublicInbox/Filter/Mirror.pm b/lib/PublicInbox/Filter/Mirror.pm new file mode 100644 index 00000000..d9940889 --- /dev/null +++ b/lib/PublicInbox/Filter/Mirror.pm @@ -0,0 +1,12 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Dumb filter for blindly accepting everything +package PublicInbox::Filter::Mirror; +use base qw(PublicInbox::Filter::Base); +use strict; +use warnings; + +sub delivery { $_[0]->ACCEPT }; + +1; diff --git a/lib/PublicInbox/Filter/Vger.pm b/lib/PublicInbox/Filter/Vger.pm new file mode 100644 index 00000000..2ffed184 --- /dev/null +++ b/lib/PublicInbox/Filter/Vger.pm @@ -0,0 +1,38 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Filter for vger.kernel.org list trailer +package PublicInbox::Filter::Vger; +use base qw(PublicInbox::Filter::Base); +use strict; +use warnings; + +my $l0 = qr/-+/; # older messages only had one '-' +my $l1 = + qr/To unsubscribe from this list: send the line "unsubscribe [\w-]+" in/; +my $l2 = qr/the body of a message to majordomo\@vger\.kernel\.org/; +my $l3 = + qr!More majordomo info at +http://vger\.kernel\.org/majordomo-info\.html!; + +# only LKML had this, and LKML nowadays has no list trailer since Jan 2016 +my $l4 = qr!Please read the FAQ at +http://www\.tux\.org/lkml/!; + +sub scrub { + my ($self, $mime) = @_; + my $s = $mime->as_string; + + # the vger appender seems to only work on the raw string, + # so in multipart (e.g. GPG-signed) messages, the list trailer + # becomes invisible to MIME-aware email clients. + if ($s =~ s/$l0\n$l1\n$l2\n$l3\n($l4\n)?\z//os) { + $mime = Email::MIME->new(\$s); + } + $self->ACCEPT($mime); +} + +sub delivery { + my ($self, $mime) = @_; + $self->scrub($mime); +} + +1; diff --git a/lib/PublicInbox/GetlineBody.pm b/lib/PublicInbox/GetlineBody.pm new file mode 100644 index 00000000..5f327828 --- /dev/null +++ b/lib/PublicInbox/GetlineBody.pm @@ -0,0 +1,35 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Wrap a pipe or file for PSGI streaming response bodies and calls the +# end callback when the object goes out-of-scope. +# This depends on rpipe being _blocking_ on getline. +package PublicInbox::GetlineBody; +use strict; +use warnings; + +sub new { + my ($class, $rpipe, $end, $buf) = @_; + bless { rpipe => $rpipe, end => $end, buf => $buf }, $class; +} + +# close should always be called after getline returns undef, +# but a client aborting a connection can ruin our day; so lets +# hope our underlying PSGI server does not leak references, here. +sub DESTROY { $_[0]->close } + +sub getline { + my ($self) = @_; + my $buf = delete $self->{buf}; # initial buffer + defined $buf ? $buf : $self->{rpipe}->getline; +} + +sub close { + my ($self) = @_; + my $rpipe = delete $self->{rpipe}; + close $rpipe if $rpipe; + my $end = delete $self->{end}; + $end->() if $end; +} + +1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 2b6782a7..dee027a3 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -3,7 +3,7 @@ # # Used to read files from a git repository without excessive forking. # Used in our web interfaces as well as our -nntpd server. -# This is based on code in Git.pm which is GPLv2, but modified to avoid +# This is based on code in Git.pm which is GPLv2+, but modified to avoid # dependence on environment variables for compatibility with mod_perl. # There are also API changes to simplify our usage and data set. package PublicInbox::Git; @@ -53,7 +53,9 @@ sub _bidi_pipe { my @cmd = ('git', "--git-dir=$self->{git_dir}", qw(cat-file), $batch); my $redir = { 0 => fileno($out_r), 1 => fileno($in_w) }; - $self->{$pid} = spawn(\@cmd, undef, $redir); + my $p = spawn(\@cmd, undef, $redir); + defined $p or fail($self, "spawn failed: $!"); + $self->{$pid} = $p; $out_w->autoflush(1); $self->{$out} = $out_w; $self->{$in} = $in_r; @@ -124,6 +126,8 @@ sub cat_file { $rv; } +sub batch_prepare ($) { _bidi_pipe($_[0], qw(--batch in out pid)) } + sub check { my ($self, $obj) = @_; $self->_bidi_pipe(qw(--batch-check in_c out_c pid_c)); @@ -167,6 +171,8 @@ sub popen { sub qx { my ($self, @cmd) = @_; my $fh = $self->popen(@cmd); + defined $fh or return; + local $/ = "\n"; return <$fh> if wantarray; local $/; <$fh> @@ -181,3 +187,55 @@ sub cleanup { sub DESTROY { cleanup(@_) } 1; +__END__ +=pod + +=head1 NAME + +PublicInbox::Git - git wrapper + +=head1 VERSION + +version 1.0 + +=head1 SYNOPSIS + + use PublicInbox::Git; + chomp(my $git_dir = `git rev-parse --git-dir`); + $git_dir or die "GIT_DIR= must be specified\n"; + my $git = PublicInbox::Git->new($git_dir); + +=head1 DESCRIPTION + +Unstable API outside of the L</new> method. +It requires L<git(1)> to be installed. + +=head1 METHODS + +=cut + +=head2 new + + my $git = PublicInbox::Git->new($git_dir); + +Initialize a new PublicInbox::Git object for use with L<PublicInbox::Import> +This is the only public API method we support. Everything else +in this module is subject to change. + +=head1 SEE ALSO + +L<Git>, L<PublicInbox::Import> + +=head1 CONTACT + +All feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> + +=head1 COPYRIGHT + +Copyright (C) 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> + +=cut diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index d0ce80bc..0275a2a0 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -7,7 +7,14 @@ package PublicInbox::GitHTTPBackend; use strict; use warnings; use Fcntl qw(:seek); -use PublicInbox::Spawn qw(spawn); +use IO::Handle; +use HTTP::Date qw(time2str); +use HTTP::Status qw(status_message); +use Plack::Util; +use PublicInbox::Qspawn; + +# 32 is same as the git-daemon connection limit +my $default_limiter = PublicInbox::Qspawn::Limiter->new(32); # n.b. serving "description" and "cloneurl" should be innocuous enough to # not cause problems. serving "config" might... @@ -20,77 +27,116 @@ my @binary = qw! objects/pack/pack-[a-f0-9]{40}\.(?:pack|idx) !; -our $ANY = join('|', @binary, @text); +our $ANY = join('|', @binary, @text, 'git-upload-pack'); my $BIN = join('|', @binary); my $TEXT = join('|', @text); -sub r { - [ $_[0] , [qw(Content-Type text/plain Content-Length 0) ], [] ] +my @no_cache = ('Expires', 'Fri, 01 Jan 1980 00:00:00 GMT', + 'Pragma', 'no-cache', + 'Cache-Control', 'no-cache, max-age=0, must-revalidate'); + +sub r ($;$) { + my ($code, $msg) = @_; + $msg ||= status_message($code); + my $len = length($msg); + [ $code, [qw(Content-Type text/plain Content-Length), $len, @no_cache], + [$msg] ] } sub serve { - my ($cgi, $git, $path) = @_; - my $service = $cgi->param('service') || ''; - if ($service =~ /\Agit-\w+-pack\z/ || $path =~ /\Agit-\w+-pack\z/) { - my $ok = serve_smart($cgi, $git, $path); + my ($env, $git, $path) = @_; + + # Documentation/technical/http-protocol.txt in git.git + # requires one and exactly one query parameter: + if ($env->{QUERY_STRING} =~ /\Aservice=git-\w+-pack\z/ || + $path =~ /\Agit-\w+-pack\z/) { + my $ok = serve_smart($env, $git, $path); return $ok if $ok; # fall through to dumb HTTP... } - serve_dumb($cgi, $git, $path); + serve_dumb($env, $git, $path); +} + +sub err ($@) { + my ($env, @msg) = @_; + $env->{'psgi.errors'}->print(@msg, "\n"); +} + +sub drop_client ($) { + if (my $io = $_[0]->{'psgix.io'}) { + $io->close; # this is Danga::Socket::close + } +} + +my $prev = 0; +my $exp; +sub cache_one_year { + my ($h) = @_; + my $t = time + 31536000; + push @$h, 'Expires', $t == $prev ? $exp : ($exp = time2str($prev = $t)), + 'Cache-Control', 'public, max-age=31536000'; } sub serve_dumb { - my ($cgi, $git, $path) = @_; + my ($env, $git, $path) = @_; - # serve dumb HTTP... + my @h; my $type; - if ($path =~ /\A(?:$BIN)\z/o) { - $type = 'application/octet-stream'; + if ($path =~ m!\Aobjects/[a-f0-9]{2}/[a-f0-9]{38}\z!) { + $type = 'application/x-git-loose-object'; + cache_one_year(\@h); + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.pack\z!) { + $type = 'application/x-git-packed-objects'; + cache_one_year(\@h); + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.idx\z!) { + $type = 'application/x-git-packed-objects-toc'; + cache_one_year(\@h); } elsif ($path =~ /\A(?:$TEXT)\z/o) { $type = 'text/plain'; + push @h, @no_cache; } else { return r(404); } - my $f = "$git->{git_dir}/$path"; - return r(404) unless -f $f && -r _; - my @st = stat(_); - my $size = $st[7]; - # TODO: If-Modified-Since and Last-Modified + my $f = (ref $git ? $git->{git_dir} : $git) . '/' . $path; + return r(404) unless -f $f && -r _; # just in case it's a FIFO :P + my $size = -s _; + + # TODO: If-Modified-Since and Last-Modified? open my $in, '<', $f or return r(404); - my $code = 200; my $len = $size; - my @h; - - my $env = $cgi->{env}; - my $range = $env->{HTTP_RANGE}; - if (defined $range && $range =~ /\bbytes=(\d*)-(\d*)\z/) { - ($code, $len) = prepare_range($cgi, $in, \@h, $1, $2, $size); + my $code = 200; + push @h, 'Content-Type', $type; + if (($env->{HTTP_RANGE} || '') =~ /\bbytes=(\d*)-(\d*)\z/) { + ($code, $len) = prepare_range($env, $in, \@h, $1, $2, $size); if ($code == 416) { push @h, 'Content-Range', "bytes */$size"; return [ 416, \@h, [] ]; } } - - push @h, 'Content-Type', $type, 'Content-Length', $len; - sub { - my ($res) = @_; # Plack callback - my $fh = $res->([ $code, \@h ]); - my $buf; - my $n = 8192; - while ($len > 0) { + push @h, 'Content-Length', $len; + my $n = 65536; + [ $code, \@h, Plack::Util::inline_object(close => sub { close $in }, + getline => sub { + return if $len == 0; $n = $len if $len < $n; - my $r = sysread($in, $buf, $n); - last if (!defined($r) || $r <= 0); - $len -= $r; - $fh->write($buf); - } - $fh->close; - } + my $r = sysread($in, my $buf, $n); + if (!defined $r) { + err($env, "$f read error: $!"); + } elsif ($r <= 0) { + err($env, "$f EOF with $len bytes left"); + } else { + $len -= $r; + $n = 8192; + return $buf; + } + drop_client($env); + return; + })] } sub prepare_range { - my ($cgi, $in, $h, $beg, $end, $size) = @_; + my ($env, $in, $h, $beg, $end, $size) = @_; my $code = 200; my $len = $size; if ($beg eq '') { @@ -119,38 +165,27 @@ sub prepare_range { if ($len <= 0) { $code = 416; } else { - seek($in, $beg, SEEK_SET) or return [ 500, [], [] ]; + sysseek($in, $beg, SEEK_SET) or return [ 500, [], [] ]; push @$h, qw(Accept-Ranges bytes Content-Range); push @$h, "bytes $beg-$end/$size"; # FIXME: Plack::Middleware::Deflater bug? - $cgi->{env}->{'psgix.no-compress'} = 1; + $env->{'psgix.no-compress'} = 1; } } ($code, $len); } +# returns undef if 403 so it falls back to dumb HTTP sub serve_smart { - my ($cgi, $git, $path) = @_; - my $env = $cgi->{env}; - - my $input = $env->{'psgi.input'}; - my $buf; - my $in; - my $err = $env->{'psgi.errors'}; - my $fd = eval { fileno($input) }; - if (defined $fd && $fd >= 0) { - $in = $input; - } else { + my ($env, $git, $path) = @_; + my $in = $env->{'psgi.input'}; + my $fd = eval { fileno($in) }; + unless (defined $fd && $fd >= 0) { $in = input_to_file($env) or return r(500); } - my ($rpipe, $wpipe); - unless (pipe($rpipe, $wpipe)) { - $err->print("error creating pipe: $!\n"); - return r(500); - } my %env = %ENV; - # GIT_HTTP_EXPORT_ALL, GIT_COMMITTER_NAME, GIT_COMMITTER_EMAIL + # GIT_COMMITTER_NAME, GIT_COMMITTER_EMAIL # may be set in the server-process and are passed as-is foreach my $name (qw(QUERY_STRING REMOTE_USER REMOTE_ADDR @@ -161,117 +196,129 @@ sub serve_smart { my $val = $env->{$name}; $env{$name} = $val if defined $val; } - my $git_dir = $git->{git_dir}; + my ($git_dir, $limiter); + if (ref $git) { + $limiter = $git->{-httpbackend_limiter} || $default_limiter; + $git_dir = $git->{git_dir}; + } else { + $limiter = $default_limiter; + $git_dir = $git; + } $env{GIT_HTTP_EXPORT_ALL} = '1'; $env{PATH_TRANSLATED} = "$git_dir/$path"; - my %rdr = ( 0 => fileno($in), 1 => fileno($wpipe), - 2 => $git->err_begin ); - my $pid = spawn([qw(git http-backend)], \%env, \%rdr); - unless (defined $pid) { - $err->print("error spawning: $!\n"); - return r(500); - } - $wpipe = $in = undef; - $buf = ''; - my ($vin, $fh, $res); + my %rdr = ( 0 => fileno($in) ); + my $x = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, \%rdr); + my ($fh, $rpipe); my $end = sub { - if ($fh) { - $fh->close; - $fh = undef; - } else { - $res->(r(500)) if $res; - } - if ($rpipe) { - $rpipe->close; # _may_ be Danga::Socket::close - $rpipe = undef; - } - if (defined $pid) { - my $wpid = $pid; - $pid = undef; - return if $wpid == waitpid($wpid, 0); - $err->print("git http-backend ($git_dir): $?\n"); + if (my $err = $x->finish) { + err($env, "git http-backend ($git_dir): $err"); } + $fh->close if $fh; # async-only }; - my $fail = sub { - if ($!{EAGAIN} || $!{EINTR}) { - select($vin, undef, undef, undef) if defined $vin; - # $vin is undef on async, so this is a noop on EAGAIN - return; - } - my $e = $!; - $end->(); - $err->print("git http-backend ($git_dir): $e\n"); + + # Danga::Socket users, we queue up the read_enable callback to + # fire after pending writes are complete: + my $buf = ''; + my $rd_hdr = sub { + my $r = sysread($rpipe, $buf, 1024, length($buf)); + return if !defined($r) && ($!{EINTR} || $!{EAGAIN}); + return r(500, 'http-backend error') unless $r; + $r = parse_cgi_headers(\$buf) or return; # incomplete headers + $r->[0] == 403 ? serve_dumb($env, $git, $path) : $r; }; - my $cb = sub { # read git-http-backend output and stream to client - my $r = $rpipe ? $rpipe->sysread($buf, 8192, length($buf)) : 0; - return $fail->() unless defined $r; - return $end->() if $r == 0; # EOF - if ($fh) { # stream body from git-http-backend to HTTP client - $fh->write($buf); - $buf = ''; - } elsif (defined $res && $buf =~ s/\A(.*?)\r\n\r\n//s) { - # parse headers - my $h = $1; - my $code = 200; - my @h; - foreach my $l (split(/\r\n/, $h)) { - my ($k, $v) = split(/:\s*/, $l, 2); - if ($k =~ /\AStatus\z/i) { - ($code) = ($v =~ /\b(\d+)\b/); - } else { - push @h, $k, $v; - } - } - # incredibly convoluted, ugh... - if ($code == 403) { - my $d = serve_dumb($cgi, $git, $path); - if (ref($d) eq 'ARRAY') { # 404 - $res->($d); - } else { - $d->($res); - } - $res = undef; - $end->(); + my $res; + my $async = $env->{'pi-httpd.async'}; + my $io = $env->{'psgix.io'}; + my $cb = sub { + my $r = $rd_hdr->() or return; + $rd_hdr = undef; + if (scalar(@$r) == 3) { # error: + if ($async) { + $async->close; # calls rpipe->close } else { - # write response header: - $fh = $res->([ $code, \@h ]); - $res = undef; - $fh->write($buf); - $buf = ''; + $rpipe->close; + $end->(); } - } # else { keep reading ... } - }; - if (my $async = $env->{'pi-httpd.async'}) { - $rpipe = $async->($rpipe, $cb); - sub { ($res) = @_ } # let Danga::Socket handle the rest. - } else { # synchronous loop for other PSGI servers - $vin = ''; - vec($vin, fileno($rpipe), 1) = 1; - sub { - ($res) = @_; - while ($rpipe) { $cb->() } + return $res->($r); } - } + if ($async) { + $fh = $res->($r); + return $async->async_pass($io, $fh, \$buf); + } + + # for synchronous PSGI servers + require PublicInbox::GetlineBody; + $r->[2] = PublicInbox::GetlineBody->new($rpipe, $end, $buf); + $res->($r); + }; + sub { + ($res) = @_; + + # hopefully this doesn't break any middlewares, + # holding the input here is a waste of FDs and memory + $env->{'psgi.input'} = undef; + + $x->start($limiter, sub { # may run later, much later... + ($rpipe) = @_; + $in = undef; + if ($async) { + $async = $async->($rpipe, $cb, $end); + } else { # generic PSGI + $cb->() while $rd_hdr; + } + }); + }; } sub input_to_file { my ($env) = @_; - my $in = IO::File->new_tmpfile; + open(my $in, '+>', undef); + unless (defined $in) { + err($env, "could not open temporary file: $!"); + return; + } my $input = $env->{'psgi.input'}; my $buf; while (1) { my $r = $input->read($buf, 8192); unless (defined $r) { - my $err = $env->{'psgi.errors'}; - $err->print("error reading input: $!\n"); + err($env, "error reading input: $!"); return; } - last if ($r == 0); - $in->write($buf); + my $off = 0; + while ($r > 0) { + my $w = syswrite($in, $buf, $r, $off); + if (defined $w) { + $r -= $w; + $off += $w; + } else { + err($env, "error writing temporary file: $!"); + return; + } + } + } + unless (defined(sysseek($in, 0, SEEK_SET))) { + err($env, "error seeking temporary file: $!"); + return; } - $in->flush; - $in->sysseek(0, SEEK_SET); return $in; } +sub parse_cgi_headers { + my ($bref) = @_; + $$bref =~ s/\A(.*?)\r\n\r\n//s or return; + my $h = $1; + my $code = 200; + my @h; + foreach my $l (split(/\r\n/, $h)) { + my ($k, $v) = split(/:\s*/, $l, 2); + if ($k =~ /\AStatus\z/i) { + ($code) = ($v =~ /\b(\d+)\b/); + } else { + push @h, $k, $v; + } + } + [ $code, \@h ] +} + 1; diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 68c3b788..cac14be3 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -4,19 +4,20 @@ # Generic PSGI server for convenience. It aims to provide # a consistent experience for public-inbox admins so they don't have # to learn different ways to admin both NNTP and HTTP components. -# There's nothing public-inbox-specific, here. +# There's nothing which depends on public-inbox, here. # Each instance of this class represents a HTTP client socket package PublicInbox::HTTP; use strict; use warnings; use base qw(Danga::Socket); -use fields qw(httpd env rbuf input_left remote_addr remote_port); +use fields qw(httpd env rbuf input_left remote_addr remote_port forward pull); use Fcntl qw(:seek); use Plack::HTTPParser qw(parse_http_request); # XS or pure Perl use HTTP::Status qw(status_message); use HTTP::Date qw(time2str); -use IO::File; +use Scalar::Util qw(weaken); +use IO::Handle; use constant { CHUNK_START => -1, # [a-f0-9]+\r\n CHUNK_END => -2, # \r\n @@ -24,13 +25,25 @@ use constant { CHUNK_MAX_HDR => 256, }; +my $pipelineq = []; +my $pipet; +sub process_pipelineq () { + my $q = $pipelineq; + $pipet = undef; + $pipelineq = []; + foreach (@$q) { + next if $_->{closed}; + rbuf_process($_); + } +} + # Use the same configuration parameter as git since this is primarily # a slow-client sponge for git-http-backend # TODO: support per-respository http.maxRequestBuffer somehow... our $MAX_REQUEST_BUFFER = $ENV{GIT_HTTP_MAX_REQUEST_BUFFER} || (10 * 1024 * 1024); -my $null_io = IO::File->new('/dev/null', '<'); +open(my $null_io, '<', '/dev/null') or die "failed to open /dev/null: $!"; my $http_date; my $prev = 0; sub http_date () { @@ -85,7 +98,7 @@ sub rbuf_process { $self->{rbuf} = substr($self->{rbuf}, $r); my $len = input_prepare($self, \%env); - defined $len or return write_err($self); # EMFILE/ENFILE + defined $len or return write_err($self, undef); # EMFILE/ENFILE $len ? event_read_input($self) : app_dispatch($self); } @@ -105,7 +118,7 @@ sub event_read_input ($) { while ($len > 0) { if ($$rbuf ne '') { my $w = write_in_full($input, $rbuf, $len); - return write_err($self) unless $w; + return write_err($self, $len) unless $w; $len -= $w; die "BUG: $len < 0 (w=$w)" if $len < 0; if ($len == 0) { # next request may be pipelined @@ -118,11 +131,11 @@ sub event_read_input ($) { return recv_err($self, $r, $len) unless $r; # continue looping if $r > 0; } - app_dispatch($self); + app_dispatch($self, $input); } -sub app_dispatch ($) { - my ($self) = @_; +sub app_dispatch { + my ($self, $input) = @_; $self->watch_read(0); my $env = $self->{env}; $env->{REMOTE_ADDR} = $self->{remote_addr}; @@ -131,10 +144,13 @@ sub app_dispatch ($) { $host =~ s/:(\d+)\z// and $env->{SERVER_PORT} = $1; $env->{SERVER_NAME} = $host; } - - sysseek($env->{'psgi.input'}, 0, SEEK_SET) or + if (defined $input) { + sysseek($input, 0, SEEK_SET) or die "BUG: psgi.input seek failed: $!"; - + } + # note: NOT $self->{sock}, we want our close (+ Danga::Socket::close), + # to do proper cleanup: + $env->{'psgix.io'} = $self; # only for ->close my $res = Plack::Util::run_app($self->{httpd}->{app}, $env); eval { if (ref($res) eq 'CODE') { @@ -163,42 +179,131 @@ sub response_header_write { if ($k =~ /\ATransfer-Encoding\z/i && $v =~ /\bchunked\b/i) { $chunked = 1; } - $h .= "$k: $v\r\n"; } my $conn = $env->{HTTP_CONNECTION} || ''; - my $alive = (defined($len) || $chunked) && - (($proto eq 'HTTP/1.1' && $conn !~ /\bclose\b/i) || - ($conn =~ /\bkeep-alive\b/i)); - - $h .= 'Connection: ' . ($alive ? 'keep-alive' : 'close'); - $h .= "\r\nDate: " . http_date() . "\r\n\r\n"; + my $term = defined($len) || $chunked; + my $prot_persist = ($proto eq 'HTTP/1.1') && ($conn !~ /\bclose\b/i); + my $alive; + if (!$term && $prot_persist) { # auto-chunk + $chunked = $alive = 2; + $h .= "Transfer-Encoding: chunked\r\n"; + # no need for "Connection: keep-alive" with HTTP/1.1 + } elsif ($term && ($prot_persist || ($conn =~ /\bkeep-alive\b/i))) { + $alive = 1; + $h .= "Connection: keep-alive\r\n"; + } else { + $alive = 0; + $h .= "Connection: close\r\n"; + } + $h .= 'Date: ' . http_date() . "\r\n\r\n"; if (($len || $chunked) && $env->{REQUEST_METHOD} ne 'HEAD') { more($self, $h); } else { $self->write($h); } - ($alive, $chunked); + $alive; +} + +# middlewares such as Deflater may write empty strings +sub chunked_wcb ($) { + my ($self) = @_; + sub { + return if $_[0] eq ''; + more($self, sprintf("%x\r\n", bytes::length($_[0]))); + more($self, $_[0]); + + # use $self->write("\n\n") if you care about real-time + # streaming responses, public-inbox WWW does not. + more($self, "\r\n"); + } +} + +sub identity_wcb ($) { + my ($self) = @_; + sub { $self->write(\($_[0])) if $_[0] ne '' } +} + +sub next_request ($) { + my ($self) = @_; + $self->watch_write(0); + if ($self->{rbuf} eq '') { # wait for next request + $self->watch_read(1); + } else { # avoid recursion for pipelined requests + push @$pipelineq, $self; + $pipet ||= PublicInbox::EvCleanup::asap(*process_pipelineq); + } +} + +sub response_done ($$) { + my ($self, $alive) = @_; + my $env = $self->{env}; + $self->{env} = undef; + $self->write("0\r\n\r\n") if $alive == 2; + $self->write(sub { $alive ? next_request($self) : $self->close }); +} + +sub getline_cb ($$$) { + my ($self, $write, $close) = @_; + local $/ = \8192; + my $forward = $self->{forward}; + # limit our own running time for fairness with other + # clients and to avoid buffering too much: + if ($forward) { + my $buf = eval { $forward->getline }; + if (defined $buf) { + $write->($buf); # may close in Danga::Socket::write + unless ($self->{closed}) { + my $next = $self->{pull}; + if ($self->{write_buf_size}) { + $self->write($next); + } else { + PublicInbox::EvCleanup::asap($next); + } + return; + } + } elsif ($@) { + err($self, "response ->getline error: $@"); + $forward = undef; + $self->close; + } + } + + $self->{forward} = $self->{pull} = undef; + # avoid recursion + if ($forward) { + eval { $forward->close }; + if ($@) { + err($self, "response ->close error: $@"); + $self->close; # idempotent + } + } + $close->(); +} + +sub getline_response { + my ($self, $body, $write, $close) = @_; + $self->{forward} = $body; + weaken($self); + my $pull = $self->{pull} = sub { getline_cb($self, $write, $close) }; + $pull->(); } sub response_write { my ($self, $env, $res) = @_; - my ($alive, $chunked) = response_header_write($self, $env, $res); - my $write = sub { $self->write($_[0]) }; - my $close = sub { - if ($alive) { - $self->event_write; # watch for readability if done + my $alive = response_header_write($self, $env, $res); + + my $write = $alive == 2 ? chunked_wcb($self) : identity_wcb($self); + my $close = sub { response_done($self, $alive) }; + if (defined(my $body = $res->[2])) { + if (ref $body eq 'ARRAY') { + $write->($_) foreach @$body; + $close->(); } else { - $self->write(sub { $self->close }); + getline_response($self, $body, $write, $close); } - $self->{env} = undef; - }; - - if (defined $res->[2]) { - Plack::Util::foreach($res->[2], $write); - $close->(); } else { # this is returned to the calling application: Plack::Util::inline_object(write => $write, close => $close); @@ -208,6 +313,7 @@ sub response_write { use constant MSG_MORE => ($^O eq 'linux') ? 0x8000 : 0; sub more ($$) { my $self = $_[0]; + return if $self->{closed}; if (MSG_MORE && !$self->{write_buf_size}) { my $n = send($self->{sock}, $_[1], MSG_MORE); if (defined $n) { @@ -220,19 +326,6 @@ sub more ($$) { $self->write($_[1]); } -# overrides existing Danga::Socket method -sub event_write { - my ($self) = @_; - # only continue watching for readability when we are done writing: - return if $self->write(undef) != 1; - - if ($self->{rbuf} eq '') { # wait for next request - $self->watch_read(1); - } else { # avoid recursion for pipelined requests - Danga::Socket->AddTimer(0, sub { rbuf_process($self) }); - } -} - sub input_prepare { my ($self, $env) = @_; my $input = $null_io; @@ -242,16 +335,15 @@ sub input_prepare { quit($self, 413); return; } - $input = IO::File->new_tmpfile; + open($input, '+>', undef); } elsif (env_chunked($env)) { $len = CHUNK_START; - $input = IO::File->new_tmpfile; + open($input, '+>', undef); } # TODO: expire idle clients on ENFILE / EMFILE return unless $input; - binmode $input; $env->{'psgi.input'} = $input; $self->{env} = $env; $self->{input_left} = $len || 0; @@ -259,11 +351,15 @@ sub input_prepare { sub env_chunked { ($_[0]->{HTTP_TRANSFER_ENCODING} || '') =~ /\bchunked\b/i } +sub err ($$) { + eval { $_[0]->{httpd}->{env}->{'psgi.errors'}->print($_[1]."\n") }; +} + sub write_err { - my ($self) = @_; - my $err = $self->{httpd}->{env}->{'psgi.errors'}; + my ($self, $len) = @_; my $msg = $! || '(zero write)'; - $err->print("error buffering to input: $msg\n"); + $msg .= " ($len bytes remaining)" if defined $len; + err($self, "error buffering to input: $msg"); quit($self, 500); } @@ -274,8 +370,7 @@ sub recv_err { $self->{input_left} = $len; return; } - my $err = $self->{httpd}->{env}->{'psgi.errors'}; - $err->print("error reading for input: $! ($len bytes remaining)\n"); + err($self, "error reading for input: $! ($len bytes remaining)"); quit($self, 500); } @@ -303,7 +398,8 @@ sub event_read_input_chunked { # unlikely... while (1) { # chunk start if ($len == CHUNK_ZEND) { - return app_dispatch($self) if $$rbuf =~ s/\A\r\n//s; + $$rbuf =~ s/\A\r\n//s and + return app_dispatch($self, $input); return quit($self, 400) if length($$rbuf) > 2; } if ($len == CHUNK_END) { @@ -337,7 +433,7 @@ sub event_read_input_chunked { # unlikely... until ($len <= 0) { if ($$rbuf ne '') { my $w = write_in_full($input, $rbuf, $len); - return write_err($self) unless $w; + return write_err($self, "$len chunk") if !$w; $len -= $w; if ($len == 0) { # we may have leftover data to parse @@ -371,11 +467,17 @@ sub quit { sub event_hup { $_[0]->close } sub event_err { $_[0]->close } -sub write ($$) : method { - my PublicInbox::HTTP $self = $_[0]; - return 1 if (defined($_[1]) && ref($_[1]) eq '' && $_[1] eq ''); - - $self->SUPER::write($_[1]); +sub close { + my $self = shift; + my $forward = $self->{forward}; + my $env = $self->{env}; + delete $env->{'psgix.io'} if $env; # prevent circular referernces + $self->{pull} = $self->{forward} = $self->{env} = undef; + if ($forward) { + eval { $forward->close }; + err($self, "forward ->close error: $@") if $@; + } + $self->SUPER::close(@_); } # for graceful shutdown in PublicInbox::Daemon: diff --git a/lib/PublicInbox/HTTPD.pm b/lib/PublicInbox/HTTPD.pm new file mode 100644 index 00000000..433d6da7 --- /dev/null +++ b/lib/PublicInbox/HTTPD.pm @@ -0,0 +1,43 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::HTTPD; +use strict; +use warnings; +use Plack::Util; +require PublicInbox::HTTPD::Async; +require PublicInbox::Daemon; + +sub pi_httpd_async { PublicInbox::HTTPD::Async->new(@_) } + +sub new { + my ($class, $sock, $app) = @_; + my $n = getsockname($sock) or die "not a socket: $sock $!\n"; + my ($host, $port) = PublicInbox::Daemon::host_with_port($n); + + my %env = ( + SERVER_NAME => $host, + SERVER_PORT => $port, + SCRIPT_NAME => '', + 'psgi.version' => [ 1, 1 ], + 'psgi.errors' => \*STDERR, + 'psgi.url_scheme' => 'http', + 'psgi.nonblocking' => Plack::Util::TRUE, + 'psgi.streaming' => Plack::Util::TRUE, + 'psgi.run_once' => Plack::Util::FALSE, + 'psgi.multithread' => Plack::Util::FALSE, + 'psgi.multiprocess' => Plack::Util::TRUE, + 'psgix.harakiri'=> Plack::Util::FALSE, + 'psgix.input.buffered' => Plack::Util::TRUE, + 'pi-httpd.async' => do { + no warnings 'once'; + *pi_httpd_async + }, + ); + bless { + app => $app, + env => \%env + }, $class; +} + +1; diff --git a/lib/PublicInbox/HTTPD/Async.pm b/lib/PublicInbox/HTTPD/Async.pm new file mode 100644 index 00000000..68514f5a --- /dev/null +++ b/lib/PublicInbox/HTTPD/Async.pm @@ -0,0 +1,78 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# XXX This is a totally unstable API for public-inbox internal use only +# This is exposed via the 'pi-httpd.async' key in the PSGI env hash. +# The name of this key is not even stable! +# Currently is is intended for use with read-only pipes. +package PublicInbox::HTTPD::Async; +use strict; +use warnings; +use base qw(Danga::Socket); +use fields qw(cb cleanup); +use Scalar::Util qw(weaken); +require PublicInbox::EvCleanup; + +sub new { + my ($class, $io, $cb, $cleanup) = @_; + my $self = fields::new($class); + IO::Handle::blocking($io, 0); + $self->SUPER::new($io); + $self->{cb} = $cb; + $self->{cleanup} = $cleanup; + $self->watch_read(1); + $self; +} + +sub restart_read_cb ($) { + my ($self) = @_; + sub { $self->watch_read(1) } +} + +sub async_pass { + my ($self, $io, $fh, $bref) = @_; + # In case the client HTTP connection ($io) dies, it + # will automatically close this ($self) object. + $io->{forward} = $self; + $fh->write($$bref); + my $restart_read = restart_read_cb($self); + weaken($self); + $self->{cb} = sub { + my $r = sysread($self->{sock}, $$bref, 8192); + if ($r) { + $fh->write($$bref); + return if $io->{closed}; + if ($io->{write_buf_size}) { + $self->watch_read(0); + $io->write($restart_read); # D::S::write + } + # stay in watch_read, but let other clients + # get some work done, too. + return; + } elsif (!defined $r) { + return if $!{EAGAIN} || $!{EINTR}; + } + + # Done! Error handling will happen in $fh->close + # called by the {cleanup} handler + $io->{forward} = undef; + $self->close; + } +} + +sub event_read { $_[0]->{cb}->() } +sub event_hup { $_[0]->{cb}->() } +sub event_err { $_[0]->{cb}->() } +sub sysread { shift->{sock}->sysread(@_) } + +sub close { + my $self = shift; + my $cleanup = $self->{cleanup}; + $self->{cleanup} = $self->{cb} = undef; + $self->SUPER::close(@_); + + # we defer this to the next timer loop since close is deferred + PublicInbox::EvCleanup::next_tick($cleanup) if $cleanup; +} + +1; diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index c0db5667..15b5fd3e 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -7,8 +7,8 @@ package PublicInbox::Hval; use strict; use warnings; use Encode qw(find_encoding); +use PublicInbox::MID qw/mid_clean mid_escape/; use URI::Escape qw(uri_escape_utf8); -use PublicInbox::MID qw/mid_clean/; use base qw/Exporter/; our @EXPORT_OK = qw/ascii_html utf8_html to_attr from_attr/; @@ -42,9 +42,8 @@ sub new { } sub new_msgid { - my ($class, $msgid, $no_compress) = @_; - $msgid = mid_clean($msgid); - $class->new($msgid, $msgid); + my ($class, $msgid) = @_; + $class->new($msgid, mid_escape($msgid)); } sub new_oneline { @@ -63,10 +62,24 @@ my %xhtml_map = ( '>' => '>', ); +$xhtml_map{chr($_)} = sprintf('\\x%02x', $_) for (0..31); +# some of these overrides are standard C escapes so they're +# easy-to-understand when rendered. +$xhtml_map{"\x00"} = '\\0'; # NUL +$xhtml_map{"\x07"} = '\\a'; # bell +$xhtml_map{"\x08"} = '\\b'; # backspace +$xhtml_map{"\x09"} = "\t"; # obvious to show as-is +$xhtml_map{"\x0a"} = "\n"; # obvious to show as-is +$xhtml_map{"\x0b"} = '\\v'; # vertical tab +$xhtml_map{"\x0c"} = '\\f'; # form feed +$xhtml_map{"\x0d"} = '\\r'; # carriage ret (not preceding \n) +$xhtml_map{"\x1b"} = '^['; # ASCII escape (mutt seems to escape this way) +$xhtml_map{"\x7f"} = '\\x7f'; # DEL + sub ascii_html { my ($s) = @_; $s =~ s/\r\n/\n/sg; # fixup bad line endings - $s =~ s/([<>&'"])/$xhtml_map{$1}/ge; + $s =~ s/([<>&'"\x7f\x00-\x1f])/$xhtml_map{$1}/sge; $enc_ascii->encode($s, Encode::HTMLCREF); } diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm new file mode 100644 index 00000000..1ac112b8 --- /dev/null +++ b/lib/PublicInbox/Import.pm @@ -0,0 +1,371 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# git fast-import-based ssoma-mda MDA replacement +# This is only ever run by public-inbox-mda and public-inbox-learn, +# not the WWW or NNTP code which only requires read-only access. +package PublicInbox::Import; +use strict; +use warnings; +use Fcntl qw(:flock :DEFAULT); +use PublicInbox::Spawn qw(spawn); +use PublicInbox::MID qw(mid_mime mid2path); +use PublicInbox::Address; + +sub new { + my ($class, $git, $name, $email, $inbox) = @_; + bless { + git => $git, + ident => "$name <$email>", + mark => 1, + ref => 'refs/heads/master', + inbox => $inbox, + }, $class +} + +# idempotent start function +sub gfi_start { + my ($self) = @_; + + return ($self->{in}, $self->{out}) if $self->{pid}; + + my ($in_r, $in_w, $out_r, $out_w); + pipe($in_r, $in_w) or die "pipe failed: $!"; + pipe($out_r, $out_w) or die "pipe failed: $!"; + my $git = $self->{git}; + my $git_dir = $git->{git_dir}; + my $lockpath = "$git_dir/ssoma.lock"; + sysopen(my $lockfh, $lockpath, O_WRONLY|O_CREAT) or + die "failed to open lock $lockpath: $!"; + + # wait for other processes to be done + flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; + local $/ = "\n"; + chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $self->{ref})); + + my @cmd = ('git', "--git-dir=$git_dir", qw(fast-import + --quiet --done --date-format=rfc2822)); + my $rdr = { 0 => fileno($out_r), 1 => fileno($in_w) }; + my $pid = spawn(\@cmd, undef, $rdr); + die "spawn fast-import failed: $!" unless defined $pid; + $out_w->autoflush(1); + $self->{in} = $in_r; + $self->{out} = $out_w; + $self->{lockfh} = $lockfh; + $self->{pid} = $pid; + $self->{nchg} = 0; + ($in_r, $out_w); +} + +sub wfail () { die "write to fast-import failed: $!" } + +sub now2822 () { + my @t = gmtime(time); + my $day = qw(Sun Mon Tue Wed Thu Fri Sat)[$t[6]]; + my $mon = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)[$t[4]]; + + sprintf('%s, %2d %s %d %02d:%02d:%02d +0000', + $day, $t[3], $mon, $t[5] + 1900, $t[2], $t[1], $t[0]); +} + +sub norm_body ($) { + my ($mime) = @_; + my $b = $mime->body_raw; + $b =~ s/(\r?\n)+\z//s; + $b +} + +sub _check_path ($$$$) { + my ($r, $w, $tip, $path) = @_; + return if $tip eq ''; + print $w "ls $tip $path\n" or wfail; + local $/ = "\n"; + defined(my $info = <$r>) or die "EOF from fast-import: $!"; + $info =~ /\Amissing / ? undef : $info; +} + +# returns undef on non-existent +# ('MISMATCH', msg) on mismatch +# (:MARK, msg) on success +sub remove { + my ($self, $mime) = @_; # mime = Email::MIME + + my $mid = mid_mime($mime); + my $path = mid2path($mid); + + my ($r, $w) = $self->gfi_start; + my $tip = $self->{tip}; + my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); + $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info"; + my $blob = $1; + + print $w "cat-blob $blob\n" or wfail; + local $/ = "\n"; + $info = <$r>; + defined $info or die "EOF from fast-import / cat-blob: $!"; + $info =~ /\A[a-f0-9]{40} blob (\d+)\n\z/ or + die "unexpected cat-blob response: $info"; + my $left = $1; + my $offset = 0; + my $buf = ''; + my $n; + while ($left > 0) { + $n = read($r, $buf, $left, $offset); + defined($n) or die "read cat-blob failed: $!"; + $n == 0 and die 'fast-export (cat-blob) died'; + $left -= $n; + $offset += $n; + } + $n = read($r, my $lf, 1); + defined($n) or die "read final byte of cat-blob failed: $!"; + die "bad read on final byte: <$lf>" if $lf ne "\n"; + my $cur = Email::MIME->new($buf); + my $cur_s = $cur->header('Subject'); + $cur_s = '' unless defined $cur_s; + my $cur_m = $mime->header('Subject'); + $cur_m = '' unless defined $cur_m; + if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { + return ('MISMATCH', $cur); + } + + my $ref = $self->{ref}; + my $commit = $self->{mark}++; + my $parent = $tip =~ /\A:/ ? $tip : undef; + unless ($parent) { + print $w "reset $ref\n" or wfail; + } + my $ident = $self->{ident}; + my $now = now2822(); + print $w "commit $ref\nmark :$commit\n", + "author $ident $now\n", + "committer $ident $now\n", + "data 3\nrm\n\n", + 'from ', ($parent ? $parent : $tip), "\n" or wfail; + print $w "D $path\n\n" or wfail; + $self->{nchg}++; + (($self->{tip} = ":$commit"), $cur); +} + +# returns undef on duplicate +sub add { + my ($self, $mime, $check_cb) = @_; # mime = Email::MIME + + my $from = $mime->header('From'); + my ($email) = PublicInbox::Address::emails($from); + my ($name) = PublicInbox::Address::names($from); + # git gets confused with: + # "'A U Thor <u@example.com>' via foo" <foo@example.com> + # ref: + # <CAD0k6qSUYANxbjjbE4jTW4EeVwOYgBD=bXkSu=akiYC_CB7Ffw@mail.gmail.com> + $name =~ tr/<>//d; + + my $date = $mime->header('Date'); + my $subject = $mime->header('Subject'); + $subject = '(no subject)' unless defined $subject; + my $mid = mid_mime($mime); + my $path = mid2path($mid); + + my ($r, $w) = $self->gfi_start; + my $tip = $self->{tip}; + _check_path($r, $w, $tip, $path) and return; + + # kill potentially confusing/misleading headers + $mime->header_set($_) for qw(bytes lines content-length status); + if ($check_cb) { + $mime = $check_cb->($mime) or return; + } + + $mime = $mime->as_string; + my $blob = $self->{mark}++; + print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail; + print $w $mime, "\n" or wfail; + my $ref = $self->{ref}; + my $commit = $self->{mark}++; + my $parent = $tip =~ /\A:/ ? $tip : undef; + + unless ($parent) { + print $w "reset $ref\n" or wfail; + } + + # quiet down wide character warnings: + binmode $w, ':utf8' or die "binmode :utf8 failed: $!"; + print $w "commit $ref\nmark :$commit\n", + "author $name <$email> $date\n", + "committer $self->{ident} ", now2822(), "\n", + "data ", (bytes::length($subject) + 1), "\n", + $subject, "\n\n" or wfail; + binmode $w, ':raw' or die "binmode :raw failed: $!"; + + if ($tip ne '') { + print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; + } + print $w "M 100644 :$blob $path\n\n" or wfail; + $self->{nchg}++; + $self->{tip} = ":$commit"; +} + +sub run_die ($$) { + my ($cmd, $env) = @_; + my $pid = spawn($cmd, $env, undef); + defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; + waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; + $? == 0 or die join(' ', @$cmd) . " failed: $?\n"; +} + +sub done { + my ($self) = @_; + my $w = delete $self->{out} or return; + my $r = delete $self->{in} or die 'BUG: missing {in} when done'; + print $w "done\n" or wfail; + my $pid = delete $self->{pid} or die 'BUG: missing {pid} when done'; + waitpid($pid, 0) == $pid or die 'fast-import did not finish'; + $? == 0 or die "fast-import failed: $?"; + my $nchg = delete $self->{nchg}; + + # for compatibility with existing ssoma installations + # we can probably remove this entirely by 2020 + my $git_dir = $self->{git}->{git_dir}; + # XXX: change the following scope to: if (-e $index) # in 2018 or so.. + my @cmd = ('git', "--git-dir=$git_dir"); + if ($nchg && !$ENV{FAST}) { + my $index = "$git_dir/ssoma.index"; + my $env = { GIT_INDEX_FILE => $index }; + run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env); + } + if ($nchg) { + run_die([@cmd, 'update-server-info'], undef); + eval { + require PublicInbox::SearchIdx; + my $inbox = $self->{inbox} || $git_dir; + my $s = PublicInbox::SearchIdx->new($inbox); + $s->index_sync({ ref => $self->{ref} }); + }; + + eval { run_die([@cmd, qw(gc --auto)], undef) }; + } + + my $lockfh = delete $self->{lockfh} or die "BUG: not locked: $!"; + flock($lockfh, LOCK_UN) or die "unlock failed: $!"; + close $lockfh or die "close lock failed: $!"; +} + +1; +__END__ +=pod + +=head1 NAME + +PublicInbox::Import - message importer for public-inbox + +=head1 VERSION + +version 1.0 + +=head1 SYNOPSYS + + use Email::MIME; + use PublicInbox::Git; + use PublicInbox::Import; + + chomp(my $git_dir = `git rev-parse --git-dir`); + $git_dir or die "GIT_DIR= must be specified\n"; + my $git = PublicInbox::Git->new($git_dir); + my @committer = ('inbox', 'inbox@example.org'); + my $im = PublicInbox::Import->new($git, @committer); + + # to add a message: + my $message = "From: <u\@example.org>\n". + "Subject: test message \n" . + "Date: Thu, 01 Jan 1970 00:00:00 +0000\n" . + "Message-ID: <m\@example.org>\n". + "\ntest message"; + my $parsed = Email::MIME->new($message); + my $ret = $im->add($parsed); + if (!defined $ret) { + warn "duplicate: ", + $parsed->header_obj->header_raw('Message-ID'), "\n"; + } else { + print "imported at mark $ret\n"; + } + $im->done; + + # to remove a message + my $junk = Email::MIME->new($message); + my ($mark, $orig) = $im->remove($junk); + if ($mark eq 'MISSING') { + print "not found\n"; + } elsif ($mark eq 'MISMATCH') { + print "Message exists but does not match\n\n", + $orig->as_string, "\n",; + } else { + print "removed at mark $mark\n\n", + $orig->as_string, "\n"; + } + $im->done; + +=head1 DESCRIPTION + +An importer and remover for public-inboxes which takes L<Email::MIME> +messages as input and stores them in a ssoma repository as +documented in L<https://ssoma.public-inbox.org/ssoma_repository.txt>, +except it does not allow duplicate Message-IDs. + +It requires L<git(1)> and L<git-fast-import(1)> to be installed. + +=head1 METHODS + +=cut + +=head2 new + + my $im = PublicInbox::Import->new($git, @committer); + +Initialize a new PublicInbox::Import object. + +=head2 add + + my $parsed = Email::MIME->new($message); + $im->add($parsed); + +Adds a message to to the git repository. This will acquire +C<$GIT_DIR/ssoma.lock> and start L<git-fast-import(1)> if necessary. + +Messages added will not be visible to other processes until L</done> +is called, but L</remove> may be called on them. + +=head2 remove + + my $junk = Email::MIME->new($message); + my ($code, $orig) = $im->remove($junk); + +Removes a message from the repository. On success, it returns +a ':'-prefixed numeric code representing the git-fast-import +mark and the original messages as an Email::MIME object. +If the message could not be found, the code is "MISSING" +and the original message is undef. If there is a mismatch where +the "Message-ID" is matched but the subject and body do not match, +the returned code is "MISMATCH" and the conflicting message +is returned as orig. + +=head2 done + +Finalizes the L<git-fast-import(1)> and unlocks the repository. +Calling this is required to finalize changes to a repository. + +=head1 SEE ALSO + +L<Email::MIME> + +=head1 CONTACT + +All feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> + +=head1 COPYRIGHT + +Copyright (C) 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> + +=cut diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm new file mode 100644 index 00000000..8c639082 --- /dev/null +++ b/lib/PublicInbox/Inbox.pm @@ -0,0 +1,235 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Represents a public-inbox (which may have multiple mailing addresses) +package PublicInbox::Inbox; +use strict; +use warnings; +use Scalar::Util qw(weaken isweak); +use PublicInbox::Git; +use PublicInbox::MID qw(mid2path); + +my $weakt; +eval { + $weakt = 'disabled'; + require PublicInbox::EvCleanup; + $weakt = undef; # OK if we get here +}; + +my $WEAKEN = {}; # string(inbox) -> inbox +sub weaken_task () { + $weakt = undef; + _weaken_fields($_) for values %$WEAKEN; + $WEAKEN = {}; +} + +sub _weaken_later ($) { + my ($self) = @_; + $weakt ||= PublicInbox::EvCleanup::later(*weaken_task); + $WEAKEN->{"$self"} = $self; +} + +sub new { + my ($class, $opts) = @_; + my $v = $opts->{address} ||= 'public-inbox@example.com'; + my $p = $opts->{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; + $opts->{domain} = ($p =~ /\@(\S+)\z/) ? $1 : 'localhost'; + weaken($opts->{-pi_config}); + bless $opts, $class; +} + +sub _weaken_fields { + my ($self) = @_; + foreach my $f (qw(git mm search)) { + isweak($self->{$f}) or weaken($self->{$f}); + } +} + +sub _set_limiter ($$$) { + my ($self, $git, $pfx) = @_; + my $lkey = "-${pfx}_limiter"; + $git->{$lkey} = $self->{$lkey} ||= eval { + my $mkey = $pfx.'max'; + my $val = $self->{$mkey} or return; + my $lim; + if ($val =~ /\A\d+\z/) { + require PublicInbox::Qspawn; + $lim = PublicInbox::Qspawn::Limiter->new($val); + } elsif ($val =~ /\A[a-z][a-z0-9]*\z/) { + $lim = $self->{-pi_config}->limiter($val); + warn "$mkey limiter=$val not found\n" if !$lim; + } else { + warn "$mkey limiter=$val not understood\n"; + } + $lim; + } +} + +sub git { + my ($self) = @_; + $self->{git} ||= eval { + _weaken_later($self); + my $g = PublicInbox::Git->new($self->{mainrepo}); + _set_limiter($self, $g, 'httpbackend'); + $g; + }; +} + +sub mm { + my ($self) = @_; + $self->{mm} ||= eval { + _weaken_later($self); + PublicInbox::Msgmap->new($self->{mainrepo}); + }; +} + +sub search { + my ($self) = @_; + $self->{search} ||= eval { + _weaken_later($self); + PublicInbox::Search->new($self->{mainrepo}, $self->{altid}); + }; +} + +sub try_cat { + my ($path) = @_; + my $rv = ''; + if (open(my $fh, '<', $path)) { + local $/; + $rv = <$fh>; + } + $rv; +} + +sub description { + my ($self) = @_; + my $desc = $self->{description}; + return $desc if defined $desc; + $desc = try_cat("$self->{mainrepo}/description"); + local $/ = "\n"; + chomp $desc; + $desc =~ s/\s+/ /smg; + $desc = '($GIT_DIR/description missing)' if $desc eq ''; + $self->{description} = $desc; +} + +sub cloneurl { + my ($self) = @_; + my $url = $self->{cloneurl}; + return $url if $url; + $url = try_cat("$self->{mainrepo}/cloneurl"); + my @url = split(/\s+/s, $url); + local $/ = "\n"; + chomp @url; + $self->{cloneurl} = \@url; +} + +sub base_url { + my ($self, $env) = @_; + if ($env) { # PSGI env + my $scheme = $env->{'psgi.url_scheme'}; + my $host_port = $env->{HTTP_HOST} || + "$env->{SERVER_NAME}:$env->{SERVER_PORT}"; + my $url = "$scheme://$host_port". ($env->{SCRIPT_NAME} || '/'); + # for mount in Plack::Builder + $url .= '/' if $url !~ m!/\z!; + $url .= $self->{name} . '/'; + } else { + # either called from a non-PSGI environment (e.g. NNTP/POP3) + $self->{-base_url} ||= do { + my $url = $self->{url} or return undef; + # expand protocol-relative URLs to HTTPS if we're + # not inside a web server + $url = "https:$url" if $url =~ m!\A//!; + $url .= '/' if $url !~ m!/\z!; + $url; + }; + } +} + +sub nntp_url { + my ($self) = @_; + $self->{-nntp_url} ||= do { + # no checking for nntp_usable here, we can point entirely + # to non-local servers or users run by a different user + my $ns = $self->{-pi_config}->{'publicinbox.nntpserver'}; + my $group = $self->{newsgroup}; + my @urls; + if ($ns && $group) { + $ns = [ $ns ] if ref($ns) ne 'ARRAY'; + @urls = map { + my $u = m!\Anntps?://! ? $_ : "nntp://$_"; + $u .= '/' if $u !~ m!/\z!; + $u.$group; + } @$ns; + } + + my $mirrors = $self->{nntpmirror}; + if ($mirrors) { + my @m; + foreach (@$mirrors) { + my $u = m!\Anntps?://! ? $_ : "nntp://$_"; + if ($u =~ m!\Anntps?://[^/]+/?\z!) { + if ($group) { + $u .= '/' if $u !~ m!/\z!; + $u .= $group; + } else { + warn +"publicinbox.$self->{name}.nntpmirror=$_ missing newsgroup name\n"; + } + } + # else: allow full URLs like: + # nntp://news.example.com/alt.example + push @m, $u; + } + my %seen = map { $_ => 1 } @urls; + foreach my $u (@m) { + next if $seen{$u}; + $seen{$u} = 1; + push @urls, $u; + } + } + \@urls; + }; +} + +sub nntp_usable { + my ($self) = @_; + my $ret = $self->mm && $self->search; + $self->{mm} = $self->{search} = undef; + $ret; +} + +sub msg_by_path ($$;$) { + my ($self, $path, $ref) = @_; + # TODO: allow other refs: + my $str = git($self)->cat_file('HEAD:'.$path, $ref); + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s if $str; + $str; +} + +sub msg_by_smsg ($$;$) { + my ($self, $smsg, $ref) = @_; + + return unless defined $smsg; # ghost + + # backwards compat to fallback to msg_by_mid + # TODO: remove if we bump SCHEMA_VERSION in Search.pm: + defined(my $blob = $smsg->blob) or return msg_by_mid($self, $smsg->mid); + + my $str = git($self)->cat_file($blob, $ref); + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s if $str; + $str; +} + +sub path_check { + my ($self, $path) = @_; + git($self)->check('HEAD:'.$path); +} + +sub msg_by_mid ($$;$) { + my ($self, $mid, $ref) = @_; + msg_by_path($self, mid2path($mid), $ref); +} + +1; diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index 4eddedd0..acd2a47e 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -15,21 +15,32 @@ use warnings; use Digest::SHA qw/sha1_hex/; my $SALT = rand; -my $LINK_RE = qr!\b((?:ftp|https?|nntp):// +my $LINK_RE = qr{(\()?\b((?:ftps?|https?|nntps?|gopher):// [\@:\w\.-]+/ - ?[\@\w\+\&\?\.\%\;/#=-]*)!x; + (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) + (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? + (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? + )}xi; sub new { bless {}, shift } sub linkify_1 { my ($self, $s) = @_; $s =~ s!$LINK_RE! - my $url = $1; + my $beg = $1 || ''; + my $url = $2; my $end = ''; # it's fairly common to end URLs in messages with - # '.' or ';' to denote the end of a statement. - if ($url =~ s/(\.)\z// || $url =~ s/(;)\z//) { + # '.', ',' or ';' to denote the end of a statement; + # assume the intent was to end the statement/sentence + # in English + # Markdown compatibility: + if ($beg eq '(') { + if ($url =~ s/(\)[\.,;]?)\z//) { + $end = $1; + } + } elsif ($url =~ s/([\.,;])\z//) { $end = $1; } @@ -40,7 +51,7 @@ sub linkify_1 { # only escape ampersands, others do not match LINK_RE $url =~ s/&/&/g; $self->{$key} = $url; - 'PI-LINK-'. $key . $end; + $beg . 'PI-LINK-'. $key . $end; !ge; $s; } diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index 003bac65..bcf5358b 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -6,11 +6,22 @@ package PublicInbox::MDA; use strict; use warnings; use Email::Simple; -use Email::Address; use Date::Parse qw(strptime); use constant MAX_SIZE => 1024 * 500; # same as spamc default, should be tunable use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian -use constant cmd => qw/ssoma-mda -1/; + +our @BAD_HEADERS = ( + # postfix + qw(delivered-to x-original-to), # prevent training loops + + # The rest are taken from Mailman 2.1.15: + # could contain passwords: + qw(approved approve x-approved x-approve urgent), + # could be used phishing: + qw(return-receipt-to disposition-notification-to x-confirm-reading-to), + # Pegasus mail: + qw(x-pmrqc) +); # drop plus addressing for matching sub __drop_plus { @@ -21,16 +32,17 @@ sub __drop_plus { # do not allow Bcc, only Cc and To if recipient is set sub precheck { - my ($klass, $filter, $address) = @_; - my Email::Simple $simple = $filter->simple; - my $mid = $simple->header("Message-ID"); + my ($klass, $simple, $address) = @_; + my @mid = $simple->header('Message-ID'); + return 0 if scalar(@mid) != 1; + my $mid = $mid[0]; return 0 if (length($mid) > MAX_MID_SIZE); return 0 unless usable_str(length('<m@h>'), $mid) && $mid =~ /\@/; - return 0 unless usable_str(length('u@h'), $filter->from); + return 0 unless usable_str(length('u@h'), $simple->header("From")); return 0 unless usable_str(length(':o'), $simple->header("Subject")); return 0 unless usable_date($simple->header("Date")); return 0 if length($simple->as_string) > MAX_SIZE; - alias_specified($filter, $address); + alias_specified($simple, $address); } sub usable_str { @@ -44,17 +56,17 @@ sub usable_date { } sub alias_specified { - my ($filter, $address) = @_; + my ($simple, $address) = @_; my @address = ref($address) eq 'ARRAY' ? @$address : ($address); my %ok = map { - my @recip = Email::Address->parse($_); - lc(__drop_plus($recip[0]->address)) => 1; + lc(__drop_plus($_)) => 1; } @address; - foreach my $line ($filter->cc, $filter->to) { - foreach my $addr (Email::Address->parse($line)) { - if ($ok{lc(__drop_plus($addr->address))}) { + foreach my $line ($simple->header('Cc'), $simple->header('To')) { + my @addrs = ($line =~ /([^,<\s]+\@[^,>\s]+)/g); + foreach my $addr (@addrs) { + if ($ok{lc(__drop_plus($addr))}) { return 1; } } @@ -64,28 +76,12 @@ sub alias_specified { sub set_list_headers { my ($class, $simple, $dst) = @_; - my $pa = $dst->{-primary_address}; - - $simple->header_set("List-Id", "<$pa>"); # RFC2919 - - # remove Delivered-To: prevent training loops - # The rest are taken from Mailman 2.1.15, some may be used for phishing - foreach my $h (qw(delivered-to approved approve x-approved x-approve - urgent return-receipt-to disposition-notification-to - x-confirm-reading-to x-pmrqc)) { - $simple->header_set($h); + unless (defined $simple->header('List-Id')) { + my $pa = $dst->{-primary_address}; + $simple->header_set("List-Id", "<$pa>"); # RFC2919 } -} - -# returns a 3-element array: name, email, date -sub author_info { - my ($class, $mime) = @_; - my $from = $mime->header('From'); - my @from = Email::Address->parse($from); - my $name = $from[0]->name; - my $email = $from[0]->address; - ($name, $email, $mime->header('Date')); + $simple->header_set($_) foreach @BAD_HEADERS; } 1; diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 78952b95..1c2d75cc 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -6,7 +6,8 @@ package PublicInbox::MID; use strict; use warnings; use base qw/Exporter/; -our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime/; +our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape/; +use URI::Escape qw(uri_escape_utf8); use Digest::SHA qw/sha1_hex/; use constant MID_MAX => 40; # SHA-1 hex length @@ -25,6 +26,7 @@ sub id_compress { my ($id, $force) = @_; if ($force || $id =~ /[^\w\-]/ || length($id) > MID_MAX) { + utf8::encode($id); return sha1_hex($id); } $id; @@ -36,7 +38,9 @@ sub mid2path { unless (defined $x38) { # compatibility with old links (or short Message-IDs :) - $mid = sha1_hex(mid_clean($mid)); + $mid = mid_clean($mid); + utf8::encode($mid); + $mid = sha1_hex($mid); ($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); } "$x2/$x38"; @@ -44,4 +48,8 @@ sub mid2path { sub mid_mime ($) { $_[0]->header_obj->header_raw('Message-ID') } +# RFC3986, section 3.3: +sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' } +sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) } + 1; diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 0d67981f..2565ea58 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -6,32 +6,18 @@ package PublicInbox::Mbox; use strict; use warnings; -use PublicInbox::MID qw/mid2path mid_clean/; -use URI::Escape qw/uri_escape_utf8/; +use PublicInbox::MID qw/mid_clean mid_escape/; require Email::Simple; -sub thread_mbox { - my ($ctx, $srch, $sfx) = @_; - sub { - my ($response) = @_; # Plack callback - emit_mbox($response, $ctx, $srch, $sfx); - } -} - sub emit1 { - my $simple = Email::Simple->new(pop); - my $ctx = pop; - sub { - my ($response) = @_; - # single message should be easily renderable in browsers - my $fh = $response->([200, ['Content-Type'=>'text/plain']]); - emit_msg($ctx, $fh, $simple); - $fh->close; - } + my ($ctx, $msg) = @_; + $msg = Email::Simple->new($msg); + # single message should be easily renderable in browsers + [200, ['Content-Type', 'text/plain'], [ msg_str($ctx, $msg)] ] } -sub emit_msg { - my ($ctx, $fh, $simple) = @_; # Email::Simple object +sub msg_str { + my ($ctx, $simple) = @_; # Email::Simple object my $header_obj = $simple->header_obj; # drop potentially confusing headers, ssoma already should've dropped @@ -39,74 +25,67 @@ sub emit_msg { foreach my $d (qw(Lines Bytes Content-Length Status)) { $header_obj->header_set($d); } - my $feed_opts = $ctx->{feed_opts}; - unless ($feed_opts) { - require PublicInbox::Feed; # FIXME: gross - $feed_opts = PublicInbox::Feed::get_feedopts($ctx); - $ctx->{feed_opts} = $feed_opts; - } - my $base = $feed_opts->{url}; + my $ibx = $ctx->{-inbox}; + my $base = $ibx->base_url($ctx->{env}); my $mid = mid_clean($header_obj->header('Message-ID')); - $mid = uri_escape_utf8($mid); - my @archived_at = $header_obj->header('Archived-At'); - push @archived_at, "<$base$mid/>"; - $header_obj->header_set('Archived-At', @archived_at); - $header_obj->header_set('List-Archive', "<$base>"); - $header_obj->header_set('List-Post', "<mailto:$feed_opts->{id_addr}>"); - - my $buf = $header_obj->as_string; - unless ($buf =~ /\AFrom /) { - $fh->write("From mboxrd\@z Thu Jan 1 00:00:00 1970\n"); + $mid = mid_escape($mid); + my @append = ( + 'Archived-At', "<$base$mid/>", + 'List-Archive', "<$base>", + 'List-Post', "<mailto:$ibx->{-primary_address}>", + ); + my $crlf = $simple->crlf; + my $buf = "From mboxrd\@z Thu Jan 1 00:00:00 1970\n" . + $header_obj->as_string; + for (my $i = 0; $i < @append; $i += 2) { + my $k = $append[$i]; + my $v = $append[$i + 1]; + my @v = $header_obj->header($k); + foreach (@v) { + if ($v eq $_) { + $v = undef; + last; + } + } + $buf .= "$k: $v$crlf" if defined $v; } - $fh->write($buf .= $simple->crlf); - - $buf = $simple->body; - $simple->body_set(''); + $buf .= $crlf; # mboxrd quoting style # ref: http://www.qmail.org/man/man5/mbox.html - $buf =~ s/^(>*From )/>$1/gm; + my $body = $simple->body; + $body =~ s/^(>*From )/>$1/gm; + $buf .= $body; + $buf .= "\n"; +} - $buf .= "\n" unless $buf =~ /\n\z/s; +sub thread_mbox { + my ($ctx, $srch, $sfx) = @_; + eval { require IO::Compress::Gzip }; + return sub { need_gzip(@_) } if $@; - $fh->write($buf); + my $cb = sub { $srch->get_thread($ctx->{mid}, @_) }; + # http://www.iana.org/assignments/media-types/application/gzip + [200, ['Content-Type' => 'application/gzip'], + PublicInbox::MboxGz->new($ctx, $cb) ]; } -sub emit_mbox { - my ($response, $ctx, $srch, $sfx) = @_; - my $type = 'mbox'; - if ($sfx) { - eval { require IO::Compress::Gzip }; - return need_gzip($response) if $@; - $type = 'gzip'; +sub emit_range { + my ($ctx, $range) = @_; + + eval { require IO::Compress::Gzip }; + return sub { need_gzip(@_) } if $@; + my $query; + if ($range eq 'all') { # TODO: YYYY[-MM] + $query = ''; + } else { + return [404, [qw(Content-Type text/plain)], []]; } + my $cb = sub { $ctx->{srch}->query($query, @_) }; # http://www.iana.org/assignments/media-types/application/gzip - # http://www.iana.org/assignments/media-types/application/mbox - my $fh = $response->([200, ['Content-Type' => "application/$type"]]); - $fh = PublicInbox::MboxGz->new($fh) if $sfx; - - require PublicInbox::Git; - my $mid = $ctx->{mid}; - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my %opts = (offset => 0); - my $nr; - do { - my $res = $srch->get_thread($mid, \%opts); - my $msgs = $res->{msgs}; - $nr = scalar @$msgs; - while (defined(my $smsg = shift @$msgs)) { - my $msg = eval { - my $p = 'HEAD:'.mid2path($smsg->mid); - Email::Simple->new($git->cat_file($p)); - }; - emit_msg($ctx, $fh, $msg) if $msg; - } - - $opts{offset} += $nr; - } while ($nr > 0); - - $fh->close; + [200, [qw(Content-Type application/gzip)], + PublicInbox::MboxGz->new($ctx, $cb) ]; } sub need_gzip { @@ -123,40 +102,55 @@ EOF 1; -# fh may not be a proper IO, so we wrap the write and close methods -# to prevent IO::Compress::Gzip from complaining package PublicInbox::MboxGz; use strict; use warnings; sub new { - my ($class, $fh) = @_; - my $buf; + my ($class, $ctx, $cb) = @_; + my $buf = ''; bless { buf => \$buf, - gz => IO::Compress::Gzip->new(\$buf), - fh => $fh, + gz => IO::Compress::Gzip->new(\$buf, Time => 0), + cb => $cb, + ctx => $ctx, + msgs => [], + opts => { offset => 0 }, }, $class; } -sub _flush_buf { +# called by Plack::Util::foreach or similar +sub getline { my ($self) = @_; - if (defined ${$self->{buf}}) { - $self->{fh}->write(${$self->{buf}}); - ${$self->{buf}} = undef; - } -} - -sub write { - $_[0]->{gz}->write($_[1]); - _flush_buf($_[0]); + my $ctx = $self->{ctx} or return; + my $res; + my $ibx = $ctx->{-inbox}; + my $gz = $self->{gz}; + do { + while (defined(my $smsg = shift @{$self->{msgs}})) { + my $msg = eval { $ibx->msg_by_smsg($smsg) } or next; + $msg = Email::Simple->new($msg); + $gz->write(PublicInbox::Mbox::msg_str($ctx, $msg)); + my $bref = $self->{buf}; + if (length($$bref) >= 8192) { + my $ret = $$bref; # copy :< + ${$self->{buf}} = ''; + return $ret; + } + + # be fair to other clients on public-inbox-httpd: + return ''; + } + $res = $self->{cb}->($self->{opts}); + $self->{msgs} = $res->{msgs}; + $res = scalar @{$self->{msgs}}; + $self->{opts}->{offset} += $res; + } while ($res); + $gz->close; + delete $self->{ctx}; + ${delete $self->{buf}}; } -sub close { - my ($self) = @_; - $self->{gz}->close; - _flush_buf($self); - $self->{fh}->close; -} +sub close {} # noop 1; diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm new file mode 100644 index 00000000..ef0d209f --- /dev/null +++ b/lib/PublicInbox/MsgIter.pm @@ -0,0 +1,57 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +package PublicInbox::MsgIter; +use strict; +use warnings; +use base qw(Exporter); +our @EXPORT = qw(msg_iter); +use Email::MIME; +use Scalar::Util qw(readonly); + +# Workaround Email::MIME versions without +# commit dcef9be66c49ae89c7a5027a789bbbac544499ce +# ("removing all trailing newlines was too much") +# This is necessary for Debian jessie +my $bad = 1.923; +my $good = 1.935; +my $ver = $Email::MIME::VERSION; +my $extra_nl = 1 if ($ver >= $bad && $ver < $good); + +# Like Email::MIME::walk_parts, but this is: +# * non-recursive +# * passes depth and indices to the iterator callback +sub msg_iter ($$) { + my ($mime, $cb) = @_; + my @parts = $mime->subparts; + if (@parts) { + my $i = 0; + @parts = map { [ $_, 1, ++$i ] } @parts; + while (my $p = shift @parts) { + my ($part, $depth, @idx) = @$p; + my @sub = $part->subparts; + if (@sub) { + $depth++; + $i = 0; + @sub = map { [ $_, $depth, @idx, ++$i ] } @sub; + @parts = (@sub, @parts); + } else { + if ($extra_nl) { + my $lf = $part->{mycrlf}; + my $bref = $part->{body}; + if (readonly($$bref)) { + my $s = $$bref . $lf; + $part->{body} = \$s; + } else { + $$bref .= $lf; + } + } + $cb->($p); + } + } + } else { + $cb->([$mime, 0, 0]); + } +} + +1; diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index 8fe17a95..3fb3805f 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -20,7 +20,12 @@ sub new { my $err = $!; -d $d or die "$d not created: $err"; } - my $f = "$d/msgmap.sqlite3"; + new_file($class, "$d/msgmap.sqlite3", $writable); +} + +sub new_file { + my ($class, $f, $writable) = @_; + my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', { AutoCommit => 1, RaiseError => 1, @@ -33,11 +38,14 @@ sub new { if ($writable) { create_tables($dbh); + $dbh->begin_work; $self->created_at(time) unless $self->created_at; + $dbh->commit; } $self; } +# n.b. invoked directly by scripts/xhdr-num2mid sub meta_accessor { my ($self, $key, $value) = @_; use constant { @@ -51,22 +59,14 @@ sub meta_accessor { defined $value or return $dbh->selectrow_array(meta_select, undef, $key); - $dbh->begin_work; - eval { - $prev = $dbh->selectrow_array(meta_select, undef, $key); + $prev = $dbh->selectrow_array(meta_select, undef, $key); - if (defined $prev) { - $dbh->do(meta_update, undef, $value, $key); - } else { - $dbh->do(meta_insert, undef, $key, $value); - } - $dbh->commit; - }; - my $err = $@; - return $prev unless $err; - - $dbh->rollback; - die $err; + if (defined $prev) { + $dbh->do(meta_update, undef, $value, $key); + } else { + $dbh->do(meta_insert, undef, $key, $value); + } + $prev; } sub last_commit { @@ -160,6 +160,7 @@ sub create_tables { 'val VARCHAR(255) NOT NULL)'); } +# used by NNTP.pm sub id_batch { my ($self, $num, $cb) = @_; my $dbh = $self->{dbh}; @@ -173,4 +174,15 @@ sub id_batch { $nr; } +# only used for mapping external serial numbers (e.g. articles from gmane) +# see scripts/xhdr-num2mid for usage +sub mid_set { + my ($self, $num, $mid) = @_; + my $sth = $self->{mid_set} ||= do { + my $sql = 'INSERT INTO msgmap (num, mid) VALUES (?,?)'; + $self->{dbh}->prepare($sql); + }; + $sth->execute($num, $mid); +} + 1; diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 8740377f..9408ffb9 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -9,12 +9,14 @@ use base qw(Danga::Socket); use fields qw(nntpd article rbuf ng long_res); use PublicInbox::Search; use PublicInbox::Msgmap; +use PublicInbox::MID qw(mid_escape); use PublicInbox::Git; -use PublicInbox::MID qw(mid2path); -use Email::MIME; -use Data::Dumper qw(Dumper); +require PublicInbox::EvCleanup; +use Email::Simple; use POSIX qw(strftime); use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); +use Digest::SHA qw(sha1_hex); +use Time::Local qw(timegm timelocal); use constant { r501 => '501 command syntax error', r221 => '221 Header follows', @@ -36,21 +38,36 @@ my $LIST_HEADERS = join("\r\n", @OVERVIEW, my %DISABLED; # = map { $_ => 1 } qw(xover list_overview_fmt newnews xhdr); my $EXPMAP; # fd -> [ idle_time, $self ] -my $EXPTIMER; +my $expt; our $EXPTIME = 180; # 3 minutes +my $nextt; + +my $nextq = []; +sub next_tick () { + $nextt = undef; + my $q = $nextq; + $nextq = []; + foreach my $nntp (@$q) { + # for request && response protocols, always finish writing + # before finishing reading: + if (my $long_cb = $nntp->{long_res}) { + $nntp->write($long_cb); + } elsif (&Danga::Socket::POLLIN & $nntp->{event_watch}) { + event_read($nntp); + } + } +} sub update_idle_time ($) { my ($self) = @_; - my $tmp = $self->{sock} or return; - $tmp = fileno($tmp); - defined $tmp and $EXPMAP->{$tmp} = [ now(), $self ]; + my $fd = $self->{fd}; + defined $fd and $EXPMAP->{$fd} = [ now(), $self ]; } sub expire_old () { my $now = now(); my $exp = $EXPTIME; my $old = $now - $exp; - my $next = $now + $exp; my $nr = 0; my %new; while (my ($fd, $v) = each %$EXPMAP) { @@ -58,36 +75,31 @@ sub expire_old () { if ($idle_time < $old) { $nntp->close; # idempotent } else { - my $nexp = $idle_time + $exp; - $next = $nexp if ($nexp < $next); ++$nr; $new{$fd} = $v; } } $EXPMAP = \%new; if ($nr) { - $next -= $now; - $next = 0 if $next < 0; - $EXPTIMER = Danga::Socket->AddTimer($next, *expire_old); + $expt = PublicInbox::EvCleanup::later(*expire_old); } else { - $EXPTIMER = undef; - # noop to kick outselves out of the loop so descriptors + $expt = undef; + # noop to kick outselves out of the loop ASAP so descriptors # really get closed - Danga::Socket->AddTimer(0, sub {}); + PublicInbox::EvCleanup::asap(sub {}); } } sub new ($$$) { my ($class, $sock, $nntpd) = @_; my $self = fields::new($class); - binmode $sock, ':utf8'; # RFC 3977 $self->SUPER::new($sock); $self->{nntpd} = $nntpd; res($self, '201 server ready - post via email'); $self->{rbuf} = ''; $self->watch_read(1); update_idle_time($self); - $EXPTIMER ||= Danga::Socket->AddTimer($EXPTIME, *expire_old); + $expt ||= PublicInbox::EvCleanup::later(*expire_old); $self; } @@ -115,7 +127,8 @@ sub process_line ($$) { my $res = eval { $req->($self, @args) }; my $err = $@; if ($err && !$self->{closed}) { - chomp($l = Dumper(\$l)); + local $/ = "\n"; + chomp($l); err($self, 'error from: %s (%s)', $l, $err); $res = '503 program fault - command not performed'; } @@ -153,7 +166,7 @@ sub list_active ($;$) { my ($self, $wildmat) = @_; wildmat2re($wildmat); foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $wildmat or next; + $ng->{newsgroup} =~ $wildmat or next; group_line($self, $ng); } } @@ -162,9 +175,9 @@ sub list_active_times ($;$) { my ($self, $wildmat) = @_; wildmat2re($wildmat); foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $wildmat or next; + $ng->{newsgroup} =~ $wildmat or next; my $c = eval { $ng->mm->created_at } || time; - more($self, "$ng->{name} $c $ng->{address}"); + more($self, "$ng->{newsgroup} $c $ng->{-primary_address}"); } } @@ -172,9 +185,9 @@ sub list_newsgroups ($;$) { my ($self, $wildmat) = @_; wildmat2re($wildmat); foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $wildmat or next; + $ng->{newsgroup} =~ $wildmat or next; my $d = $ng->description; - more($self, "$ng->{name} $d"); + more($self, "$ng->{newsgroup} $d"); } } @@ -226,7 +239,6 @@ sub cmd_listgroup ($;$) { sub parse_time ($$;$) { my ($date, $time, $gmt) = @_; - use Time::Local qw(); my ($hh, $mm, $ss) = unpack('A2A2A2', $time); if (defined $gmt) { $gmt =~ /\A(?:UTC|GMT)\z/i or die "GM invalid: $gmt"; @@ -238,22 +250,22 @@ sub parse_time ($$;$) { ($YYYY, $MM, $DD) = unpack('A4A2A2', $date); } else { # legacy clients send YYMMDD ($YYYY, $MM, $DD) = unpack('A2A2A2', $date); - if ($YYYY > strftime('%y', @now)) { - my $cur_year = $now[5] + 1900; + my $cur_year = $now[5] + 1900; + if ($YYYY > $cur_year) { $YYYY += int($cur_year / 1000) * 1000 - 100; } } if ($gmt) { - Time::Local::timegm($ss, $mm, $hh, $DD, $MM - 1, $YYYY); + timegm($ss, $mm, $hh, $DD, $MM - 1, $YYYY); } else { - Time::Local::timelocal($ss, $mm, $hh, $DD, $MM - 1, $YYYY); + timelocal($ss, $mm, $hh, $DD, $MM - 1, $YYYY); } } sub group_line ($$) { my ($self, $ng) = @_; my ($min, $max) = $ng->mm->minmax; - more($self, "$ng->{name} $max $min n") if defined $min && defined $max; + more($self, "$ng->{newsgroup} $max $min n") if defined $min && defined $max; } sub cmd_newgroups ($$$;$$) { @@ -275,7 +287,6 @@ sub wildmat2re (;$) { return $_[0] = qr/.*/ if (!defined $_[0] || $_[0] eq '*'); my %keep; my $salt = rand; - use Digest::SHA qw(sha1_hex); my $tmp = $_[0]; $tmp =~ s#(?<!\\)\[(.+)(?<!\\)\]# @@ -313,8 +324,8 @@ sub cmd_newnews ($$$$;$$) { ngpat2re($skip); my @srch; foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $keep or next; - $ng->{name} =~ $skip and next; + $ng->{newsgroup} =~ $keep or next; + $ng->{newsgroup} =~ $skip and next; my $srch = $ng->search or next; push @srch, $srch; }; @@ -382,7 +393,8 @@ sub cmd_last ($) { article_adj($_[0], -1) } sub cmd_post ($) { my ($self) = @_; my $ng = $self->{ng}; - $ng ? "440 mailto:$ng->{address} to post" : '440 posting not allowed' + $ng ? "440 mailto:$ng->{-primary_address} to post" + : '440 posting not allowed' } sub cmd_quit ($) { @@ -392,6 +404,29 @@ sub cmd_quit ($) { undef; } +sub header_append ($$$) { + my ($hdr, $k, $v) = @_; + my @v = $hdr->header($k); + foreach (@v) { + return if $v eq $_; + } + $hdr->header_set($k, @v, $v); +} + +sub set_nntp_headers { + my ($hdr, $ng, $n, $mid) = @_; + + # clobber some + $hdr->header_set('Newsgroups', $ng->{newsgroup}); + $hdr->header_set('Xref', xref($ng, $n)); + header_append($hdr, 'List-Post', "<mailto:$ng->{-primary_address}>"); + if (my $url = $ng->base_url) { + $mid = mid_escape($mid); + header_append($hdr, 'Archived-At', "<$url$mid/>"); + header_append($hdr, 'List-Archive', "<$url>"); + } +} + sub art_lookup ($$$) { my ($self, $art, $set_headers) = @_; my $ng = $self->{ng}; @@ -428,14 +463,12 @@ find_mid: defined $mid or return $err; } found: - my $o = 'HEAD:' . mid2path($mid); my $bytes; - my $s = eval { Email::MIME->new($ng->gcf->cat_file($o, \$bytes)) }; - return $err unless $s; + my $s = eval { $ng->msg_by_mid($mid, \$bytes) } or return $err; + $s = Email::Simple->new($s); my $lines; if ($set_headers) { - $s->header_set('Newsgroups', $ng->{name}); - $s->header_set('Xref', xref($ng, $n)); + set_nntp_headers($s->header_obj, $ng, $n, $mid); $lines = $s->body =~ tr!\n!\n!; # must be last @@ -460,6 +493,12 @@ sub set_art { $self->{article} = $art if defined $art && $art =~ /\A\d+\z/; } +sub _header ($) { + my $hdr = $_[0]->header_obj->as_string; + utf8::encode($hdr); + $hdr +} + sub cmd_article ($;$) { my ($self, $art) = @_; my $r = art_lookup($self, $art, 1); @@ -467,7 +506,7 @@ sub cmd_article ($;$) { my ($n, $mid, $s) = @$r; set_art($self, $art); more($self, "220 $n <$mid> article retrieved - head and body follow"); - do_more($self, $s->header_obj->as_string); + do_more($self, _header($s)); do_more($self, "\r\n"); simple_body_write($self, $s); } @@ -479,7 +518,7 @@ sub cmd_head ($;$) { my ($n, $mid, $s) = @$r; set_art($self, $art); more($self, "221 $n <$mid> article retrieved - head follows"); - do_more($self, $s->header_obj->as_string); + do_more($self, _header($s)); '.' } @@ -533,16 +572,6 @@ sub get_range ($$) { [ $beg, $end ]; } -sub hdr_val ($$) { - my ($r, $header) = @_; - return $r->[3] if $header =~ /\A:?bytes\z/i; - return $r->[4] if $header =~ /\A:?lines\z/i; - $r = $r->[2]->header_obj->header($header); - defined $r or return; - $r =~ s/[\r\n\t]+/ /sg; - $r; -} - sub long_response ($$$$) { my ($self, $beg, $end, $cb) = @_; die "BUG: nested long response" if $self->{long_res}; @@ -584,9 +613,9 @@ sub long_response ($$$$) { # no recursion, schedule another call ASAP # but only after all pending writes are done update_idle_time($self); - Danga::Socket->AddTimer(0, sub { - $self->write($self->{long_res}); - }); + + push @$nextq, $self; + $nextt ||= PublicInbox::EvCleanup::asap(*next_tick); } else { # all done! $self->{long_res} = undef; $self->watch_read(1); @@ -622,7 +651,7 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. sub xref ($$) { my ($ng, $n) = @_; - "$ng->{domain} $ng->{name}:$n" + "$ng->{domain} $ng->{newsgroup}:$n" } sub mid_lookup ($$) { @@ -665,8 +694,7 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin sub search_header_for { my ($srch, $mid, $field) = @_; - my $smsg = $srch->lookup_message($mid) or return; - $smsg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); + my $smsg = $srch->lookup_mail($mid) or return; $smsg->$field; } @@ -696,6 +724,7 @@ sub hdr_searchmsg ($$$$) { foreach my $s (@$msgs) { $tmp .= $s->num . ' ' . $s->$field . "\r\n"; } + utf8::encode($tmp); do_more($self, $tmp); # -1 to adjust for implicit increment in long_response $$i = $nr ? $$i + $nr - 1 : long_response_limit; @@ -784,7 +813,7 @@ sub over_line ($$) { my ($num, $smsg) = @_; # n.b. field access and procedural calls can be # 10%-15% faster than OO method calls: - join("\t", $num, + my $s = join("\t", $num, $smsg->{subject}, $smsg->{from}, PublicInbox::SearchMsg::date($smsg), @@ -792,16 +821,17 @@ sub over_line ($$) { $smsg->{references}, PublicInbox::SearchMsg::bytes($smsg), PublicInbox::SearchMsg::lines($smsg)); + utf8::encode($s); + $s } sub cmd_over ($;$) { my ($self, $range) = @_; if ($range && $range =~ /\A<(.+)>\z/) { my ($ng, $n) = mid_lookup($self, $1); - my $smsg = $ng->search->lookup_message($range) or + my $smsg = $ng->search->lookup_mail($range) or return '430 No article with that message-id'; more($self, '224 Overview information follows (multi-line)'); - $smsg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); # Only set article number column if it's the current group my $self_ng = $self->{ng}; @@ -846,7 +876,7 @@ sub cmd_xpath ($$) { my @paths; foreach my $ng (values %{$self->{nntpd}->{groups}}) { my $n = $ng->mm->num_for($mid); - push @paths, "$ng->{name}/$n" if defined $n; + push @paths, "$ng->{newsgroup}/$n" if defined $n; } return '430 no such article on server' unless @paths; '223 '.join(' ', @paths); @@ -865,7 +895,7 @@ sub more ($$) { sub do_write ($$) { my ($self, $data) = @_; my $done = $self->write($data); - die if $self->{closed}; + return if $self->{closed}; # Do not watch for readability if we have data in the queue, # instead re-enable watching for readability when we can @@ -922,6 +952,7 @@ sub event_read { $self->{rbuf} .= $$buf; while ($r > 0 && $self->{rbuf} =~ s/\A\s*([^\r\n]+)\r?\n//) { my $line = $1; + return $self->close if $line =~ /[[:cntrl:]]/s; my $t0 = now(); my $fd = $self->{fd}; $r = eval { process_line($self, $line) }; @@ -945,19 +976,25 @@ sub watch_read { # and we must double-check again by the time the timer fires # in case we really did dispatch a read event and started # another long response. - Danga::Socket->AddTimer(0, sub { - if (&Danga::Socket::POLLIN & $self->{event_watch}) { - event_read($self); - } - }); + push @$nextq, $self; + $nextt ||= PublicInbox::EvCleanup::asap(*next_tick); } $rv; } +sub not_idle_long ($$) { + my ($self, $now) = @_; + defined(my $fd = $self->{fd}) or return; + my $ary = $EXPMAP->{$fd} or return; + my $exp_at = $ary->[0] + $EXPTIME; + $exp_at > $now; +} + # for graceful shutdown in PublicInbox::Daemon: -sub busy () { - my ($self) = @_; - ($self->{rbuf} ne '' || $self->{long_res} || $self->{write_buf_size}); +sub busy { + my ($self, $now) = @_; + ($self->{rbuf} ne '' || $self->{long_res} || $self->{write_buf_size} || + not_idle_long($self, $now)); } 1; diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm new file mode 100644 index 00000000..eb43a2bf --- /dev/null +++ b/lib/PublicInbox/NNTPD.pm @@ -0,0 +1,44 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# represents an NNTPD (currently a singleton), +# see script/public-inbox-nntpd for how it is used +package PublicInbox::NNTPD; +use strict; +use warnings; +require PublicInbox::Config; + +sub new { + my ($class) = @_; + bless { + groups => {}, + err => \*STDERR, + out => \*STDOUT, + grouplist => [], + }, $class; +} + +sub refresh_groups () { + my ($self) = @_; + my $pi_config = PublicInbox::Config->new; + my $new = {}; + my @list; + $pi_config->each_inbox(sub { + my ($ng) = @_; + my $ngname = $ng->{newsgroup} or return; + if (ref $ngname) { + warn 'multiple newsgroups not supported: '. + join(', ', @$ngname). "\n"; + } elsif ($ng->nntp_usable) { + # Only valid if msgmap and search works + $new->{$ngname} = $ng; + push @list, $ng; + } + }); + @list = sort { $a->{newsgroup} cmp $b->{newsgroup} } @list; + $self->{grouplist} = \@list; + # this will destroy old groups that got deleted + %{$self->{groups}} = %$new; +} + +1; diff --git a/lib/PublicInbox/NewsGroup.pm b/lib/PublicInbox/NewsGroup.pm deleted file mode 100644 index b20180e6..00000000 --- a/lib/PublicInbox/NewsGroup.pm +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (C) 2015 all contributors <meta@public-inbox.org> -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -# -# Used only by the NNTP server to represent a public-inbox git repository -# as a newsgroup -package PublicInbox::NewsGroup; -use strict; -use warnings; -use Scalar::Util qw(weaken); -require Danga::Socket; -require PublicInbox::Msgmap; -require PublicInbox::Search; -require PublicInbox::Git; - -sub new { - my ($class, $name, $git_dir, $address) = @_; - $address = $address->[0] if ref($address); - my $self = bless { - name => $name, - git_dir => $git_dir, - address => $address, - }, $class; - $self->{domain} = ($address =~ /\@(\S+)\z/) ? $1 : 'localhost'; - $self; -} - -sub defer_weaken { - my ($self, $field) = @_; - Danga::Socket->AddTimer(30, sub { weaken($self->{$field}) }); -} - -sub gcf { - my ($self) = @_; - $self->{gcf} ||= eval { - my $gcf = PublicInbox::Git->new($self->{git_dir}); - - # git repos may be repacked and old packs unlinked - defer_weaken($self, 'gcf'); - $gcf; - }; -} - -sub usable { - my ($self) = @_; - eval { - PublicInbox::Msgmap->new($self->{git_dir}); - PublicInbox::Search->new($self->{git_dir}); - }; -} - -sub mm { - my ($self) = @_; - $self->{mm} ||= eval { - my $mm = PublicInbox::Msgmap->new($self->{git_dir}); - - # may be needed if we run low on handles - defer_weaken($self, 'mm'); - $mm; - }; -} - -sub search { - my ($self) = @_; - $self->{search} ||= eval { - my $search = PublicInbox::Search->new($self->{git_dir}); - - # may be needed if we run low on handles - defer_weaken($self, 'search'); - $search; - }; -} - -sub description { - my ($self) = @_; - open my $fh, '<', "$self->{git_dir}/description" or return ''; - my $desc = eval { local $/; <$fh> }; - chomp $desc; - $desc =~ s/\s+/ /smg; - $desc; -} - -sub update { - my ($self, $new) = @_; - $self->{address} = $new->{address}; - $self->{domain} = $new->{domain}; - if ($self->{git_dir} ne $new->{git_dir}) { - # new git_dir requires a new mm and gcf - $self->{mm} = $self->{gcf} = undef; - $self->{git_dir} = $new->{git_dir}; - } -} - -1; diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm index dfc00217..b4d74763 100644 --- a/lib/PublicInbox/NewsWWW.pm +++ b/lib/PublicInbox/NewsWWW.pm @@ -2,14 +2,14 @@ # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Plack app redirector for mapping /$NEWSGROUP requests to -# the appropriate /$LISTNAME in PublicInbox::WWW because some +# the appropriate /$INBOX in PublicInbox::WWW because some # auto-linkifiers cannot handle nntp:// redirects properly. # This is also used directly by PublicInbox::WWW package PublicInbox::NewsWWW; use strict; use warnings; use PublicInbox::Config; -use URI::Escape qw(uri_escape_utf8); +use PublicInbox::MID qw(mid_escape); sub new { my ($class, $pi_config) = @_; @@ -19,7 +19,6 @@ sub new { sub call { my ($self, $env) = @_; - my $ng_map = $self->newsgroup_map; my $path = $env->{PATH_INFO}; $path =~ s!\A/+!!; $path =~ s!/+\z!!; @@ -27,54 +26,24 @@ sub call { # some links may have the article number in them: # /inbox.foo.bar/123456 my ($ng, $article) = split(m!/+!, $path, 2); - if (my $info = $ng_map->{$ng}) { - my $url = PublicInbox::Hval::prurl($env, $info->{url}); + if (my $inbox = $self->{pi_config}->lookup_newsgroup($ng)) { + my $url = PublicInbox::Hval::prurl($env, $inbox->{url}); my $code = 301; - my $h = [ Location => $url, 'Content-Type' => 'text/plain' ]; if (defined $article && $article =~ /\A\d+\z/) { - my $mid = eval { ng_mid_for($ng, $info, $article) }; + my $mid = eval { $inbox->mm->mid_for($article) }; if (defined $mid) { # article IDs are not stable across clones, # do not encourage caching/bookmarking them $code = 302; - $url .= uri_escape_utf8($mid) . '/'; + $url .= mid_escape($mid) . '/'; } } - return [ $code, $h, [ "Redirecting to $url\n" ] ] - } - [ 404, [ 'Content-Type' => 'text/plain' ], [] ]; -} - -sub ng_mid_for { - my ($ng, $info, $article) = @_; - # may fail due to lack of Danga::Socket - # for defer_weaken: - require PublicInbox::NewsGroup; - $ng = $info->{ng} ||= - PublicInbox::NewsGroup->new($ng, $info->{git_dir}, ''); - $ng->mm->mid_for($article); -} - -sub newsgroup_map { - my ($self) = @_; - my $rv; - $rv = $self->{ng_map} and return $rv; - my $pi_config = $self->{pi_config}; - my %ng_map; - foreach my $k (keys %$pi_config) { - $k =~ /\Apublicinbox\.([^\.]+)\.mainrepo\z/ or next; - my $listname = $1; - my $git_dir = $pi_config->{"publicinbox.$listname.mainrepo"}; - my $url = $pi_config->{"publicinbox.$listname.url"}; - defined $url or next; - my $ng = $pi_config->{"publicinbox.$listname.newsgroup"}; - next if (!defined $ng) || ($ng eq ''); # disabled + my $h = [ Location => $url, 'Content-Type' => 'text/plain' ]; - $url =~ m!/\z! or $url .= '/'; - $ng_map{$ng} = { url => $url, git_dir => $git_dir }; + return [ $code, $h, [ "Redirecting to $url\n" ] ] } - $self->{ng_map} = \%ng_map; + [ 404, [ 'Content-Type' => 'text/plain' ], [ "404 Not Found\n" ] ]; } 1; diff --git a/lib/PublicInbox/ParentPipe.pm b/lib/PublicInbox/ParentPipe.pm new file mode 100644 index 00000000..d2d054ce --- /dev/null +++ b/lib/PublicInbox/ParentPipe.pm @@ -0,0 +1,21 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# only for PublicInbox::Daemon +package PublicInbox::ParentPipe; +use strict; +use warnings; +use base qw(Danga::Socket); +use fields qw(cb); + +sub new ($$$) { + my ($class, $pipe, $cb) = @_; + my $self = fields::new($class); + $self->SUPER::new($pipe); + $self->{cb} = $cb; + $self->watch_read(1); + $self; +} + +sub event_read { $_[0]->{cb}->($_[0]) } + +1; diff --git a/lib/PublicInbox/Qspawn.pm b/lib/PublicInbox/Qspawn.pm new file mode 100644 index 00000000..697c55a1 --- /dev/null +++ b/lib/PublicInbox/Qspawn.pm @@ -0,0 +1,71 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Limits the number of processes spawned +# This does not depend on Danga::Socket or any other external +# scheduling mechanism, you just need to call start and finish +# appropriately +package PublicInbox::Qspawn; +use strict; +use warnings; +use PublicInbox::Spawn qw(popen_rd); + +sub new ($$$;) { + my ($class, $cmd, $env, $opt) = @_; + bless { args => [ $cmd, $env, $opt ] }, $class; +} + +sub _do_spawn { + my ($self, $cb) = @_; + my $err; + + ($self->{rpipe}, $self->{pid}) = popen_rd(@{$self->{args}}); + if (defined $self->{pid}) { + $self->{limiter}->{running}++; + } else { + $self->{err} = $!; + } + $cb->($self->{rpipe}); +} + +sub finish ($) { + my ($self) = @_; + my $limiter = $self->{limiter}; + if (delete $self->{rpipe}) { + my $pid = delete $self->{pid}; + $self->{err} = $pid == waitpid($pid, 0) ? $? : + "PID:$pid still running?"; + $limiter->{running}--; + } + if (my $next = shift @{$limiter->{run_queue}}) { + _do_spawn(@$next); + } + $self->{err}; +} + +sub start { + my ($self, $limiter, $cb) = @_; + $self->{limiter} = $limiter; + + if ($limiter->{running} < $limiter->{max}) { + _do_spawn($self, $cb); + } else { + push @{$limiter->{run_queue}}, [ $self, $cb ]; + } +} + +package PublicInbox::Qspawn::Limiter; +use strict; +use warnings; + +sub new { + my ($class, $max) = @_; + bless { + # 32 is same as the git-daemon connection limit + max => $max || 32, + running => 0, + run_queue => [], + }, $class; +} + +1; diff --git a/lib/PublicInbox/Repobrowse.pm b/lib/PublicInbox/Repobrowse.pm index 0a812f72..cdd708e9 100644 --- a/lib/PublicInbox/Repobrowse.pm +++ b/lib/PublicInbox/Repobrowse.pm @@ -96,6 +96,7 @@ sub call { extra => \@extra, # path cgi => $cgi, rconfig => $rconfig, + env => $env, }; my $tslash = 0; my $cmd = shift @extra; diff --git a/lib/PublicInbox/RepobrowseConfig.pm b/lib/PublicInbox/RepobrowseConfig.pm index 77ef46bb..a08c6cec 100644 --- a/lib/PublicInbox/RepobrowseConfig.pm +++ b/lib/PublicInbox/RepobrowseConfig.pm @@ -3,7 +3,8 @@ package PublicInbox::RepobrowseConfig; use strict; use warnings; -use PublicInbox::Config qw/try_cat/; +use PublicInbox::Inbox; +use PublicInbox::Config; require PublicInbox::Hval; sub new { @@ -52,7 +53,7 @@ sub lookup { # gitweb compatibility foreach my $key (qw(description cloneurl)) { - $rv->{$key} = try_cat("$path/$key"); + $rv->{$key} = PublicInbox::Inbox::try_cat("$path/$key"); } $rv->{desc_html} = diff --git a/lib/PublicInbox/RepobrowseGitFallback.pm b/lib/PublicInbox/RepobrowseGitFallback.pm index 696e5b94..38640139 100644 --- a/lib/PublicInbox/RepobrowseGitFallback.pm +++ b/lib/PublicInbox/RepobrowseGitFallback.pm @@ -15,8 +15,7 @@ sub call { my $expath = $req->{expath}; return if index($expath, '..') >= 0; # prevent path traversal my $git = $req->{repo_info}->{git}; - my $cgi = $req->{cgi}; - PublicInbox::GitHTTPBackend::serve($cgi, $git, $expath); + PublicInbox::GitHTTPBackend::serve($req->{env}, $git, $expath); } 1; diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pm b/lib/PublicInbox/SaPlugin/ListMirror.pm new file mode 100644 index 00000000..3808196c --- /dev/null +++ b/lib/PublicInbox/SaPlugin/ListMirror.pm @@ -0,0 +1,109 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Rules useful for running a mailing list mirror. We want to: +# * ensure Received: headers are really from the list mail server +# users expect. This is to prevent malicious users from +# injecting spam into mirrors without going through the expected +# server +# * flag messages where the mailing list is Bcc:-ed since it is +# common for spam to have wrong or non-existent To:/Cc: headers. + +package PublicInbox::SaPlugin::ListMirror; +use strict; +use warnings; +use base qw(Mail::SpamAssassin::Plugin); + +# constructor: register the eval rules +sub new { + my ($class, $mail) = @_; + + # some boilerplate... + $class = ref($class) || $class; + my $self = $class->SUPER::new($mail); + bless $self, $class; + $mail->{conf}->{list_mirror_check} = []; + $self->register_eval_rule('check_list_mirror_received'); + $self->register_eval_rule('check_list_mirror_bcc'); + $self->set_config($mail->{conf}); + $self; +} + +sub check_list_mirror_received { + my ($self, $pms) = @_; + my $recvd = $pms->get('Received') || ''; + $recvd =~ s/\n.*\z//s; + + foreach my $cfg (@{$pms->{conf}->{list_mirror_check}}) { + my ($hdr, $hval, $host_re, $addr_re) = @$cfg; + my $v = $pms->get($hdr) or next; + local $/ = "\n"; + chomp $v; + next if $v ne $hval; + return 1 if $recvd !~ $host_re; + } + + 0; +} + +sub check_list_mirror_bcc { + my ($self, $pms) = @_; + my $tocc = $pms->get('ToCc'); + + foreach my $cfg (@{$pms->{conf}->{list_mirror_check}}) { + my ($hdr, $hval, $host_re, $addr_re) = @$cfg; + defined $addr_re or next; + my $v = $pms->get($hdr) or next; + local $/ = "\n"; + chomp $v; + next if $v ne $hval; + return 1 if !$tocc || $tocc !~ $addr_re; + } + + 0; +} + +# list_mirror HEADER HEADER_VALUE HOSTNAME_GLOB [LIST_ADDRESS] +# list_mirror X-Mailing-List git@vger.kernel.org *.kernel.org +# list_mirror List-Id <foo.example.org> *.example.org foo@example.org +sub config_list_mirror { + my ($self, $key, $value, $line) = @_; + + defined $value or + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + + my ($hdr, $hval, $host_glob, @extra) = split(/\s+/, $value); + my $addr = shift @extra; + + if (defined $addr) { + $addr !~ /\@/ and + return $Mail::SpamAssassin::Conf::INVALID_VALUE; + $addr = join('|', map { quotemeta } split(/,/, $addr)); + $addr = qr/\b$addr\b/i; + } + + @extra and return $Mail::SpamAssassin::Conf::INVALID_VALUE; + + defined $host_glob or + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + + my %patmap = ('*' => '\S+', '?' => '.', '[' => '[', ']' => ']'); + $host_glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; + my $host_re = qr/\A\s*from\s+$host_glob(?:\s|$)/si; + + push @{$self->{list_mirror_check}}, [ $hdr, $hval, $host_re, $addr ]; +} + +sub set_config { + my ($self, $conf) = @_; + my @cmds; + push @cmds, { + setting => 'list_mirror', + default => '', + type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING, + code => *config_list_mirror, + }; + $conf->{parser}->register_commands(\@cmds); +} + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 0f7815fb..24cb2667 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -12,6 +12,7 @@ use constant TS => 0; # timestamp use constant NUM => 1; # NNTP article number use constant BYTES => 2; # :bytes as defined in RFC 3977 use constant LINES => 3; # :lines as defined in RFC 3977 +use constant YYYYMMDD => 4; # for searching in the WWW UI use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; @@ -36,7 +37,9 @@ use constant { # 8 - remove redundant/unneeded document data # 9 - disable Message-ID compression (SHA-1) # 10 - optimize doc for NNTP overviews - SCHEMA_VERSION => 10, + # 11 - merge threads when vivifying ghosts + # 12 - change YYYYMMDD value column to numeric + SCHEMA_VERSION => 12, # n.b. FLAG_PURE_NOT is expensive not suitable for a public website # as it could become a denial-of-service vector @@ -50,16 +53,57 @@ my %bool_pfx_internal = ( ); my %bool_pfx_external = ( + # do we still need these? probably not.. path => 'XPATH', mid => 'Q', # uniQue id (Message-ID) ); my %prob_prefix = ( - subject => 'S', - s => 'S', # for mairix compatibility - m => 'Q', # 'mid' is exact, 'm' can do partial + # for mairix compatibility + s => 'S', + m => 'XMID', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + f => 'A', + t => 'XTO', + tc => 'XTO XCC', + c => 'XCC', + tcf => 'XTO XCC A', + a => 'XTO XCC A', + b => 'XNQ XQUOT', + bs => 'XNQ XQUOT S', + n => 'XFN', + + q => 'XQUOT', + nq => 'XNQ', + + # default: + '' => 'XMID S A XNQ XQUOT XFN', ); +# not documenting m: and mid: for now, the using the URLs works w/o Xapian +our @HELP = ( + 's:' => 'match within Subject e.g. s:"a quick brown fox"', + 'd:' => <<EOF, +date range as YYYYMMDD e.g. d:19931002..20101002 +Open-ended ranges such as d:19931002.. and d:..20101002 +are also supported +EOF + 'b:' => 'match within message body, including text attachments', + 'nq:' => 'match non-quoted text within message body', + 'quot:' => 'match quoted text within message body', + 'n:' => 'match filename of attachment(s)', + 't:' => 'match within the To header', + 'c:' => 'match within the Cc header', + 'f:' => 'match within the From header', + 'a:' => 'match within the To, Cc, and From headers', + 'tc:' => 'match within the To and Cc headers', + 'bs:' => 'match within the Subject and body', +); +chomp @HELP; +# TODO: +# df (filenames from diff) +# da (diff a/ removed lines) +# db (diff b/ added lines) + my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); sub xpfx { $all_pfx{$_[0]} } @@ -78,10 +122,10 @@ sub xdir { } sub new { - my ($class, $git_dir) = @_; + my ($class, $git_dir, $altid) = @_; my $dir = $class->xdir($git_dir); my $db = Search::Xapian::Database->new($dir); - bless { xdb => $db, git_dir => $git_dir }, $class; + bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class; } sub reopen { $_[0]->{xdb}->reopen } @@ -97,7 +141,7 @@ sub query { $opts->{relevance} = 1 unless exists $opts->{relevance}; } - $self->do_enquire($query, $opts); + _do_enquire($self, $query, $opts); } sub get_thread { @@ -106,13 +150,44 @@ sub get_thread { return { total => 0, msgs => [] } unless $smsg; my $qtid = Search::Xapian::Query->new(xpfx('thread').$smsg->thread_id); - my $path = id_compress($smsg->path); - my $qsub = Search::Xapian::Query->new(xpfx('path').$path); - my $query = Search::Xapian::Query->new(OP_OR, $qtid, $qsub); - $self->do_enquire($query, $opts); + my $path = $smsg->path; + if (defined $path && $path ne '') { + my $path = id_compress($smsg->path); + my $qsub = Search::Xapian::Query->new(xpfx('path').$path); + $qtid = Search::Xapian::Query->new(OP_OR, $qtid, $qsub); + } + $opts ||= {}; + $opts->{limit} ||= 1000; + + # always sort threads by timestamp, this makes life easier + # for the threading algorithm (in SearchThread.pm) + $opts->{asc} = 1; + + _do_enquire($self, $qtid, $opts); +} + +sub retry_reopen { + my ($self, $cb) = @_; + my $ret; + for (1..10) { + eval { $ret = $cb->() }; + return $ret unless $@; + # Exception: The revision being read has been discarded - + # you should call Xapian::Database::reopen() + if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') { + reopen($self); + } else { + die; + } + } +} + +sub _do_enquire { + my ($self, $query, $opts) = @_; + retry_reopen($self, sub { _enquire_once($self, $query, $opts) }); } -sub do_enquire { +sub _enquire_once { my ($self, $query, $opts) = @_; my $enquire = $self->enquire; if (defined $query) { @@ -125,6 +200,8 @@ sub do_enquire { my $desc = !$opts->{asc}; if ($opts->{relevance}) { $enquire->set_sort_by_relevance_then_value(TS, $desc); + } elsif ($opts->{num}) { + $enquire->set_sort_by_value(NUM, 0); } else { $enquire->set_sort_by_value_then_relevance(TS, $desc); } @@ -155,28 +232,37 @@ sub qp { $qp->set_database($self->{xdb}); $qp->set_stemmer($self->stemmer); $qp->set_stemming_strategy(STEM_SOME); - $qp->add_valuerangeprocessor($self->ts_range_processor); - $qp->add_valuerangeprocessor($self->date_range_processor); + $qp->add_valuerangeprocessor( + Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:')); while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $prefix); } + # we do not actually create AltId objects, + # just parse the spec to avoid the extra DB handles for now. + if (my $altid = $self->{altid}) { + my $user_pfx = $self->{-user_pfx} ||= []; + for (@$altid) { + # $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3' + /\Aserial:(\w+):/ or next; + my $pfx = $1; + push @$user_pfx, "$pfx:", <<EOF; +alternate serial number e.g. $pfx:12345 (boolean) +EOF + # gmane => XGMANE + $qp->add_boolean_prefix($pfx, 'X'.uc($pfx)); + } + chomp @$user_pfx; + } + while (my ($name, $prefix) = each %prob_prefix) { - $qp->add_prefix($name, $prefix); + $qp->add_prefix($name, $_) foreach split(/ /, $prefix); } $self->{query_parser} = $qp; } -sub ts_range_processor { - $_[0]->{tsrp} ||= Search::Xapian::NumberValueRangeProcessor->new(TS); -} - -sub date_range_processor { - $_[0]->{drp} ||= Search::Xapian::DateValueRangeProcessor->new(TS); -} - sub num_range_processor { $_[0]->{nrp} ||= Search::Xapian::NumberValueRangeProcessor->new(NUM); } @@ -184,21 +270,12 @@ sub num_range_processor { # only used for NNTP server sub query_xover { my ($self, $beg, $end, $offset) = @_; - my $enquire = $self->enquire; my $qp = Search::Xapian::QueryParser->new; $qp->set_database($self->{xdb}); $qp->add_valuerangeprocessor($self->num_range_processor); my $query = $qp->parse_query("$beg..$end", QP_FLAGS); - $query = Search::Xapian::Query->new(OP_AND, $mail_query, $query); - $enquire->set_query($query); - $enquire->set_sort_by_value(NUM, 0); - my $limit = 200; - my $mset = $enquire->get_mset($offset, $limit); - my @msgs = map { - PublicInbox::SearchMsg->load_doc($_->get_document); - } $mset->items; - { total => $mset->get_matches_estimated, msgs => \@msgs } + _do_enquire($self, $query, {num => 1, limit => 200, offset => $offset}); } sub lookup_message { @@ -216,6 +293,12 @@ sub lookup_message { $smsg; } +sub lookup_mail { # no ghosts! + my ($self, $mid) = @_; + my $smsg = lookup_message($self, $mid) or return; + PublicInbox::SearchMsg->load_doc($smsg->{doc}); +} + sub find_unique_doc_id { my ($self, $term, $value) = @_; @@ -295,4 +378,14 @@ sub enquire { $self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb}); } +sub help { + my ($self) = @_; + $self->qp; # parse altids + my @ret = @HELP; + if (my $user_pfx = $self->{-user_pfx}) { + push @ret, @$user_pfx; + } + \@ret; +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 63be6810..832d1cbf 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -4,13 +4,20 @@ # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread -# relationships for use by Mail::Thread. This writes to the search -# index. +# relationships for use by PublicInbox::SearchThread. +# This writes to the search index. package PublicInbox::SearchIdx; use strict; use warnings; +use Fcntl qw(:flock :DEFAULT); +use Email::MIME; +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; use base qw(PublicInbox::Search); use PublicInbox::MID qw/mid_clean id_compress mid_mime/; +use PublicInbox::MsgIter; +use Carp qw(croak); +use POSIX qw(strftime); require PublicInbox::Git; *xpfx = *PublicInbox::Search::xpfx; @@ -24,135 +31,206 @@ use constant { }; sub new { - my ($class, $git_dir, $writable) = @_; - my $dir = $class->xdir($git_dir); + my ($class, $inbox, $creat) = @_; + my $git_dir = $inbox; + my $altid; + if (ref $inbox) { + $git_dir = $inbox->{mainrepo}; + $altid = $inbox->{altid}; + if ($altid) { + require PublicInbox::AltId; + $altid = [ map { + PublicInbox::AltId->new($inbox, $_); + } @$altid ]; + } + } require Search::Xapian::WritableDatabase; - my $flag = Search::Xapian::DB_OPEN; - my $self = bless { git_dir => $git_dir }, $class; + my $self = bless { git_dir => $git_dir, -altid => $altid }, $class; my $perm = $self->_git_config_perm; my $umask = _umask_for($perm); $self->{umask} = $umask; - $self->{xdb} = $self->with_umask(sub { - if ($writable == 1) { - require File::Path; - File::Path::mkpath($dir); - $flag = Search::Xapian::DB_CREATE_OR_OPEN; - } - Search::Xapian::WritableDatabase->new($dir, $flag); - }); + $self->{lock_path} = "$git_dir/ssoma.lock"; + $self->{git} = PublicInbox::Git->new($git_dir); + $self->{creat} = ($creat || 0) == 1; $self; } -sub add_val { +sub _xdb_release { + my ($self) = @_; + my $xdb = delete $self->{xdb} or croak 'not acquired'; + $xdb->close; + _lock_release($self) if $self->{creat}; + undef; +} + +sub _xdb_acquire { + my ($self) = @_; + croak 'already acquired' if $self->{xdb}; + my $dir = PublicInbox::Search->xdir($self->{git_dir}); + my $flag = Search::Xapian::DB_OPEN; + if ($self->{creat}) { + require File::Path; + _lock_acquire($self); + File::Path::mkpath($dir); + $self->{batch_size} = 100; + $flag = Search::Xapian::DB_CREATE_OR_OPEN; + } + $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); +} + +# we only acquire the flock if creating or reindexing; +# PublicInbox::Import already has the lock on its own. +sub _lock_acquire { + my ($self) = @_; + croak 'already locked' if $self->{lockfh}; + sysopen(my $lockfh, $self->{lock_path}, O_WRONLY|O_CREAT) or + die "failed to open lock $self->{lock_path}: $!\n"; + flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; + $self->{lockfh} = $lockfh; +} + +sub _lock_release { + my ($self) = @_; + my $lockfh = delete $self->{lockfh} or croak 'not locked'; + flock($lockfh, LOCK_UN) or die "unlock failed: $!\n"; + close $lockfh or die "close failed: $!\n"; +} + +sub add_val ($$$) { my ($doc, $col, $num) = @_; $num = Search::Xapian::sortable_serialise($num); $doc->add_value($col, $num); } +sub add_values ($$$) { + my ($smsg, $bytes, $num) = @_; + + my $ts = $smsg->ts; + my $doc = $smsg->{doc}; + add_val($doc, &PublicInbox::Search::TS, $ts); + + defined($num) and add_val($doc, &PublicInbox::Search::NUM, $num); + + defined($bytes) and add_val($doc, &PublicInbox::Search::BYTES, $bytes); + + add_val($doc, &PublicInbox::Search::LINES, + $smsg->{mime}->body_raw =~ tr!\n!\n!); + + my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); + add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); +} + +sub index_users ($$) { + my ($tg, $smsg) = @_; + + my $from = $smsg->from; + my $to = $smsg->to; + my $cc = $smsg->cc; + + $tg->index_text($from, 1, 'A'); # A - author + $tg->increase_termpos; + $tg->index_text($to, 1, 'XTO') if $to ne ''; + $tg->increase_termpos; + $tg->index_text($cc, 1, 'XCC') if $cc ne ''; + $tg->increase_termpos; +} + +sub index_body ($$$) { + my ($tg, $lines, $inc) = @_; + $tg->index_text(join("\n", @$lines), $inc, $inc ? 'XNQ' : 'XQUOT'); + @$lines = (); + $tg->increase_termpos; +} + sub add_message { - my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object + my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; - my $doc_id; + my ($doc_id, $old_tid); my $mid = mid_clean(mid_mime($mime)); - my $was_ghost = 0; - my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; my $smsg = $self->lookup_message($mid); - my $doc; - if ($smsg) { - $smsg->ensure_metadata; # convert a ghost to a regular message # it will also clobber any existing regular message - $smsg->mime($mime); - $doc = $smsg->{doc}; - - my $type = xpfx('type'); - eval { - $doc->remove_term($type . 'ghost'); - $was_ghost = 1; - }; - - # probably does not exist: - eval { $doc->remove_term($type . 'mail') }; - $doc->add_term($type . 'mail'); - } else { - $smsg = PublicInbox::SearchMsg->new($mime); - $doc = $smsg->{doc}; - $doc->add_term(xpfx('mid') . $mid); + $doc_id = $smsg->doc_id; + $old_tid = $smsg->thread_id; } + $smsg = PublicInbox::SearchMsg->new($mime); + my $doc = $smsg->{doc}; + $doc->add_term(xpfx('mid') . $mid); my $subj = $smsg->subject; - if ($subj ne '') { my $path = $self->subject_path($subj); $doc->add_term(xpfx('path') . id_compress($path)); } - add_val($doc, &PublicInbox::Search::TS, $smsg->ts); - - defined($num) and - add_val($doc, &PublicInbox::Search::NUM, $num); - - defined($bytes) and - add_val($doc, &PublicInbox::Search::BYTES, $bytes); - - add_val($doc, &PublicInbox::Search::LINES, - $mime->body_raw =~ tr!\n!\n!); + add_values($smsg, $bytes, $num); my $tg = $self->term_generator; $tg->set_document($doc); $tg->index_text($subj, 1, 'S') if $subj; $tg->increase_termpos; - $tg->index_text($subj) if $subj; - $tg->increase_termpos; - $tg->index_text($smsg->from); - $tg->increase_termpos; + index_users($tg, $smsg); - $mime->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - my $ct = $part->content_type || $ct_msg; + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $tg->index_text($fn, 1, 'XFN'); + } - # account for filter bugs... - $ct =~ m!\btext/plain\b!i or return; + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; my (@orig, @quot); my $body = $part->body; - $part->body_set(''); my @lines = split(/\n/, $body); while (defined(my $l = shift @lines)) { - if ($l =~ /^\s*>/) { + if ($l =~ /^>/) { + index_body($tg, \@orig, 1) if @orig; push @quot, $l; } else { + index_body($tg, \@quot, 0) if @quot; push @orig, $l; } } - if (@quot) { - $tg->index_text(join("\n", @quot), 0); - @quot = (); - $tg->increase_termpos; - } - if (@orig) { - $tg->index_text(join("\n", @orig)); - @orig = (); - $tg->increase_termpos; - } + index_body($tg, \@quot, 0) if @quot; + index_body($tg, \@orig, 1) if @orig; }); - if ($was_ghost) { - $doc_id = $smsg->doc_id; - $self->link_message($smsg, 0); - $doc->set_data($smsg->to_doc_data); + link_message($self, $smsg, $old_tid); + $tg->index_text($mid, 1, 'XMID'); + $doc->set_data($smsg->to_doc_data($blob)); + + if (my $altid = $self->{-altid}) { + foreach my $alt (@$altid) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_term($alt->{xprefix} . $id); + } + } + if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { - $self->link_message($smsg, 0); - $doc->set_data($smsg->to_doc_data); $doc_id = $db->add_document($doc); } }; @@ -208,27 +286,17 @@ sub next_thread_id { } sub link_message { - my ($self, $smsg, $is_ghost) = @_; - - if ($is_ghost) { - $smsg->ensure_metadata; - } else { - $self->link_message_to_parents($smsg); - } -} - -sub link_message_to_parents { - my ($self, $smsg) = @_; + my ($self, $smsg, $old_tid) = @_; my $doc = $smsg->{doc}; my $mid = $smsg->mid; my $mime = $smsg->mime; my $hdr = $mime->header_obj; my $refs = $hdr->header_raw('References'); my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); - if (my $irt = $hdr->header_raw('In-Reply-To')) { - # last References should be $irt - # we will de-dupe later - push @refs, mid_clean($irt); + my $irt = $hdr->header_raw('In-Reply-To'); + if (defined $irt) { + $irt = mid_clean($irt); + $irt = undef if $mid eq $irt; } my $tid; @@ -237,6 +305,15 @@ sub link_message_to_parents { my @orig_refs = @refs; @refs = (); + if (defined $irt) { + # to check MAX_MID_SIZE + push @orig_refs, $irt; + + # below, we will ensure IRT (if specified) + # is the last References + $uniq{$irt} = 1; + } + # prevent circular references via References: here: foreach my $ref (@orig_refs) { if (length($ref) > MAX_MID_SIZE) { @@ -247,6 +324,11 @@ sub link_message_to_parents { push @refs, $ref; } } + + # last References should be IRT, but some mail clients do things + # out of order, so trust IRT over References iff IRT exists + push @refs, $irt if defined $irt; + if (@refs) { $smsg->{references} = '<'.join('> <', @refs).'>'; @@ -254,13 +336,12 @@ sub link_message_to_parents { # but we can never trust clients to do the right thing my $ref = shift @refs; $tid = $self->_resolve_mid_to_tid($ref); + $self->merge_threads($tid, $old_tid) if defined $old_tid; # the rest of the refs should point to this tid: foreach $ref (@refs) { my $ptid = $self->_resolve_mid_to_tid($ref); - if ($tid ne $ptid) { - $self->merge_threads($tid, $ptid); - } + merge_threads($self, $tid, $ptid); } } else { $tid = $self->next_thread_id; @@ -269,135 +350,196 @@ sub link_message_to_parents { } sub index_blob { - my ($self, $git, $mime, $bytes, $num) = @_; - $self->add_message($mime, $bytes, $num); + my ($self, $mime, $bytes, $num, $blob) = @_; + $self->add_message($mime, $bytes, $num, $blob); } sub unindex_blob { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; my $mid = eval { mid_clean(mid_mime($mime)) }; $self->remove_message($mid) if defined $mid; } sub index_mm { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_insert(mid_clean(mid_mime($mime))); } sub unindex_mm { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); } sub index_mm2 { - my ($self, $git, $mime, $bytes) = @_; + my ($self, $mime, $bytes, $blob) = @_; my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime))); - index_blob($self, $git, $mime, $bytes, $num); + index_blob($self, $mime, $bytes, $num, $blob); } sub unindex_mm2 { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); - unindex_blob($self, $git, $mime); + unindex_blob($self, $mime); } sub index_both { - my ($self, $git, $mime, $bytes) = @_; - my $num = index_mm($self, $git, $mime); - index_blob($self, $git, $mime, $bytes, $num); + my ($self, $mime, $bytes, $blob) = @_; + my $num = index_mm($self, $mime); + index_blob($self, $mime, $bytes, $num, $blob); } sub unindex_both { - my ($self, $git, $mime) = @_; - unindex_blob($self, $git, $mime); - unindex_mm($self, $git, $mime); + my ($self, $mime) = @_; + unindex_blob($self, $mime); + unindex_mm($self, $mime); } sub do_cat_mail { my ($git, $blob, $sizeref) = @_; my $mime = eval { my $str = $git->cat_file($blob, $sizeref); + # fixup bugs from import: + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; Email::MIME->new($str); }; $@ ? undef : $mime; } sub index_sync { - my ($self, $head) = @_; - $self->with_umask(sub { $self->_index_sync($head) }); + my ($self, $opts) = @_; + with_umask($self, sub { $self->_index_sync($opts) }); } sub rlog { - my ($self, $range, $add_cb, $del_cb) = @_; + my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; my $h40 = $hex .'{40}'; my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!; - my $git = PublicInbox::Git->new($self->{git_dir}); - my $log = $git->popen(qw/log --reverse --no-notes --no-color - --raw -r --no-abbrev/, $range); + my $git = $self->{git}; my $latest; my $bytes; - while (defined(my $line = <$log>)) { + my $max = $self->{batch_size}; # may be undef + local $/ = "\n"; + my $line; + while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { - my $mime = do_cat_mail($git, $1, \$bytes) or next; - $add_cb->($self, $git, $mime, $bytes); + my $blob = $1; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { - my $mime = do_cat_mail($git, $1) or next; - $del_cb->($self, $git, $mime); + my $blob = $1; + my $mime = do_cat_mail($git, $blob) or next; + $del_cb->($self, $mime); } elsif ($line =~ /^commit ($h40)/o) { + if (defined $max && --$max <= 0) { + $max = $self->{batch_size}; + $batch_cb->($latest, 1); + } $latest = $1; } } - $latest; + $batch_cb->($latest, 0); } -# indexes all unindexed messages -sub _index_sync { - my ($self, $head) = @_; - my $db = $self->{xdb}; - $head ||= 'HEAD'; - my $mm = $self->{mm} = eval { +sub _msgmap_init { + my ($self) = @_; + $self->{mm} = eval { require PublicInbox::Msgmap; PublicInbox::Msgmap->new($self->{git_dir}, 1); }; +} + +sub _git_log { + my ($self, $range) = @_; + $self->{git}->popen(qw/log --reverse --no-notes --no-color + --raw -r --no-abbrev/, $range); +} + +# indexes all unindexed messages +sub _index_sync { + my ($self, $opts) = @_; + my $tip = $opts->{ref} || 'HEAD'; + my $reindex = $opts->{reindex}; + my ($mkey, $last_commit, $lx, $xlog); + $self->{git}->batch_prepare; + my $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + do { + $xlog = undef; + $mkey = 'last_commit'; + $last_commit = $xdb->get_metadata('last_commit'); + $lx = $last_commit; + if ($reindex) { + $lx = ''; + $mkey = undef if $last_commit ne ''; + } + $xdb->cancel_transaction; + $xdb = _xdb_release($self); + + # ensure we leak no FDs to "git log" + my $range = $lx eq '' ? $tip : "$lx..$tip"; + $xlog = _git_log($self, $range); + + $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + } while ($xdb->get_metadata('last_commit') ne $last_commit); + + my $mm = _msgmap_init($self); + my $dbh = $mm->{dbh} if $mm; + my $mm_only; + my $cb = sub { + my ($commit, $more) = @_; + if ($dbh) { + $mm->last_commit($commit) if $commit; + $dbh->commit; + } + if (!$mm_only) { + $xdb->set_metadata($mkey, $commit) if $mkey && $commit; + $xdb->commit_transaction; + $xdb = _xdb_release($self); + } + # let another process do some work... < + if ($more) { + if (!$mm_only) { + $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + } + $dbh->begin_work if $dbh; + } + }; - $db->begin_transaction; - my $lx = $db->get_metadata('last_commit'); - my $range = $lx eq '' ? $head : "$lx..$head"; if ($mm) { - $mm->{dbh}->begin_work; + $dbh->begin_work; my $lm = $mm->last_commit || ''; if ($lm eq $lx) { # Common case is the indexes are synced, # we only need to run git-log once: - $lx = $self->rlog($range, *index_both, *unindex_both); - $mm->{dbh}->commit; - if (defined $lx) { - $db->set_metadata('last_commit', $lx); - $mm->last_commit($lx); - } + rlog($self, $xlog, *index_both, *unindex_both, $cb); } else { - # dumb case, msgmap and xapian are out-of-sync - # do not care for performance: - my $r = $lm eq '' ? $head : "$lm..$head"; - $lm = $self->rlog($r, *index_mm, *unindex_mm); - $mm->{dbh}->commit; - $mm->last_commit($lm) if defined $lm; - - $lx = $self->rlog($range, *index_mm2, *unindex_mm2); - $db->set_metadata('last_commit', $lx) if defined $lx; + # Uncommon case, msgmap and xapian are out-of-sync + # do not care for performance (but git is fast :>) + # This happens if we have to reindex Xapian since + # msgmap is a frozen format and our Xapian format + # is evolving. + my $r = $lm eq '' ? $tip : "$lm..$tip"; + + # first, ensure msgmap is up-to-date: + my $mkey_prev = $mkey; + $mkey = undef; # ignore xapian, for now + my $mlog = _git_log($self, $r); + $mm_only = 1; + rlog($self, $mlog, *index_mm, *unindex_mm, $cb); + $mm_only = $mlog = undef; + + # now deal with Xapian + $mkey = $mkey_prev; + $dbh = undef; + rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb); } } else { # user didn't install DBD::SQLite and DBI - $lx = $self->rlog($range, *index_blob, *unindex_blob); - $db->set_metadata('last_commit', $lx) if defined $lx; - } - if ($@) { - $db->cancel_transaction; - $mm->{dbh}->rollback if $mm; - } else { - $db->commit_transaction; + rlog($self, $xlog, *index_blob, *unindex_blob, $cb); } } @@ -410,17 +552,15 @@ sub _resolve_mid_to_tid { } sub create_ghost { - my ($self, $mid, $tid) = @_; - - $tid = $self->next_thread_id unless defined $tid; + my ($self, $mid) = @_; + my $tid = $self->next_thread_id; my $doc = Search::Xapian::Document->new; $doc->add_term(xpfx('mid') . $mid); $doc->add_term(xpfx('thread') . $tid); $doc->add_term(xpfx('type') . 'ghost'); my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); - $self->link_message($smsg, 1); $self->{xdb}->add_document($doc); $smsg; @@ -428,6 +568,7 @@ sub create_ghost { sub merge_threads { my ($self, $winner_tid, $loser_tid) = @_; + return if $winner_tid == $loser_tid; my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid); my $thread_pfx = xpfx('thread'); my $db = $self->{xdb}; @@ -445,6 +586,7 @@ sub _read_git_config_perm { my ($self) = @_; my @cmd = qw(config core.sharedRepository); my $fh = PublicInbox::Git->new($self->{git_dir})->popen(@cmd); + local $/ = "\n"; my $perm = <$fh>; chomp $perm if defined $perm; $perm; @@ -496,4 +638,10 @@ sub with_umask { $rv; } +sub DESTROY { + # order matters for unlocking + $_[0]->{xdb} = undef; + $_[0]->{lockfh} = undef; +} + 1; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index a0899159..5779d1e2 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -7,14 +7,11 @@ package PublicInbox::SearchMsg; use strict; use warnings; use Search::Xapian; -use Email::Address qw//; use POSIX qw//; use Date::Parse qw/str2time/; use PublicInbox::MID qw/mid_clean/; -use Encode qw/find_encoding/; -my $enc_utf8 = find_encoding('UTF-8'); +use PublicInbox::Address; our $PFX2TERM_RE = undef; -use constant EPOCH_822 => 'Thu, 01 Jan 1970 00:00:00 +0000'; use POSIX qw(strftime); sub new { @@ -37,10 +34,10 @@ sub get_val ($$) { sub load_doc { my ($class, $doc) = @_; - my $data = $doc->get_data; + my $data = $doc->get_data or return; my $ts = get_val($doc, &PublicInbox::Search::TS); - $data = $enc_utf8->decode($data); - my ($subj, $from, $refs, $to, $cc) = split(/\n/, $data); + utf8::decode($data); + my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data); bless { doc => $doc, subject => $subj, @@ -49,6 +46,7 @@ sub load_doc { references => $refs, to => $to, cc => $cc, + blob => $blob, }, $class; } @@ -80,16 +78,15 @@ sub date ($) { return $date if defined $date; my $ts = $self->{ts}; return unless defined $ts; - $self->{date} = strftime('%a, %d %b %Y %T %z', gmtime($ts)); + $self->{date} = strftime('%a, %d %b %Y %T +0000', gmtime($ts)); } sub from ($) { my ($self) = @_; my $from = __hdr($self, 'from'); if (defined $from && !defined $self->{from_name}) { - $from =~ tr/\t\r\n/ /; - my @from = Email::Address->parse($from); - $self->{from_name} = $from[0]->name; + my @n = PublicInbox::Address::names($from); + $self->{from_name} = join(', ', @n); } $from; } @@ -108,9 +105,11 @@ sub ts { } sub to_doc_data { - my ($self) = @_; - join("\n", $self->subject, $self->from, $self->references, - $self->to, $self->cc); + my ($self, $blob) = @_; + my @rows = ($self->subject, $self->from, $self->references, + $self->to, $self->cc); + push @rows, $blob if defined $blob; + join("\n", @rows); } sub references { @@ -144,38 +143,6 @@ sub ensure_metadata { } } -# for threading only -sub mini_mime { - my ($self) = @_; - $self->ensure_metadata; - my @hs = ( - 'Subject' => $self->subject, - 'X-PI-From' => $self->from_name, - ); - - my @h = ( - # prevent Email::Simple::Creator from running, - # this header is useless for threading as we use X-PI-TS - # for sorting and display: - 'Date' => EPOCH_822, - 'Message-ID' => "<$self->{mid}>", - 'X-PI-TS' => $self->ts, - ); - if (my $refs = $self->{references}) { - push @h, References => $refs; - } - my $mime = Email::MIME->create(header_str => \@hs, header => \@h); - my $h = $mime->header_obj; - - # set these headers manually since Encode::encode('MIME-Q', ...) - # will add spaces to long values when using header_str above. - - # drop useless headers Email::MIME set for us - $h->header_set('Date'); - $h->header_set('MIME-Version'); - $mime; -} - sub mid ($;$) { my ($self, $mid) = @_; @@ -191,6 +158,15 @@ sub mid ($;$) { sub _extract_mid { mid_clean(mid_mime($_[0]->mime)) } +sub blob { + my ($self, $x40) = @_; + if (defined $x40) { + $self->{blob} = $x40; + } else { + $self->{blob}; + } +} + sub mime { my ($self, $mime) = @_; if (defined $mime) { diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm new file mode 100644 index 00000000..601a84b0 --- /dev/null +++ b/lib/PublicInbox/SearchThread.pm @@ -0,0 +1,163 @@ +# This library is free software; you can redistribute it and/or modify +# it under the same terms as Perl itself. +# +# This license differs from the rest of public-inbox +# +# Our own jwz-style threading class based on Mail::Thread from CPAN. +# Mail::Thread is unmaintained and unavailable on some distros. +# We also do not want pruning or subject grouping, since we want +# to encourage strict threading and hopefully encourage people +# to use proper In-Reply-To. +# +# This includes fixes from several open bugs for Mail::Thread +# +# Avoid circular references +# - https://rt.cpan.org/Public/Bug/Display.html?id=22817 +# +# And avoid recursion in recurse_down: +# - https://rt.cpan.org/Ticket/Display.html?id=116727 +# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479 +package PublicInbox::SearchThread; +use strict; +use warnings; + +sub new { + return bless { + messages => $_[1], + id_table => {}, + rootset => [] + }, $_[0]; +} + +sub thread { + my $self = shift; + _add_message($self, $_) foreach @{$self->{messages}}; + my $id_table = delete $self->{id_table}; + $self->{rootset} = [ grep { + !delete($_->{parent}) && $_->visible } values %$id_table ]; +} + +sub _get_cont_for_id ($$) { + my ($self, $mid) = @_; + $self->{id_table}{$mid} ||= PublicInbox::SearchThread::Msg->new($mid); +} + +sub _add_message ($$) { + my ($self, $smsg) = @_; + + # A. if id_table... + my $this = _get_cont_for_id($self, $smsg->{mid}); + $this->{smsg} = $smsg; + + # B. For each element in the message's References field: + defined(my $refs = $smsg->{references}) or return; + + # This loop exists to help fill in gaps left from missing + # messages. It is not needed in a perfect world where + # everything is perfectly referenced, only the last ref + # matters. + my $prev; + foreach my $ref ($refs =~ m/<([^>]+)>/g) { + # Find a Container object for the given Message-ID + my $cont = _get_cont_for_id($self, $ref); + + # Link the References field's Containers together in + # the order implied by the References header + # + # * If they are already linked don't change the + # existing links + # * Do not add a link if adding that link would + # introduce a loop... + if ($prev && + !$cont->{parent} && # already linked + !$cont->has_descendent($prev) # would loop + ) { + $prev->add_child($cont); + } + $prev = $cont; + } + + # C. Set the parent of this message to be the last element in + # References. + $prev->add_child($this) if defined $prev; +} + +sub order { + my ($self, $ordersub) = @_; + my $rootset = $ordersub->($self->{rootset}); + $self->{rootset} = $rootset; + $_->order_children($ordersub) for @$rootset; +} + +package PublicInbox::SearchThread::Msg; +use strict; +use warnings; +use Carp qw(croak); + +sub new { + bless { + id => $_[1], + children => {}, # becomes an array when sorted by ->order(...) + }, $_[0]; +} + +sub topmost { + my ($self) = @_; + my @q = ($self); + while (my $cont = shift @q) { + return $cont if $cont->{smsg}; + push @q, values %{$cont->{children}}; + } + undef; +} + +sub add_child { + my ($self, $child) = @_; + croak "Cowardly refusing to become my own parent: $self" + if $self == $child; + + my $cid = $child->{id}; + + # reparenting: + if (defined(my $parent = $child->{parent})) { + delete $parent->{children}->{$cid}; + } + + $self->{children}->{$cid} = $child; + $child->{parent} = $self; +} + +sub has_descendent { + my ($self, $child) = @_; + my %seen; # loop prevention XXX may not be necessary + while ($child) { + return 1 if $self == $child || $seen{$child}++; + $child = $child->{parent}; + } + 0; +} + +# Do not show/keep ghosts iff they have no children. Sometimes +# a ghost Message-ID is the result of a long header line +# being folded/mangled by a MUA, and not a missing message. +sub visible ($) { + my ($self) = @_; + $self->{smsg} || scalar values %{$self->{children}}; +} + +sub order_children { + my ($cur, $ordersub) = @_; + + my %seen = ($cur => 1); # self-referential loop prevention + my @q = ($cur); + while (defined($cur = shift @q)) { + my $c = $cur->{children}; # The hashref here... + + $c = [ grep { !$seen{$_}++ && visible($_) } values %$c ]; + $c = $ordersub->($c) if scalar @$c > 1; + $cur->{children} = $c; # ...becomes an arrayref + push @q, @$c; + } +} + +1; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index ab0ff19a..50a2c01c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -8,14 +8,18 @@ use warnings; use PublicInbox::SearchMsg; use PublicInbox::Hval qw/ascii_html/; use PublicInbox::View; -use PublicInbox::MID qw(mid2path mid_clean mid_mime); +use PublicInbox::WwwAtomStream; +use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape); use Email::MIME; require PublicInbox::Git; +require PublicInbox::SearchThread; our $LIM = 50; +sub noop {} + sub sres_top_html { my ($ctx) = @_; - my $q = PublicInbox::SearchQuery->new($ctx->{cgi}); + my $q = PublicInbox::SearchQuery->new($ctx->{qp}); my $code = 200; # double the limit for expanded views: @@ -26,72 +30,92 @@ sub sres_top_html { relevance => $q->{r}, }; my ($mset, $total); - eval { - $mset = $ctx->{srch}->query($q->{q}, $opts); + $mset = $ctx->{srch}->query($q->{'q'}, $opts); $total = $mset->get_matches_estimated; }; my $err = $@; - my $res = html_start($q, $ctx) . '<pre>'; + ctx_prepare($q, $ctx); + my $cb; if ($err) { $code = 400; - $res .= err_txt($ctx, $err) . "</pre><hr /><pre>" . foot($ctx); + $ctx->{-html_tip} = '<pre>'.err_txt($ctx, $err).'</pre><hr>'; + $cb = *noop; } elsif ($total == 0) { $code = 404; - $res .= "\n\n[No results found]</pre><hr /><pre>".foot($ctx); + $ctx->{-html_tip} = "<pre>\n[No results found]</pre><hr>"; + $cb = *noop; } else { my $x = $q->{x}; - return sub { adump($_[0], $mset, $q, $ctx) } if ($x eq 'A'); + return adump($_[0], $mset, $q, $ctx) if $x eq 'A'; - $res .= search_nav_top($mset, $q) . "\n\n"; + $ctx->{-html_tip} = search_nav_top($mset, $q) . "\n\n"; if ($x eq 't') { - return sub { tdump($_[0], $res, $mset, $q, $ctx) }; + $cb = mset_thread($ctx, $mset, $q); + } else { + $cb = mset_summary($ctx, $mset, $q); } - dump_mset(\$res, $mset); - $res .= '</pre>' . search_nav_bot($mset, $q) . - "\n\n" . foot($ctx); } + PublicInbox::WwwStream->response($ctx, $code, $cb); +} + +# allow undef for individual doc loads... +sub load_doc_retry { + my ($srch, $mitem) = @_; - $res .= "</pre></body></html>"; - [$code, ['Content-Type'=>'text/html; charset=UTF-8'], [$res]]; + eval { + $srch->retry_reopen(sub { + PublicInbox::SearchMsg->load_doc($mitem->get_document) + }); + } } # display non-threaded search results similar to what users expect from # regular WWW search engines: -sub dump_mset { - my ($res, $mset) = @_; +sub mset_summary { + my ($ctx, $mset, $q) = @_; my $total = $mset->get_matches_estimated; my $pad = length("$total"); my $pfx = ' ' x $pad; + my $res = \($ctx->{-html_tip}); + my $srch = $ctx->{srch}; foreach my $m ($mset->items) { my $rank = sprintf("%${pad}d", $m->get_rank + 1); my $pct = $m->get_percent; - my $smsg = PublicInbox::SearchMsg->load_doc($m->get_document); + my $smsg = load_doc_retry($srch, $m); + unless ($smsg) { + eval { + $m = "$m ".$m->get_docid . " expired\n"; + $ctx->{env}->{'psgi.errors'}->print($m); + }; + next; + } my $s = ascii_html($smsg->subject); my $f = ascii_html($smsg->from_name); my $ts = PublicInbox::View::fmt_ts($smsg->ts); - my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->as_href; + my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href}; $$res .= qq{$rank. <b><a\nhref="$mid/">}. $s . "</a></b>\n"; $$res .= "$pfx - by $f @ $ts UTC [$pct%]\n\n"; } + $$res .= search_nav_bot($mset, $q); + *noop; } sub err_txt { my ($ctx, $err) = @_; - my $u = '//xapian.org/docs/queryparser.html'; - $u = PublicInbox::Hval::prurl($ctx->{cgi}->{env}, $u); + my $u = $ctx->{-inbox}->base_url($ctx->{env}) . '_/text/help/'; $err =~ s/^\s*Exception:\s*//; # bad word to show users :P $err = ascii_html($err); - "\n\nBad query: <b>$err</b>\n" . - qq{See <a\nhref="$u">$u</a> for Xapian query syntax}; + "\nBad query: <b>$err</b>\n" . + qq{See <a\nhref="$u">$u</a> for help on using search}; } sub search_nav_top { my ($mset, $q) = @_; - my $rv = "Search results ordered by ["; + my $rv = "<pre>Search results ordered by ["; if ($q->{r}) { my $d = $q->qs_html(r => 0); $rv .= qq{<a\nhref="?$d">date</a>|<b>relevance</b>}; @@ -121,156 +145,121 @@ sub search_nav_bot { my $o = $q->{o}; my $end = $o + $nr; my $beg = $o + 1; - my $rv = "<hr /><pre>Results $beg-$end of $total"; + my $rv = "</pre><hr><pre>Results $beg-$end of $total"; my $n = $o + $LIM; if ($n < $total) { my $qs = $q->qs_html(o => $n); - $rv .= qq{, <a\nhref="?$qs">next</a>} + $rv .= qq{, <a\nhref="?$qs"\nrel=next>next</a>} } if ($o > 0) { $rv .= $n < $total ? '/' : ', '; my $p = $o - $LIM; my $qs = $q->qs_html(o => ($p > 0 ? $p : 0)); - $rv .= qq{<a\nhref="?$qs">prev</a>}; + $rv .= qq{<a\nhref="?$qs"\nrel=prev>prev</a>}; } - $rv; + $rv .= '</pre>'; } -sub tdump { - my ($cb, $res, $mset, $q, $ctx) = @_; - my $fh = $cb->([200, ['Content-Type'=>'text/html; charset=UTF-8']]); - $fh->write($res .= '</pre>'); +sub mset_thread { + my ($ctx, $mset, $q) = @_; my %pct; - my @m = map { + my $msgs = $ctx->{srch}->retry_reopen(sub { [ map { my $i = $_; - my $m = PublicInbox::SearchMsg->load_doc($i->get_document); - $pct{$m->mid} = $i->get_percent; - $m = $m->mini_mime; - $m; - } ($mset->items); + my $smsg = PublicInbox::SearchMsg->load_doc($i->get_document); + $pct{$smsg->mid} = $i->get_percent; + $smsg; + } ($mset->items) ]}); - require PublicInbox::Thread; - my $th = PublicInbox::Thread->new(@m); - { - no warnings 'once'; - $Mail::Thread::nosubject = 0; - } + my $th = PublicInbox::SearchThread->new($msgs); $th->thread; - if ($q->{r}) { + if ($q->{r}) { # order by relevance $th->order(sub { - sort { (eval { $pct{$b->topmost->messageid} } || 0) + [ sort { (eval { $pct{$b->topmost->{id}} } || 0) <=> - (eval { $pct{$a->topmost->messageid} } || 0) - } @_; + (eval { $pct{$a->topmost->{id}} } || 0) + } @{$_[0]} ]; }); - } else { - no warnings 'once'; - $th->order(*PublicInbox::View::rsort_ts); + } else { # order by time (default for threaded view) + $th->order(*PublicInbox::View::sort_ts); } - - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my $state = { - ctx => $ctx, - anchor_idx => 0, - pct => \%pct, - cur_level => 0, - fh => $fh, + my $skel = search_nav_bot($mset, $q). "<pre>"; + my $inbox = $ctx->{-inbox}; + $ctx->{-upfx} = ''; + $ctx->{anchor_idx} = 1; + $ctx->{cur_level} = 0; + $ctx->{dst} = \$skel; + $ctx->{mapping} = {}; + $ctx->{pct} = \%pct; + $ctx->{prev_attr} = ''; + $ctx->{prev_level} = 0; + $ctx->{seen} = {}; + $ctx->{s_nr} = scalar(@$msgs).'+ results'; + + PublicInbox::View::walk_thread($th, $ctx, + *PublicInbox::View::pre_thread); + + my $mime; + sub { + return unless $msgs; + while ($mime = shift @$msgs) { + $mime = $inbox->msg_by_smsg($mime) and last; + } + if ($mime) { + $mime = Email::MIME->new($mime); + return PublicInbox::View::index_entry($mime, $ctx, + scalar @$msgs); + } + $msgs = undef; + $skel .= "\n</pre>"; }; - $ctx->{searchview} = 1; - tdump_ent($git, $state, $_, 0) for $th->rootset; - PublicInbox::View::thread_adj_level($state, 0); - Email::Address->purge_cache; - - $fh->write(search_nav_bot($mset, $q). "\n\n" . - foot($ctx). '</pre></body></html>'); - - $fh->close; } -sub tdump_ent { - my ($git, $state, $node, $level) = @_; - return unless $node; - my $mime = $node->message; - - if ($mime) { - # lazy load the full message from mini_mime: - my $mid = mid_mime($mime); - $mime = eval { - my $path = mid2path(mid_clean($mid)); - Email::MIME->new($git->cat_file('HEAD:'.$path)); - }; - } - if ($mime) { - my $end = PublicInbox::View::thread_adj_level($state, $level); - PublicInbox::View::index_entry($mime, $level, $state); - $state->{fh}->write($end) if $end; - } else { - my $mid = $node->messageid; - PublicInbox::View::ghost_flush($state, '', $mid, $level); - } - tdump_ent($git, $state, $node->child, $level + 1); - tdump_ent($git, $state, $node->next, $level); -} - -sub foot { - my ($ctx) = @_; - my $foot = $ctx->{footer} || ''; - qq{Back to <a\nhref=".">index</a>.\n$foot}; -} - -sub html_start { +sub ctx_prepare { my ($q, $ctx) = @_; my $qh = ascii_html($q->{'q'}); - my $A = $q->qs_html(x => 'A', r => undef); - my $res = '<html><head>' . PublicInbox::Hval::STYLE . - "<title>$qh - search results</title>" . - qq{<link\nrel=alternate\ntitle="Atom feed"\n} . - qq!href="?$A"\ntype="application/atom+xml"/></head>! . - qq{<body><form\naction="">} . - qq{<input\nname=q\nvalue="$qh"\ntype=text />}; - - $res .= qq{<input\ntype=hidden\nname=r />} if $q->{r}; + $ctx->{-q_value_html} = $qh; + $ctx->{-atom} = '?'.$q->qs_html(x => 'A', r => undef); + $ctx->{-title_html} = "$qh - search results"; + my $extra = ''; + $extra .= qq{<input\ntype=hidden\nname=r />} if $q->{r}; if (my $x = $q->{x}) { $x = ascii_html($x); - $res .= qq{<input\ntype=hidden\nname=x\nvalue="$x" />}; + $extra .= qq{<input\ntype=hidden\nname=x\nvalue="$x" />}; } - - $res .= qq{<input\ntype=submit\nvalue=search /></form>}; + $ctx->{-extra_form_html} = $extra; } sub adump { my ($cb, $mset, $q, $ctx) = @_; - my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]); - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my $feed_opts = PublicInbox::Feed::get_feedopts($ctx); - my $x = ascii_html($q->{'q'}); - $x = qq{$x - search results}; - $feed_opts->{atomurl} = $feed_opts->{url} . '?'. $q->qs_html; - $feed_opts->{url} .= '?'. $q->qs_html(x => undef); - $x = PublicInbox::Feed::atom_header($feed_opts, $x); - $fh->write($x. PublicInbox::Feed::feed_updated()); - - for ($mset->items) { - $x = PublicInbox::SearchMsg->load_doc($_->get_document)->mid; - $x = mid2path($x); - PublicInbox::Feed::add_to_feed($feed_opts, $fh, $x, $git); - } - PublicInbox::Feed::end_feed($fh); + my $ibx = $ctx->{-inbox}; + my @items = $mset->items; + $ctx->{search_query} = $q; + my $srch = $ctx->{srch}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $x = shift @items) { + $x = load_doc_retry($srch, $x); + $x = $ibx->msg_by_smsg($x) and + return Email::MIME->new($x); + } + return undef; + }); } package PublicInbox::SearchQuery; use strict; use warnings; use PublicInbox::Hval; +use PublicInbox::MID qw(mid_escape); sub new { - my ($class, $cgi) = @_; - my $r = $cgi->param('r'); + my ($class, $qp) = @_; + + my $r = $qp->{r}; bless { - q => $cgi->param('q'), - x => $cgi->param('x') || '', - o => int($cgi->param('o') || 0) || 0, + q => $qp->{'q'}, + x => $qp->{x} || '', + o => (($qp->{o} || '0') =~ /(\d+)/), r => (defined $r && $r ne '0'), }, $class; } @@ -286,7 +275,7 @@ sub qs_html { $self = $tmp; } - my $q = PublicInbox::Hval->new($self->{'q'})->as_href; + my $q = mid_escape($self->{'q'}); $q =~ s/%20/+/g; # improve URL readability my $qs = "q=$q"; diff --git a/lib/PublicInbox/Spamcheck/Spamc.pm b/lib/PublicInbox/Spamcheck/Spamc.pm new file mode 100644 index 00000000..30eec95c --- /dev/null +++ b/lib/PublicInbox/Spamcheck/Spamc.pm @@ -0,0 +1,93 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::Spamcheck::Spamc; +use strict; +use warnings; +use PublicInbox::Spawn qw(popen_rd spawn); +use IO::Handle; +use Fcntl qw(:DEFAULT SEEK_SET); + +sub new { + my ($class) = @_; + bless { + checkcmd => [qw(spamc -E --headers)], + hamcmd => [qw(spamc -L ham)], + spamcmd => [qw(spamc -L spam)], + }, $class; +} + +sub spamcheck { + my ($self, $msg, $out) = @_; + + my $tmp; + my $fd = _msg_to_fd($self, $msg, \$tmp); + my $rdr = { 0 => $fd }; + my ($fh, $pid) = popen_rd($self->{checkcmd}, undef, $rdr); + defined $pid or die "failed to popen_rd spamc: $!\n"; + my $r; + unless (ref $out) { + my $buf = ''; + $out = \$buf; + } + do { + $r = sysread($fh, $$out, 65536, length($$out)); + } while (defined($r) && $r != 0); + defined $r or die "read failed: $!"; + close $fh or die "close failed: $!"; + waitpid($pid, 0); + ($? || $$out eq '') ? 0 : 1; +} + +sub hamlearn { + my ($self, $msg, $rdr) = @_; + _learn($self, $msg, $rdr, 'hamcmd'); +} + +sub spamlearn { + my ($self, $msg, $rdr) = @_; + _learn($self, $msg, $rdr, 'spamcmd'); +} + +sub _learn { + my ($self, $msg, $rdr, $field) = @_; + $rdr ||= {}; + $rdr->{1} ||= $self->_devnull; + $rdr->{2} ||= $self->_devnull; + my $tmp; + $rdr->{0} = _msg_to_fd($self, $msg, \$tmp); + my $pid = spawn($self->{$field}, undef, $rdr); + waitpid($pid, 0); + !$?; +} + +sub _devnull { + my ($self) = @_; + my $fd = $self->{-devnullfd}; + return $fd if defined $fd; + open my $fh, '+>', '/dev/null' or + die "failed to open /dev/null: $!"; + $self->{-devnull} = $fh; + $self->{-devnullfd} = fileno($fh); +} + +sub _msg_to_fd { + my ($self, $msg, $tmpref) = @_; + my $fd; + if (my $ref = ref($msg)) { + my $fileno = eval { fileno($msg) }; + return $fileno if defined $fileno; + + open(my $tmpfh, '+>', undef) or die "failed to open: $!"; + $tmpfh->autoflush(1); + $msg = \($msg->as_string) if $ref ne 'SCALAR'; + print $tmpfh $$msg or die "failed to print: $!"; + sysseek($tmpfh, 0, SEEK_SET) or + die "sysseek(fh) failed: $!"; + $$tmpref = $tmpfh; + + return fileno($tmpfh); + } + $msg; +} + +1; diff --git a/lib/PublicInbox/Spawn.pm b/lib/PublicInbox/Spawn.pm index 23f303fb..41b08a33 100644 --- a/lib/PublicInbox/Spawn.pm +++ b/lib/PublicInbox/Spawn.pm @@ -24,6 +24,8 @@ my $vfork_spawn = <<'VFORK_SPAWN'; #include <sys/uio.h> #include <unistd.h> #include <alloca.h> +#include <signal.h> +#include <assert.h> #define AV_ALLOCA(av, max) alloca((max = (av_len((av)) + 1)) * sizeof(char *)) @@ -81,6 +83,8 @@ int public_inbox_fork_exec(int in, int out, int err, pid_t pid; char **argv, **envp; I32 max; + sigset_t set, old; + int ret, errnum; argv = AV_ALLOCA(cmd, max); av2c_copy(argv, cmd, max); @@ -88,14 +92,30 @@ int public_inbox_fork_exec(int in, int out, int err, envp = AV_ALLOCA(env, max); av2c_copy(envp, env, max); + ret = sigfillset(&set); + assert(ret == 0 && "BUG calling sigfillset"); + ret = sigprocmask(SIG_SETMASK, &set, &old); + assert(ret == 0 && "BUG calling sigprocmask to block"); pid = vfork(); if (pid == 0) { + int sig; + REDIR(in, 0); REDIR(out, 1); REDIR(err, 2); + for (sig = 1; sig < NSIG; sig++) + signal(sig, SIG_DFL); /* ignore errors on signals */ + /* + * don't bother unblocking, we don't want signals + * to the group taking out a subprocess + */ execve(filename, argv, envp); xerr("execve failed"); } + errnum = errno; + ret = sigprocmask(SIG_SETMASK, &old, NULL); + assert(ret == 0 && "BUG calling sigprocmask to restore"); + errno = errnum; return (int)pid; } @@ -111,7 +131,7 @@ if (defined $vfork_spawn) { my $f = "$inline_dir/.public-inbox.lock"; open my $fh, '>', $f or die "failed to open $f: $!\n"; flock($fh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n"; - eval 'use Inline C => $vfork_spawn'; + eval 'use Inline C => $vfork_spawn'; #, BUILD_NOISY => 1'; my $err = $@; flock($fh, LOCK_UN) or die "LOCK_UN failed on $f: $!\n"; die $err if $err; @@ -128,6 +148,7 @@ unless (defined $vfork_spawn) { *public_inbox_fork_exec = *PublicInbox::SpawnPP::public_inbox_fork_exec } +# n.b. we never use absolute paths with this sub which ($) { my ($file) = @_; foreach my $p (split(':', $ENV{PATH})) { @@ -161,7 +182,8 @@ sub spawn ($;$$) { my $in = $opts->{0} || 0; my $out = $opts->{1} || 1; my $err = $opts->{2} || 2; - public_inbox_fork_exec($in, $out, $err, $f, $cmd, \@env); + my $pid = public_inbox_fork_exec($in, $out, $err, $f, $cmd, \@env); + $pid < 0 ? undef : $pid; } sub popen_rd { @@ -172,6 +194,7 @@ sub popen_rd { IO::Handle::blocking($r, $blocking) if defined $blocking; $opts->{1} = fileno($w); my $pid = spawn($cmd, $env, $opts); + return unless defined $pid; return ($r, $pid) if wantarray; my $ret = gensym; tie *$ret, 'PublicInbox::ProcessPipe', $pid, $r; diff --git a/lib/PublicInbox/SpawnPP.pm b/lib/PublicInbox/SpawnPP.pm index dc2ef364..179aba5e 100644 --- a/lib/PublicInbox/SpawnPP.pm +++ b/lib/PublicInbox/SpawnPP.pm @@ -3,12 +3,21 @@ package PublicInbox::SpawnPP; use strict; use warnings; -use POSIX qw(dup2); +use POSIX qw(dup2 :signal_h); # Pure Perl implementation for folks that do not use Inline::C sub public_inbox_fork_exec ($$$$$$) { my ($in, $out, $err, $f, $cmd, $env) = @_; + my $old = POSIX::SigSet->new(); + my $set = POSIX::SigSet->new(); + $set->fillset or die "fillset failed: $!"; + sigprocmask(SIG_SETMASK, $set, $old) or die "can't block signals: $!"; + my $syserr; my $pid = fork; + unless (defined $pid) { # compat with Inline::C version + $syserr = $!; + $pid = -1; + } if ($pid == 0) { if ($in != 0) { dup2($in, 0) or die "dup2 failed for stdin: $!"; @@ -19,9 +28,18 @@ sub public_inbox_fork_exec ($$$$$$) { if ($err != 2) { dup2($err, 2) or die "dup2 failed for stderr: $!"; } - exec qw(env -i), @$env, @$cmd; - die "exec env -i ... $cmd->[0] failed: $!\n"; + + if ($ENV{MOD_PERL}) { + exec qw(env -i), @$env, @$cmd; + die "exec env -i ... $cmd->[0] failed: $!\n"; + } else { + local %ENV = map { split(/=/, $_, 2) } @$env; + exec @$cmd; + die "exec $cmd->[0] failed: $!\n"; + } } + sigprocmask(SIG_SETMASK, $old) or die "can't unblock signals: $!"; + $! = $syserr; $pid; } diff --git a/lib/PublicInbox/Thread.pm b/lib/PublicInbox/Thread.pm deleted file mode 100644 index 781fffff..00000000 --- a/lib/PublicInbox/Thread.pm +++ /dev/null @@ -1,42 +0,0 @@ -# subclass Mail::Thread and use this to workaround a memory leak -# Based on the patch in: https://rt.cpan.org/Public/Bug/Display.html?id=22817 -# -# Additionally, workaround for a bug where $walk->topmost returns undef: -# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913 -# - https://rt.cpan.org/Ticket/Display.html?id=106498 -# -# License differs from the rest of public-inbox (but is compatible): -# This library is free software; you can redistribute it and/or modify -# it under the same terms as Perl itself. -package PublicInbox::Thread; -use strict; -use warnings; -use base qw(Mail::Thread); - -if ($Mail::Thread::VERSION <= 2.55) { - eval q(sub _container_class { 'PublicInbox::Thread::Container' }); -} - -package PublicInbox::Thread::Container; -use strict; -use warnings; -use base qw(Mail::Thread::Container); -use Scalar::Util qw(weaken); -sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} } - -sub topmost { - $_[0]->SUPER::topmost || PublicInbox::Thread::CPANRTBug106498->new; -} - -# ref: -# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913 -# - https://rt.cpan.org/Ticket/Display.html?id=106498 -package PublicInbox::Thread::CPANRTBug106498; -use strict; -use warnings; - -sub new { bless {}, $_[0] } - -sub simple_subject {} - -1; diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm new file mode 100644 index 00000000..fca300e5 --- /dev/null +++ b/lib/PublicInbox/Unsubscribe.pm @@ -0,0 +1,180 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Standalone PSGI app to handle HTTP(s) unsubscribe links generated +# by milters like examples/unsubscribe.milter to mailing lists. +# +# This does not depend on any other modules in the PublicInbox::* +# and ought to be usable with any mailing list software. +package PublicInbox::Unsubscribe; +use strict; +use warnings; +use Crypt::CBC; +use Plack::Util; +use MIME::Base64 qw(decode_base64url); +my $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8'); + +sub new { + my ($class, %opt) = @_; + my $key_file = $opt{key_file}; + defined $key_file or die "`key_file' needed"; + open my $fh, '<', $key_file or die + "failed to open key_file=$key_file: $!\n"; + my ($key, $iv); + if (read($fh, $key, 8) != 8 || read($fh, $iv, 8) != 8 || + read($fh, my $end, 8) != 0) { + die "key_file must be 16 bytes\n"; + } + + # these parameters were chosen to generate shorter parameters + # to reduce the possibility of copy+paste errors + my $cipher = Crypt::CBC->new(-key => $key, + -iv => $iv, + -header => 'none', + -cipher => 'Blowfish'); + + my $e = $opt{owner_email} or die "`owner_email' not specified\n"; + my $unsubscribe = $opt{unsubscribe} or + die "`unsubscribe' callback not given\n"; + + bless { + pi_config => $opt{pi_config}, # PublicInbox::Config + owner_email => $opt{owner_email}, + cipher => $cipher, + unsubscribe => $unsubscribe, + contact => qq(<a\nhref="mailto:$e">$e</a>), + code_url => $opt{code_url} || $CODE_URL, + confirm => $opt{confirm}, + }, $class; +} + +# entry point for PSGI +sub call { + my ($self, $env) = @_; + my $m = $env->{REQUEST_METHOD}; + if ($m eq 'GET' || $m eq 'HEAD') { + $self->{confirm} ? confirm_prompt($self, $env) + : finalize_unsub($self, $env); + } elsif ($m eq 'POST') { + finalize_unsub($self, $env); + } else { + r($self, 405, + Plack::Util::encode_html($m).' method not allowed'); + } +} + +sub _user_list_addr { + my ($self, $env) = @_; + my ($blank, $u, $list) = split('/', $env->{PATH_INFO}); + + if (!defined $u || $u eq '') { + return r($self, 400, 'Bad request', + 'Missing encrypted email address in path component'); + } + if (!defined $list && $list eq '') { + return r($self, 400, 'Bad request', + 'Missing mailing list name in path component'); + } + my $user = eval { $self->{cipher}->decrypt(decode_base64url($u)) }; + if (!defined $user || index($user, '@') < 1) { + my $err = quotemeta($@); + my $errors = $env->{'psgi.errors'}; + $errors->print("error decrypting: $u\n"); + $errors->print("$_\n") for split("\n", $err); + $u = Plack::Util::encode_html($u); + return r($self, 400, 'Bad request', "Failed to decrypt: $u"); + } + + # The URLs are too damn long if we have the encrypted domain + # name in the PATH_INFO + if (index($list, '@') < 0) { + my $host = (split(':', $env->{HTTP_HOST}))[0]; + $list .= '@'.$host; + } + ($user, $list); +} + +sub confirm_prompt { # on GET + my ($self, $env) = @_; + my ($user_addr, $list_addr) = _user_list_addr($self, $env); + return $user_addr if ref $user_addr; + + my $xl = Plack::Util::encode_html($list_addr); + my $xu = Plack::Util::encode_html($user_addr); + my @body = ( + "Confirmation required to remove", '', + "\t$xu", '', + "from the mailing list at", '', + "\t$xl", '', + 'You will get one last email once you hit "Confirm" below:', + qq(</pre><form\nmethod=post\naction="">) . + qq(<input\ntype=submit\nvalue="Confirm" />) . + '</form><pre>'); + + push @body, archive_info($self, $env, $list_addr); + + r($self, 200, "Confirm unsubscribe for $xl", @body); +} + +sub finalize_unsub { # on POST + my ($self, $env) = @_; + my ($user_addr, $list_addr) = _user_list_addr($self, $env); + return $user_addr if ref $user_addr; + + my @archive = archive_info($self, $env, $list_addr); + if (my $err = $self->{unsubscribe}->($user_addr, $list_addr)) { + return r($self, 500, Plack::Util::encode_html($err), @archive); + } + + my $xl = Plack::Util::encode_html($list_addr); + r($self, 200, "Unsubscribed from $xl", + 'You may get one final goodbye message', @archive); +} + +sub r { + my ($self, $code, $title, @body) = @_; + [ $code, [ @CT_HTML ], [ + "<html><head><title>$title</title></head><body><pre>". + join("\n", "<b>$title</b>\n", @body) . '</pre><hr>'. + "<pre>This page is available under AGPL-3.0+\n" . + "git clone $self->{code_url}\n" . + qq(Email $self->{contact} if you have any questions). + '</pre></body></html>' + ] ]; +} + +sub archive_info { + my ($self, $env, $list_addr) = @_; + my $archive_url = $self->{archive_urls}->{$list_addr}; + + unless ($archive_url) { + if (my $config = $self->{pi_config}) { + # PublicInbox::Config::lookup + my $inbox = $config->lookup($list_addr); + # PublicInbox::Inbox::base_url + $archive_url = $inbox->base_url if $inbox; + } + } + + # protocol-relative URL: "//example.com/" => "https://example.com/" + if ($archive_url && $archive_url =~ m!\A//!) { + $archive_url = "$env->{'psgi.url_scheme'}:$archive_url"; + } + + # maybe there are other places where we could map + # list_addr => archive_url without ~/.public-inbox/config + if ($archive_url) { + $archive_url = Plack::Util::encode_html($archive_url); + ('', + 'HTML and git clone-able archives are available at:', + qq(<a\nhref="$archive_url">$archive_url</a>)) + } else { + ('', + 'There ought to be archives for this list,', + 'but unfortunately the admin did not configure '. + __PACKAGE__. ' to show you the URL'); + } +} + +1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index a4047aa2..fa47a16a 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -8,86 +8,93 @@ use strict; use warnings; use URI::Escape qw/uri_escape_utf8/; use Date::Parse qw/str2time/; -use Encode qw/find_encoding/; -use Encode::MIME::Header; -use Email::MIME::ContentType qw/parse_content_type/; use PublicInbox::Hval qw/ascii_html/; use PublicInbox::Linkify; -use PublicInbox::MID qw/mid_clean id_compress mid2path mid_mime/; +use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/; +use PublicInbox::MsgIter; +use PublicInbox::Address; +use PublicInbox::WwwStream; require POSIX; -# TODO: make these constants tunable -use constant MAX_INLINE_QUOTED => 12; # half an 80x24 terminal -use constant MAX_TRUNC_LEN => 72; -use constant T_ANCHOR => '#u'; use constant INDENT => ' '; +use constant TCHILD => '` '; +sub th_pfx ($) { $_[0] == 0 ? '' : TCHILD }; -my $enc_utf8 = find_encoding('UTF-8'); - -# public functions: +# public functions: (unstable) sub msg_html { - my ($ctx, $mime, $full_pfx, $footer) = @_; - $footer = defined($footer) ? "\n$footer" : ''; + my ($ctx, $mime) = @_; my $hdr = $mime->header_obj; - headers_to_html_header($hdr, $full_pfx, $ctx) . - multipart_text_as_html($mime, $full_pfx) . - '</pre><hr /><pre>' . - html_footer($hdr, 1, $full_pfx, $ctx) . - $footer . - '</pre></body></html>'; + my $tip = _msg_html_prepare($hdr, $ctx); + PublicInbox::WwwStream->response($ctx, 200, sub { + my ($nr, undef) = @_; + if ($nr == 1) { + $tip . multipart_text_as_html($mime, '') . '</pre><hr>' + } elsif ($nr == 2) { + # fake an EOF if generating the footer fails; + # we want to at least show the message if something + # here crashes: + eval { + '<pre>' . html_footer($hdr, 1, $ctx) . + '</pre>' . msg_reply($ctx, $hdr) + }; + } else { + undef + } + }); } -# /$LISTNAME/$MESSAGE_ID/R/ +# /$INBOX/$MESSAGE_ID/#R sub msg_reply { - my ($ctx, $hdr, $footer) = @_; - my $s = $hdr->header('Subject'); - $s = '(no subject)' if (!defined $s) || ($s eq ''); - my $f = $hdr->header('From'); - $f = '' unless defined $f; - my $mid = $hdr->header_raw('Message-ID'); - $mid = PublicInbox::Hval->new_msgid($mid); - my $t = ascii_html($s); + my ($ctx, $hdr) = @_; my $se_url = 'https://kernel.org/pub/software/scm/git/docs/git-send-email.html'; + my $p_url = + 'https://en.wikipedia.org/wiki/Posting_style#Interleaved_style'; + + my $info = ''; + if (my $url = $ctx->{-inbox}->{infourl}) { + $url = PublicInbox::Hval::prurl($ctx->{env}, $url); + $info = qq(\n List information: <a\nhref="$url">$url</a>\n); + } my ($arg, $link) = mailto_arg_link($hdr); push @$arg, '/path/to/YOUR_REPLY'; + $arg = ascii_html(join(" \\\n ", '', @$arg)); + <<EOF +<hr><pre +id=R><b>Reply instructions:</b> + +You may reply publically to <a +href=#t>this message</a> via plain-text email +using any one of the following methods: + +* Save the following mbox file, import it into your mail client, + and reply-to-all from there: <a +href=raw>mbox</a> + + Avoid top-posting and favor interleaved quoting: + <a +href="$p_url">$p_url</a> +$info +* Reply to all the recipients using the <b>--to</b>, <b>--cc</b>, + and <b>--in-reply-to</b> switches of git-send-email(1): + + git send-email$arg - "<html><head><title>replying to \"$t\"</title></head><body><pre>" . - "replying to message:\n\n" . - "Subject: <b>$t</b>\n" . - "From: ". ascii_html($f) . - "\nDate: " . ascii_html($hdr->header('Date')) . - "\nMessage-ID: <" . $mid->as_html . ">\n\n" . - "There are multiple ways to reply:\n\n" . - "* Save the following mbox file, import it into your mail client,\n" . - " and reply-to-all from there: <a\nhref=../raw>mbox</a>\n\n" . - "* Reply to all the recipients using the <b>--to</b>, <b>--cc</b>,\n" . - " and <b>--in-reply-to</b> switches of git-send-email(1):\n\n" . - "\tgit send-email \\\n\t\t" . - join(" \\ \n\t\t", @$arg ). "\n\n" . - qq( <a\nhref="$se_url">$se_url</a>\n\n) . - "* If your mail client supports setting the <b>In-Reply-To</b>" . - " header\n via mailto: links, try the " . - qq(<a\nhref="$link">mailto: link</a>\n) . - "\nFor context, the original <a\nhref=../>message</a> or " . - qq(<a\nhref="../t/#u">thread</a>) . - '</pre><hr /><pre>' . $footer . '</pre></body></html>'; -} - -sub feed_entry { - my ($class, $mime, $full_pfx) = @_; - - # no <head> here for <style>... - qq(<pre\nstyle="white-space:pre-wrap">) . - multipart_text_as_html($mime, $full_pfx) . '</pre>'; + <a +href="$se_url">$se_url</a> + +* If your mail client supports setting the <b>In-Reply-To</b> header + via mailto: links, try the <a +href="$link">mailto: link</a></pre> +EOF } sub in_reply_to { my ($hdr) = @_; my $irt = $hdr->header_raw('In-Reply-To'); - return mid_clean($irt) if (defined $irt); + return mid_clean($irt) if defined $irt && $irt ne ''; my $refs = $hdr->header_raw('References'); if ($refs && $refs =~ /<([^>]+)>\s*\z/s) { @@ -96,276 +103,385 @@ sub in_reply_to { undef; } +sub _hdr_names ($$) { + my ($hdr, $field) = @_; + my $val = $hdr->header($field) or return ''; + ascii_html(join(', ', PublicInbox::Address::names($val))); +} + +sub nr_to_s ($$$) { + my ($nr, $singular, $plural) = @_; + return "0 $plural" if $nr == 0; + $nr == 1 ? "$nr $singular" : "$nr $plural"; +} + # this is already inside a <pre> sub index_entry { - my ($mime, $level, $state) = @_; - my $midx = $state->{anchor_idx}++; - my $ctx = $state->{ctx}; + my ($mime, $ctx, $more) = @_; my $srch = $ctx->{srch}; - my $part_nr = 0; my $hdr = $mime->header_obj; - my $enc = enc_for($hdr->header("Content-Type")); my $subj = $hdr->header('Subject'); my $mid_raw = mid_clean(mid_mime($mime)); - my $id = anchor_for($mid_raw); - my $seen = $state->{seen}; - $seen->{$id} = "#$id"; # save the anchor for children, later - - my $mid = PublicInbox::Hval->new_msgid($mid_raw); - my $from = $hdr->header('From'); - my @from = Email::Address->parse($from); - $from = $from[0]->name; - - my $root_anchor = $state->{root_anchor} || ''; - my $path = $root_anchor ? '../../' : ''; - my $href = $mid->as_href; - my $irt = in_reply_to($hdr); - my $parent_anchor = $seen->{anchor_for($irt)} if defined $irt; - - $from = ascii_html($from); - $subj = ascii_html($subj); - $subj = "<a\nhref=\"${path}$href/\">$subj</a>"; - $subj = "<u\nid=u>$subj</u>" if $root_anchor eq $id; - - my $ts = _msg_date($hdr); - my $rv = "<pre\nid=s$midx>"; - $rv .= "<b\nid=$id>$subj</b>\n"; - my $txt = "${path}$href/raw"; - my $fh = $state->{fh}; - $fh->write($rv .= "- $from @ $ts UTC (<a\nhref=\"$txt\">raw</a>)\n\n"); - - my $fhref; - my $mhref = "${path}$href/"; - - # show full message if it's our root message - my $neq = $root_anchor ne $id; - if ($neq || ($neq && $level != 0 && !$ctx->{flat})) { - $fhref = "${path}$href/f/"; + my $id = id_compress($mid_raw, 1); + my $id_m = 'm'.$id; + + my $root_anchor = $ctx->{root_anchor} || ''; + my $irt; + + my $rv = "<a\nhref=#e$id\nid=m$id>*</a> "; + $subj = '<b>'.ascii_html($subj).'</b>'; + $subj = "<u\nid=u>$subj</u>" if $root_anchor eq $id_m; + $rv .= $subj . "\n"; + $rv .= _th_index_lite($mid_raw, \$irt, $id, $ctx); + my @tocc; + foreach my $f (qw(To Cc)) { + my $dst = _hdr_names($hdr, $f); + push @tocc, "$f: $dst" if $dst ne ''; + } + $rv .= "From: "._hdr_names($hdr, 'From').' @ '._msg_date($hdr)." UTC"; + my $upfx = $ctx->{-upfx}; + my $mhref = $upfx . mid_escape($mid_raw) . '/'; + $rv .= qq{ (<a\nhref="$mhref">permalink</a> / }; + $rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n}; + $rv .= ' '.join('; +', @tocc) . "\n" if @tocc; + + my $mapping = $ctx->{mapping}; + if (!$mapping && (defined($irt) || defined($irt = in_reply_to($hdr)))) { + my $mirt = PublicInbox::Hval->new_msgid($irt); + my $href = $upfx . $mirt->{href}. '/'; + my $html = $mirt->as_html; + $rv .= qq(In-Reply-To: <<a\nhref="$href">$html</a>>\n) } - # scan through all parts, looking for displayable text - $mime->walk_parts(sub { - index_walk($fh, $_[0], $enc, \$part_nr, $fhref); - }); - $mime->body_set(''); - $rv = "\n" . html_footer($hdr, 0, undef, $ctx, $mhref); + $rv .= "\n"; - if (defined $irt) { - unless (defined $parent_anchor) { - my $v = PublicInbox::Hval->new_msgid($irt, 1); - $v = $v->as_href; - $parent_anchor = "${path}$v/"; - } - $rv .= " <a\nhref=\"$parent_anchor\">parent</a>"; - } - if (my $pct = $state->{pct}) { # used by SearchView.pm - $rv .= " [relevance $pct->{$mid_raw}%]"; - } elsif ($srch) { + # scan through all parts, looking for displayable text + msg_iter($mime, sub { $rv .= add_text_body($mhref, $_[0]) }); + + # add the footer + $rv .= "\n<a\nhref=#$id_m\nid=e$id>^</a> ". + "<a\nhref=\"$mhref\">permalink</a>" . + " <a\nhref=\"${mhref}raw\">raw</a>" . + " <a\nhref=\"${mhref}#R\">reply</a>"; + + my $hr; + if (my $pct = $ctx->{pct}) { # used by SearchView.pm + $rv .= "\t[relevance $pct->{$mid_raw}%]"; + $hr = 1; + } elsif ($mapping) { my $threaded = 'threaded'; my $flat = 'flat'; + my $end = ''; if ($ctx->{flat}) { + $hr = 1; $flat = "<b>$flat</b>"; } else { $threaded = "<b>$threaded</b>"; } - $rv .= " [<a\nhref=\"${path}$href/t/#u\">$threaded</a>"; - $rv .= "|<a\nhref=\"${path}$href/T/#u\">$flat</a>]"; + $rv .= "\t[<a\nhref=\"${mhref}T/#u\">$flat</a>"; + $rv .= "|<a\nhref=\"${mhref}t/#u\">$threaded</a>]"; + $rv .= " <a\nhref=#r$id>$ctx->{s_nr}</a>"; + } else { + $hr = $ctx->{-hr}; } - $fh->write($rv .= '</pre>'); -} -sub thread_html { - my ($ctx, $foot, $srch) = @_; - # $_[0] in sub is the Plack callback - sub { emit_thread_html($_[0], $ctx, $foot, $srch) } + $rv .= $more ? '</pre><hr><pre>' : '</pre>' if $hr; + $rv; } -# only private functions below. +sub pad_link ($$;$) { + my ($mid, $level, $s) = @_; + $s ||= '...'; + my $id = id_compress($mid, 1); + (' 'x19).indent_for($level).th_pfx($level)."<a\nhref=#r$id>($s)</a>\n"; +} -sub emit_thread_html { - my ($res, $ctx, $foot, $srch) = @_; - my $mid = $ctx->{mid}; - my $msgs = load_results($srch->get_thread($mid)); - my $nr = scalar @$msgs; - return missing_thread($res, $ctx) if $nr == 0; - my $flat = $ctx->{flat}; - my $seen = {}; - my $state = { - res => $res, - ctx => $ctx, - seen => $seen, - root_anchor => anchor_for($mid), - anchor_idx => 0, - cur_level => 0, - }; - - require PublicInbox::Git; - $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - if ($flat) { - pre_anchor_entry($seen, $_) for (@$msgs); - __thread_entry($state, $_, 0) for (@$msgs); - } else { - my $th = thread_results($msgs); - thread_entry($state, $_, 0) for $th->rootset; - if (my $max = $state->{cur_level}) { - $state->{fh}->write( - ('</ul></li>' x ($max - 1)) . '</ul>'); +sub _th_index_lite { + my ($mid_raw, $irt, $id, $ctx) = @_; + my $rv = ''; + my $mapping = $ctx->{mapping} or return $rv; + my $pad = ' '; + my ($attr, $node, $idx, $level) = @{$mapping->{$mid_raw}}; + my $children = $node->{children}; + my $nr_c = scalar @$children; + my $nr_s = 0; + my $siblings; + if (my $smsg = $node->{smsg}) { + ($$irt) = (($smsg->{references} || '') =~ m/<([^>]+)>\z/); + } + my $irt_map = $mapping->{$$irt} if defined $$irt; + if (defined $irt_map) { + $siblings = $irt_map->[1]->{children}; + $nr_s = scalar(@$siblings) - 1; + $rv .= $pad . $irt_map->[0]; + if ($idx > 0) { + my $prev = $siblings->[$idx - 1]; + my $pmid = $prev->{id}; + if ($idx > 2) { + my $s = ($idx - 1). ' preceding siblings ...'; + $rv .= pad_link($pmid, $level, $s); + } elsif ($idx == 2) { + my $ppmid = $siblings->[0]->{id}; + $rv .= $pad . $mapping->{$ppmid}->[0]; + } + $rv .= $pad . $mapping->{$pmid}->[0]; + } + } + my $s_s = nr_to_s($nr_s, 'sibling', 'siblings'); + my $s_c = nr_to_s($nr_c, 'reply', 'replies'); + $attr =~ s!\n\z!</b>\n!s; + $attr =~ s!<a\nhref.*</a> !!s; # no point in duplicating subject + $attr =~ s!<a\nhref=[^>]+>([^<]+)</a>!$1!s; # no point linking to self + $rv .= "<b>@ $attr"; + if ($nr_c) { + my $cmid = $children->[0]->{id}; + $rv .= $pad . $mapping->{$cmid}->[0]; + if ($nr_c > 2) { + my $s = ($nr_c - 1). ' more replies'; + $rv .= pad_link($cmid, $level + 1, $s); + } elsif (my $cn = $children->[1]) { + $rv .= $pad . $mapping->{$cn->{id}}->[0]; } } - Email::Address->purge_cache; - - # there could be a race due to a message being deleted in git - # but still being in the Xapian index: - my $fh = delete $state->{fh} or return missing_thread($res, $ctx); - - my $final_anchor = $state->{anchor_idx}; - my $next = "<a\nid=s$final_anchor>"; - $next .= $final_anchor == 1 ? 'only message in' : 'end of'; - $next .= " thread</a>, back to <a\nhref=\"../../\">index</a>"; - $next .= "\ndownload thread: "; - $next .= "<a\nhref=\"../t.mbox.gz\">mbox.gz</a>"; - $next .= " / follow: <a\nhref=\"../t.atom\">Atom feed</a>"; - $fh->write('<hr /><pre>' . $next . "\n\n". - $foot . '</pre></body></html>'); - $fh->close; -} - -sub index_walk { - my ($fh, $part, $enc, $part_nr, $fhref) = @_; - my $s = add_text_body($enc, $part, $part_nr, $fhref, 1); - - return if $s eq ''; - - $s .= "\n"; # ensure there's a trailing newline - $fh->write($s); + my $next = $siblings->[$idx+1] if $siblings && $idx >= 0; + if ($next) { + my $nmid = $next->{id}; + $rv .= $pad . $mapping->{$nmid}->[0]; + my $nnext = $nr_s - $idx; + if ($nnext > 2) { + my $s = ($nnext - 1).' subsequent siblings'; + $rv .= pad_link($nmid, $level, $s); + } elsif (my $nn = $siblings->[$idx + 2]) { + $rv .= $pad . $mapping->{$nn->{id}}->[0]; + } + } + $rv .= $pad ."<a\nhref=#r$id>$s_s, $s_c; $ctx->{s_nr}</a>\n"; +} + +sub walk_thread { + my ($th, $ctx, $cb) = @_; + my @q = map { (0, $_, -1) } @{$th->{rootset}}; + while (@q) { + my ($level, $node, $i) = splice(@q, 0, 3); + defined $node or next; + $cb->($ctx, $level, $node, $i); + ++$level; + $i = 0; + unshift @q, map { ($level, $_, $i++) } @{$node->{children}}; + } +} + +sub pre_thread { + my ($ctx, $level, $node, $idx) = @_; + $ctx->{mapping}->{$node->{id}} = [ '', $node, $idx, $level ]; + skel_dump($ctx, $level, $node); +} + +sub thread_index_entry { + my ($ctx, $level, $mime) = @_; + my ($beg, $end) = thread_adj_level($ctx, $level); + $beg . '<pre>' . index_entry($mime, $ctx, 0) . '</pre>' . $end; +} + +sub stream_thread ($$) { + my ($th, $ctx) = @_; + my $inbox = $ctx->{-inbox}; + my $mime; + my @q = map { (0, $_) } @{$th->{rootset}}; + my $level; + while (@q) { + $level = shift @q; + my $node = shift @q or next; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}}; + $mime = $inbox->msg_by_smsg($node->{smsg}) and last; + } + return missing_thread($ctx) unless $mime; + + $mime = Email::MIME->new($mime); + $ctx->{-title_html} = ascii_html($mime->header('Subject')); + $ctx->{-html_tip} = thread_index_entry($ctx, $level, $mime); + PublicInbox::WwwStream->response($ctx, 200, sub { + return unless $ctx; + while (@q) { + $level = shift @q; + my $node = shift @q or next; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}}; + my $mid = $node->{id}; + if ($mime = $inbox->msg_by_smsg($node->{smsg})) { + $mime = Email::MIME->new($mime); + return thread_index_entry($ctx, $level, $mime); + } else { + return ghost_index_entry($ctx, $level, $node); + } + } + my $ret = join('', thread_adj_level($ctx, 0)); + $ret .= ${$ctx->{dst}}; # skel + $ctx = undef; + $ret; + }); } -sub enc_for { - my ($ct, $default) = @_; - $default ||= $enc_utf8; - defined $ct or return $default; - my $ct_parsed = parse_content_type($ct); - if ($ct_parsed) { - if (my $charset = $ct_parsed->{attributes}->{charset}) { - my $enc = find_encoding($charset); - return $enc if $enc; +sub thread_html { + my ($ctx) = @_; + my $mid = $ctx->{mid}; + my $srch = $ctx->{srch}; + my $sres = $srch->get_thread($mid); + my $msgs = load_results($srch, $sres); + my $nr = $sres->{total}; + return missing_thread($ctx) if $nr == 0; + my $skel = '<hr><pre>'; + $skel .= $nr == 1 ? 'only message in thread' : 'end of thread'; + $skel .= ", back to <a\nhref=\"../../\">index</a>\n\n"; + $skel .= "<b\nid=t>Thread overview:</b> "; + $skel .= $nr == 1 ? '(only message)' : "$nr+ messages"; + $skel .= " (download: <a\nhref=\"../t.mbox.gz\">mbox.gz</a>"; + $skel .= " / follow: <a\nhref=\"../t.atom\">Atom feed</a>)\n"; + $skel .= "-- links below jump to the message on this page --\n"; + $ctx->{-upfx} = '../../'; + $ctx->{cur_level} = 0; + $ctx->{dst} = \$skel; + $ctx->{prev_attr} = ''; + $ctx->{prev_level} = 0; + $ctx->{root_anchor} = anchor_for($mid); + $ctx->{seen} = {}; + $ctx->{mapping} = {}; + $ctx->{s_nr} = "$nr+ messages in thread"; + + my $th = thread_results($msgs); + walk_thread($th, $ctx, *pre_thread); + $skel .= '</pre>'; + return stream_thread($th, $ctx) unless $ctx->{flat}; + + # flat display: lazy load the full message from smsg + my $inbox = $ctx->{-inbox}; + my $mime; + while ($mime = shift @$msgs) { + $mime = $inbox->msg_by_smsg($mime) and last; + } + return missing_thread($ctx) unless $mime; + $mime = Email::MIME->new($mime); + $ctx->{-title_html} = ascii_html($mime->header('Subject')); + $ctx->{-html_tip} = '<pre>'.index_entry($mime, $ctx, scalar @$msgs); + $mime = undef; + PublicInbox::WwwStream->response($ctx, 200, sub { + return unless $msgs; + while ($mime = shift @$msgs) { + $mime = $inbox->msg_by_smsg($mime) and last; } - } - $default; + if ($mime) { + $mime = Email::MIME->new($mime); + return index_entry($mime, $ctx, scalar @$msgs); + } + $msgs = undef; + $skel; + }); } sub multipart_text_as_html { - my ($mime, $full_pfx, $srch) = @_; + my ($mime, $upfx) = @_; my $rv = ""; - my $part_nr = 0; - my $enc = enc_for($mime->header("Content-Type")); # scan through all parts, looking for displayable text - $mime->walk_parts(sub { - my ($part) = @_; - $part = add_text_body($enc, $part, \$part_nr, $full_pfx, 1); - $rv .= $part; - $rv .= "\n" if $part ne ''; + msg_iter($mime, sub { + my ($p) = @_; + $rv .= add_text_body($upfx, $p); }); - $mime->body_set(''); $rv; } -sub add_filename_line { - my ($enc, $fn) = @_; - my $len = 72; - my $pad = "-"; - $fn = $enc->decode($fn); - $len -= length($fn); - $pad x= ($len/2) if ($len > 0); - "$pad " . ascii_html($fn) . " $pad\n"; -} - sub flush_quote { - my ($quot, $n, $part_nr, $full_pfx, $final, $do_anchor) = @_; - - # n.b.: do not use <blockquote> since it screws up alignment - # w.r.t. unquoted text. Repliers may rely on pre-formatted - # alignment to point out a certain word in quoted text. - if ($full_pfx) { - if (!$final && scalar(@$quot) <= MAX_INLINE_QUOTED) { - # show quote inline - my $l = PublicInbox::Linkify->new; - my $rv = join('', map { $l->linkify_1($_) } @$quot); - @$quot = (); - $rv = ascii_html($rv); - return $l->linkify_2($rv); - } - - # show a short snippet of quoted text and link to full version: - @$quot = map { s/^(?:>\s*)+//gm; $_ } @$quot; - my $cur = join(' ', @$quot); - @$quot = split(/\s+/, $cur); - $cur = ''; - do { - my $tmp = shift(@$quot); - my $len = length($tmp) + length($cur); - if ($len > MAX_TRUNC_LEN) { - @$quot = (); - } else { - $cur .= $tmp . ' '; - } - } while (@$quot && length($cur) < MAX_TRUNC_LEN); - @$quot = (); - $cur =~ s/ \z/ .../s; - $cur = ascii_html($cur); - my $nr = ++$$n; - "> [<a\nhref=\"$full_pfx#q${part_nr}_$nr\">$cur</a>]\n"; + my ($s, $l, $quot) = @_; + + # show everything in the full version with anchor from + # short version (see above) + my $rv = $l->linkify_1(join('', @$quot)); + @$quot = (); + + # we use a <span> here to allow users to specify their own + # color for quoted text + $rv = $l->linkify_2(ascii_html($rv)); + $$s .= qq(<span\nclass="q">) . $rv . '</span>' +} + +sub attach_link ($$$$;$) { + my ($upfx, $ct, $p, $fn, $err) = @_; + my ($part, $depth, @idx) = @$p; + my $nl = $idx[-1] > 1 ? "\n" : ''; + my $idx = join('.', @idx); + my $size = bytes::length($part->body); + + # hide attributes normally, unless we want to aid users in + # spotting MUA problems: + $ct =~ s/;.*// unless $err; + $ct = ascii_html($ct); + my $desc = $part->header('Content-Description'); + $desc = $fn unless defined $desc; + $desc = '' unless defined $desc; + my $sfn; + if (defined $fn && $fn =~ /\A[[:alnum:]][\w\.-]+[[:alnum:]]\z/) { + $sfn = $fn; + } elsif ($ct eq 'text/plain') { + $sfn = 'a.txt'; } else { - # show everything in the full version with anchor from - # short version (see above) - my $l = PublicInbox::Linkify->new; - my $rv .= join('', map { $l->linkify_1($_) } @$quot); - @$quot = (); - $rv = ascii_html($rv); - return $l->linkify_2($rv) unless $do_anchor; - my $nr = ++$$n; - "<a\nid=q${part_nr}_$nr></a>" . $l->linkify_2($rv); + $sfn = 'a.bin'; + } + my $ret = qq($nl<a\nhref="$upfx$idx-$sfn">); + if ($err) { + $ret .= +"[-- Warning: decoded text below may be mangled --]\n"; } + $ret .= "[-- Attachment #$idx: "; + my $ts = "Type: $ct, Size: $size bytes"; + $ret .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]"; + $ret .= "</a>\n"; } sub add_text_body { - my ($enc_msg, $part, $part_nr, $full_pfx, $do_anchor) = @_; - return '' if $part->subparts; - - my $ct = $part->content_type; - # account for filter bugs... - if (defined $ct && $ct =~ m!\btext/x?html\b!i) { - $part->body_set(''); - return ''; - } - my $enc = enc_for($ct, $enc_msg); - my $n = 0; - my $nr = 0; - my $s = $part->body; - $part->body_set(''); - $s = $enc->decode($s); - my @lines = split(/^/m, $s); - $s = ''; + my ($upfx, $p) = @_; # from msg_iter: [ Email::MIME, depth, @idx ] + my ($part, $depth, @idx) = @$p; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; - if ($$part_nr > 0) { - my $fn = $part->filename; - defined($fn) or $fn = "part #" . ($$part_nr + 1); - $s .= add_filename_line($enc, $fn); + if ($ct =~ m!\btext/x?html\b!i) { + return attach_link($upfx, $ct, $p, $fn); } + my $s = eval { $part->body_str }; + + # badly-encoded message? tell the world about it! + my $err = $@; + if ($err) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine seems to + # do wacky things and set charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + + # If forcing charset=UTF-8 failed, + # attach_link will warn further down... + $s = $part->body if $@; + } else { + return attach_link($upfx, $ct, $p, $fn); + } + } + + my @lines = split(/^/m, $s); + $s = ''; + if (defined($fn) || $depth > 0 || $err) { + $s .= attach_link($upfx, $ct, $p, $fn, $err); + $s .= "\n"; + } my @quot; + my $l = PublicInbox::Linkify->new; while (defined(my $cur = shift @lines)) { if ($cur !~ /^>/) { # show the previously buffered quote inline - if (scalar @quot) { - $s .= flush_quote(\@quot, \$n, $$part_nr, - $full_pfx, 0, $do_anchor); - } + flush_quote(\$s, $l, \@quot) if @quot; # regular line, OK - my $l = PublicInbox::Linkify->new; $cur = $l->linkify_1($cur); $cur = ascii_html($cur); $s .= $l->linkify_2($cur); @@ -373,24 +489,29 @@ sub add_text_body { push @quot, $cur; } } - if (scalar @quot) { - $s .= flush_quote(\@quot, \$n, $$part_nr, $full_pfx, 1, - $do_anchor); - } - ++$$part_nr; + my $end = "\n"; + if (@quot) { + $end = ''; + flush_quote(\$s, $l, \@quot); + } $s =~ s/[ \t]+$//sgm; # kill per-line trailing whitespace $s =~ s/\A\n+//s; # kill leading blank lines - $s =~ s/\s+\z//s; # kill all trailing spaces (final "\n" added if ne '') - $s; + $s =~ s/\s+\z//s; # kill all trailing spaces + $s .= $end; } -sub headers_to_html_header { - my ($hdr, $full_pfx, $ctx) = @_; +sub _msg_html_prepare { + my ($hdr, $ctx) = @_; my $srch = $ctx->{srch} if $ctx; - my $rv = ""; + my $atom = ''; + my $rv = "<pre\nid=b>"; # anchor for body start + + if ($srch) { + $ctx->{-upfx} = '../'; + } my @title; - my $mid = $hdr->header_raw('Message-ID'); + my $mid = mid_clean($hdr->header_raw('Message-ID')); $mid = PublicInbox::Hval->new_msgid($mid); foreach my $h (qw(From To Cc Subject Date)) { my $v = $hdr->header($h); @@ -398,53 +519,45 @@ sub headers_to_html_header { $v = PublicInbox::Hval->new($v); if ($h eq 'From') { - my @from = Email::Address->parse($v->raw); - $title[1] = ascii_html($from[0]->name); + my @n = PublicInbox::Address::names($v->raw); + $title[1] = ascii_html(join(', ', @n)); } elsif ($h eq 'Subject') { $title[0] = $v->as_html; if ($srch) { - $rv .= "$h: <b\nid=t>"; - $rv .= $v->as_html . "</b>\n"; + $rv .= qq($h: <a\nhref="#r"\nid=t>); + $rv .= $v->as_html . "</a>\n"; next; } } - $rv .= "$h: " . $v->as_html . "\n"; + $v = $v->as_html; + $v =~ s/(\@[^,]+,) /$1\n\t/g if ($h eq 'Cc' || $h eq 'To'); + $rv .= "$h: $v\n"; } + $title[0] ||= '(no subject)'; + $ctx->{-title_html} = join(' - ', @title); $rv .= 'Message-ID: <' . $mid->as_html . '> '; - my $upfx = $full_pfx ? '' : '../'; - $rv .= "(<a\nhref=\"${upfx}raw\">raw</a>)\n"; - my $atom; - if ($srch) { - thread_inline(\$rv, $ctx, $hdr, $upfx); - - $atom = qq{<link\nrel=alternate\ntitle="Atom feed"\n} . - qq!href="${upfx}t.atom"\ntype="application/atom+xml"/>!; - } else { - $rv .= _parent_headers_nosrch($hdr); - $atom = ''; - } + $rv .= "(<a\nhref=\"raw\">raw</a>)\n"; + $rv .= _parent_headers($hdr, $srch); $rv .= "\n"; - - ("<html><head><title>". join(' - ', @title) . "</title>$atom". - PublicInbox::Hval::STYLE . "</head><body><pre>" . $rv); } -sub thread_inline { - my ($dst, $ctx, $hdr, $upfx) = @_; +sub thread_skel { + my ($dst, $ctx, $hdr, $tpfx) = @_; my $srch = $ctx->{srch}; my $mid = mid_clean($hdr->header_raw('Message-ID')); my $sres = $srch->get_thread($mid); my $nr = $sres->{total}; - my $expand = "<a\nhref=\"${upfx}t/#u\">expand</a> " . - "/ <a\nhref=\"${upfx}t.mbox.gz\">mbox.gz</a>"; + my $expand = qq(<a\nhref="${tpfx}T/#u">expand</a> ) . + qq(/ <a\nhref="${tpfx}t.mbox.gz">mbox.gz</a> ) . + qq(/ <a\nhref="${tpfx}t.atom">Atom feed</a>); - $$dst .= 'Thread: '; my $parent = in_reply_to($hdr); + $$dst .= "\n<b>Thread overview: </b>"; if ($nr <= 1) { if (defined $parent) { $$dst .= "($expand)\n "; - $$dst .= ghost_parent("$upfx../", $parent) . "\n"; + $$dst .= ghost_parent("$tpfx../", $parent) . "\n"; } else { $$dst .= "[no followups, yet] ($expand)\n"; } @@ -453,43 +566,37 @@ sub thread_inline { return; } - $$dst .= "~$nr messages ($expand"; - if ($nr > MAX_INLINE_QUOTED) { - $$dst .= qq! / <a\nhref="#b">[scroll down]</a>!; - } - $$dst .= ")\n"; + $$dst .= "$nr+ messages in thread ($expand"; + $$dst .= qq! / <a\nhref="#b">[top]</a>)\n!; my $subj = $srch->subject_path($hdr->header('Subject')); - my $state = { - seen => { $subj => 1 }, - srch => $srch, - cur => $mid, - parent_cmp => defined $parent ? $parent : '', - parent => $parent, - prev_attr => '', - prev_level => 0, - }; - for (thread_results(load_results($sres))->rootset) { - inline_dump($dst, $state, $upfx, $_, 0); - } - $$dst .= "<a\nid=b></a>"; # anchor for body start - $ctx->{next_msg} = $state->{next_msg}; - $ctx->{parent_msg} = $state->{parent}; -} - -sub _parent_headers_nosrch { - my ($hdr) = @_; + $ctx->{seen} = { $subj => 1 }; + $ctx->{cur} = $mid; + $ctx->{prev_attr} = ''; + $ctx->{prev_level} = 0; + $ctx->{dst} = $dst; + $sres = load_results($srch, $sres); + walk_thread(thread_results($sres), $ctx, *skel_dump); + $ctx->{parent_msg} = $parent; +} + +sub _parent_headers { + my ($hdr, $srch) = @_; my $rv = ''; my $irt = in_reply_to($hdr); if (defined $irt) { - my $v = PublicInbox::Hval->new_msgid($irt, 1); + my $v = PublicInbox::Hval->new_msgid($irt); my $html = $v->as_html; - my $href = $v->as_href; + my $href = $v->{href}; $rv .= "In-Reply-To: <"; $rv .= "<a\nhref=\"../$href/\">$html</a>>\n"; } + # do not display References: if search is present, + # we show the thread skeleton at the bottom, instead. + return $rv if $srch; + my $refs = $hdr->header_raw('References'); if ($refs) { # avoid redundant URLs wasting bandwidth @@ -504,12 +611,21 @@ sub _parent_headers_nosrch { } if (@refs) { - $rv .= 'References: '. join(' ', @refs) . "\n"; + $rv .= 'References: '. join("\n\t", @refs) . "\n"; } } $rv; } +sub squote_maybe ($) { + my ($val) = @_; + if ($val =~ m{([^\w@\./,\%\+\-])}) { + $val =~ s/(['!])/'\\$1'/g; # '!' for csh + return "'$val'"; + } + $val; +} + sub mailto_arg_link { my ($hdr) = @_; my %cc; # everyone else @@ -518,207 +634,110 @@ sub mailto_arg_link { foreach my $h (qw(From To Cc)) { my $v = $hdr->header($h); defined($v) && ($v ne '') or next; - my @addrs = Email::Address->parse($v); - foreach my $recip (@addrs) { - my $address = $recip->address; + my @addrs = PublicInbox::Address::emails($v); + foreach my $address (@addrs) { my $dst = lc($address); $cc{$dst} ||= $address; $to ||= $dst; } } - Email::Address->purge_cache; my @arg; my $subj = $hdr->header('Subject') || ''; $subj = "Re: $subj" unless $subj =~ /\bRe:/i; my $mid = $hdr->header_raw('Message-ID'); - push @arg, "--in-reply-to='" . ascii_html($mid) . "'"; - my $irt = uri_escape_utf8($mid); + push @arg, '--in-reply-to='.squote_maybe(mid_clean($mid)); + my $irt = mid_escape($mid); delete $cc{$to}; - push @arg, '--to=' . ascii_html($to); + push @arg, "--to=$to"; $to = uri_escape_utf8($to); $subj = uri_escape_utf8($subj); - my $cc = join(',', sort values %cc); - push @arg, '--cc=' . ascii_html($cc); - $cc = uri_escape_utf8($cc); + my @cc = sort values %cc; + push(@arg, map { "--cc=$_" } @cc); + my $cc = uri_escape_utf8(join(',', @cc)); my $href = "mailto:$to?In-Reply-To=$irt&Cc=${cc}&Subject=$subj"; - $href =~ s/%20/+/g; - (\@arg, $href); + (\@arg, ascii_html($href)); } sub html_footer { - my ($mime, $standalone, $full_pfx, $ctx, $mhref) = @_; + my ($hdr, $standalone, $ctx, $rhref) = @_; my $srch = $ctx->{srch} if $ctx; - my $upfx = $full_pfx ? '../' : '../../'; - my $tpfx = $full_pfx ? '' : '../'; + my $upfx = '../'; + my $tpfx = ''; my $idx = $standalone ? " <a\nhref=\"$upfx\">index</a>" : ''; my $irt = ''; - - if ($srch && $standalone) { - $idx .= qq{ / follow: <a\nhref="${tpfx}t.atom">Atom feed</a>\n}; - } if ($idx && $srch) { - my $p = $ctx->{parent_msg}; - my $next = $ctx->{next_msg}; - if ($p) { - $p = PublicInbox::Hval->new_msgid($p); - $p = $p->as_href; - $irt = "<a\nhref=\"$upfx$p/\">parent</a> "; - } else { - $irt = ' ' x length('parent '); + $idx .= "\n"; + thread_skel(\$idx, $ctx, $hdr, $tpfx); + my ($next, $prev); + my $parent = ' '; + $next = $prev = ' '; + + if (my $n = $ctx->{next_msg}) { + $n = PublicInbox::Hval->new_msgid($n)->{href}; + $next = "<a\nhref=\"$upfx$n/\"\nrel=next>next</a>"; } - if ($next) { - $irt .= "<a\nhref=\"$upfx$next/\">next</a> "; - } else { - $irt .= ' ' x length('next '); + my $u; + my $par = $ctx->{parent_msg}; + if ($par) { + $u = PublicInbox::Hval->new_msgid($par)->{href}; + $u = "$upfx$u/"; } - if ($p || $next) { - $irt .= "<a\nhref=\"${tpfx}t/#u\">thread</a> "; - } else { - $irt .= ' ' x length('thread '); + if (my $p = $ctx->{prev_msg}) { + $prev = PublicInbox::Hval->new_msgid($p)->{href}; + if ($p && $par && $p eq $par) { + $prev = "<a\nhref=\"$upfx$prev/\"\n" . + 'rel=prev>prev parent</a>'; + $parent = ''; + } else { + $prev = "<a\nhref=\"$upfx$prev/\"\n" . + 'rel=prev>prev</a>'; + $parent = " <a\nhref=\"$u\">parent</a>" if $u; + } + } elsif ($u) { # unlikely + $parent = " <a\nhref=\"$u\"\nrel=prev>parent</a>"; } + $irt = "$next $prev$parent "; } else { $irt = ''; } - - $mhref = './' unless defined $mhref; - $irt . qq(<a\nhref="${mhref}R/">reply</a>) . $idx; + $rhref ||= '#R'; + $irt .= qq(<a\nhref="$rhref">reply</a>); + $irt .= $idx; } sub linkify_ref_nosrch { - my $v = PublicInbox::Hval->new_msgid($_[0], 1); + my $v = PublicInbox::Hval->new_msgid($_[0]); my $html = $v->as_html; - my $href = $v->as_href; + my $href = $v->{href}; "<<a\nhref=\"../$href/\">$html</a>>"; } sub anchor_for { my ($msgid) = @_; - my $id = $msgid; - if ($id !~ /\A[a-f0-9]{40}\z/) { - $id = id_compress(mid_clean($id), 1); - } - 'm' . $id; -} - -sub thread_html_head { - my ($hdr, $state) = @_; - my $res = delete $state->{res} or die "BUG: no Plack callback in {res}"; - my $fh = $res->([200, ['Content-Type'=> 'text/html; charset=UTF-8']]); - $state->{fh} = $fh; - - my $s = ascii_html($hdr->header('Subject')); - $fh->write("<html><head><title>$s</title>". - qq{<link\nrel=alternate\ntitle="Atom feed"\n} . - qq!href="../t.atom"\ntype="application/atom+xml"/>! . - PublicInbox::Hval::STYLE . - "</head><body>"); -} - -sub pre_anchor_entry { - my ($seen, $mime) = @_; - my $id = anchor_for(mid_mime($mime)); - $seen->{$id} = "#$id"; # save the anchor for children, later + 'm' . id_compress($msgid, 1); } sub ghost_parent { my ($upfx, $mid) = @_; - # 'subject dummy' is used internally by Mail::Thread - return '[no common parent]' if ($mid eq 'subject dummy'); $mid = PublicInbox::Hval->new_msgid($mid); - my $href = $mid->as_href; + my $href = $mid->{href}; my $html = $mid->as_html; qq{[parent not found: <<a\nhref="$upfx$href/">$html</a>>]}; } -sub thread_adj_level { - my ($state, $level) = @_; - - my $max = $state->{cur_level}; - if ($level <= 0) { - return '' if $max == 0; # flat output - - # reset existing lists - my $x = $max > 1 ? ('</ul></li>' x ($max - 1)) : ''; - $state->{fh}->write($x . '</ul>'); - $state->{cur_level} = 0; - return ''; - } - if ($level == $max) { # continue existing list - $state->{fh}->write('<li>'); - } elsif ($level < $max) { - my $x = $max > 1 ? ('</ul></li>' x ($max - $level)) : ''; - $state->{fh}->write($x .= '<li>'); - $state->{cur_level} = $level; - } else { # ($level > $max) # start a new level - $state->{cur_level} = $level; - $state->{fh}->write(($max ? '<li>' : '') . '<ul><li>'); - } - '</li>'; -} - -sub ghost_flush { - my ($state, $upfx, $mid, $level) = @_; - my $end = '<pre>'. ghost_parent($upfx, $mid) . '</pre>'; - $state->{fh}->write($end .= thread_adj_level($state, $level)); -} - -sub __thread_entry { - my ($state, $mime, $level) = @_; - - # lazy load the full message from mini_mime: - $mime = eval { - my $path = mid2path(mid_clean(mid_mime($mime))); - Email::MIME->new($state->{ctx}->{git}->cat_file('HEAD:'.$path)); - } or return; - - thread_html_head($mime, $state) if $state->{anchor_idx} == 0; - if (my $ghost = delete $state->{ghost}) { - # n.b. ghost messages may only be parents, not children - foreach my $g (@$ghost) { - ghost_flush($state, '../../', @$g); - } - } - my $end = thread_adj_level($state, $level); - index_entry($mime, $level, $state); - $state->{fh}->write($end) if $end; - - 1; -} - sub indent_for { my ($level) = @_; INDENT x ($level - 1); } -sub __ghost_prepare { - my ($state, $node, $level) = @_; - my $ghost = $state->{ghost} ||= []; - push @$ghost, [ $node->messageid, $level ]; -} - -sub thread_entry { - my ($state, $node, $level) = @_; - return unless $node; - if (my $mime = $node->message) { - unless (__thread_entry($state, $mime, $level)) { - __ghost_prepare($state, $node, $level); - } - } else { - __ghost_prepare($state, $node, $level); - } - - thread_entry($state, $node->child, $level + 1); - thread_entry($state, $node->next, $level); -} - sub load_results { - my ($sres) = @_; - - [ map { $_->mini_mime } @{delete $sres->{msgs}} ]; + my ($srch, $sres) = @_; + my $msgs = delete $sres->{msgs}; + $srch->retry_reopen(sub { [ map { $_->ensure_metadata; $_ } @$msgs ] }); } sub msg_timestamp { @@ -728,21 +747,18 @@ sub msg_timestamp { } sub thread_results { - my ($msgs, $nosubject) = @_; - require PublicInbox::Thread; - my $th = PublicInbox::Thread->new(@$msgs); - no warnings 'once'; - $Mail::Thread::nosubject = $nosubject; + my ($msgs) = @_; + require PublicInbox::SearchThread; + my $th = PublicInbox::SearchThread->new($msgs); $th->thread; $th->order(*sort_ts); $th } sub missing_thread { - my ($res, $ctx) = @_; + my ($ctx) = @_; require PublicInbox::ExtMsg; - - $res->(PublicInbox::ExtMsg::ext_msg($ctx)) + PublicInbox::ExtMsg::ext_msg($ctx); } sub _msg_date { @@ -753,187 +769,255 @@ sub _msg_date { sub fmt_ts { POSIX::strftime('%Y-%m-%d %k:%M', gmtime($_[0])) } -sub _inline_header { - my ($dst, $state, $upfx, $hdr, $level) = @_; - my $dot = $level == 0 ? '' : '` '; +sub skel_dump { + my ($ctx, $level, $node) = @_; + my $smsg = $node->{smsg} or return _skel_ghost($ctx, $level, $node); - my $cur = $state->{cur}; - my $mid = mid_clean($hdr->header_raw('Message-ID')); - my $f = ascii_html($hdr->header('X-PI-From')); - my $d = _msg_date($hdr); - my $pfx = ' ' . $d . ' ' . indent_for($level); + my $dst = $ctx->{dst}; + my $cur = $ctx->{cur}; + my $mid = $smsg->{mid}; + my $f = ascii_html($smsg->from_name); + my $d = fmt_ts($smsg->{ts}) . ' ' . indent_for($level) . th_pfx($level); my $attr = $f; - $state->{first_level} ||= $level; + $ctx->{first_level} ||= $level; - if ($attr ne $state->{prev_attr} || $state->{prev_level} > $level) { - $state->{prev_attr} = $attr; - } else { - $attr = ''; + if ($attr ne $ctx->{prev_attr} || $ctx->{prev_level} > $level) { + $ctx->{prev_attr} = $attr; } - $state->{prev_level} = $level; + $ctx->{prev_level} = $level; if ($cur) { if ($cur eq $mid) { - delete $state->{cur}; - $$dst .= "$pfx$dot<b><a\nid=r\nhref=\"#b\">". + delete $ctx->{cur}; + $$dst .= "<b>$d<a\nid=r\nhref=\"#t\">". "$attr [this message]</a></b>\n"; - return; + } else { + $ctx->{prev_msg} = $mid; } } else { - $state->{next_msg} ||= $mid; + $ctx->{next_msg} ||= $mid; } # Subject is never undef, this mail was loaded from # our Xapian which would've resulted in '' if it were # really missing (and Filter rejects empty subjects) - my $s = $hdr->header('Subject'); - my $h = $state->{srch}->subject_path($s); - if ($state->{seen}->{$h}) { + my $s = $smsg->subject; + my $h = $ctx->{srch}->subject_path($s); + if ($ctx->{seen}->{$h}) { $s = undef; } else { - $state->{seen}->{$h} = 1; + $ctx->{seen}->{$h} = 1; $s = PublicInbox::Hval->new($s); $s = $s->as_html; } - my $m = PublicInbox::Hval->new_msgid($mid); - $m = $upfx . '../' . $m->as_href . '/'; - if (defined $s) { - $$dst .= "$pfx$dot<a\nhref=\"$m\">$s</a> $attr\n"; + my $m; + my $id = ''; + my $mapping = $ctx->{mapping}; + my $end = defined($s) ? "$s</a> $f\n" : "$f</a>\n"; + if ($mapping) { + my $map = $mapping->{$mid}; + $id = id_compress($mid, 1); + $m = '#m'.$id; + $map->[0] = "$d<a\nhref=\"$m\">$end"; + $id = "\nid=r".$id; } else { - $$dst .= "$pfx$dot<a\nhref=\"$m\">$f</a>\n"; + $m = $ctx->{-upfx}.mid_escape($mid).'/'; } + $$dst .= $d . "<a\nhref=\"$m\"$id>" . $end; } -sub inline_dump { - my ($dst, $state, $upfx, $node, $level) = @_; - return unless $node; - if (my $mime = $node->message) { - my $hdr = $mime->header_obj; - my $mid = mid_clean($hdr->header_raw('Message-ID')); - if ($mid eq $state->{parent_cmp}) { - $state->{parent} = $mid; - } - _inline_header($dst, $state, $upfx, $hdr, $level); +sub _skel_ghost { + my ($ctx, $level, $node) = @_; + + my $mid = $node->{id}; + my $d = $ctx->{pct} ? ' [irrelevant] ' # search result + : ' [not found] '; + $d .= indent_for($level) . th_pfx($level); + my $upfx = $ctx->{-upfx}; + my $m = PublicInbox::Hval->new_msgid($mid); + my $href = $upfx . $m->{href} . '/'; + my $html = $m->as_html; + + my $mapping = $ctx->{mapping}; + my $map = $mapping->{$mid} if $mapping; + if ($map) { + my $id = id_compress($mid, 1); + $map->[0] = $d . qq{<<a\nhref=#r$id>$html</a>>\n}; + $d .= qq{<<a\nhref="$href"\nid=r$id>$html</a>>\n}; } else { - my $dot = $level == 0 ? '' : '` '; - my $pfx = (' ' x length(' 1970-01-01 13:37 ')). - indent_for($level) . $dot; - $$dst .= $pfx; - $$dst .= ghost_parent("$upfx../", $node->messageid) . "\n"; + $d .= qq{<<a\nhref="$href">$html</a>>\n}; } - inline_dump($dst, $state, $upfx, $node->child, $level+1); - inline_dump($dst, $state, $upfx, $node->next, $level); + my $dst = $ctx->{dst}; + $$dst .= $d; } sub sort_ts { - sort { - (eval { $a->topmost->message->header('X-PI-TS') } || 0) <=> - (eval { $b->topmost->message->header('X-PI-TS') } || 0) - } @_; -} - -sub rsort_ts { - sort { - (eval { $b->topmost->message->header('X-PI-TS') } || 0) <=> - (eval { $a->topmost->message->header('X-PI-TS') } || 0) - } @_; + [ sort { + (eval { $a->topmost->{smsg}->ts } || 0) <=> + (eval { $b->topmost->{smsg}->ts } || 0) + } @{$_[0]} ]; } # accumulate recent topics if search is supported -# returns 1 if done, undef if not -sub add_topic { - my ($state, $node, $level) = @_; - return unless $node; - my $child_adjust = 1; - - if (my $x = $node->message) { - $x = $x->header_obj; - my $subj; - - $subj = $x->header('Subject'); - $subj = $state->{srch}->subject_normalized($subj); - - if (++$state->{subjs}->{$subj} == 1) { - push @{$state->{order}}, [ $level, $subj ]; +# returns 200 if done, 404 if not +sub acc_topic { + my ($ctx, $level, $node) = @_; + my $srch = $ctx->{srch}; + my $mid = $node->{id}; + my $x = $node->{smsg} || $srch->lookup_mail($mid); + my ($subj, $ts); + my $topic; + if ($x) { + $subj = $x->subject; + $subj = $srch->subject_normalized($subj); + $ts = $x->ts; + if ($level == 0) { + $topic = [ $ts, 1, { $subj => $mid }, $subj ]; + $ctx->{-cur_topic} = $topic; + push @{$ctx->{order}}, $topic; + return; } - my $mid = mid_clean($x->header_raw('Message-ID')); - - my $ts = $x->header('X-PI-TS'); - my $exist = $state->{latest}->{$subj}; - if (!$exist || $exist->[1] < $ts) { - $state->{latest}->{$subj} = [ $mid, $ts ]; + $topic = $ctx->{-cur_topic}; # should never be undef + $topic->[0] = $ts if $ts > $topic->[0]; + $topic->[1]++; + my $seen = $topic->[2]; + if (scalar(@$topic) == 3) { # parent was a ghost + push @$topic, $subj; + } elsif (!$seen->{$subj}) { + push @$topic, $level, $subj; } - } else { - # ghost message, do not bump level - $child_adjust = 0; + $seen->{$subj} = $mid; # latest for subject + } else { # ghost message + return if $level != 0; # ignore child ghosts + $topic = [ -666, 0, {} ]; + $ctx->{-cur_topic} = $topic; + push @{$ctx->{order}}, $topic; } - - add_topic($state, $node->child, $level + $child_adjust); - add_topic($state, $node->next, $level); } sub dump_topics { - my ($state) = @_; - my $order = $state->{order}; - my $subjs = $state->{subjs}; - my $latest = $state->{latest}; - return "\n[No topics in range]</pre>" unless (scalar @$order); - my $dst = ''; - my $pfx; - my $prev = 0; - my $prev_attr = ''; - while (defined(my $info = shift @$order)) { - my ($level, $subj) = @$info; - my $n = delete $subjs->{$subj}; - my ($mid, $ts) = @{delete $latest->{$subj}}; - $mid = PublicInbox::Hval->new_msgid($mid)->as_href; - $subj = PublicInbox::Hval->new($subj)->as_html; - $pfx = indent_for($level); - my $nl = $level == $prev ? "\n" : ''; - my $dot = $level == 0 ? '' : '` '; - $dst .= "$nl$pfx$dot<a\nhref=\"$mid/t/#u\"><b>$subj</b></a>\n"; - + my ($ctx) = @_; + my $order = delete $ctx->{order}; # [ ts, subj1, subj2, subj3, ... ] + if (!@$order) { + $ctx->{-html_tip} = '<pre>[No topics in range]</pre>'; + return 404; + } + + my @out; + + # sort by recency, this allows new posts to "bump" old topics... + foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) { + my ($ts, $n, $seen, $top, @ex) = @$topic; + @$topic = (); + next unless defined $top; # ghost topic + my $mid = delete $seen->{$top}; + my $href = mid_escape($mid); + $top = PublicInbox::Hval->new($top)->as_html; $ts = fmt_ts($ts); - my $attr = " $ts UTC"; # $n isn't the total number of posts on the topic, # just the number of posts in the current results window - $n = $n == 1 ? '' : " ($n+ messages)"; - - if ($level == 0 || $attr ne $prev_attr) { - my $mbox = qq(<a\nhref="$mid/t.mbox.gz">mbox.gz</a>); - my $atom = qq(<a\nhref="$mid/t.atom">Atom</a>); - $pfx .= INDENT if $level > 0; - $dst .= $pfx . $attr . $n . " - $mbox / $atom\n"; - $prev_attr = $attr; + my $anchor; + if ($n == 1) { + $n = ''; + $anchor = '#u'; # top of only message + } else { + $n = " ($n+ messages)"; + $anchor = '#t'; # thread skeleton + } + + my $mbox = qq(<a\nhref="$href/t.mbox.gz">mbox.gz</a>); + my $atom = qq(<a\nhref="$href/t.atom">Atom</a>); + my $s = "<a\nhref=\"$href/T/$anchor\"><b>$top</b></a>\n" . + " $ts UTC $n - $mbox / $atom\n"; + for (my $i = 0; $i < scalar(@ex); $i += 2) { + my $level = $ex[$i]; + my $sub = $ex[$i + 1]; + $mid = delete $seen->{$sub}; + $sub = PublicInbox::Hval->new($sub)->as_html; + $href = mid_escape($mid); + $s .= indent_for($level) . TCHILD; + $s .= "<a\nhref=\"$href/T/#u\">$sub</a>\n"; } + push @out, $s; } - $dst .= '</pre>'; + $ctx->{-html_tip} = '<pre>' . join("\n", @out) . '</pre>'; + 200; } -sub emit_index_topics { - my ($state) = @_; - my $off = $state->{ctx}->{cgi}->param('o'); - $off = 0 unless defined $off; - $state->{order} = []; - $state->{subjs} = {}; - $state->{latest} = {}; - my $max = 25; - my %opts = ( offset => int $off, limit => $max * 4 ); - while (scalar @{$state->{order}} < $max) { - my $sres = $state->{srch}->query('', \%opts); - my $nr = scalar @{$sres->{msgs}} or last; +sub index_nav { # callback for WwwStream + my (undef, $ctx) = @_; + delete $ctx->{qp} or return; + my ($next, $prev); + $next = $prev = ' '; + my $latest = ''; + + my $next_o = $ctx->{-next_o}; + if ($next_o) { + $next = qq!<a\nhref="?o=$next_o"\nrel=next>next</a>!; + } + if (my $cur_o = $ctx->{-cur_o}) { + $latest = qq! <a\nhref=.>latest</a>!; - for (rsort_ts(thread_results(load_results($sres), 1)->rootset)){ - add_topic($state, $_, 0); + my $o = $cur_o - ($next_o - $cur_o); + if ($o > 0) { + $prev = qq!<a\nhref="?o=$o"\nrel=prev>prev</a>!; + } elsif ($o == 0) { + $prev = qq!<a\nhref=.\nrel=prev>prev</a>!; } - $opts{offset} += $nr; } + "<hr><pre>page: $next $prev$latest</pre>"; +} + +sub index_topics { + my ($ctx) = @_; + my ($off) = (($ctx->{qp}->{o} || '0') =~ /(\d+)/); + my $opts = { offset => $off, limit => 200 }; + + $ctx->{order} = []; + my $srch = $ctx->{srch}; + my $sres = $srch->query('', $opts); + my $nr = scalar @{$sres->{msgs}}; + if ($nr) { + $sres = load_results($srch, $sres); + walk_thread(thread_results($sres), $ctx, *acc_topic); + } + $ctx->{-next_o} = $off+ $nr; + $ctx->{-cur_o} = $off; + PublicInbox::WwwStream->response($ctx, dump_topics($ctx), *index_nav); +} + +sub thread_adj_level { + my ($ctx, $level) = @_; + + my $max = $ctx->{cur_level}; + if ($level <= 0) { + return ('', '') if $max == 0; # flat output + + # reset existing lists + my $beg = $max > 1 ? ('</ul></li>' x ($max - 1)) : ''; + $ctx->{cur_level} = 0; + ("$beg</ul>", ''); + } elsif ($level == $max) { # continue existing list + qw(<li> </li>); + } elsif ($level < $max) { + my $beg = $max > 1 ? ('</ul></li>' x ($max - $level)) : ''; + $ctx->{cur_level} = $level; + ("$beg<li>", '</li>'); + } else { # ($level > $max) # start a new level + $ctx->{cur_level} = $level; + my $beg = ($max ? '<li>' : '') . '<ul><li>'; + ($beg, '</li>'); + } +} - $state->{fh}->write(dump_topics($state)); - $opts{offset}; +sub ghost_index_entry { + my ($ctx, $level, $node) = @_; + my ($beg, $end) = thread_adj_level($ctx, $level); + $beg . '<pre>'. ghost_parent($ctx->{-upfx}, $node->{id}) + . '</pre>' . $end; } 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index bb54aaa6..11fc92e9 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -13,16 +13,16 @@ package PublicInbox::WWW; use 5.008; use strict; use warnings; -use Plack::Request; -use PublicInbox::Config qw(try_cat); -use URI::Escape qw(uri_escape_utf8 uri_unescape); -use constant SSOMA_URL => '//ssoma.public-inbox.org/'; -use constant PI_URL => '//public-inbox.org/'; +use PublicInbox::Config; +use PublicInbox::Hval; +use URI::Escape qw(uri_unescape); +use PublicInbox::MID qw(mid_escape); require PublicInbox::Git; use PublicInbox::GitHTTPBackend; -our $LISTNAME_RE = qr!\A/([\w\.\-]+)!; +our $INBOX_RE = qr!\A/([\w\.\-]+)!; our $MID_RE = qr!([^/]+)!; -our $END_RE = qr!(f/|T/|t/|R/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; +our $END_RE = qr!(T/|t/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; +our $ATTACH_RE = qr!(\d[\.\d]*)-([[:alnum:]][\w\.-]+[[:alnum:]])!i; sub new { my ($class, $pi_config) = @_; @@ -38,16 +38,24 @@ sub run { sub call { my ($self, $env) = @_; - my $cgi = Plack::Request->new($env); - my $ctx = { cgi => $cgi, pi_config => $self->{pi_config} }; - my $path_info = $cgi->path_info; + my $ctx = { env => $env, www => $self }; + + # we don't care about multi-value + my %qp = map { + my ($k, $v) = split('=', uri_unescape($_), 2); + $v = '' unless defined $v; + $v =~ tr/+/ /; + ($k, $v) + } split(/[&;]/, $env->{QUERY_STRING}); + $ctx->{qp} = \%qp; + + my $path_info = $env->{PATH_INFO}; + my $method = $env->{REQUEST_METHOD}; - my $method = $cgi->method; if ($method eq 'POST' && - $path_info =~ m!$LISTNAME_RE/(git-upload-pack)\z!) { + $path_info =~ m!$INBOX_RE/(git-upload-pack)\z!) { my $path = $2; - return (invalid_list($self, $ctx, $1) || - serve_git($cgi, $ctx->{git}, $path)); + return invalid_inbox($ctx, $1) || serve_git($ctx, $path); } elsif ($method !~ /\AGET|HEAD\z/) { return r(405, 'Method Not Allowed'); @@ -56,33 +64,47 @@ sub call { # top-level indices and feeds if ($path_info eq '/') { r404(); - } elsif ($path_info =~ m!$LISTNAME_RE\z!o) { - invalid_list($self, $ctx, $1) || r301($ctx, $1); - } elsif ($path_info =~ m!$LISTNAME_RE(?:/|/index\.html)?\z!o) { - invalid_list($self, $ctx, $1) || get_index($ctx); - } elsif ($path_info =~ m!$LISTNAME_RE/(?:atom\.xml|new\.atom)\z!o) { - invalid_list($self, $ctx, $1) || get_atom($ctx); - - } elsif ($path_info =~ m!$LISTNAME_RE/ + } elsif ($path_info =~ m!$INBOX_RE\z!o) { + invalid_inbox($ctx, $1) || r301($ctx, $1); + } elsif ($path_info =~ m!$INBOX_RE(?:/|/index\.html)?\z!o) { + invalid_inbox($ctx, $1) || get_index($ctx); + } elsif ($path_info =~ m!$INBOX_RE/(?:atom\.xml|new\.atom)\z!o) { + invalid_inbox($ctx, $1) || get_atom($ctx); + } elsif ($path_info =~ m!$INBOX_RE/new\.html\z!o) { + invalid_inbox($ctx, $1) || get_new($ctx); + } elsif ($path_info =~ m!$INBOX_RE/ ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my $path = $2; - invalid_list($self, $ctx, $1) || - serve_git($cgi, $ctx->{git}, $path); - } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/$END_RE\z!o) { - msg_page($self, $ctx, $1, $2, $3); - + invalid_inbox($ctx, $1) || serve_git($ctx, $path); + } elsif ($path_info =~ m!$INBOX_RE/([\w-]+).mbox\.gz\z!o) { + serve_mbox_range($ctx, $1, $2); + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$END_RE\z!o) { + msg_page($ctx, $1, $2, $3); + + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$ATTACH_RE\z!o) { + my ($idx, $fn) = ($3, $4); + invalid_inbox_mid($ctx, $1, $2) || get_attach($ctx, $idx, $fn); # in case people leave off the trailing slash: - } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/(f|T|t|R)\z!o) { - my ($listname, $mid, $suffix) = ($1, $2, $3); + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/(T|t)\z!o) { + my ($inbox, $mid, $suffix) = ($1, $2, $3); $suffix .= $suffix =~ /\A[tT]\z/ ? '/#u' : '/'; - r301($ctx, $listname, $mid, $suffix); + r301($ctx, $inbox, $mid, $suffix); + + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/R/?\z!o) { + my ($inbox, $mid) = ($1, $2); + r301($ctx, $inbox, $mid, '#R'); + + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/f/?\z!o) { + r301($ctx, $1, $2); + } elsif ($path_info =~ m!$INBOX_RE/_/text(?:/(.*))?\z!o) { + get_text($ctx, $1, $2); # convenience redirects order matters - } elsif ($path_info =~ m!$LISTNAME_RE/([^/]{2,})\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/([^/]{2,})\z!o) { r301($ctx, $1, $2); } else { - legacy_redirects($self, $ctx, $path_info); + legacy_redirects($ctx, $path_info); } } @@ -90,14 +112,14 @@ sub call { sub preload { require PublicInbox::Feed; require PublicInbox::View; - require PublicInbox::Thread; + require PublicInbox::SearchThread; require Email::MIME; require Digest::SHA; require POSIX; foreach (qw(PublicInbox::Search PublicInbox::SearchView PublicInbox::Mbox IO::Compress::Gzip - PublicInbox::NewsWWW PublicInbox::NewsGroup)) { + PublicInbox::NewsWWW)) { eval "require $_;"; } } @@ -118,13 +140,15 @@ sub r404 { sub r { [ $_[0], ['Content-Type' => 'text/plain'], [ join(' ', @_, "\n") ] ] } # returns undef if valid, array ref response if invalid -sub invalid_list { - my ($self, $ctx, $listname, $mid) = @_; - my $git_dir = $ctx->{pi_config}->get($listname, "mainrepo"); - if (defined $git_dir) { - $ctx->{git_dir} = $git_dir; - $ctx->{git} = PublicInbox::Git->new($git_dir); - $ctx->{listname} = $listname; +sub invalid_inbox ($$) { + my ($ctx, $inbox) = @_; + my $www = $ctx->{www}; + my $obj = $www->{pi_config}->lookup_name($inbox); + if (defined $obj) { + $ctx->{git_dir} = $obj->{mainrepo}; + $ctx->{git} = $obj->git; + $ctx->{-inbox} = $obj; + $ctx->{inbox} = $inbox; return; } @@ -132,13 +156,13 @@ sub invalid_list { # generation and link things intended for nntp:// to https?://, # so try to infer links and redirect them to the appropriate # list URL. - $self->news_www->call($ctx->{cgi}->{env}); + $www->news_www->call($ctx->{env}); } # returns undef if valid, array ref response if invalid -sub invalid_list_mid { - my ($self, $ctx, $listname, $mid) = @_; - my $ret = invalid_list($self, $ctx, $listname, $mid); +sub invalid_inbox_mid { + my ($ctx, $inbox, $mid) = @_; + my $ret = invalid_inbox($ctx, $inbox); return $ret if $ret; $ctx->{mid} = $mid = uri_unescape($mid); @@ -154,20 +178,26 @@ sub invalid_list_mid { undef; } -# /$LISTNAME/new.atom -> Atom feed, includes replies +# /$INBOX/new.atom -> Atom feed, includes replies sub get_atom { my ($ctx) = @_; require PublicInbox::Feed; PublicInbox::Feed::generate($ctx); } -# /$LISTNAME/?r=$GIT_COMMIT -> HTML only +# /$INBOX/new.html -> HTML only +sub get_new { + my ($ctx) = @_; + require PublicInbox::Feed; + PublicInbox::Feed::new_html($ctx); +} + +# /$INBOX/?r=$GIT_COMMIT -> HTML only sub get_index { my ($ctx) = @_; require PublicInbox::Feed; - my $srch = searcher($ctx); - footer($ctx); - if (defined $ctx->{cgi}->param('q')) { + searcher($ctx); + if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) { require PublicInbox::SearchView; PublicInbox::SearchView::sres_top_html($ctx); } else { @@ -178,12 +208,10 @@ sub get_index { # just returns a string ref for the blob in the current ctx sub mid2blob { my ($ctx) = @_; - require PublicInbox::MID; - my $path = PublicInbox::MID::mid2path($ctx->{mid}); - $ctx->{git}->cat_file("HEAD:$path"); + $ctx->{-inbox}->msg_by_mid($ctx->{mid}); } -# /$LISTNAME/$MESSAGE_ID/raw -> raw mbox +# /$INBOX/$MESSAGE_ID/raw -> raw mbox sub get_mid_txt { my ($ctx) = @_; my $x = mid2blob($ctx) or return r404($ctx); @@ -191,55 +219,37 @@ sub get_mid_txt { PublicInbox::Mbox::emit1($ctx, $x); } -# /$LISTNAME/$MESSAGE_ID/ -> HTML content (short quotes) +# /$INBOX/$MESSAGE_ID/ -> HTML content (short quotes) sub get_mid_html { my ($ctx) = @_; my $x = mid2blob($ctx) or return r404($ctx); require PublicInbox::View; - my $foot = footer($ctx); require Email::MIME; my $mime = Email::MIME->new($x); searcher($ctx); - [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View::msg_html($ctx, $mime, 'f/', $foot) ] ]; + PublicInbox::View::msg_html($ctx, $mime); } -# /$LISTNAME/$MESSAGE_ID/f/ -> HTML content (fullquotes) -sub get_full_html { - my ($ctx) = @_; - my $x = mid2blob($ctx) or return r404($ctx); - +# /$INBOX/$MESSAGE_ID/t/ +sub get_thread { + my ($ctx, $flat) = @_; + searcher($ctx) or return need_search($ctx); + $ctx->{flat} = $flat; require PublicInbox::View; - my $foot = footer($ctx); - require Email::MIME; - my $mime = Email::MIME->new($x); - searcher($ctx); - [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View::msg_html($ctx, $mime, undef, $foot)] ]; + PublicInbox::View::thread_html($ctx); } -# /$LISTNAME/$MESSAGE_ID/R/ -> HTML content (fullquotes) -sub get_reply_html { - my ($ctx) = @_; - my $x = mid2blob($ctx) or return r404($ctx); +# /$INBOX/_/text/$KEY/ +# /$INBOX/_/text/$KEY/raw +# KEY may contain slashes +sub get_text { + my ($ctx, $inbox, $key) = @_; + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; - require PublicInbox::View; - my $foot = footer($ctx); - require Email::MIME; - my $hdr = Email::MIME->new($x)->header_obj; - [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View::msg_reply($ctx, $hdr, $foot)] ]; -} - -# /$LISTNAME/$MESSAGE_ID/t/ -sub get_thread { - my ($ctx, $flat) = @_; - my $srch = searcher($ctx) or return need_search($ctx); - require PublicInbox::View; - my $foot = footer($ctx); - $ctx->{flat} = $flat; - PublicInbox::View::thread_html($ctx, $foot, $srch); + require PublicInbox::WwwText; + PublicInbox::WwwText::get_text($ctx, $key); } sub ctx_get { @@ -249,66 +259,13 @@ sub ctx_get { $val; } -sub footer { - my ($ctx) = @_; - return '' unless $ctx; - my $git_dir = ctx_get($ctx, 'git_dir'); - - # favor user-supplied footer - my $footer = try_cat("$git_dir/public-inbox/footer.html"); - if (defined $footer) { - chomp $footer; - $ctx->{footer} = $footer; - return $footer; - } - - # auto-generate a footer - my $listname = ctx_get($ctx, 'listname'); - my $desc = try_cat("$git_dir/description"); - $desc = '$GIT_DIR/description missing' unless defined $desc; - chomp $desc; - - my $urls = try_cat("$git_dir/cloneurl"); - my @urls = split(/\r?\n/, $urls || ''); - my %seen = map { $_ => 1 } @urls; - my $cgi = $ctx->{cgi}; - my $http = $cgi->base->as_string . $listname; - $seen{$http} or unshift @urls, $http; - my $ssoma_url = PublicInbox::Hval::prurl($cgi->{env}, SSOMA_URL); - if (scalar(@urls) == 1) { - $urls = "URL for <a\nhref=\"" . $ssoma_url . - qq(">ssoma</a> or <b>git clone --mirror $urls[0]</b>); - } else { - $urls = "URLs for <a\nhref=\"" . $ssoma_url . - qq(">ssoma</a> or <b>git clone --mirror</b>\n) . - join("\n", map { "\tgit clone --mirror $_" } @urls); - } - - my $addr = $ctx->{pi_config}->get($listname, 'address'); - if (ref($addr) eq 'ARRAY') { - $addr = $addr->[0]; # first address is primary - } - - $addr = "<a\nhref=\"mailto:$addr\">$addr</a>"; - - $ctx->{footer} = join("\n", - '- ' . $desc, - "A <a\nhref=\"" . - PublicInbox::Hval::prurl($ctx->{cgi}->{env}, PI_URL) . - '">public-inbox</a>, ' . - 'anybody may post in plain-text (not HTML):', - $addr, - $urls - ); -} - # search support is optional, returns undef if Xapian is not installed # or not configured for the given GIT_DIR sub searcher { my ($ctx) = @_; eval { require PublicInbox::Search; - $ctx->{srch} = PublicInbox::Search->new($ctx->{git_dir}); + $ctx->{srch} = $ctx->{-inbox}->search; }; } @@ -322,8 +279,8 @@ EOF [ 501, [ 'Content-Type' => 'text/html; charset=UTF-8' ], [ $msg ] ]; } -# /$LISTNAME/$MESSAGE_ID/t.mbox -> thread as mbox -# /$LISTNAME/$MESSAGE_ID/t.mbox.gz -> thread as gzipped mbox +# /$INBOX/$MESSAGE_ID/t.mbox -> thread as mbox +# /$INBOX/$MESSAGE_ID/t.mbox.gz -> thread as gzipped mbox # note: I'm not a big fan of other compression formats since they're # significantly more expensive on CPU than gzip and less-widely available, # especially on older systems. Stick to zlib since that's what git uses. @@ -335,78 +292,83 @@ sub get_thread_mbox { } -# /$LISTNAME/$MESSAGE_ID/t.atom -> thread as Atom feed +# /$INBOX/$MESSAGE_ID/t.atom -> thread as Atom feed sub get_thread_atom { my ($ctx) = @_; searcher($ctx) or return need_search($ctx); - $ctx->{self_url} = $ctx->{cgi}->uri->as_string; require PublicInbox::Feed; PublicInbox::Feed::generate_thread_atom($ctx); } sub legacy_redirects { - my ($self, $ctx, $path_info) = @_; + my ($ctx, $path_info) = @_; # single-message pages - if ($path_info =~ m!$LISTNAME_RE/m/(\S+)/\z!o) { + if ($path_info =~ m!$INBOX_RE/m/(\S+)/\z!o) { r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)/raw\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/m/(\S+)/raw\z!o) { r301($ctx, $1, $2, 'raw'); - } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)/\z!o) { - r301($ctx, $1, $2, 'f/'); + } elsif ($path_info =~ m!$INBOX_RE/f/(\S+)/\z!o) { + r301($ctx, $1, $2); # thread display - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)/\z!o) { r301($ctx, $1, $2, 't/#u'); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/mbox(\.gz)?\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)/mbox(\.gz)?\z!o) { r301($ctx, $1, $2, "t.mbox$3"); # even older legacy redirects - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)\.html\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/m/(\S+)\.html\z!o) { r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)\.html\z!o) { r301($ctx, $1, $2, 't/#u'); - } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)\.html\z!o) { - r301($ctx, $1, $2, 'f/'); + } elsif ($path_info =~ m!$INBOX_RE/f/(\S+)\.html\z!o) { + r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/(?:m|f)/(\S+)\.txt\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/(?:m|f)/(\S+)\.txt\z!o) { r301($ctx, $1, $2, 'raw'); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)(\.mbox(?:\.gz)?)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)(\.mbox(?:\.gz)?)\z!o) { r301($ctx, $1, $2, "t$3"); # legacy convenience redirects, order still matters - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/m/(\S+)\z!o) { r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)\z!o) { r301($ctx, $1, $2, 't/#u'); - } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)\z!o) { - r301($ctx, $1, $2, 'f/'); + } elsif ($path_info =~ m!$INBOX_RE/f/(\S+)\z!o) { + r301($ctx, $1, $2); # some Message-IDs have slashes in them and the HTTP server # may try to be clever and unescape them :< - } elsif ($path_info =~ m!$LISTNAME_RE/(\S+/\S+)/$END_RE\z!o) { - msg_page($self, $ctx, $1, $2, $3); + } elsif ($path_info =~ m!$INBOX_RE/(\S+/\S+)/$END_RE\z!o) { + msg_page($ctx, $1, $2, $3); # in case people leave off the trailing slash: - } elsif ($path_info =~ m!$LISTNAME_RE/(\S+/\S+)/(f|T|t)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/(\S+/\S+)/(T|t)\z!o) { r301($ctx, $1, $2, $3 eq 't' ? 't/#u' : $3); + } elsif ($path_info =~ m!$INBOX_RE/(\S+/\S+)/f\z!o) { + r301($ctx, $1, $2); } else { - $self->news_www->call($ctx->{cgi}->{env}); + $ctx->{www}->news_www->call($ctx->{env}); } } sub r301 { - my ($ctx, $listname, $mid, $suffix) = @_; - my $cgi = $ctx->{cgi}; - my $url; - my $qs = $cgi->env->{QUERY_STRING}; - $url = $cgi->base->as_string . $listname . '/'; - $url .= (uri_escape_utf8($mid) . '/') if (defined $mid); + my ($ctx, $inbox, $mid, $suffix) = @_; + my $obj = $ctx->{-inbox}; + unless ($obj) { + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; + $obj = $ctx->{-inbox}; + } + my $url = $obj->base_url($ctx->{env}); + my $qs = $ctx->{env}->{QUERY_STRING}; + $url .= (mid_escape($mid) . '/') if (defined $mid); $url .= $suffix if (defined $suffix); $url .= "?$qs" if $qs ne ''; @@ -416,32 +378,48 @@ sub r301 { } sub msg_page { - my ($self, $ctx, $list, $mid, $e) = @_; + my ($ctx, $inbox, $mid, $e) = @_; my $ret; - $ret = invalid_list_mid($self, $ctx, $list, $mid) and return $ret; + $ret = invalid_inbox_mid($ctx, $inbox, $mid) and return $ret; '' eq $e and return get_mid_html($ctx); + 'T/' eq $e and return get_thread($ctx, 1); 't/' eq $e and return get_thread($ctx); 't.atom' eq $e and return get_thread_atom($ctx); 't.mbox' eq $e and return get_thread_mbox($ctx); 't.mbox.gz' eq $e and return get_thread_mbox($ctx, '.gz'); - 'T/' eq $e and return get_thread($ctx, 1); 'raw' eq $e and return get_mid_txt($ctx); - 'f/' eq $e and return get_full_html($ctx); - 'R/' eq $e and return get_reply_html($ctx); + + # legacy, but no redirect for compatibility: + 'f/' eq $e and return get_mid_html($ctx); r404($ctx); } sub serve_git { - my ($cgi, $git, $path) = @_; - PublicInbox::GitHTTPBackend::serve($cgi, $git, $path); + my ($ctx, $path) = @_; + PublicInbox::GitHTTPBackend::serve($ctx->{env}, $ctx->{git}, $path); +} + +sub serve_mbox_range { + my ($ctx, $inbox, $range) = @_; + invalid_inbox($ctx, $inbox) || eval { + require PublicInbox::Mbox; + searcher($ctx); + PublicInbox::Mbox::emit_range($ctx, $range); + } } sub news_www { my ($self) = @_; - my $nw = $self->{news_www}; - return $nw if $nw; - require PublicInbox::NewsWWW; - $self->{news_www} = PublicInbox::NewsWWW->new($self->{pi_config}); + $self->{news_www} ||= do { + require PublicInbox::NewsWWW; + PublicInbox::NewsWWW->new($self->{pi_config}); + } +} + +sub get_attach { + my ($ctx, $idx, $fn) = @_; + require PublicInbox::WwwAttach; + PublicInbox::WwwAttach::get_attach($ctx, $idx, $fn); } 1; diff --git a/lib/PublicInbox/WWW.pod b/lib/PublicInbox/WWW.pod new file mode 100644 index 00000000..a1d33a3b --- /dev/null +++ b/lib/PublicInbox/WWW.pod @@ -0,0 +1,56 @@ +=head1 NAME + +PublicInbox::WWW - PSGI interface for public-inbox + +=head1 SYNOPSIS + +In your .psgi file: + + use PublicInbox::WWW; + + my $www = PublicInbox::WWW->new; + builder { + enable 'Head'; + mount '/inboxes' => sub { $www->call(@_) }; + }; + +=head1 DESCRIPTION + +The PSGI web interface for public-inbox. + +Using this directly is not needed unless you wish to customize +your public-inbox PSGI deployment or are using a PSGI server +other than L<public-inbox-httpd(1)>. + +While this PSGI application works with all PSGI/Plack web +servers such as L<starman(1)>, L<starlet(1)> or L<twiggy(1)>; +PublicInbox::WWW takes advantage of currently-undocumented APIs +of L<public-inbox-httpd(1)> to improve fairness when serving +large responses for thread views and git clones. + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +Used to override the default "~/.public-inbox/config" value. + +=back + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright (C) 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<http://plackperl.org/>, L<Plack>, L<public-inbox-httpd(1)> diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm new file mode 100644 index 00000000..c8ea3ed3 --- /dev/null +++ b/lib/PublicInbox/WatchMaildir.pm @@ -0,0 +1,253 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# ref: https://cr.yp.to/proto/maildir.html +# http://wiki2.dovecot.org/MailboxFormat/Maildir +package PublicInbox::WatchMaildir; +use strict; +use warnings; +use Email::MIME; +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect +use PublicInbox::Git; +use PublicInbox::Import; +use PublicInbox::MDA; +use PublicInbox::Spawn qw(spawn); + +sub new { + my ($class, $config) = @_; + my (%mdmap, @mdir, $spamc); + + # "publicinboxwatch" is the documented namespace + # "publicinboxlearn" is legacy but may be supported + # indefinitely... + foreach my $pfx (qw(publicinboxwatch publicinboxlearn)) { + my $k = "$pfx.watchspam"; + if (my $spamdir = $config->{$k}) { + if ($spamdir =~ s/\Amaildir://) { + $spamdir =~ s!/+\z!!; + # skip "new", no MUA has seen it, yet. + my $cur = "$spamdir/cur"; + push @mdir, $cur; + $mdmap{$cur} = 'watchspam'; + } else { + warn "unsupported $k=$spamdir\n"; + } + } + } + + my $k = 'publicinboxwatch.spamcheck'; + my $spamcheck = $config->{$k}; + if ($spamcheck) { + if ($spamcheck eq 'spamc') { + $spamcheck = 'PublicInbox::Spamcheck::Spamc'; + } + if ($spamcheck =~ /::/) { + eval "require $spamcheck"; + $spamcheck = _spamcheck_cb($spamcheck->new); + } else { + warn "unsupported $k=$spamcheck\n"; + $spamcheck = undef; + } + } + foreach $k (keys %$config) { + $k =~ /\Apublicinbox\.([^\.]+)\.watch\z/ or next; + my $name = $1; + my $watch = $config->{$k}; + if ($watch =~ s/\Amaildir://) { + $watch =~ s!/+\z!!; + my $inbox = $config->lookup_name($name); + if (my $wm = $inbox->{watchheader}) { + my ($k, $v) = split(/:/, $wm, 2); + $inbox->{-watchheader} = [ $k, qr/\Q$v\E/ ]; + } + my $new = "$watch/new"; + my $cur = "$watch/cur"; + push @mdir, $new, $cur; + die "$new already in use\n" if $mdmap{$new}; + die "$cur already in use\n" if $mdmap{$cur}; + $mdmap{$new} = $mdmap{$cur} = $inbox; + } else { + warn "watch unsupported: $k=$watch\n"; + } + } + return unless @mdir; + + my $mdre = join('|', map { quotemeta($_) } @mdir); + $mdre = qr!\A($mdre)/!; + bless { + spamcheck => $spamcheck, + mdmap => \%mdmap, + mdir => \@mdir, + mdre => $mdre, + importers => {}, + }, $class; +} + +sub _done_for_now { + $_->done foreach values %{$_[0]->{importers}}; +} + +sub _try_fsn_paths { + my ($self, $paths) = @_; + _try_path($self, $_->{path}) foreach @$paths; + _done_for_now($self); +} + +sub _remove_spam { + my ($self, $path) = @_; + $path =~ /:2,[A-R]*S[T-Z]*\z/i or return; + my $mime = _path_to_mime($path) or return; + _force_mid($mime); + foreach my $inbox (values %{$self->{mdmap}}) { + next unless ref $inbox; + my $im = _importer_for($self, $inbox); + $im->remove($mime); + if (my $scrub = _scrubber_for($inbox)) { + my $scrubbed = $scrub->scrub($mime) or next; + $im->remove($scrubbed); + } + } +} + +# used to hash the relevant portions of a message when there are conflicts +sub _hash_mime2 { + my ($mime) = @_; + require Digest::SHA; + my $dig = Digest::SHA->new('SHA-1'); + $dig->add($mime->header_obj->header_raw('Subject')); + $dig->add($mime->body_raw); + $dig->hexdigest; +} + +sub _force_mid { + my ($mime) = @_; + # probably a bad idea, but we inject a Message-Id if + # one is missing, here.. + my $mid = $mime->header_obj->header_raw('Message-Id'); + if (!defined $mid || $mid =~ /\A\s*\z/) { + $mid = '<' . _hash_mime2($mime) . '@generated>'; + $mime->header_set('Message-Id', $mid); + } +} + +sub _try_path { + my ($self, $path) = @_; + my @p = split(m!/+!, $path); + return if $p[-1] !~ /\A[a-zA-Z0-9][\w:,=\.]+\z/; + if ($p[-1] =~ /:2,([A-Z]+)\z/i) { + my $flags = $1; + return if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail + } + return unless -f $path; + if ($path !~ $self->{mdre}) { + warn "unrecognized path: $path\n"; + return; + } + my $inbox = $self->{mdmap}->{$1}; + unless ($inbox) { + warn "unmappable dir: $1\n"; + return; + } + if (!ref($inbox) && $inbox eq 'watchspam') { + return _remove_spam($self, $path); + } + my $im = _importer_for($self, $inbox); + my $mime = _path_to_mime($path) or return; + $mime->header_set($_) foreach @PublicInbox::MDA::BAD_HEADERS; + my $wm = $inbox->{-watchheader}; + if ($wm) { + my $v = $mime->header_obj->header_raw($wm->[0]); + return unless ($v && $v =~ $wm->[1]); + } + if (my $scrub = _scrubber_for($inbox)) { + $mime = $scrub->scrub($mime) or return; + } + + _force_mid($mime); + $im->add($mime, $self->{spamcheck}); +} + +sub watch { + my ($self) = @_; + my $cb = sub { _try_fsn_paths($self, \@_) }; + my $mdir = $self->{mdir}; + + # lazy load here, we may support watching via IMAP IDLE + # in the future... + require Filesys::Notify::Simple; + my $watcher = Filesys::Notify::Simple->new($mdir); + $watcher->wait($cb) while (1); +} + +sub scan { + my ($self) = @_; + my $mdir = $self->{mdir}; + foreach my $dir (@$mdir) { + my $ok = opendir(my $dh, $dir); + unless ($ok) { + warn "failed to open $dir: $!\n"; + next; + } + while (my $fn = readdir($dh)) { + _try_path($self, "$dir/$fn"); + } + closedir $dh; + } + _done_for_now($self); +} + +sub _path_to_mime { + my ($path) = @_; + if (open my $fh, '<', $path) { + local $/; + my $str = <$fh>; + $str or return; + return Email::MIME->new(\$str); + } elsif ($!{ENOENT}) { + return; + } else { + warn "failed to open $path: $!\n"; + return; + } +} + +sub _importer_for { + my ($self, $inbox) = @_; + my $im = $inbox->{-import} ||= eval { + my $git = $inbox->git; + my $name = $inbox->{name}; + my $addr = $inbox->{-primary_address}; + PublicInbox::Import->new($git, $name, $addr, $inbox); + }; + $self->{importers}->{"$im"} = $im; +} + +sub _scrubber_for { + my ($inbox) = @_; + my $f = $inbox->{filter}; + if ($f && $f =~ /::/) { + eval "require $f"; + if ($@) { + warn $@; + } else { + return $f->new; + } + } + undef; +} + +sub _spamcheck_cb { + my ($sc) = @_; + sub { + my ($mime) = @_; + my $tmp = ''; + if ($sc->spamcheck($mime, \$tmp)) { + return Email::MIME->new(\$tmp); + } + warn $mime->header('Message-ID')." failed spam check\n"; + undef; + } +} + +1; diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm new file mode 100644 index 00000000..5720384c --- /dev/null +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -0,0 +1,134 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Atom body stream for which yields getline+close methods +package PublicInbox::WwwAtomStream; +use strict; +use warnings; + +# FIXME: locale-independence: +use POSIX qw(strftime); +use Date::Parse qw(strptime); + +use PublicInbox::Address; +use PublicInbox::Hval qw(ascii_html); +use PublicInbox::MID qw/mid_clean mid2path mid_escape/; + +# called by PSGI server after getline: +sub close {} + +sub new { + my ($class, $ctx, $cb) = @_; + $ctx->{emit_header} = 1; + $ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env}); + bless { cb => $cb || *close, ctx => $ctx }, $class; +} + +sub response { + my ($class, $ctx, $code, $cb) = @_; + [ $code, [ 'Content-Type', 'application/atom+xml' ], + $class->new($ctx, $cb) ] +} + +# called once for each message by PSGI server +sub getline { + my ($self) = @_; + if (my $middle = $self->{cb}) { + my $mime = $middle->(); + return feed_entry($self, $mime) if $mime; + } + delete $self->{cb} ? '</feed>' : undef; +} + +# private + +sub title_tag { + my ($title) = @_; + $title =~ tr/\t\n / /s; # squeeze spaces + # try to avoid the type attribute in title: + $title = ascii_html($title); + my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; + "<title$type>$title</title>"; +} + +sub atom_header { + my ($ctx, $title) = @_; + my $ibx = $ctx->{-inbox}; + my $base_url = $ctx->{feed_base_url}; + my $search_q = $ctx->{search_query}; + my $self_url = $base_url; + my $mid = $ctx->{mid}; + if (defined $mid) { # per-thread + $self_url .= mid_escape($mid).'/t.atom'; + } elsif (defined $search_q) { + my $query = $search_q->{'q'}; + $title = title_tag("$query - search results"); + $base_url .= '?' . $search_q->qs_html(x => undef); + $self_url .= '?' . $search_q->qs_html; + } else { + $title = title_tag($ibx->description); + $self_url .= 'new.atom'; + } + my $mtime = (stat($ibx->{mainrepo}))[9] || time; + + qq(<?xml version="1.0" encoding="us-ascii"?>\n) . + qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . + qq{$title} . + qq(<link\nrel="alternate"\ntype="text/html") . + qq(\nhref="$base_url"/>) . + qq(<link\nrel="self"\nhref="$self_url"/>) . + qq(<id>mailto:$ibx->{-primary_address}</id>) . + feed_updated(gmtime($mtime)); +} + +# returns undef or string +sub feed_entry { + my ($self, $mime) = @_; + my $ctx = $self->{ctx}; + my $hdr = $mime->header_obj; + my $mid = mid_clean($hdr->header_raw('Message-ID')); + + my $uuid = mid2path($mid); + $uuid =~ tr!/!!d; + my $h = '[a-f0-9]'; + my (@uuid5) = ($uuid =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); + $uuid = 'urn:uuid:' . join('-', @uuid5); + + $mid = PublicInbox::Hval->new_msgid($mid); + my $href = $ctx->{feed_base_url} . $mid->{href}. '/'; + + my $date = $hdr->header('Date'); + my @t = eval { strptime($date) } if defined $date; + @t = gmtime(time) unless scalar @t; + my $updated = feed_updated(@t); + + my $title = $hdr->header('Subject'); + $title = '(no subject)' unless defined $title && $title ne ''; + $title = title_tag($title); + + my $from = $hdr->header('From') or return; + my ($email) = PublicInbox::Address::emails($from); + my $name = join(', ',PublicInbox::Address::names($from)); + $name = ascii_html($name); + $email = ascii_html($email); + + my $s = ''; + if (delete $ctx->{emit_header}) { + $s .= atom_header($ctx, $title); + } + $s .= "<entry><author><name>$name</name><email>$email</email>" . + "</author>$title$updated" . + qq{<content\ntype="xhtml">} . + qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} . + qq(<pre\nstyle="white-space:pre-wrap">) . + PublicInbox::View::multipart_text_as_html($mime, $href) . + '</pre>' . + qq!</div></content><link\nhref="$href"/>!. + "<id>$uuid</id></entry>"; +} + +sub feed_updated { + '<updated>' . strftime('%Y-%m-%dT%H:%M:%SZ', @_) . '</updated>'; +} + +1; diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm new file mode 100644 index 00000000..33bfce27 --- /dev/null +++ b/lib/PublicInbox/WwwAttach.pm @@ -0,0 +1,43 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# For retrieving attachments from messages in the WWW interface +package PublicInbox::WwwAttach; # internal package +use strict; +use warnings; +use Email::MIME; +use Email::MIME::ContentType qw(parse_content_type); +$Email::MIME::ContentType::STRICT_PARAMS = 0; +use PublicInbox::MsgIter; + +# /$LISTNAME/$MESSAGE_ID/$IDX-$FILENAME +sub get_attach ($$$) { + my ($ctx, $idx, $fn) = @_; + my $res = [ 404, [ 'Content-Type', 'text/plain' ], [ "Not found\n" ] ]; + my $mime = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return $res; + $mime = Email::MIME->new($mime); + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + return if join('.', @idx) ne $idx; + $res->[0] = 200; + my $ct = $part->content_type; + $ct = parse_content_type($ct) if $ct; + + # discrete == type, we remain Debian wheezy-compatible + if ($ct && (($ct->{discrete} || '') eq 'text')) { + # display all text as text/plain: + my $cset = $ct->{attributes}->{charset}; + if ($cset && ($cset =~ /\A[\w-]+\z/)) { + $res->[1]->[1] .= qq(; charset=$cset); + } + } else { # TODO: allow user to configure safe types + $res->[1]->[1] = 'application/octet-stream'; + } + $part = $part->body; + push @{$res->[1]}, 'Content-Length', bytes::length($part); + $res->[2]->[0] = $part; + }); + $res; +} + +1; diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm new file mode 100644 index 00000000..01f7b31b --- /dev/null +++ b/lib/PublicInbox/WwwStream.pm @@ -0,0 +1,129 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# HTML body stream for which yields getline+close methods +package PublicInbox::WwwStream; +use strict; +use warnings; +use PublicInbox::Hval qw(ascii_html); +use URI; +our $TOR_URL = 'https://www.torproject.org/'; +our $TOR2WEB_URL = 'https://www.tor2web.org/'; +our $CODE_URL = 'https://public-inbox.org/'; +our $PROJECT = 'public-inbox'; + +# noop for HTTP.pm (and any other PSGI servers) +sub close {} + +sub new { + my ($class, $ctx, $cb) = @_; + bless { nr => 0, cb => $cb || *close, ctx => $ctx }, $class; +} + +sub response { + my ($class, $ctx, $code, $cb) = @_; + [ $code, [ 'Content-Type', 'text/html; charset=UTF-8' ], + $class->new($ctx, $cb) ] +} + +sub _html_top ($) { + my ($self) = @_; + my $ctx = $self->{ctx}; + my $obj = $ctx->{-inbox}; + my $desc = ascii_html($obj->description); + my $title = $ctx->{-title_html} || $desc; + my $upfx = $ctx->{-upfx} || ''; + my $help = $upfx.'_/text/help'; + my $atom = $ctx->{-atom} || $upfx.'new.atom'; + my $tip = $ctx->{-html_tip} || ''; + my $top = "<b>$desc</b>"; + my $links = "<a\nhref=\"$help\">help</a> / ". + "<a\nhref=\"$atom\">Atom feed</a>"; + if ($obj->search) { + my $q_val = $ctx->{-q_value_html}; + if (defined $q_val && $q_val ne '') { + $q_val = qq(\nvalue="$q_val"); + } else { + $q_val = ''; + } + # XXX gross, for SearchView.pm + my $extra = $ctx->{-extra_form_html} || ''; + my $action = $upfx eq '' ? './' : $upfx; + $top = qq{<form\naction="$action"><pre>$top} . + qq{\n<input\nname=q\ntype=text$q_val />} . + $extra . + qq{<input\ntype=submit\nvalue=search />} . + ' ' . $links . + q{</pre></form>} + } else { + $top = '<pre>' . $top . "\n" . $links . '</pre>'; + } + "<html><head><title>$title</title>" . + "<link\nrel=alternate\ntitle=\"Atom feed\"\n". + "href=\"$atom\"\ntype=\"application/atom+xml\"/>" . + PublicInbox::Hval::STYLE . + "</head><body>". $top . $tip; +} + +sub _html_end { + my ($self) = @_; + my $urls = 'Archives are clonable:'; + my $ctx = $self->{ctx}; + my $obj = $ctx->{-inbox}; + my $desc = ascii_html($obj->description); + + my $http = $obj->base_url($ctx->{env}); + chop $http; + my %seen = ( $http => 1 ); + my @urls = ($http); + foreach my $u (@{$obj->cloneurl}) { + next if $seen{$u}; + $seen{$u} = 1; + push @urls, $u =~ /\Ahttps?:/ ? qq(<a\nhref="$u">$u</a>) : $u; + } + if (scalar(@urls) == 1) { + $urls .= " git clone --mirror $http"; + } else { + $urls .= "\n" . + join("\n", map { "\tgit clone --mirror $_" } @urls); + } + + my @nntp = map { qq(<a\nhref="$_">$_</a>) } @{$obj->nntp_url}; + if (@nntp) { + $urls .= "\n\n"; + $urls .= @nntp == 1 ? 'Newsgroup' : 'Newsgroups are'; + $urls .= ' available over NNTP:'; + $urls .= "\n\t" . join("\n\t", @nntp) . "\n"; + } + if ($urls =~ m!\b[^:]+://\w+\.onion/!) { + $urls .= "\n note: .onion URLs require Tor: "; + $urls .= qq[<a\nhref="$TOR_URL">$TOR_URL</a>]; + if ($TOR2WEB_URL) { + $urls .= "\n or Tor2web: "; + $urls .= qq[<a\nhref="$TOR2WEB_URL">$TOR2WEB_URL</a>]; + } + } + my $url = PublicInbox::Hval::prurl($ctx->{env}, $CODE_URL); + '<hr><pre>'.join("\n\n", + $desc, + $urls, + 'AGPL code for this site: '. + qq(git clone <a\nhref="$url">$url</a> $PROJECT) + ).'</pre></body></html>'; +} + +# callback for HTTP.pm (and any other PSGI servers) +sub getline { + my ($self) = @_; + my $nr = $self->{nr}++; + + return _html_top($self) if $nr == 0; + + if (my $middle = $self->{cb}) { + $middle = $middle->($nr, $self->{ctx}) and return $middle; + } + + delete $self->{cb} ? _html_end($self) : undef; +} + +1; diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm new file mode 100644 index 00000000..b0f262cd --- /dev/null +++ b/lib/PublicInbox/WwwText.pm @@ -0,0 +1,207 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# serves the /$INBOX/_/* endpoints from :text/* of the git tree +package PublicInbox::WwwText; +use strict; +use warnings; +use PublicInbox::Linkify; +use PublicInbox::WwwStream; +use PublicInbox::Hval qw(ascii_html); +our $QP_URL = 'https://xapian.org/docs/queryparser.html'; +our $WIKI_URL = 'https://en.wikipedia.org/wiki'; + +# /$INBOX/_/text/$KEY/ # KEY may contain slashes +# For now, "help" is the only supported $KEY +sub get_text { + my ($ctx, $key) = @_; + my $code = 200; + + $key = 'help' if !defined $key; # this 302s to _/text/help/ + + # get the raw text the same way we get mboxrds + my $raw = ($key =~ s!/raw\z!!); + my $have_tslash = ($key =~ s!/\z!!) if !$raw; + + my $txt = ''; + if (!_default_text($ctx, $key, \$txt)) { + $code = 404; + $txt = "404 Not Found ($key)\n"; + } + if ($raw) { + return [ $code, [ 'Content-Type', 'text/plain', + 'Content-Length', bytes::length($txt) ], + [ $txt ] ] + } + + # enforce trailing slash for "wget -r" compatibility + if (!$have_tslash && $code == 200) { + my $url = $ctx->{-inbox}->base_url($ctx->{env}); + $url .= "_/text/$key/"; + + return [ 302, [ 'Content-Type', 'text/plain', + 'Location', $url ], + [ "Redirecting to $url\n" ] ]; + } + + # Follow git commit message conventions, + # first line is the Subject/title + my ($title) = ($txt =~ /\A([^\n]*)/s); + _do_linkify($txt); + $ctx->{-title_html} = ascii_html($title); + + my $nslash = ($key =~ tr!/!/!); + $ctx->{-upfx} = '../../../' . ('../' x $nslash); + + PublicInbox::WwwStream->response($ctx, $code, sub { + my ($nr, undef) = @_; + $nr == 1 ? '<pre>'.$txt.'</pre>' : undef + }); +} + +sub _do_linkify { + my $l = PublicInbox::Linkify->new; + $_[0] = $l->linkify_2(ascii_html($l->linkify_1($_[0]))); +} + +sub _srch_prefix ($$) { + my ($srch, $txt) = @_; + my $pad = 0; + my $htxt = ''; + my $help = $srch->help; + my $i; + for ($i = 0; $i < @$help; $i += 2) { + my $pfx = $help->[$i]; + my $n = length($pfx); + $pad = $n if $n > $pad; + $htxt .= $pfx . "\0"; + $htxt .= $help->[$i + 1]; + $htxt .= "\f\n"; + } + $pad += 2; + my $padding = ' ' x ($pad + 8); + $htxt =~ s/^/$padding/gms; + $htxt =~ s/^$padding(\S+)\0/" $1". + (' ' x ($pad - length($1)))/egms; + $htxt =~ s/\f\n/\n/gs; + $$txt .= $htxt; + 1; +} + + +sub _default_text ($$$) { + my ($ctx, $key, $txt) = @_; + return if $key ne 'help'; # TODO more keys? + + my $ibx = $ctx->{-inbox}; + my $base_url = $ibx->base_url($ctx->{env}); + $$txt .= "public-inbox help for $base_url\n"; + $$txt .= <<EOF; + +overview +-------- + + public-inbox uses Message-ID identifiers in URLs. + One may look up messages by substituting Message-IDs + (without the leading '<' or trailing '>') into the URL. + Forward slash ('/') characters in the Message-IDs + need to be escaped as "%2F" (without quotes). + + Thus, it is possible to retrieve any message by its + Message-ID by going to: + + $base_url<Message-ID>/ + + (without the '<' or '>') + + Message-IDs are described at: + + $WIKI_URL/Message-ID + +EOF + + # n.b. we use the Xapian DB for any regeneratable, + # order-of-arrival-independent data. + my $srch = $ibx->search; + if ($srch) { + $$txt .= <<EOF; +search +------ + + This public-inbox has search functionality provided by Xapian. + + It supports typical AND, OR, NOT, '+', '-' queries present + in other search engines. + + We also support search prefixes to limit the scope of the + search to certain fields. + + Prefixes supported in this installation include: + +EOF + _srch_prefix($srch, $txt); + + $$txt .= <<EOF; + + Most prefixes are probabilistic, meaning they support stemming + and wildcards ('*'). Ranges (such as 'd:') and boolean prefixes + do not support stemming or wildcards. + The upstream Xapian query parser documentation fully explains + the query syntax: + + $QP_URL + +message threading +----------------- + + Message threading is enabled for this public-inbox, + additional endpoints for message threads are available: + + * $base_url<Message-ID>/T/#u + + Loads the thread belonging to the given <Message-ID> + in flat chronological order. The "#u" anchor + focuses the browser on the given <Message-ID>. + + * $base_url<Message-ID>/t/#u + + Loads the thread belonging to the given <Message-ID> + in threaded order with nesting. For deep threads, + this requires a wide display or horizontal scrolling. + + Both of these HTML endpoints are suitable for offline reading + using the thread overview at the bottom of each page. + + Users of feed readers may follow a particular thread using: + + * $base_url<Message-ID>/t.atom + + Which loads the thread in Atom Syndication Standard + described at Wikipedia and RFC4287: + + $WIKI_URL/Atom_(standard) + https://tools.ietf.org/html/rfc4287 + + Finally, the gzipped mbox for a thread is available for + downloading and importing into your favorite mail client: + + * $base_url<Message-ID>/t.mbox.gz + + We use the mboxrd variant of the mbox format described + at: + + $WIKI_URL/Mbox + +contact +------- + + This help text is maintained by public-inbox developers + reachable via plain-text email at: meta\@public-inbox.org + +EOF + # TODO: support admin contact info in ~/.public-inbox/config + } + 1; +} + +1; diff --git a/sa_config/user/.spamassassin/user_prefs b/sa_config/user/.spamassassin/user_prefs index 3a4a50fb..70e12911 100644 --- a/sa_config/user/.spamassassin/user_prefs +++ b/sa_config/user/.spamassassin/user_prefs @@ -40,9 +40,27 @@ body HELLOMYDEAR /hello my dear/i describe HELLOMYDEAR "spam phrase" score HELLOMYDEAR 2.8 -# we hate HTML mail -score MIME_HTML_MOSTLY 3.0 -score HTML_MESSAGE 3.0 +body JUSTAMAILBOX /I'm just a mailbox used for sending notifications/ +describe JUSTAMAILBOX "autoreply phrase" +score JUSTAMAILBOX 5.0 + +# no delivery +header PI_DNOT subject =~ /delivery\s+(?:status\s+)?notification/i +describe PI_DNOT "delivery notification" +score PI_DNOT 3 + +# notice to appear +header PI_DNOTICE subject =~ /notice to appear/i +describe PI_DNOTICE "notice to appear" +score PI_DNOTICE 3 + +full ZIPFILE /\b(?:file)?name\=.*\.zip\b/i +describe ZIPFILE zipfile attachment +score ZIPFILE 0.5 + +score BAYES_999 3 +score BAYES_05 -1.5 +score BAYES_00 -15 # trust paid whitelist services? never score RCVD_IN_RP_SAFE 0.0 diff --git a/script/public-inbox-httpd b/script/public-inbox-httpd index 3ca974c9..8ba42c2f 100755 --- a/script/public-inbox-httpd +++ b/script/public-inbox-httpd @@ -8,7 +8,7 @@ use warnings; use Plack::Util; use PublicInbox::Daemon; use PublicInbox::HTTP; -use Plack::Request; +use PublicInbox::HTTPD; use Plack::Builder; my %httpds; my $app; @@ -24,7 +24,6 @@ my $refresh = sub { PublicInbox::WWW->preload; my $www = PublicInbox::WWW->new; $app = builder { - enable 'Chunked'; eval { enable 'Deflater', content_type => [ qw( @@ -54,80 +53,3 @@ PublicInbox::Daemon::run('0.0.0.0:8080', $refresh, my $h = $httpds{$fd} ||= PublicInbox::HTTPD->new($srv, $app); PublicInbox::HTTP->new($client, $addr, $h), }); - -1; - -# XXX This is a totally unstable API for public-inbox internal use only -# This is exposed via the 'pi-httpd.async' key in the PSGI env hash. -# The name of this key is not even stable! -# Currently is is intended for use with read-only pipes. -package PublicInbox::HTTPD::Async; -use strict; -use warnings; -use base qw(Danga::Socket); -use fields qw(cb); - -sub new { - my ($class, $io, $cb) = @_; - my $self = fields::new($class); - IO::Handle::blocking($io, 0); - $self->SUPER::new($io); - $self->{cb} = $cb; - $self->watch_read(1); - $self; -} - -sub event_read { $_[0]->{cb}->() } -sub event_hup { $_[0]->{cb}->() } -sub event_err { $_[0]->{cb}->() } -sub sysread { shift->{sock}->sysread(@_) } - -sub close { - my $self = shift; - $self->{cb} = undef; - $self->SUPER::close(@_); -} - -1; - -package PublicInbox::HTTPD; -use strict; -use warnings; -use Plack::Util; - -sub pi_httpd_async { - my ($io, $cb) = @_; - PublicInbox::HTTPD::Async->new($io, $cb); -} - -sub new { - my ($class, $sock, $app) = @_; - my $n = getsockname($sock) or die "not a socket: $sock $!\n"; - my ($host, $port) = PublicInbox::Daemon::host_with_port($n); - - my %env = ( - SERVER_NAME => $host, - SERVER_PORT => $port, - SCRIPT_NAME => '', - 'psgi.version' => [ 1, 1 ], - 'psgi.errors' => \*STDERR, - 'psgi.url_scheme' => 'http', - 'psgi.nonblocking' => Plack::Util::TRUE, - 'psgi.streaming' => Plack::Util::TRUE, - 'psgi.run_once' => Plack::Util::FALSE, - 'psgi.multithread' => Plack::Util::FALSE, - 'psgi.multiprocess' => Plack::Util::TRUE, - 'psgix.harakiri'=> Plack::Util::FALSE, - 'psgix.input.buffered' => Plack::Util::TRUE, - 'pi-httpd.async' => do { - no warnings 'once'; - *pi_httpd_async - }, - ); - bless { - app => $app, - env => \%env, - }, $class; -} - -1; diff --git a/script/public-inbox-index b/script/public-inbox-index index 578d91d5..e9bbec9e 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -8,13 +8,24 @@ use strict; use warnings; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +use Cwd 'abs_path'; my $usage = "public-inbox-index GIT_DIR"; use PublicInbox::Config; +my $config = eval { PublicInbox::Config->new } || eval { + warn "public-inbox unconfigured for serving, indexing anyways...\n"; + {} +}; eval { require PublicInbox::SearchIdx }; if ($@) { print STDERR "Search::Xapian required for $0\n"; exit 1; } + +my $reindex; +my %opts = ( '--reindex' => \$reindex ); +GetOptions(%opts) or die "bad command-line args\n$usage"; + my @dirs; sub resolve_git_dir { @@ -36,8 +47,8 @@ sub resolve_git_dir { }; close $fh or die "error in $cmd: $!\n"; chomp $dir; - return $cd if ($dir eq '.' && defined $cd); - $dir; + return abs_path($cd) if ($dir eq '.' && defined $cd); + abs_path($dir); } } @@ -50,16 +61,26 @@ if (@ARGV) { sub usage { print STDERR "Usage: $usage\n"; exit 1 } usage() unless @dirs; +foreach my $k (keys %$config) { + $k =~ /\Apublicinbox\.([^\.]+)\.mainrepo\z/ or next; + my $name = $1; + my $v = $config->{$k}; + for my $i (0..$#dirs) { + next if $dirs[$i] ne $v; + my $ibx = $config->lookup_name($name); + $dirs[$i] = $ibx if $ibx; + } +} + foreach my $dir (@dirs) { index_dir($dir); } sub index_dir { my ($git_dir) = @_; - -d $git_dir or die "$git_dir does not appear to be a git repository\n"; - - system('git', "--git-dir=$git_dir", 'update-server-info') and - die "git update-server-info failed for $git_dir"; + if (!ref $git_dir && ! -d $git_dir) { + die "$git_dir does not appear to be a git repository\n"; + } my $s = PublicInbox::SearchIdx->new($git_dir, 1); - $s->index_sync; + $s->index_sync({ reindex => $reindex }); } diff --git a/script/public-inbox-init b/script/public-inbox-init index d66361df..e23d1419 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -28,6 +28,11 @@ mkpath($dir); # will croak on fatal errors my ($fh, $filename) = tempfile('pi-init-XXXXXXXX', DIR => $dir); if (-e $pi_config) { open(my $oh, '<', $pi_config) or die "unable to read $pi_config: $!\n"; + my @st = stat($oh); + my $perm = $st[2]; + defined $perm or die "(f)stat failed on $pi_config: $!\n"; + chmod($perm & 07777, $fh) or + die "(f)chmod failed on future $pi_config: $!\n"; my $old; { local $/; @@ -43,10 +48,10 @@ if (-e $pi_config) { foreach my $addr (@address) { my $found = $cfg->lookup($addr); if ($found) { - if ($found->{listname} ne $name) { + if ($found->{name} ne $name) { print STDERR "`$addr' already defined for ", - "`$found->{listname}',\n", + "`$found->{name}',\n", "does not match intend `$name'\n"; $conflict = 1; } else { @@ -63,6 +68,10 @@ my $pfx = "publicinbox.$name"; my @x = (qw/git config/, "--file=$filename"); $git_dir = abs_path($git_dir); x(qw(git init -q --bare), $git_dir); + +# set a reasonable default: +x(qw/git config/, "--file=$git_dir/config", 'repack.writeBitmaps', 'true'); + foreach my $addr (@address) { next if $seen{lc($addr)}; x(@x, "--add", "$pfx.address", $addr); diff --git a/script/public-inbox-learn b/script/public-inbox-learn index 0c7b4199..396ab489 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -8,82 +8,78 @@ my $usage = "$0 (spam|ham) < /path/to/message"; use strict; use warnings; use PublicInbox::Config; +use PublicInbox::Git; +use PublicInbox::Import; use Email::MIME; -use Email::Address; -use IPC::Run qw/run/; +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect +use PublicInbox::Address; +use PublicInbox::Spamcheck::Spamc; my $train = shift or die "usage: $usage\n"; if ($train !~ /\A(?:ham|spam)\z/) { die "`$train' not recognized.\nusage: $usage\n"; } +my $spamc = PublicInbox::Spamcheck::Spamc->new; my $pi_config = PublicInbox::Config->new; -my $mime = Email::MIME->new(eval { local $/; <> }); +my $err; +my $mime = Email::MIME->new(eval { + local $/; + my $data = scalar <STDIN>; + $data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + eval { + if ($train eq 'ham') { + $spamc->hamlearn(\$data); + } else { + $spamc->spamlearn(\$data); + } + die "spamc failed with: $?\n" if $?; + }; + $err = $@; + $data +}); # get all recipients my %dests; foreach my $h (qw(Cc To)) { - foreach my $recipient (Email::Address->parse($mime->header($h))) { - $dests{lc($recipient->address)} = 1; + my $val = $mime->header($h) or next; + foreach my $email (PublicInbox::Address::emails($val)) { + $dests{lc($email)} = 1; } } -my ($name, $email, $date); - -if ($train eq "ham") { - require PublicInbox::MDA; - require PublicInbox::Filter; - PublicInbox::Filter->run($mime); - ($name, $email, $date) = PublicInbox::MDA->author_info($mime); -} - -my $in = $mime->as_string; -my $err = 0; -my @output = qw(> /dev/null > /dev/null); +require PublicInbox::MDA if $train eq "ham"; # n.b. message may be cross-posted to multiple public-inboxes foreach my $recipient (keys %dests) { my $dst = $pi_config->lookup($recipient) or next; my $git_dir = $dst->{mainrepo} or next; - my ($out, $err) = ("", ""); - + my $git = PublicInbox::Git->new($git_dir); # We do not touch GIT_COMMITTER_* env here so we can track # who trained the message. - # We will not touch GIT_AUTHOR_* when learning spam messages, either + my $name = $ENV{GIT_COMMITTER_NAME} || $dst->{name}; + my $email = $ENV{GIT_COMMITTER_EMAIL} || $recipient; + my $im = PublicInbox::Import->new($git, $name, $email); + if ($train eq "spam") { # This needs to be idempotent, as my inotify trainer # may train for each cross-posted message, and this # script already learns for every list in # ~/.public-inbox/config - if (!run(["ssoma-rm", $git_dir], \$in, \$out, \$err)) { - if ($err !~ /^git cat-file .+ failed: 32768$/) { - $err = 1; - } - } + $im->remove($mime); } else { # $train eq "ham" # no checking for spam here, we assume the message has # been reviewed by a human at this point: PublicInbox::MDA->set_list_headers($mime, $dst); - my $s = $mime->as_string; - - local $ENV{GIT_AUTHOR_NAME} = $name; - local $ENV{GIT_AUTHOR_EMAIL} = $email; - local $ENV{GIT_AUTHOR_DATE} = $date; # Ham messages are trained when they're marked into # a SEEN state, so this is idempotent: - run([PublicInbox::MDA->cmd, $git_dir], \$s, \$out, \$err); - if ($err !~ /CONFLICT/) { - $err = 1; - } - } - if (!run([qw(spamc -L), $train], \$in, @output)) { - $err = 1; + $im->add($mime); } - - $err or eval { - require PublicInbox::SearchIdx; - PublicInbox::SearchIdx->new($git_dir, 2)->index_sync; - }; + $im->done; } -exit $err; +if ($err) { + warn $err; + exit 1; +} diff --git a/script/public-inbox-mda b/script/public-inbox-mda index 24feeb81..8b5258f5 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -6,106 +6,90 @@ use strict; use warnings; my $usage = 'public-inbox-mda < rfc2822_message'; +my ($ems, $emm); -use Email::Filter; +sub do_exit { + my ($code) = shift; + $emm = $ems = undef; # trigger DESTROY + exit $code; +} + +use Email::Simple; use Email::MIME; -use Email::Address; -use File::Path::Expand qw/expand_filename/; -use IPC::Run qw(run); +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect use PublicInbox::MDA; -use PublicInbox::Filter; use PublicInbox::Config; +use PublicInbox::Import; +use PublicInbox::Git; +use PublicInbox::Emergency; +use PublicInbox::Filter::Base; +use PublicInbox::Spamcheck::Spamc; # n.b: hopefully we can setup the emergency path without bailing due to # user error, we really want to setup the emergency destination ASAP # in case there's bugs in our code or user error. -my $emergency = $ENV{PI_EMERGENCY} || '~/.public-inbox/emergency/'; -$emergency = expand_filename($emergency); - -# this reads the message from stdin -my $filter = Email::Filter->new(emergency => $emergency); +my $emergency = $ENV{PI_EMERGENCY} || "$ENV{HOME}/.public-inbox/emergency/"; +$ems = PublicInbox::Emergency->new($emergency); +my $str = eval { local $/; <STDIN> }; +$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; +$ems->prepare(\$str); +my $simple = Email::Simple->new(\$str); my $config = PublicInbox::Config->new; my $recipient = $ENV{ORIGINAL_RECIPIENT}; defined $recipient or die "ORIGINAL_RECIPIENT not defined in ENV\n"; my $dst = $config->lookup($recipient); # first check -defined $dst or exit(1); -my $main_repo = $dst->{mainrepo} or exit(1); -my $filtered; # string dest - -if (PublicInbox::MDA->precheck($filter, $dst->{address}) && - do_spamc($filter->simple, \$filtered)) { - # update our message with SA headers (in case our filter rejects it) - my $msg = Email::MIME->new(\$filtered); - $filtered = undef; - $filter->simple($msg); - - my $filter_arg; - my $fcfg = $dst->{filter}; - if (!defined $fcfg || $filter eq 'reject') { - $filter_arg = $filter; - } elsif ($fcfg eq 'scrub') { - $filter_arg = undef; # the default for legacy versions - } else { - warn "publicinbox.$dst->{listname}.filter=$fcfg invalid\n"; - warn "must be either 'scrub' or 'reject' (the default)\n"; - } - - if (PublicInbox::Filter->run($msg, $filter_arg)) { - # run spamc again on the HTML-free message - if (do_spamc($msg, \$filtered)) { - $msg = Email::MIME->new(\$filtered); - PublicInbox::MDA->set_list_headers($msg, $dst); - $filter->simple($msg); - - my ($name, $email, $date) = - PublicInbox::MDA->author_info($msg); - - END { - index_sync($main_repo) if ($? == 0); - }; - - local $ENV{GIT_AUTHOR_NAME} = $name; - local $ENV{GIT_AUTHOR_EMAIL} = $email; - local $ENV{GIT_AUTHOR_DATE} = $date; - local $ENV{GIT_COMMITTER_EMAIL} = $recipient; - local $ENV{GIT_COMMITTER_NAME} = $dst->{listname}; - - $filter->pipe(PublicInbox::MDA->cmd, $main_repo); - } - } +defined $dst or do_exit(1); +my $main_repo = $dst->{mainrepo} or do_exit(1); + +# pre-check, MDA has stricter rules than an importer might; +do_exit(0) unless PublicInbox::MDA->precheck($simple, $dst->{address}); +my $spamc = PublicInbox::Spamcheck::Spamc->new; +$str = ''; +my $spam_ok = $spamc->spamcheck($ems->fh, \$str); +$simple = undef; +$emm = PublicInbox::Emergency->new($emergency); +$emm->prepare(\$str); +$ems = $ems->abort; +my $mime = Email::MIME->new(\$str); +$str = ''; +do_exit(0) unless $spam_ok; + +my $fcfg = $dst->{filter} || ''; +my $filter; +if ($fcfg =~ /::/) { + eval "require $fcfg"; + die $@ if $@; + $filter = $fcfg->new; +} elsif ($fcfg eq 'scrub') { # TODO: + require PublicInbox::Filter::Mirror; + $filter = PublicInbox::Filter::Mirror->new; } else { - # Ensure emergency spam gets spamassassin headers. - # This makes it easier to prioritize obvious spam from less obvious - if (defined($filtered) && $filtered ne '') { - my $drop = Email::MIME->new(\$filtered); - $filtered = undef; - $filter->simple($drop); - } + $filter = PublicInbox::Filter::Base->new; } -exit 0; # goes to emergency -# we depend on "report_safe 0" in /etc/spamassassin/*.cf with --headers -# not using Email::Filter->pipe here since we want the stdout of -# the command even on failure (spamc will set $? on error). -sub do_spamc { - my ($msg, $out) = @_; - eval { - my $orig = $msg->as_string; - run([qw/spamc -E --headers/], \$orig, $out); - }; - - return ($@ || $? || !defined($$out) || $$out eq '') ? 0 : 1; +my $ret = $filter->delivery($mime); +if (ref($ret) && $ret->isa('Email::MIME')) { # filter altered message + $mime = $ret; +} elsif ($ret == PublicInbox::Filter::Base::IGNORE) { + do_exit(0); # chuck it to emergency +} elsif ($ret == PublicInbox::Filter::Base::REJECT) { + $! = $ret; + die $filter->err, "\n"; +} # else { accept + +PublicInbox::MDA->set_list_headers($mime, $dst); +my $git = PublicInbox::Git->new($main_repo); +my $im = PublicInbox::Import->new($git, $dst->{name}, $recipient); +if (defined $im->add($mime)) { + $emm = $emm->abort; +} else { + # this message is similar to what ssoma-mda shows: + print STDERR "CONFLICT: Message-ID: ", + $mime->header_obj->header_raw('Message-ID'), + " exists\n"; } -sub index_sync { - my ($git_dir) = @_; - - # potentially user-visible, ignore errors: - system('git', "--git-dir=$git_dir", 'update-server-info'); - - eval { - require PublicInbox::SearchIdx; - PublicInbox::SearchIdx->new($git_dir, 2)->index_sync; - }; -} +$im->done; +do_exit(0); diff --git a/script/public-inbox-nntpd b/script/public-inbox-nntpd index cea88163..be860a54 100755 --- a/script/public-inbox-nntpd +++ b/script/public-inbox-nntpd @@ -6,64 +6,9 @@ use strict; use warnings; require PublicInbox::Daemon; -require PublicInbox::NewsGroup; require PublicInbox::NNTP; -require PublicInbox::Config; +require PublicInbox::NNTPD; my $nntpd = PublicInbox::NNTPD->new; PublicInbox::Daemon::run('0.0.0.0:119', sub { $nntpd->refresh_groups }, # refresh sub ($$$) { PublicInbox::NNTP->new($_[0], $nntpd) }); # post_accept - -1; -package PublicInbox::NNTPD; -use strict; -use warnings; - -sub new { - my ($class) = @_; - bless { - groups => {}, - err => \*STDERR, - out => \*STDOUT, - grouplist => [], - }, $class; -} - -sub refresh_groups () { - my ($self) = @_; - my $pi_config = PublicInbox::Config->new; - my $new = {}; - my @list; - foreach my $k (keys %$pi_config) { - $k =~ /\Apublicinbox\.([^\.]+)\.mainrepo\z/ or next; - my $g = $1; - my $git_dir = $pi_config->{$k}; - my $addr = $pi_config->{"publicinbox.$g.address"}; - my $ngname = $pi_config->{"publicinbox.$g.newsgroup"}; - if (defined $ngname) { - next if ($ngname eq ''); # disabled - $g = $ngname; - } - my $ng = PublicInbox::NewsGroup->new($g, $git_dir, $addr); - my $old_ng = $self->{groups}->{$g}; - - # Reuse the old one if possible since it can hold - # references to valid mm and gcf objects - if ($old_ng) { - $old_ng->update($ng); - $ng = $old_ng; - } - - # Only valid if msgmap and search works - if ($ng->usable) { - $new->{$g} = $ng; - push @list, $ng; - } - } - @list = sort { $a->{name} cmp $b->{name} } @list; - $self->{grouplist} = \@list; - # this will destroy old groups that got deleted - %{$self->{groups}} = %$new; -} - -1; diff --git a/script/public-inbox-watch b/script/public-inbox-watch new file mode 100755 index 00000000..bb655929 --- /dev/null +++ b/script/public-inbox-watch @@ -0,0 +1,21 @@ +#!/usr/bin/perl -w +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use PublicInbox::WatchMaildir; +use PublicInbox::Config; +my ($config, $watch_md); +my $reload = sub { + $config = PublicInbox::Config->new; + $watch_md = PublicInbox::WatchMaildir->new($config); +}; +$reload->(); +if ($watch_md) { + my $scan = sub { $watch_md->scan if $watch_md }; + $SIG{HUP} = $reload; + $SIG{USR1} = $scan; + $SIG{ALRM} = sub { $SIG{ALRM} = 'DEFAULT'; $scan->() }; + alarm(1); + $watch_md->watch; +} diff --git a/script/public-inbox.cgi b/script/public-inbox.cgi index 5b2aefe2..2b7f737d 100755 --- a/script/public-inbox.cgi +++ b/script/public-inbox.cgi @@ -20,7 +20,7 @@ my $app = builder { # Enable to ensure redirects and Atom feed URLs are generated # properly when running behind a reverse proxy server which - # sets X-Forwarded-For and X-Forwarded-Proto request headers. + # sets the X-Forwarded-Proto request header. # See Plack::Middleware::ReverseProxy documentation for details # enable 'ReverseProxy'; diff --git a/scripts/dc-dlvr b/scripts/dc-dlvr index ca64505c..81193b23 100755 --- a/scripts/dc-dlvr +++ b/scripts/dc-dlvr @@ -1,15 +1,10 @@ #!/bin/sh -# Copyright (C) 2008-2013, Eric Wong <e@80x24.org> -# License: GPLv3 or later <http://www.gnu.org/licenses/gpl-3.0.txt> +# Copyright (C) 2008-2016 all contributors <meta@public-inbox.org> +# License: GPL-3.0+ <http://www.gnu.org/licenses/gpl-3.0.txt> # This is installed as /etc/dc-dcvr on my system # to use with postfix main.cf: mailbox_command = /etc/dc-dlvr "$EXTENSION" DELIVER=/usr/lib/dovecot/deliver - -# my personal preference is to use a catchall account to avoid generating -# backscatter, as invalid emails are usually spam -case $USER in -catchall) exec $DELIVER ;; -esac +CLAMDSCAN=clamdscan # change if your spamc/spamd listens elsewhere spamc='spamc' @@ -23,19 +18,28 @@ trainham,) exec $spamc -L ham > /dev/null 2>&1 ;; esac TMPMSG=$(mktemp -t dc-dlvr.orig.$USER.XXXXXX || exit 1) -rm_list=$TMPMSG +CDMSG=$(mktemp -t dc-dlvr.orig.$USER.XXXXXX || exit 1) +rm_list="$TMPMSG $CDMSG" + +cat >$CDMSG +$CLAMDSCAN --quiet - <$CDMSG +if test $? -eq 1 +then + $DELIVER -m INBOX.spam <$CDMSG + exec rm -f $rm_list +fi # pre-filter, for infrequently read lists which do their own spam filtering: if test -r ~/.dc-dlvr.pre then set -e - cat > $TMPMSG + mv -f $CDMSG $TMPMSG DEFAULT_INBOX=$(. ~/.dc-dlvr.pre) case $DEFAULT_INBOX in '') exec rm -f $rm_list ;; INBOX) ;; # do nothing *) - $DELIVER -m $DEFAULT_INBOX < $TMPMSG + $DELIVER -m $DEFAULT_INBOX <$TMPMSG exec rm -f $rm_list ;; esac @@ -43,9 +47,9 @@ then rm_list="$rm_list $PREMSG" set +e mv -f $TMPMSG $PREMSG - $spamc -E --headers < $PREMSG > $TMPMSG + $spamc -E --headers <$PREMSG >$TMPMSG else - $spamc -E --headers > $TMPMSG + $spamc -E --headers <$CDMSG >$TMPMSG fi err=$? @@ -53,14 +57,14 @@ err=$? set -e case $err in -1) $DELIVER -m INBOX.spam < $TMPMSG ;; +1) $DELIVER -m INBOX.spam <$TMPMSG ;; *) # users may override normal delivery and have it go elsewhere if test -r ~/.dc-dlvr.rc then . ~/.dc-dlvr.rc else - $DELIVER -m INBOX < $TMPMSG + $DELIVER -m INBOX <$TMPMSG fi ;; esac diff --git a/scripts/dc-dlvr.pre b/scripts/dc-dlvr.pre index c10e80c7..d7bc1b5c 100644 --- a/scripts/dc-dlvr.pre +++ b/scripts/dc-dlvr.pre @@ -5,7 +5,7 @@ export PATH=/usr/local/bin:/usr/bin:/bin trap 'err=$?; set +e; test $err -eq 0 || rm -f $TMPMSG; exit $err' EXIT case $1,$CLIENT_ADDRESS in -pispam,) exec public-inbox-learn spam < $TMPMSG ;; -piham,) exec public-inbox-learn ham < $TMPMSG ;; +pispam,) exec public-inbox-learn spam <$TMPMSG ;; +piham,) exec public-inbox-learn ham <$TMPMSG ;; esac -exec public-inbox-mda < $TMPMSG +exec public-inbox-mda <$TMPMSG diff --git a/scripts/import_maildir b/scripts/import_maildir index aaabe80d..c87ca1b2 100755 --- a/scripts/import_maildir +++ b/scripts/import_maildir @@ -9,8 +9,8 @@ export ORIGINAL_RECIPIENT='list@example.com' git init --bare $MAINREPO export GIT_CONFIG=$HOME/.public-inbox/config - git config publicinbox.$LISTNAME.address $ORIGINAL_RECIPIENT - git config publicinbox.$LISTNAME.mainrepo $MAINREPO + git config publicinbox.$INBOX.address $ORIGINAL_RECIPIENT + git config publicinbox.$INBOX.mainrepo $MAINREPO unset GIT_CONFIG ./import_maildir /path/to/maildir/ =cut diff --git a/scripts/import_slrnspool b/scripts/import_slrnspool index f8271f58..98850591 100755 --- a/scripts/import_slrnspool +++ b/scripts/import_slrnspool @@ -1,18 +1,19 @@ #!/usr/bin/perl -w -# Copyright (C) 2015 all contributors <meta@public-inbox.org> -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) +# Copyright (C) 2015-2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Incremental (or one-shot) importer of a slrnpull news spool =begin usage export ORIGINAL_RECIPIENT=address@example.com - public-inbox-init $LISTNAME $GIT_DIR $HTTP_URL $ORIGINAL_RECIPIENT + public-inbox-init $INBOX $GIT_DIR $HTTP_URL $ORIGINAL_RECIPIENT ./import_slrnspool SLRNPULL_ROOT/news/foo/bar =cut use strict; use warnings; use PublicInbox::Config; -use Email::Filter; -use Email::LocalDelivery; +use Email::MIME; +use PublicInbox::Import; +use PublicInbox::Git; sub usage { "Usage:\n".join('',grep(/\t/, `head -n 10 $0`)) } my $exit = 0; my $sighandler = sub { $exit = 1 }; @@ -22,43 +23,33 @@ my $spool = shift @ARGV or die usage(); my $recipient = $ENV{ORIGINAL_RECIPIENT}; defined $recipient or die usage(); my $config = PublicInbox::Config->new; -my $cfg = $config->lookup($recipient); -defined $cfg or exit(1); -my @mda; -if ($ENV{'FILTER'}) { - @mda = qw(public-inbox-mda); -} else { - @mda = (qw(ssoma-mda -1), $cfg->{mainrepo}); -} +my $ibx = $config->lookup($recipient); +my $git = $ibx->git; +my $im = PublicInbox::Import->new($git, $ibx->{name}, $ibx->{-primary_address}); sub key { - my ($cfg) = @_; - "publicinbox.$cfg->{listname}.importslrnspoolstate"; + "publicinbox.$ibx->{name}.importslrnspoolstate"; } sub get_min { my $f = PublicInbox::Config->default_file; - my @cmd = (qw/git config/, "--file=$f", key($cfg)); - use IPC::Run qw/run/; - - my $in = ''; - my $out = ''; - unless (run(\@cmd, \$in, \$out)) { - $out = 0; - } - int($out); + my $out = $git->qx('config', "--file=$f", key($ibx)); + $out ||= 0; + chomp $out; + $out =~ /\A\d+\z/ and return $out; + 0; } sub set_min { - my ($cfg, $num) = @_; + my ($num) = @_; my $f = PublicInbox::Config->default_file; - my @cmd = (qw/git config/, "--file=$f", key($cfg), $num); + my @cmd = (qw/git config/, "--file=$f", key($ibx), $num); system(@cmd) == 0 or die join(' ', @cmd). " failed: $?\n"; } my $n = get_min(); my $ok; -my $max_gap = 10000; +my $max_gap = 200000; my $max = $n + $max_gap; for (; $exit == 0 && $n < $max; $n++) { @@ -67,40 +58,31 @@ for (; $exit == 0 && $n < $max; $n++) { open(my $fh, '<', $fn) or next; $max = $n + $max_gap; - # prevent process growth by forking a new process for each message - my $pid = fork; - die "failed to fork: $!\n" unless defined $pid; - - if ($pid == 0) { - my $f = Email::Filter->new(data => eval { local $/; <$fh> }); - close $fh; - $fh = undef; - my $s = $f->simple; + my $mime = Email::MIME->new(eval { local $/; <$fh> }); + my $hdr = $mime->header_obj; - # gmane rewrites Received headers, which increases spamminess - # Some older archives set Original-To - foreach my $x (qw(Received To)) { - my @h = $s->header("Original-$x"); - if (@h) { - $s->header_set($x, @h); - $s->header_set("Original-$x"); - } + # gmane rewrites Received headers, which increases spamminess + # Some older archives set Original-To + foreach my $x (qw(Received To)) { + my @h = $hdr->header_raw("Original-$x"); + if (@h) { + $hdr->header_set($x, @h); + $hdr->header_set("Original-$x"); } + } - # triggers for the SA HEADER_SPAM rule - foreach my $drop (qw(Approved)) { $s->header_set($drop) } + # Approved triggers for the SA HEADER_SPAM rule, + # X-From is gmane specific + foreach my $drop (qw(Approved X-From)) { + $hdr->header_set($drop); + } - # appears to be an old gmane bug: - $s->header_set('connect()'); + # appears to be an old gmane bug: + $hdr->header_set('connect()'); + $im->add($mime); - $f->exit(0); - $f->pipe(@mda); - exit 0; - } else { - close $fh; - waitpid($pid, 0); - die "error: $?\n" if $?; - } $ok = $n + 1; - set_min($cfg, $ok); + set_min($ok); } + +$im->done; diff --git a/scripts/import_vger_from_mbox b/scripts/import_vger_from_mbox new file mode 100644 index 00000000..4976e056 --- /dev/null +++ b/scripts/import_vger_from_mbox @@ -0,0 +1,47 @@ +#!/usr/bin/perl -w +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Email::MIME; +$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect +use PublicInbox::Git; +use PublicInbox::Import; +my $usage = "usage: $0 NAME EMAIL <MBOX\n"; +chomp(my $git_dir = `git rev-parse --git-dir`); +my $git = PublicInbox::Git->new($git_dir); +my $name = shift or die $usage; # git +my $email = shift or die $usage; # git@vger.kernel.org +my $im = PublicInbox::Import->new($git, $name, $email); +binmode STDIN; +my $msg = ''; +use PublicInbox::Filter::Vger; +my $vger = PublicInbox::Filter::Vger->new; +sub do_add ($$) { + my ($im, $msg) = @_; + $$msg =~ s/(\r?\n)+\z/$1/s; + $msg = Email::MIME->new($$msg); + $msg = $vger->scrub($msg); + $im->add($msg) or + warn "duplicate: ", + $msg->header_obj->header_raw('Message-ID'), "\n"; +} + +# asctime: From example@example.com Fri Jun 23 02:56:55 2000 +my $from_strict = qr/^From \S+ \S+ \S+ +\S+ [^:]+:[^:]+:[^:]+ [^:]+/; +my $prev = undef; +while (defined(my $l = <STDIN>)) { + if ($l =~ /$from_strict/o) { + if (!defined($prev) || $prev =~ /^\r?$/) { + do_add($im, \$msg) if $msg; + $msg = ''; + $prev = $l; + next; + } + warn "W[$.] $l\n"; + } + $prev = $l; + $msg .= $l; +} +do_add($im, \$msg) if $msg; +$im->done; diff --git a/scripts/report-spam b/scripts/report-spam index 0015ef0b..325f5718 100755 --- a/scripts/report-spam +++ b/scripts/report-spam @@ -32,14 +32,14 @@ PI_USER=pi case $1 in *[/.]spam/cur/*) # non-new messages in spam get trained - $DO_SENDMAIL $PI_USER+pispam < $1 - exec $DO_SENDMAIL $USER+trainspam < $1 + $DO_SENDMAIL $PI_USER+pispam <$1 + exec $DO_SENDMAIL $USER+trainspam <$1 ;; *:2,*S*) # otherwise, seen messages only case $1 in *:2,*T*) exit 0 ;; # ignore trashed messages esac - $DO_SENDMAIL $PI_USER+piham < $1 - exec $DO_SENDMAIL $USER+trainham < $1 + $DO_SENDMAIL $PI_USER+piham <$1 + exec $DO_SENDMAIL $USER+trainham <$1 ;; esac diff --git a/scripts/ssoma-replay b/scripts/ssoma-replay new file mode 100755 index 00000000..3c3fdf48 --- /dev/null +++ b/scripts/ssoma-replay @@ -0,0 +1,96 @@ +#!/usr/bin/perl -w +# Copyright (C) 2015-2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# A work-in-progress, but one day I hope this script is no longer +# necessary and users will all pull from public-inboxes instead +# of having mail pushed to them via mlmmj. +# +# This is for use with ssoma, using "command:" delivery mechanism +# (as opposed to normal Maildir or mbox). +# It assumes mlmmj-process is in /usr/bin (mlmmj requires absolute paths) +# and assumes FOO@domain.example.com has web archives available at: +# https://domain.example.com/FOO/ +# +# The goal here is _anybody_ can setup a mirror of any public-inbox +# repository and run their own mlmmj instance to replay traffic. +=begin usage with ssoma: + +NAME=meta +URL=https://public-inbox.org/meta/ +ssoma add $NAME $URL "command:/path/to/ssoma-replay -L /path/to/spool/$NAME" + +; $GIT_DIR/ssoma.state should have something like the following target: +; (where GIT_DIR is ~/.ssoma/meta.git/ in the above example) +[target "local"] + command = /path/to/ssoma-replay -L /path/to/spool/meta +=cut +use strict; +use Email::Simple; +use URI::Escape qw/uri_escape_utf8/; +use File::Temp qw/tempfile/; +my ($fh, $filename) = tempfile('ssoma-replay-XXXXXXXX', TMPDIR => 1); +my $msg = eval { + local $/; + Email::Simple->new(<STDIN>); +}; +select $fh; + +# Note: the archive URL makes assumptions about where the +# archive is hosted. It is currently true of all the domains +# hosted by me. + +my $header_obj = $msg->header_obj; +my $body = $msg->body; +my $list_id = $header_obj->header('List-Id'); +my ($archive_url, $user, $domain); +if (defined $list_id) { + ($user, $domain) = ($list_id =~ /<(.+)\@(.+)>/g); + + if (defined $domain) { + $archive_url = "https://$domain/$user/"; + my $mid = $header_obj->header('Message-Id'); + if ($mid =~ /\A<(.+)>\z/) { + $mid = $1; + } + $mid = uri_escape_utf8($mid, + '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@'); + $header_obj->header_set('List-Archive', "<$archive_url>"); + + foreach my $h (qw(Help Unsubscribe Subscribe Owner)) { + my $lch = lc $h; + my $v = "<mailto:$user+$lch\@$domain>"; + $header_obj->header_set("List-$h", $v); + } + $header_obj->header_set('List-Post', "<mailto:$user\@$domain>"); + + # RFC 5064 + $header_obj->header_set('Archived-At', "<$archive_url$mid/>"); + $header_obj->header_set('X-Archived-At'); + } +} + +print $header_obj->as_string, $msg->crlf, $body; + +# don't break inline signatures +goto out if ($body =~ /^-----BEGIN PGP SIG.+-----/sm); + +# try not to break dkim/dmarc/spf crap, either +foreach (qw(domainkey-signature dkim-signature authentication-results)) { + goto out if defined $header_obj->header($_); +} + +my $ct = $header_obj->header('Content-Type'); + +if (!defined($ct) || $ct =~ m{\A\s*text/plain\b}i) { + print "\n" unless $body =~ /\n\z/s; + defined $archive_url or goto out; + # Do not add a space after '--' as is standard for user-generated + # signatures, we want to preserve the "-- \n" in original user sigs + # for mail software which splits on that. + print "--\n", "unsubscribe: $user+unsubscribe\@$domain\n", + "archive: $archive_url\n"; +} +out: +$| = 1; +exec '/usr/bin/mlmmj-process', @ARGV, '-m', $filename; diff --git a/scripts/xhdr-num2mid b/scripts/xhdr-num2mid new file mode 100755 index 00000000..bc3ede60 --- /dev/null +++ b/scripts/xhdr-num2mid @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# Useful for mapping article IDs from existing NNTP servers to MIDs +use strict; +use warnings; +use Net::NNTP; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +my $usage = "usage: NNTPSERVER=news.example.org $0 GROUP [FIRST_NUM]\n"; +my ($msgmap, $mm); +my %opts = ( '--msgmap=s' => \$msgmap ); +GetOptions(%opts) or die "bad command-line args\n$usage"; + +if ($msgmap) { + require PublicInbox::Msgmap; + require PublicInbox::MID; # mid_clean + $mm = PublicInbox::Msgmap->new_file($msgmap, 1); +} + +my $group = shift or die $usage; +my $nntp = Net::NNTP->new($ENV{NNTPSERVER} || '127.0.0.1'); +my ($num, $first, $last) = $nntp->group($group); +die "Invalid group\n" if !(defined $num && defined $first && defined $last); +my $arg_first = shift; +if (defined $arg_first) { + $arg_first =~ /\A\d+\z/ or die $usage; + $first = $arg_first; +} elsif ($mm) { + my $last_article = $mm->meta_accessor('last_article'); + $first = $last_article + 1 if defined $last_article; +} + +my $batch = 1000; +my $i; +for ($i = $first; $i < $last; $i += $batch) { + my $j = $i + $batch - 1; + $j = $last if $j > $last; + my $num2mid = $nntp->xhdr('Message-ID', "$i-$j"); + + $mm->{dbh}->begin_work if $mm; + for my $n ($i..$j) { + defined(my $mid = $num2mid->{$n}) or next; + print "$n $mid\n"; + if ($mm) { + $mid = PublicInbox::MID::mid_clean($mid); + $mm->mid_set($n, $mid); + } + } + if ($mm) { + $mm->meta_accessor('last_article', $j); + $mm->{dbh}->commit; + } +} diff --git a/t/address.t b/t/address.t new file mode 100644 index 00000000..be0fc5b7 --- /dev/null +++ b/t/address.t @@ -0,0 +1,36 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use_ok 'PublicInbox::Address'; + +is_deeply([qw(e@example.com e@example.org)], + [PublicInbox::Address::emails('User <e@example.com>, e@example.org')], + 'address extraction works as expected'); + +is_deeply([PublicInbox::Address::emails('"ex@example.com" <ex@example.com>')], + [qw(ex@example.com)]); + +my @names = PublicInbox::Address::names( + 'User <e@e>, e@e, "John A. Doe" <j@d>, <x@x>'); +is_deeply(['User', 'e', 'John A. Doe', 'x'], \@names, + 'name extraction works as expected'); + +@names = PublicInbox::Address::names('"user@example.com" <user@example.com>'); +is_deeply(['user'], \@names, 'address-as-name extraction works as expected'); + + +{ + my $backwards = 'u@example.com (John Q. Public)'; + @names = PublicInbox::Address::names($backwards); + is_deeply(\@names, ['u'], 'backwards name OK'); + my @emails = PublicInbox::Address::emails($backwards); + is_deeply(\@emails, ['u@example.com'], 'backwards emails OK'); +} + + +@names = PublicInbox::Address::names('"Quote Unneeded" <user@example.com>'); +is_deeply(['Quote Unneeded'], \@names, 'extra quotes dropped'); + +done_testing; diff --git a/t/altid.t b/t/altid.t new file mode 100644 index 00000000..887d548f --- /dev/null +++ b/t/altid.t @@ -0,0 +1,61 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for altid.t" if $@; +} + +use_ok 'PublicInbox::Msgmap'; +use_ok 'PublicInbox::SearchIdx'; +use_ok 'PublicInbox::Import'; +use_ok 'PublicInbox::Inbox'; +my $tmpdir = tempdir('pi-altid-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $git_dir = "$tmpdir/a.git"; +my $alt_file = "$tmpdir/another-nntp.sqlite3"; +my $altid = [ "serial:gmane:file=$alt_file" ]; + +{ + my $mm = PublicInbox::Msgmap->new_file($alt_file, 1); + $mm->mid_set(1234, 'a@example.com'); +} + +{ + is(system(qw(git init -q --bare), $git_dir), 0, 'git init ok'); + my $git = PublicInbox::Git->new($git_dir); + my $im = PublicInbox::Import->new($git, 'testbox', 'test@example'); + $im->add(Email::MIME->create( + header => [ + From => 'a@example.com', + To => 'b@example.com', + 'Content-Type' => 'text/plain', + Subject => 'boo!', + 'Message-ID' => '<a@example.com>', + ], + body => "hello world gmane:666\n", + )); + $im->done; +} +{ + my $inbox = PublicInbox::Inbox->new({mainrepo=>$git_dir}); + $inbox->{altid} = $altid; + my $rw = PublicInbox::SearchIdx->new($inbox, 1); + $rw->index_sync; +} + +{ + my $ro = PublicInbox::Search->new($git_dir, $altid); + my $res = $ro->query("gmane:1234"); + is($res->{total}, 1, 'got one match'); + is($res->{msgs}->[0]->mid, 'a@example.com'); + + $res = $ro->query("gmane:666"); + is($res->{total}, 0, 'body did NOT match'); +}; + +done_testing(); + +1; @@ -1,30 +1,27 @@ # Copyright (C) 2014-2015 all contributors <meta@public-inbox.org> # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) +# FIXME: this test is too slow and most non-CGI-requirements +# should be moved over to things which use test_psgi use strict; use warnings; use Test::More; use Email::MIME; use File::Temp qw/tempdir/; use Cwd; -use IPC::Run qw/run/; +eval { require IPC::Run }; +plan skip_all => "missing IPC::Run for t/cgi.t" if $@; use constant CGI => "blib/script/public-inbox.cgi"; -my $mda = "blib/script/public-inbox-mda"; my $index = "blib/script/public-inbox-index"; my $tmpdir = tempdir('pi-cgi-XXXXXX', TMPDIR => 1, CLEANUP => 1); my $home = "$tmpdir/pi-home"; my $pi_home = "$home/.public-inbox"; my $pi_config = "$pi_home/config"; my $maindir = "$tmpdir/main.git"; -my $main_bin = getcwd()."/t/main-bin"; -my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock my $addr = 'test-public@example.com'; my $cfgpfx = "publicinbox.test"; { - ok(-x "$main_bin/spamc", - "spamc ham mock found (run in top of source tree"); - ok(-x $mda, "$mda is executable"); is(1, mkdir($home, 0755), "setup ~/ for testing"); is(1, mkdir($pi_home, 0755), "setup ~/.public-inbox"); is(0, system(qw(git init -q --bare), $maindir), "git init (main)"); @@ -42,15 +39,18 @@ my $cfgpfx = "publicinbox.test"; } } -my $failbox = "$home/fail.mbox"; -local $ENV{PI_EMERGENCY} = $failbox; +use_ok 'PublicInbox::Git'; +use_ok 'PublicInbox::Import'; +use_ok 'Email::MIME'; +my $git = PublicInbox::Git->new($maindir); +my $im = PublicInbox::Import->new($git, 'test', $addr); + { local $ENV{HOME} = $home; - local $ENV{ORIGINAL_RECIPIENT} = $addr; # ensure successful message delivery { - my $simple = Email::Simple->new(<<EOF); + my $mime = Email::MIME->new(<<EOF); From: Me <me\@example.com> To: You <you\@example.com> Cc: $addr @@ -60,16 +60,15 @@ Date: Thu, 01 Jan 1970 00:00:00 +0000 zzzzzz EOF - my $in = $simple->as_string; - run_with_env({PATH => $main_path}, [$mda], \$in); - local $ENV{GIT_DIR} = $maindir; - my $rev = `git rev-list HEAD`; + $im->add($mime); + $im->done; + my $rev = `git --git-dir=$maindir rev-list HEAD`; like($rev, qr/\A[a-f0-9]{40}/, "good revision committed"); } # deliver a reply, too { - my $reply = Email::Simple->new(<<EOF); + my $reply = Email::MIME->new(<<EOF); From: You <you\@example.com> To: Me <me\@example.com> Cc: $addr @@ -83,10 +82,9 @@ Me wrote: what? EOF - my $in = $reply->as_string; - run_with_env({PATH => $main_path}, [$mda], \$in); - local $ENV{GIT_DIR} = $maindir; - my $rev = `git rev-list HEAD`; + $im->add($reply); + $im->done; + my $rev = `git --git-dir=$maindir rev-list HEAD`; like($rev, qr/\A[a-f0-9]{40}/, "good revision committed"); } @@ -120,7 +118,7 @@ EOF like($res->{head}, qr/Status:\s*206/i, "info/refs partial past end OK"); is($res->{body}, substr($orig, 5), 'partial body OK past end'); } - +use Data::Dumper; # atom feeds { local $ENV{HOME} = $home; @@ -128,31 +126,16 @@ EOF like($res->{body}, qr/<title>test for public-inbox/, "set title in XML feed"); like($res->{body}, - qr!http://test\.example\.com/test/blah%40example\.com/!, + qr!http://test\.example\.com/test/blah\@example\.com/!, "link id set"); like($res->{body}, qr/what\?/, "reply included"); } -# indices -{ - local $ENV{HOME} = $home; - my $res = cgi_run("/test/"); - like($res->{head}, qr/Status: 200 OK/, "index returns 200"); - - my $idx = cgi_run("/test/index.html"); - $idx->{body} =~ s!/index.html(\?r=)!/$1!g; # dirty... - $idx->{body} = [ split(/\n/, $idx->{body}) ]; - $res->{body} = [ split(/\n/, $res->{body}) ]; - is_deeply($res, $idx, - '/$LISTNAME/ and /$LISTNAME/index.html are nearly identical'); - # more checks in t/feed.t -} - # message-id pages { local $ENV{HOME} = $home; my $slashy_mid = 'slashy/asdf@example.com'; - my $reply = Email::Simple->new(<<EOF); + my $reply = Email::MIME->new(<<EOF); From: You <you\@example.com> To: Me <me\@example.com> Cc: $addr @@ -162,16 +145,10 @@ Date: Thu, 01 Jan 1970 00:00:01 +0000 slashy EOF - my $in = $reply->as_string; - - { - local $ENV{HOME} = $home; - local $ENV{ORIGINAL_RECIPIENT} = $addr; - run_with_env({PATH => $main_path}, [$mda], \$in); - } - local $ENV{GIT_DIR} = $maindir; + $im->add($reply); + $im->done; - my $res = cgi_run("/test/slashy%2fasdf%40example.com/raw"); + my $res = cgi_run("/test/slashy%2fasdf\@example.com/raw"); like($res->{body}, qr/Message-Id: <\Q$slashy_mid\E>/, "slashy mid raw hit"); @@ -188,21 +165,22 @@ EOF like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss"); $res = cgi_run("/test/blahblah\@example.com/f/"); - like($res->{body}, qr/\A<html>/, "mid html"); - like($res->{head}, qr/Status: 200 OK/, "200 response"); - $res = cgi_run("/test/blahblah\@example.con/f/"); + like($res->{head}, qr/Status: 301 Moved/, "301 response"); + like($res->{head}, + qr!^Location: http://[^/]+/test/blahblah\@example\.com/\r\n!ms, + '301 redirect location'); + $res = cgi_run("/test/blahblah\@example.con/"); like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss"); - $res = cgi_run("/test/"); - like($res->{body}, qr/slashy%2Fasdf%40example\.com/, + $res = cgi_run("/test/new.html"); + like($res->{body}, qr/slashy%2Fasdf\@example\.com/, "slashy URL generated correctly"); } # retrieve thread as an mbox { local $ENV{HOME} = $home; - local $ENV{PATH} = $main_path; - my $path = "/test/blahblah%40example.com/t.mbox.gz"; + my $path = "/test/blahblah\@example.com/t.mbox.gz"; my $res = cgi_run($path); like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled"); my $indexed = system($index, $maindir) == 0; @@ -222,7 +200,7 @@ EOF my $have_xml_feed = eval { require XML::Feed; 1 } if $indexed; if ($have_xml_feed) { - $path = "/test/blahblah%40example.com/t.atom"; + $path = "/test/blahblah\@example.com/t.atom"; $res = cgi_run($path); like($res->{head}, qr/^Status: 200 /, "atom returned 200"); like($res->{head}, qr!^Content-Type: application/atom\+xml!m, @@ -246,7 +224,7 @@ done_testing(); sub run_with_env { my ($env, @args) = @_; my $init = sub { foreach my $k (keys %$env) { $ENV{$k} = $env->{$k} } }; - run(@args, init => $init); + IPC::Run::run(@args, init => $init); } sub cgi_run { diff --git a/t/check-www-inbox.perl b/t/check-www-inbox.perl new file mode 100644 index 00000000..4319049c --- /dev/null +++ b/t/check-www-inbox.perl @@ -0,0 +1,157 @@ +#!/usr/bin/perl -w +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# Parallel WWW checker +my $usage = "$0 [-j JOBS] [-s SLOW_THRESHOLD] URL_OF_INBOX\n"; +use strict; +use warnings; +use File::Temp qw(tempfile); +use GDBM_File; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +use IO::Socket; +use LWP::ConnCache; +use POSIX qw(:sys_wait_h); +use Time::HiRes qw(gettimeofday tv_interval); +use WWW::Mechanize; +use Data::Dumper; +my $nproc = 4; +my $slow = 0.5; +my %opts = ( + '-j|jobs=i' => \$nproc, + '-s|slow-threshold=f' => \$slow, +); +GetOptions(%opts) or die "bad command-line args\n$usage"; +my $root_url = shift or die $usage; + +my %workers; +$SIG{TERM} = sub { exit 0 }; +$SIG{CHLD} = sub { + while (1) { + my $pid = waitpid(-1, WNOHANG); + return if !defined $pid || $pid <= 0; + my $p = delete $workers{$pid} || '(unknown)'; + warn("$pid [$p] exited with $?\n") if $?; + } +}; + +my @todo = IO::Socket->socketpair(AF_UNIX, SOCK_SEQPACKET, 0); +die "socketpair failed: $!" unless $todo[1]; +my @done = IO::Socket->socketpair(AF_UNIX, SOCK_SEQPACKET, 0); +die "socketpair failed: $!" unless $done[1]; +$| = 1; + +foreach my $p (1..$nproc) { + my $pid = fork; + die "fork failed: $!\n" unless defined $pid; + if ($pid) { + $workers{$pid} = $p; + } else { + $todo[1]->close; + $done[0]->close; + worker_loop($todo[0], $done[1]); + } +} + +my ($fh, $tmp) = tempfile('www-check-XXXXXXXX', + SUFFIX => '.gdbm', UNLINK => 1, TMPDIR => 1); +my $gdbm = tie my %seen, 'GDBM_File', $tmp, &GDBM_WRCREAT, 0600; +defined $gdbm or die "gdbm open failed: $!\n"; +$todo[0]->close; +$done[1]->close; + +my ($rvec, $wvec); +$todo[1]->blocking(0); +$done[0]->blocking(0); +$seen{$root_url} = 1; +my $ndone = 0; +my $nsent = 1; +my @queue = ($root_url); +my $timeout = $slow * 4; +while (keys %workers) { # reacts to SIGCHLD + $wvec = $rvec = ''; + my $u; + vec($rvec, fileno($done[0]), 1) = 1; + if (@queue) { + vec($wvec, fileno($todo[1]), 1) = 1; + } elsif ($ndone == $nsent) { + kill 'TERM', keys %workers; + exit; + } + if (!select($rvec, $wvec, undef, $timeout)) { + while (my ($k, $v) = each %seen) { + next if $v == 2; + print "WAIT ($ndone/$nsent) <$k>\n"; + } + } + while ($u = shift @queue) { + my $s = $todo[1]->send($u, MSG_EOR); + if ($!{EAGAIN}) { + unshift @queue, $u; + last; + } + } + my $r; + do { + $r = $done[0]->recv($u, 65535, 0); + } while (!defined $r && $!{EINTR}); + next unless $u; + if ($u =~ s/\ADONE\t//) { + $ndone++; + $seen{$u} = 2; + } else { + next if $seen{$u}; + $seen{$u} = 1; + $nsent++; + push @queue, $u; + } +} + +sub worker_loop { + my ($todo_rd, $done_wr) = @_; + my $m = WWW::Mechanize->new(autocheck => 0); + my $cc = LWP::ConnCache->new; + $m->conn_cache($cc); + while (1) { + $todo_rd->recv(my $u, 65535, 0); + next unless $u; + + my $t = [ gettimeofday ]; + my $r = $m->get($u); + $t = tv_interval($t); + printf "SLOW %0.06f % 5d %s\n", $t, $$, $u if $t > $slow; + my @links; + if ($r->is_success) { + my %links = map { + (split('#', $_->URI->abs->as_string))[0] => 1; + } grep { + $_->tag && $_->url !~ /:/ + } $m->links; + @links = keys %links; + } elsif ($r->code != 300) { + warn "W: ".$r->code . " $u\n" + } + + my $s; + # blocking + foreach my $l (@links, "DONE\t$u") { + next if $l eq ''; + do { + $s = $done_wr->send($l, MSG_EOR); + } while (!defined $s && $!{EINTR}); + die "$$ send $!\n" unless defined $s; + my $n = length($l); + die "$$ send truncated $s < $n\n" if $s != $n; + } + + # make sure the HTML source doesn't screw up terminals + # when people curl the source (not remotely an expert + # on languages or encodings, here). + next if $r->header('Content-Type') !~ m!\btext/html\b!; + my $dc = $r->decoded_content; + if ($dc =~ /([\x00-\x08\x0d-\x1f\x7f-\x{99999999}]+)/s) { + my $o = $1; + my $c = Dumper($o); + warn "bad: $u $c\n"; + } + } +} diff --git a/t/common.perl b/t/common.perl index bec57699..1251333d 100644 --- a/t/common.perl +++ b/t/common.perl @@ -1,18 +1,15 @@ # Copyright (C) 2015 all contributors <meta@public-inbox.org> # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -require IO::File; -use POSIX qw/dup/; sub stream_to_string { - my ($cb) = @_; - my $headers; - my $io = IO::File->new_tmpfile; - my $dup = dup($io->fileno); - my $response = sub { $headers = \@_, $io }; - $cb->($response); - $io = IO::File->new; - $io->fdopen($dup, 'r+'); - $io->seek(0, 0); - $io->read(my $str, ($io->stat)[7]); + my ($res) = @_; + my $body = $res->[2]; + my $str = ''; + while (defined(my $chunk = $body->getline)) { + $str .= $chunk; + } + $body->close; $str; } + +1; @@ -9,10 +9,8 @@ my $tmpdir = tempdir('pi-config-XXXXXX', TMPDIR => 1, CLEANUP => 1); { is(system(qw(git init -q --bare), $tmpdir), 0, "git init successful"); - { - local $ENV{GIT_DIR} = $tmpdir; - is(system(qw(git config foo.bar hihi)), 0, "set config"); - } + my @cmd = ('git', "--git-dir=$tmpdir", qw(config foo.bar hihi)); + is(system(@cmd), 0, "set config"); my $tmp = PublicInbox::Config->new("$tmpdir/config"); @@ -28,8 +26,11 @@ my $tmpdir = tempdir('pi-config-XXXXXX', TMPDIR => 1, CLEANUP => 1); is_deeply($cfg->lookup('meta@public-inbox.org'), { 'mainrepo' => '/home/pi/meta-main.git', 'address' => 'meta@public-inbox.org', + 'domain' => 'public-inbox.org', + 'url' => 'http://example.com/meta', -primary_address => 'meta@public-inbox.org', - 'listname' => 'meta', + 'name' => 'meta', + -pi_config => $cfg, }, "lookup matches expected output"); is($cfg->lookup('blah@example.com'), undef, @@ -42,8 +43,24 @@ my $tmpdir = tempdir('pi-config-XXXXXX', TMPDIR => 1, CLEANUP => 1); 'test@public-inbox.org'], -primary_address => 'try@public-inbox.org', 'mainrepo' => '/home/pi/test-main.git', - 'listname' => 'test', + 'domain' => 'public-inbox.org', + 'name' => 'test', + 'url' => 'http://example.com/test', + -pi_config => $cfg, }, "lookup matches expected output for test"); } + +{ + my $cfgpfx = "publicinbox.test"; + my @altid = qw(serial:gmane:file=a serial:enamg:file=b); + my $config = PublicInbox::Config->new({ + "$cfgpfx.address" => 'test@example.com', + "$cfgpfx.mainrepo" => '/path/to/non/existent', + "$cfgpfx.altid" => [ @altid ], + }); + my $ibx = $config->lookup_name('test'); + is_deeply($ibx->{altid}, [ @altid ]); +} + done_testing(); diff --git a/t/config_limiter.t b/t/config_limiter.t new file mode 100644 index 00000000..3c7ec557 --- /dev/null +++ b/t/config_limiter.t @@ -0,0 +1,49 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use PublicInbox::Config; +my $cfgpfx = "publicinbox.test"; +{ + my $config = PublicInbox::Config->new({ + "$cfgpfx.address" => 'test@example.com', + "$cfgpfx.mainrepo" => '/path/to/non/existent', + "$cfgpfx.httpbackendmax" => 12, + }); + my $ibx = $config->lookup_name('test'); + my $git = $ibx->git; + my $old = "$git"; + my $lim = $git->{-httpbackend_limiter}; + ok($lim, 'Limiter exists'); + is($lim->{max}, 12, 'limiter has expected slots'); + $ibx->{git} = undef; + $git = $ibx->git; + isnt($old, "$git", 'got new Git object'); + is("$git->{-httpbackend_limiter}", "$lim", 'same limiter'); +} + +{ + my $config = PublicInbox::Config->new({ + 'limiter.named.max' => 3, + "$cfgpfx.address" => 'test@example.com', + "$cfgpfx.mainrepo" => '/path/to/non/existent', + "$cfgpfx.httpbackendmax" => 'named', + }); + my $ibx = $config->lookup_name('test'); + my $git = $ibx->git; + ok($git, 'got git object'); + my $old = "$git"; + my $lim = $git->{-httpbackend_limiter}; + ok($lim, 'Limiter exists'); + is($lim->{max}, 3, 'limiter has expected slots'); + $git = undef; + $ibx->{git} = undef; + PublicInbox::Inbox::weaken_task; + $git = $ibx->git; + isnt($old, "$git", 'got new Git object'); + is("$git->{-httpbackend_limiter}", "$lim", 'same limiter'); + is($lim->{max}, 3, 'limiter has expected slots'); +} + +done_testing; diff --git a/t/emergency.t b/t/emergency.t new file mode 100644 index 00000000..e480338d --- /dev/null +++ b/t/emergency.t @@ -0,0 +1,53 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +my $tmpdir = tempdir('emergency-XXXXXX', TMPDIR => 1, CLEANUP => 1); +use_ok 'PublicInbox::Emergency'; + +{ + my $md = "$tmpdir/a"; + my $em = PublicInbox::Emergency->new($md); + ok(-d $md, 'Maildir a auto-created'); + my @tmp = <$md/tmp/*>; + is(scalar @tmp, 0, 'no temporary files exist, yet'); + $em->prepare(\"BLAH"); + @tmp = <$md/tmp/*>; + is(scalar @tmp, 1, 'globbed one temporary file'); + open my $fh, '<', $tmp[0] or die "failed to open: $!"; + is("BLAH", <$fh>, 'wrote contents to temporary location'); + my @new = <$md/new/*>; + is(scalar @new, 0, 'no new files exist, yet'); + $em = undef; + @tmp = <$md/tmp/*>; + is(scalar @tmp, 0, 'temporary file no longer exists'); + @new = <$md/new/*>; + is(scalar @new, 1, 'globbed one new file'); + open $fh, '<', $new[0] or die "failed to open: $!"; + is("BLAH", <$fh>, 'wrote contents to new location'); +} +{ + my $md = "$tmpdir/b"; + my $em = PublicInbox::Emergency->new($md); + ok(-d $md, 'Maildir b auto-created'); + my @tmp = <$md/tmp/*>; + is(scalar @tmp, 0, 'no temporary files exist, yet'); + $em->prepare(\"BLAH"); + @tmp = <$md/tmp/*>; + is(scalar @tmp, 1, 'globbed one temporary file'); + open my $fh, '<', $tmp[0] or die "failed to open: $!"; + is("BLAH", <$fh>, 'wrote contents to temporary location'); + my @new = <$md/new/*>; + is(scalar @new, 0, 'no new files exist, yet'); + is(sysread($em->fh, my $buf, 9), 4, 'read file handle exposed'); + is($buf, 'BLAH', 'got expected data'); + $em->abort; + @tmp = <$md/tmp/*>; + is(scalar @tmp, 0, 'temporary file no longer exists'); + @new = <$md/new/*>; + is(scalar @new , 0, 'new file no longer exists'); +} + +done_testing(); @@ -3,10 +3,12 @@ use strict; use warnings; use Test::More; -use Email::Simple; +use Email::MIME; use PublicInbox::Feed; +use PublicInbox::Git; +use PublicInbox::Import; use PublicInbox::Config; -use IPC::Run qw/run/; +use PublicInbox::Inbox; use File::Temp qw/tempdir/; my $have_xml_feed = eval { require XML::Feed; 1 }; require 't/common.perl'; @@ -15,15 +17,45 @@ sub string_feed { stream_to_string(PublicInbox::Feed::generate($_[0])); } +# ensure we are compatible with existing ssoma installations which +# do not use fast-import. We can probably remove this in 2018 +my %SSOMA; +sub rand_use ($) { + return 0 if $ENV{FAST}; + eval { require IPC::Run }; + return 0 if $@; + my $cmd = $_[0]; + my $x = $SSOMA{$cmd}; + unless ($x) { + $x = -1; + foreach my $p (split(':', $ENV{PATH})) { + -x "$p/$cmd" or next; + $x = 1; + last; + } + $SSOMA{$cmd} = $x; + } + return if $x < 0; + ($x > 0 && (int(rand(10)) % 2) == 1); +} + my $tmpdir = tempdir('pi-feed-XXXXXX', TMPDIR => 1, CLEANUP => 1); my $git_dir = "$tmpdir/gittest"; +my $ibx = PublicInbox::Inbox->new({ + address => 'test@example', + name => 'testbox', + mainrepo => $git_dir, + url => 'http://example.com/test', +}); +my $git = $ibx->git; +my $im = PublicInbox::Import->new($git, $ibx->{name}, 'test@example'); { is(0, system(qw(git init -q --bare), $git_dir), "git init"); local $ENV{GIT_DIR} = $git_dir; foreach my $i (1..6) { - my $simple = Email::Simple->new(<<EOF); + my $mime = Email::MIME->new(<<EOF); From: ME <me\@example.com> To: U <u\@example.com> Message-Id: <$i\@example.com> @@ -53,10 +85,16 @@ msg $i keep me EOF - my $str = $simple->as_string; - run(['ssoma-mda', $git_dir], \$str) or - die "mda failed: $?\n"; + if (rand_use('ssoma-mda')) { + $im->done; + my $str = $mime->as_string; + IPC::Run::run(['ssoma-mda', $git_dir], \$str) or + die "mda failed: $?\n"; + } else { + like($im->add($mime), qr/\A:\d+/, 'added'); + } } + $im->done; } # spam check @@ -64,7 +102,7 @@ EOF # check initial feed { my $feed = string_feed({ - git_dir => $git_dir, + -inbox => $ibx, max => 3 }); SKIP: { @@ -72,13 +110,11 @@ EOF my $p = XML::Feed->parse(\$feed); is($p->format, "Atom", "parsed atom feed"); is(scalar $p->entries, 3, "parsed three entries"); - is($p->id, 'mailto:public-inbox@example.com', + is($p->id, 'mailto:test@example', "id is set to default"); } - unlike($feed, qr/drop me/, "long quoted text dropped"); - like($feed, qr!/\d%40example\.com/f/#q!, - "/f/ url generated for long quoted text"); + like($feed, qr/drop me/, "long quoted text kept"); like($feed, qr/inline me here/, "short quoted text kept"); like($feed, qr/keep me/, "unquoted text saved"); } @@ -86,13 +122,7 @@ EOF # add a new spam message my $spam; { - my $pid = open(my $pipe, "|-"); - defined $pid or die "fork/pipe failed: $!\n"; - if ($pid == 0) { - exec("ssoma-mda", $git_dir); - } - - $spam = Email::Simple->new(<<EOF); + $spam = Email::MIME->new(<<EOF); From: SPAMMER <spammer\@example.com> To: U <u\@example.com> Message-Id: <this-is-spam\@example.com> @@ -100,14 +130,20 @@ Subject: SPAM!!!!!!!! Date: Thu, 01 Jan 1970 00:00:00 +0000 EOF - print $pipe $spam->as_string or die "print failed: $!\n"; - close $pipe or die "close pipe failed: $!\n"; + if (rand_use('ssoma-mda')) { + my $str = $spam->as_string; + IPC::Run::run(['ssoma-mda', $git_dir], \$str) or + die "mda failed: $?\n"; + } else { + $im->add($spam); + $im->done; + } } # check spam shows up { my $spammy_feed = string_feed({ - git_dir => $git_dir, + -inbox => $ibx, max => 3 }); SKIP: { @@ -120,18 +156,18 @@ EOF } # nuke spam - { + if (rand_use('ssoma-rm')) { my $spam_str = $spam->as_string; - run(["ssoma-rm", $git_dir], \$spam_str) or + IPC::Run::run(["ssoma-rm", $git_dir], \$spam_str) or die "ssoma-rm failed: $?\n"; + } else { + $im->remove($spam); + $im->done; } # spam no longer shows up { - my $feed = string_feed({ - git_dir => $git_dir, - max => 3 - }); + my $feed = string_feed({ -inbox => $ibx, max => 3 }); SKIP: { skip 'XML::Feed missing', 2 unless $have_xml_feed; my $p = XML::Feed->parse(\$feed); @@ -142,26 +178,4 @@ EOF } } -# check pi_config -{ - foreach my $addr (('a@example.com'), ['a@example.com','b@localhost']) { - my $feed = string_feed({ - git_dir => $git_dir, - max => 3, - listname => 'asdf', - pi_config => bless({ - 'publicinbox.asdf.address' => $addr, - }, 'PublicInbox::Config'), - }); - SKIP: { - skip 'XML::Feed missing', 3 unless $have_xml_feed; - my $p = XML::Feed->parse(\$feed); - is($p->id, 'mailto:a@example.com', - "ID is set correctly"); - is($p->format, "Atom", "parsed atom feed"); - is(scalar $p->entries, 3, "parsed three entries"); - } - } -} - done_testing(); diff --git a/t/filter.t b/t/filter.t deleted file mode 100644 index 80a7c123..00000000 --- a/t/filter.t +++ /dev/null @@ -1,355 +0,0 @@ -# Copyright (C) 2013-2015 all contributors <meta@public-inbox.org> -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -use strict; -use warnings; -use Test::More; -use Email::MIME; -use PublicInbox::Filter; - -sub count_body_parts { - my ($bodies, $part) = @_; - my $body = $part->body_raw; - $body =~ s/\A\s*//; - $body =~ s/\s*\z//; - $bodies->{$body} ||= 0; - $bodies->{$body}++; -} - -# multipart/alternative: HTML and quoted-printable, keep the plain-text -{ - my $html_body = "<html><body>hi</body></html>"; - my $parts = [ - Email::MIME->create( - attributes => { - content_type => 'text/html; charset=UTF-8', - encoding => 'base64', - }, - body => $html_body, - ), - Email::MIME->create( - attributes => { - content_type => 'text/plain', - encoding => 'quoted-printable', - }, - body => 'hi = "bye"', - ) - ]; - my $email = Email::MIME->create( - header_str => [ - From => 'a@example.com', - Subject => 'blah', - 'Content-Type' => 'multipart/alternative' - ], - parts => $parts, - ); - is(1, PublicInbox::Filter->run($email), "run was a success"); - my $parsed = Email::MIME->new($email->as_string); - is("text/plain", $parsed->header("Content-Type")); - is(scalar $parsed->parts, 1, "HTML part removed"); - my %bodies; - $parsed->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - count_body_parts(\%bodies, $part); - }); - is(scalar keys %bodies, 1, "one bodies"); - is($bodies{"hi =3D \"bye\"="}, 1, "QP text part unchanged"); - $parsed->walk_parts(sub { - my ($part) = @_; - my $b = $part->body; - $b =~ s/\s*\z//; - is($b, "hi = \"bye\"", "decoded body matches"); - }); -} - -# plain-text email is passed through unchanged -{ - my $s = Email::MIME->create( - header => [ - From => 'a@example.com', - To => 'b@example.com', - 'Content-Type' => 'text/plain', - Subject => 'this is a subject', - ], - body => "hello world\n", - ); - is(1, PublicInbox::Filter->run($s), "run was a success"); -} - -# convert single-part HTML to plain-text -{ - my $s = Email::MIME->create( - header => [ - From => 'a@example.com', - To => 'b@example.com', - 'Content-Type' => 'text/html', - Subject => 'HTML only badness', - ], - body => "<html><body>bad body\r\n</body></html>\n", - ); - is(1, PublicInbox::Filter->run($s), "run was a success"); - unlike($s->as_string, qr/<html>/, "HTML removed"); - is("text/plain", $s->header("Content-Type"), - "content-type changed"); - like($s->body, qr/\A\s*bad body\s*\z/, "body"); - unlike($s->body, qr/\r/, "body has no cr"); - like($s->header("X-Content-Filtered-By"), - qr/PublicInbox::Filter/, "XCFB header added"); -} - -# multipart/alternative: HTML and plain-text, keep the plain-text -{ - my $html_body = "<html><body>hi</body></html>"; - my $parts = [ - Email::MIME->create( - attributes => { - content_type => 'text/html; charset=UTF-8', - encoding => 'base64', - }, - body => $html_body, - ), - Email::MIME->create( - attributes => { - content_type => 'text/plain', - }, - body=> 'hi', - ) - ]; - my $email = Email::MIME->create( - header_str => [ - From => 'a@example.com', - Subject => 'blah', - 'Content-Type' => 'multipart/alternative' - ], - parts => $parts, - ); - is(1, PublicInbox::Filter->run($email), "run was a success"); - my $parsed = Email::MIME->new($email->as_string); - is("text/plain", $parsed->header("Content-Type")); - is(scalar $parsed->parts, 1, "HTML part removed"); - my %bodies; - $parsed->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - count_body_parts(\%bodies, $part); - }); - is(scalar keys %bodies, 1, "one bodies"); - is($bodies{"hi"}, 1, "plain text part unchanged"); -} - -# multi-part plain-text-only -{ - my $parts = [ - Email::MIME->create( - attributes => { content_type => 'text/plain', }, - body => 'hi', - ), - Email::MIME->create( - attributes => { content_type => 'text/plain', }, - body => 'bye', - ) - ]; - my $email = Email::MIME->create( - header_str => [ From => 'a@example.com', Subject => 'blah' ], - parts => $parts, - ); - is(1, PublicInbox::Filter->run($email), "run was a success"); - my $parsed = Email::MIME->new($email->as_string); - is(scalar $parsed->parts, 2, "still 2 parts"); - my %bodies; - $parsed->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - count_body_parts(\%bodies, $part); - }); - is(scalar keys %bodies, 2, "two bodies"); - is($bodies{"bye"}, 1, "bye part exists"); - is($bodies{"hi"}, 1, "hi part exists"); - is($parsed->header("X-Content-Filtered-By"), undef, - "XCFB header unset"); -} - -# multi-part HTML, several HTML parts -{ - my $parts = [ - Email::MIME->create( - attributes => { - content_type => 'text/html', - encoding => 'base64', - }, - body => '<html><body>b64 body</body></html>', - ), - Email::MIME->create( - attributes => { - content_type => 'text/html', - encoding => 'quoted-printable', - }, - body => '<html><body>qp body</body></html>', - ) - ]; - my $email = Email::MIME->create( - header_str => [ From => 'a@example.com', Subject => 'blah' ], - parts => $parts, - ); - is(1, PublicInbox::Filter->run($email), "run was a success"); - my $parsed = Email::MIME->new($email->as_string); - is(scalar $parsed->parts, 2, "still 2 parts"); - my %bodies; - $parsed->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - count_body_parts(\%bodies, $part); - }); - is(scalar keys %bodies, 2, "two body parts"); - is($bodies{"b64 body"}, 1, "base64 part converted"); - is($bodies{"qp body"}, 1, "qp part converted"); - like($parsed->header("X-Content-Filtered-By"), qr/PublicInbox::Filter/, - "XCFB header added"); -} - -# plain-text with image attachments, kill images -{ - my $parts = [ - Email::MIME->create( - attributes => { content_type => 'text/plain' }, - body => 'see image', - ), - Email::MIME->create( - attributes => { - content_type => 'image/jpeg', - filename => 'scary.jpg', - encoding => 'base64', - }, - body => 'bad', - ) - ]; - my $email = Email::MIME->create( - header_str => [ From => 'a@example.com', Subject => 'blah' ], - parts => $parts, - ); - is(1, PublicInbox::Filter->run($email), "run was a success"); - my $parsed = Email::MIME->new($email->as_string); - is(scalar $parsed->parts, 1, "image part removed"); - my %bodies; - $parsed->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - count_body_parts(\%bodies, $part); - }); - is(scalar keys %bodies, 1, "one body"); - is($bodies{'see image'}, 1, 'original body exists'); - like($parsed->header("X-Content-Filtered-By"), qr/PublicInbox::Filter/, - "XCFB header added"); -} - -# all bad -{ - my $parts = [ - Email::MIME->create( - attributes => { - content_type => 'image/jpeg', - filename => 'scary.jpg', - encoding => 'base64', - }, - body => 'bad', - ), - Email::MIME->create( - attributes => { - content_type => 'text/plain', - filename => 'scary.exe', - encoding => 'base64', - }, - body => 'bad', - ), - ]; - my $email = Email::MIME->create( - header_str => [ From => 'a@example.com', Subject => 'blah' ], - parts => $parts, - ); - is(0, PublicInbox::Filter->run($email), - "run signaled to stop delivery"); - my $parsed = Email::MIME->new($email->as_string); - is(scalar $parsed->parts, 1, "bad parts removed"); - my %bodies; - $parsed->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - count_body_parts(\%bodies, $part); - }); - is(scalar keys %bodies, 1, "one body"); - is($bodies{"all attachments scrubbed by PublicInbox::Filter"}, 1, - "attachment scrubber left its mark"); - like($parsed->header("X-Content-Filtered-By"), qr/PublicInbox::Filter/, - "XCFB header added"); -} - -{ - my $s = Email::MIME->create( - header => [ - From => 'a@example.com', - To => 'b@example.com', - 'Content-Type' => 'test/pain', - Subject => 'this is a subject', - ], - body => "hello world\n", - ); - is(0, PublicInbox::Filter->run($s), "run was a failure"); - like($s->as_string, qr/scrubbed/, "scrubbed message"); -} - -{ - my $s = Email::MIME->create( - header => [ - From => 'a@example.com', - To => 'b@example.com', - 'Content-Type' => 'text/plain', - 'Mail-Followup-To' => 'c@example.com', - Subject => 'mfttest', - ], - body => "mft\n", - ); - - is('c@example.com', $s->header("Mail-Followup-To"), - "mft set correctly"); - is(1, PublicInbox::Filter->run($s), "run succeeded for mft"); - is(undef, $s->header("Mail-Followup-To"), "mft stripped"); -} - -# multi-part with application/octet-stream -{ - my $os = 'application/octet-stream'; - my $parts = [ - Email::MIME->create( - attributes => { content_type => $os }, - body => <<EOF -#include <stdio.h> -int main(void) -{ - printf("Hello world\\n"); - return 0; -} - -/* some folks like ^L */ -EOF - ), - Email::MIME->create( - attributes => { - filename => 'zero.data', - encoding => 'base64', - content_type => $os, - }, - body => ("\0" x 4096), - ) - ]; - my $email = Email::MIME->create( - header_str => [ From => 'a@example.com', Subject => 'blah' ], - parts => $parts, - ); - is(1, PublicInbox::Filter->run($email), "run was a success"); - my $parsed = Email::MIME->new($email->as_string); - is(scalar $parsed->parts, 1, "only one remaining part"); - like($parsed->header("X-Content-Filtered-By"), - qr/PublicInbox::Filter/, "XCFB header added"); -} - -done_testing(); diff --git a/t/filter_base.t b/t/filter_base.t new file mode 100644 index 00000000..ee5c7307 --- /dev/null +++ b/t/filter_base.t @@ -0,0 +1,81 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use_ok 'PublicInbox::Filter::Base'; + +{ + my $f = PublicInbox::Filter::Base->new; + ok($f, 'created stock object'); + ok(defined $f->{reject_suffix}, 'rejected suffix redefined'); + is(ref($f->{reject_suffix}), 'Regexp', 'reject_suffix should be a RE'); +} + +{ + my $f = PublicInbox::Filter::Base->new(reject_suffix => undef); + ok($f, 'created base object q/o reject_suffix'); + ok(!defined $f->{reject_suffix}, 'reject_suffix not defined'); +} + +{ + my $f = PublicInbox::Filter::Base->new; + my $html_body = "<html><body>hi</body></html>"; + my $parts = [ + Email::MIME->create( + attributes => { + content_type => 'text/xhtml; charset=UTF-8', + encoding => 'base64', + }, + body => $html_body, + ), + Email::MIME->create( + attributes => { + content_type => 'text/plain', + encoding => 'quoted-printable', + }, + body => 'hi = "bye"', + ) + ]; + my $email = Email::MIME->create( + header_str => [ + From => 'a@example.com', + Subject => 'blah', + 'Content-Type' => 'multipart/alternative' + ], + parts => $parts, + ); + is($f->delivery($email), 100, "xhtml rejected"); +} + +{ + my $f = PublicInbox::Filter::Base->new; + my $parts = [ + Email::MIME->create( + attributes => { + content_type => 'application/vnd.ms-excel', + encoding => 'base64', + }, + body => 'junk', + ), + Email::MIME->create( + attributes => { + content_type => 'text/plain', + encoding => 'quoted-printable', + }, + body => 'junk', + ) + ]; + my $email = Email::MIME->create( + header_str => [ + From => 'a@example.com', + Subject => 'blah', + 'Content-Type' => 'multipart/mixed' + ], + parts => $parts, + ); + is($f->delivery($email), 100, 'proprietary format rejected on glob'); +} + +done_testing(); diff --git a/t/filter_mirror.t b/t/filter_mirror.t new file mode 100644 index 00000000..01be282e --- /dev/null +++ b/t/filter_mirror.t @@ -0,0 +1,40 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use_ok 'PublicInbox::Filter::Mirror'; + +my $f = PublicInbox::Filter::Mirror->new; +ok($f, 'created PublicInbox::Filter::Mirror object'); +{ + my $html_body = "<html><body>hi</body></html>"; + my $parts = [ + Email::MIME->create( + attributes => { + content_type => 'text/html; charset=UTF-8', + encoding => 'base64', + }, + body => $html_body, + ), + Email::MIME->create( + attributes => { + content_type => 'text/plain', + encoding => 'quoted-printable', + }, + body => 'hi = "bye"', + ) + ]; + my $email = Email::MIME->create( + header_str => [ + From => 'a@example.com', + Subject => 'blah', + 'Content-Type' => 'multipart/alternative' + ], + parts => $parts, + ); + is($f->ACCEPT, $f->delivery($email), 'accept any trash that comes'); +} + +done_testing(); diff --git a/t/filter_vger.t b/t/filter_vger.t new file mode 100644 index 00000000..83a4c9ee --- /dev/null +++ b/t/filter_vger.t @@ -0,0 +1,46 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use_ok 'PublicInbox::Filter::Vger'; + +my $f = PublicInbox::Filter::Vger->new; +ok($f, 'created PublicInbox::Filter::Vger object'); +{ + my $lkml = <<'EOF'; +From: foo@example.com +Subject: test + +keep this +-- +To unsubscribe from this list: send the line "unsubscribe linux-kernel" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html +Please read the FAQ at http://www.tux.org/lkml/ +EOF + + my $mime = Email::MIME->new($lkml); + $mime = $f->delivery($mime); + is("keep this\n", $mime->body, 'normal message filtered OK'); +} + +{ + my $no_nl = <<'EOF'; +From: foo@example.com +Subject: test + +OSX users :P-- +To unsubscribe from this list: send the line "unsubscribe git" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html +EOF + + my $mime = Email::MIME->new($no_nl); + $mime = $f->delivery($mime); + is('OSX users :P', $mime->body, 'missing trailing LF in original OK'); +} + + +done_testing(); diff --git a/t/git-http-backend.psgi b/t/git-http-backend.psgi new file mode 100644 index 00000000..66f41505 --- /dev/null +++ b/t/git-http-backend.psgi @@ -0,0 +1,25 @@ +#!/usr/bin/perl -w +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use PublicInbox::GitHTTPBackend; +use PublicInbox::Git; +use Plack::Builder; +use BSD::Resource qw(getrusage); +my $git_dir = $ENV{GIANT_GIT_DIR} or die 'GIANT_GIT_DIR not defined in env'; +my $git = PublicInbox::Git->new($git_dir); +builder { + enable 'Head'; + sub { + my ($env) = @_; + if ($env->{PATH_INFO} =~ m!\A/(.+)\z!s) { + PublicInbox::GitHTTPBackend::serve($env, $git, $1); + } else { + my $ru = getrusage(); + my $b = $ru->maxrss . "\n"; + [ 200, [ qw(Content-Type text/plain Content-Length), + length($b) ], [ $b ] ] + } + } +} diff --git a/t/git-http-backend.t b/t/git-http-backend.t new file mode 100644 index 00000000..e506e772 --- /dev/null +++ b/t/git-http-backend.t @@ -0,0 +1,134 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +use IO::Socket; +use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); +use Socket qw(SO_KEEPALIVE IPPROTO_TCP TCP_NODELAY); +use POSIX qw(dup2 setsid); +use Cwd qw(getcwd); + +my $git_dir = $ENV{GIANT_GIT_DIR}; +plan 'skip_all' => 'GIANT_GIT_DIR not defined' unless $git_dir; +foreach my $mod (qw(Danga::Socket BSD::Resource + Plack::Util Plack::Builder + HTTP::Date HTTP::Status Net::HTTP)) { + eval "require $mod"; + plan skip_all => "$mod missing for git-http-backend.t" if $@; +} +my $psgi = getcwd()."/t/git-http-backend.psgi"; +my $tmpdir = tempdir('pi-git-http-backend-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $err = "$tmpdir/stderr.log"; +my $out = "$tmpdir/stdout.log"; +my $httpd = 'blib/script/public-inbox-httpd'; +my %opts = ( + LocalAddr => '127.0.0.1', + ReuseAddr => 1, + Proto => 'tcp', + Type => SOCK_STREAM, + Listen => 1024, +); +my $sock = IO::Socket::INET->new(%opts); +my $host = $sock->sockhost; +my $port = $sock->sockport; +my $pid; +END { kill 'TERM', $pid if defined $pid }; + +my $get_maxrss = sub { + my $http = Net::HTTP->new(Host => "$host:$port"); + ok($http, 'Net::HTTP object created for maxrss'); + $http->write_request(GET => '/'); + my ($code, $mess, %h) = $http->read_response_headers; + is($code, 200, 'success reading maxrss'); + my $n = $http->read_entity_body(my $buf, 256); + ok(defined $n, 'read response body'); + like($buf, qr/\A\d+\n\z/, 'got memory response'); + ok(int($buf) > 0, 'got non-zero memory response'); + int($buf); +}; + +{ + ok($sock, 'sock created'); + $pid = fork; + if ($pid == 0) { # pretend to be systemd + fcntl($sock, F_SETFD, 0); + dup2(fileno($sock), 3) or die "dup2 failed: $!\n"; + $ENV{LISTEN_PID} = $$; + $ENV{LISTEN_FDS} = 1; + $ENV{TEST_CHUNK} = '1'; + exec $httpd, "--stdout=$out", "--stderr=$err", $psgi; + die "FAIL: $!\n"; + } + ok(defined $pid, 'forked httpd process successfully'); +} +my $mem_a = $get_maxrss->(); + +SKIP: { + my $max = 0; + my $pack; + my $glob = "$git_dir/objects/pack/pack-*.pack"; + foreach my $f (glob($glob)) { + my $n = -s $f; + if ($n > $max) { + $max = $n; + $pack = $f; + } + } + skip "no packs found in $git_dir" unless defined $pack; + if ($pack !~ m!(/objects/pack/pack-[a-f0-9]{40}.pack)\z!) { + skip "bad pack name: $pack"; + } + my $url = $1; + my $http = Net::HTTP->new(Host => "$host:$port"); + ok($http, 'Net::HTTP object created'); + $http->write_request(GET => $url); + my ($code, $mess, %h) = $http->read_response_headers; + is(200, $code, 'got 200 success for pack'); + is($max, $h{'Content-Length'}, 'got expected Content-Length for pack'); + foreach my $i (1..3) { + sleep 1; + my $diff = $get_maxrss->() - $mem_a; + note "${diff}K memory increase after $i seconds"; + ok($diff < 1024, 'no bloating caused by slow dumb client'); + } +} + +{ + my $c = fork; + if ($c == 0) { + setsid(); + exec qw(git clone -q --mirror), "http://$host:$port/", + "$tmpdir/mirror.git"; + die "Failed start git clone: $!\n"; + } + select(undef, undef, undef, 0.1); + foreach my $i (1..10) { + is(1, kill('STOP', -$c), 'signaled clone STOP'); + sleep 1; + ok(kill('CONT', -$c), 'continued clone'); + my $diff = $get_maxrss->() - $mem_a; + note "${diff}K memory increase after $i seconds"; + ok($diff < 2048, 'no bloating caused by slow smart client'); + } + ok(kill('CONT', -$c), 'continued clone'); + is($c, waitpid($c, 0), 'reaped wayward slow clone'); + is($?, 0, 'clone did not error out'); + note 'clone done, fsck-ing clone result...'; + is(0, system("git", "--git-dir=$tmpdir/mirror.git", + qw(fsck --no-progress)), + 'fsck did not report corruption'); + + my $diff = $get_maxrss->() - $mem_a; + note "${diff}K memory increase after smart clone"; + ok($diff < 2048, 'no bloating caused by slow smart client'); +} + +{ + ok(kill('TERM', $pid), 'killed httpd'); + $pid = undef; + waitpid(-1, 0); +} + +done_testing(); diff --git a/t/html_index.t b/t/html_index.t index adbadaf4..f29b442d 100644 --- a/t/html_index.t +++ b/t/html_index.t @@ -3,11 +3,22 @@ use strict; use warnings; use Test::More; -use Email::Simple; +use Email::MIME; use PublicInbox::Feed; +use PublicInbox::Git; +use PublicInbox::Import; +use PublicInbox::Inbox; use File::Temp qw/tempdir/; my $tmpdir = tempdir('pi-http-XXXXXX', TMPDIR => 1, CLEANUP => 1); my $git_dir = "$tmpdir/gittest"; +my $ibx = PublicInbox::Inbox->new({ + address => 'test@example', + name => 'tester', + mainrepo => $git_dir, + url => 'http://example.com/test', +}); +my $git = $ibx->git; +my $im = PublicInbox::Import->new($git, 'tester', 'test@example'); # setup { @@ -15,19 +26,13 @@ my $git_dir = "$tmpdir/gittest"; my $prev = ""; foreach my $i (1..6) { - local $ENV{GIT_DIR} = $git_dir; - my $pid = open(my $pipe, "|-"); - defined $pid or die "fork/pipe failed: $!\n"; - if ($pid == 0) { - exec("ssoma-mda", $git_dir); - } my $mid = "<$i\@example.com>"; my $mid_line = "Message-ID: $mid"; if ($prev) { $mid_line .= "In-Reply-To: $prev"; } $prev = $mid; - my $simple = Email::Simple->new(<<EOF); + my $mime = Email::MIME->new(<<EOF); From: ME <me\@example.com> To: U <u\@example.com> $mid_line @@ -43,20 +48,9 @@ msg $i keep me EOF - print $pipe $simple->as_string or die "print failed: $!\n"; - close $pipe or die "close pipe failed: $!\n"; + like($im->add($mime), qr/\A:\d+\z/, 'inserted message'); } -} - -# check HTML index -{ - use IO::File; - my $cb = PublicInbox::Feed::generate_html_index({ - git_dir => $git_dir, - max => 3 - }); - require 't/common.perl'; - like(stream_to_string($cb), qr/html/, "feed is valid HTML :)"); + $im->done; } done_testing(); diff --git a/t/httpd-corner.psgi b/t/httpd-corner.psgi index da8a2ee8..ed1f92c0 100644 --- a/t/httpd-corner.psgi +++ b/t/httpd-corner.psgi @@ -30,6 +30,7 @@ my $app = sub { return sub { open my $f, '<', $fifo or die "open $fifo: $!\n"; + local $/ = "\n"; my @r = <$f>; $_[0]->([200, $h, \@r ]); }; @@ -38,6 +39,7 @@ my $app = sub { my $fh = $_[0]->([200, $h]); open my $f, '<', $fifo or die "open $fifo: $!\n"; + local $/ = "\n"; while (defined(my $l = <$f>)) { $fh->write($l); } @@ -56,6 +58,20 @@ my $app = sub { $fh->write($buf); $fh->close; } + } elsif ($path eq '/empty') { + $code = 200; + } elsif ($path eq '/getline-die') { + $code = 200; + $body = Plack::Util::inline_object( + getline => sub { die 'GETLINE FAIL' }, + close => sub { die 'CLOSE FAIL' }, + ); + } elsif ($path eq '/close-die') { + $code = 200; + $body = Plack::Util::inline_object( + getline => sub { undef }, + close => sub { die 'CLOSE FAIL' }, + ); } [ $code, $h, $body ] diff --git a/t/httpd-corner.t b/t/httpd-corner.t index b64f334a..8a0337c2 100644 --- a/t/httpd-corner.t +++ b/t/httpd-corner.t @@ -5,8 +5,9 @@ use strict; use warnings; use Test::More; +use Time::HiRes qw(gettimeofday tv_interval); -foreach my $mod (qw(Plack::Util Plack::Request Plack::Builder Danga::Socket +foreach my $mod (qw(Plack::Util Plack::Builder Danga::Socket HTTP::Date HTTP::Status)) { eval "require $mod"; plan skip_all => "$mod missing for httpd-corner.t" if $@; @@ -85,6 +86,30 @@ my $spawn_httpd = sub { } { + my $conn = conn_for($sock, 'getline-die'); + $conn->write("GET /getline-die HTTP/1.1\r\nHost: example.com\r\n\r\n"); + ok($conn->read(my $buf, 8192), 'read some response'); + like($buf, qr!HTTP/1\.1 200\b[^\r]*\r\n!, 'got some sort of header'); + is($conn->read(my $nil, 8192), 0, 'read EOF'); + $conn = undef; + my $after = capture($err); + is(scalar(grep(/GETLINE FAIL/, @$after)), 1, 'failure logged'); + is(scalar(grep(/CLOSE FAIL/, @$after)), 1, 'body->close not called'); +} + +{ + my $conn = conn_for($sock, 'close-die'); + $conn->write("GET /close-die HTTP/1.1\r\nHost: example.com\r\n\r\n"); + ok($conn->read(my $buf, 8192), 'read some response'); + like($buf, qr!HTTP/1\.1 200\b[^\r]*\r\n!, 'got some sort of header'); + is($conn->read(my $nil, 8192), 0, 'read EOF'); + $conn = undef; + my $after = capture($err); + is(scalar(grep(/GETLINE FAIL/, @$after)), 0, 'getline not failed'); + is(scalar(grep(/CLOSE FAIL/, @$after)), 1, 'body->close not called'); +} + +{ my $conn = conn_for($sock, 'excessive header'); $SIG{PIPE} = 'IGNORE'; $conn->write("GET /callback HTTP/1.0\r\n"); @@ -218,7 +243,6 @@ my $check_self = sub { SKIP: { use POSIX qw(dup2); - use IO::File; my $have_curl = 0; foreach my $p (split(':', $ENV{PATH})) { -x "$p/curl" or next; @@ -230,7 +254,7 @@ SKIP: { my $url = 'http://' . $sock->sockhost . ':' . $sock->sockport . '/sha1'; my ($r, $w); pipe($r, $w) or die "pipe: $!"; - my $tout = IO::File->new_tmpfile or die "new_tmpfile: $!"; + open(my $tout, '+>', undef) or die "open temporary file: $!"; my $pid = fork; defined $pid or die "fork: $!"; my @cmd = (qw(curl --tcp-nodelay --no-buffer -T- -HExpect: -sS), $url); @@ -275,6 +299,18 @@ SKIP: { } { + my $conn = conn_for($sock, 'no TCP_CORK on empty body'); + $conn->write("GET /empty HTTP/1.1\r\nHost:example.com\r\n\r\n"); + my $buf = ''; + my $t0 = [ gettimeofday ]; + until ($buf =~ /\r\n\r\n/s) { + $conn->sysread($buf, 4096, length($buf)); + } + my $elapsed = tv_interval($t0, [ gettimeofday ]); + ok($elapsed < 0.190, 'no 200ms TCP cork delay on empty body'); +} + +{ my $conn = conn_for($sock, 'graceful termination during slow request'); $conn->write("PUT /sha1 HTTP/1.0\r\n"); delay(); @@ -476,4 +512,13 @@ SKIP: { done_testing(); +sub capture { + my ($f) = @_; + open my $fh, '+<', $f or die "failed to open $f: $!\n"; + local $/ = "\n"; + my @r = <$fh>; + truncate($fh, 0) or die "truncate failed on $f: $!\n"; + \@r +} + 1; diff --git a/t/httpd-unix.t b/t/httpd-unix.t index 00adf13c..4b0f116e 100644 --- a/t/httpd-unix.t +++ b/t/httpd-unix.t @@ -5,7 +5,7 @@ use strict; use warnings; use Test::More; -foreach my $mod (qw(Plack::Util Plack::Request Plack::Builder Danga::Socket +foreach my $mod (qw(Plack::Util Plack::Builder Danga::Socket HTTP::Date HTTP::Status)) { eval "require $mod"; plan skip_all => "$mod missing for httpd-unix.t" if $@; @@ -54,6 +54,7 @@ ok(-S $unix, 'UNIX socket was bound by -httpd'); sub check_sock ($) { my ($unix) = @_; my $sock = IO::Socket::UNIX->new(Peer => $unix, Type => SOCK_STREAM); + warn "E: $! connecting to $unix\n" unless defined $sock; ok($sock, 'client UNIX socket connected'); ok($sock->write("GET /host-port HTTP/1.0\r\n\r\n"), 'wrote req to server'); @@ -103,6 +104,7 @@ SKIP: { ok(-f "$tmpdir/pid", 'pid file written'); open my $fh, '<', "$tmpdir/pid" or die "open failed: $!"; + local $/ = "\n"; my $rpid = <$fh>; chomp $rpid; like($rpid, qr/\A\d+\z/s, 'pid file looks like a pid'); @@ -4,7 +4,7 @@ use strict; use warnings; use Test::More; -foreach my $mod (qw(Plack::Util Plack::Request Plack::Builder Danga::Socket +foreach my $mod (qw(Plack::Util Plack::Builder Danga::Socket HTTP::Date HTTP::Status)) { eval "require $mod"; plan skip_all => "$mod missing for httpd.t" if $@; @@ -14,24 +14,16 @@ use Cwd qw/getcwd/; use IO::Socket; use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); use Socket qw(SO_KEEPALIVE IPPROTO_TCP TCP_NODELAY); -use IPC::Run; # FIXME: too much setup my $tmpdir = tempdir('pi-httpd-XXXXXX', TMPDIR => 1, CLEANUP => 1); my $home = "$tmpdir/pi-home"; my $err = "$tmpdir/stderr.log"; my $out = "$tmpdir/stdout.log"; -my $pi_home = "$home/.public-inbox"; -my $pi_config = "$pi_home/config"; my $maindir = "$tmpdir/main.git"; -my $main_bin = getcwd()."/t/main-bin"; -my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock my $group = 'test-httpd'; my $addr = $group . '@example.com'; my $cfgpfx = "publicinbox.$group"; -my $failbox = "$home/fail.mbox"; -local $ENV{PI_EMERGENCY} = $failbox; -my $mda = 'blib/script/public-inbox-mda'; my $httpd = 'blib/script/public-inbox-httpd'; my $init = 'blib/script/public-inbox-init'; @@ -44,6 +36,9 @@ my %opts = ( ); my $sock = IO::Socket::INET->new(%opts); my $pid; +use_ok 'PublicInbox::Git'; +use_ok 'PublicInbox::Import'; +use_ok 'Email::MIME'; END { kill 'TERM', $pid if defined $pid }; { local $ENV{HOME} = $home; @@ -52,8 +47,7 @@ END { kill 'TERM', $pid if defined $pid }; # ensure successful message delivery { - local $ENV{ORIGINAL_RECIPIENT} = $addr; - my $in = <<EOF; + my $mime = Email::MIME->new(<<EOF); From: Me <me\@example.com> To: You <you\@example.com> Cc: $addr @@ -63,9 +57,11 @@ Date: Thu, 01 Jan 1970 06:06:06 +0000 nntp EOF - local $ENV{PATH} = $main_path; - IPC::Run::run([$mda], \$in); - is(0, $?, 'ran MDA correctly'); + $mime->header_set('List-Id', "<$addr>"); + my $git = PublicInbox::Git->new($maindir); + my $im = PublicInbox::Import->new($git, 'test', $addr); + $im->add($mime); + $im->done($mime); } ok($sock, 'sock created'); $! = 0; @@ -104,7 +100,16 @@ EOF is(system(qw(git clone -q --mirror), "http://$host:$port/$group", "$tmpdir/clone.git"), + 0, 'smart clone successful'); + + # ensure dumb cloning works, too: + is(system('git', "--git-dir=$maindir", + qw(config http.uploadpack false)), + 0, 'disable http.uploadpack'); + is(system(qw(git clone -q --mirror), + "http://$host:$port/$group", "$tmpdir/dumb.git"), 0, 'clone successful'); + ok(kill('TERM', $pid), 'killed httpd'); $pid = undef; waitpid(-1, 0); diff --git a/t/import.t b/t/import.t new file mode 100644 index 00000000..73f92adb --- /dev/null +++ b/t/import.t @@ -0,0 +1,69 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use PublicInbox::Git; +use PublicInbox::Import; +use File::Temp qw/tempdir/; +my $dir = tempdir('pi-import-XXXXXX', TMPDIR => 1, CLEANUP => 1); + +is(system(qw(git init -q --bare), $dir), 0, 'git init successful'); +my $git = PublicInbox::Git->new($dir); + +my $im = PublicInbox::Import->new($git, 'testbox', 'test@example'); +my $mime = Email::MIME->create( + header => [ + From => 'a@example.com', + To => 'b@example.com', + 'Content-Type' => 'text/plain', + Subject => 'this is a subject', + 'Message-ID' => '<a@example.com>', + ], + body => "hello world\n", +); +like($im->add($mime), qr/\A:\d+\z/, 'added one message'); +$im->done; +my @revs = $git->qx(qw(rev-list HEAD)); +is(scalar @revs, 1, 'one revision created'); + +$mime->header_set('Message-ID', '<b@example.com>'); +$mime->header_set('Subject', 'msg2'); +like($im->add($mime, sub { $mime }), qr/\A:\d+\z/, 'added 2nd message'); +$im->done; +@revs = $git->qx(qw(rev-list HEAD)); +is(scalar @revs, 2, '2 revisions exist'); + +is($im->add($mime), undef, 'message only inserted once'); +$im->done; +@revs = $git->qx(qw(rev-list HEAD)); +is(scalar @revs, 2, '2 revisions exist'); + +foreach my $c ('c'..'z') { + $mime->header_set('Message-ID', "<$c\@example.com>"); + $mime->header_set('Subject', "msg - $c"); + like($im->add($mime), qr/\A:\d+\z/, "added $c message"); +} +$im->done; +@revs = $git->qx(qw(rev-list HEAD)); +is(scalar @revs, 26, '26 revisions exist after mass import'); +my ($mark, $msg) = $im->remove($mime); +like($mark, qr/\A:\d+\z/, 'got mark'); +is(ref($msg), 'Email::MIME', 'got old message deleted'); + +is(undef, $im->remove($mime), 'remove is idempotent'); + +# mismatch on identical Message-ID +$mime->header_set('Message-ID', '<a@example.com>'); +($mark, $msg) = $im->remove($mime); +is($mark, 'MISMATCH', 'mark == MISMATCH on mismatch'); +is($msg->header('Message-ID'), '<a@example.com>', 'Message-ID matches'); +isnt($msg->header('Subject'), $mime->header('Subject'), 'subject mismatch'); + +$mime->header_set('Message-Id', '<failcheck@example.com>'); +is($im->add($mime, sub { undef }), undef, 'check callback fails'); +is($im->remove($mime), undef, 'message not added, so not removed'); + +$im->done; +done_testing(); diff --git a/t/inbox.t b/t/inbox.t new file mode 100644 index 00000000..9909dc53 --- /dev/null +++ b/t/inbox.t @@ -0,0 +1,15 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use_ok 'PublicInbox::Inbox'; +my $x = PublicInbox::Inbox->new({url => '//example.com/test/'}); +is($x->base_url, 'https://example.com/test/', 'expanded protocol-relative'); +$x = PublicInbox::Inbox->new({url => 'http://example.com/test'}); +is($x->base_url, 'http://example.com/test/', 'added trailing slash'); + +$x = PublicInbox::Inbox->new({}); +is($x->base_url, undef, 'undef base_url allowed'); + +done_testing(); @@ -13,10 +13,16 @@ use constant pi_init => 'blib/script/public-inbox-init'; my $cfgfile = "$ENV{PI_DIR}/config"; my @cmd = (pi_init, 'blist', "$tmpdir/blist", qw(http://example.com/blist blist@example.com)); - is(system(@cmd), 0, 'public-inbox-init failed'); + is(system(@cmd), 0, 'public-inbox-init OK'); ok(-e $cfgfile, "config exists, now"); - is(system(@cmd), 0, 'public-inbox-init failed (idempotent)'); + is(system(@cmd), 0, 'public-inbox-init OK (idempotent)'); + + chmod 0666, $cfgfile or die "chmod failed: $!"; + @cmd = (pi_init, 'clist', "$tmpdir/clist", + qw(http://example.com/clist clist@example.com)); + is(system(@cmd), 0, 'public-inbox-init clist OK'); + is((stat($cfgfile))[2] & 07777, 0666, "permissions preserved"); } done_testing(); diff --git a/t/linkify.t b/t/linkify.t index 586691ae..99acf17d 100644 --- a/t/linkify.t +++ b/t/linkify.t @@ -23,4 +23,60 @@ use PublicInbox::Linkify; is($s, qq(<a\nhref="$u">$u</a>;), 'trailing semicolon not in URL'); } +{ + my $l = PublicInbox::Linkify->new; + my $u = 'http://example.com/url-with-(parens)'; + my $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello <a\nhref="$u">$u</a> world), 'URL preserved'); + + $u .= "?query=a"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello <a\nhref="$u">$u</a> world), 'query preserved'); + + $u .= "#fragment"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello <a\nhref="$u">$u</a> world), + 'query + fragment preserved'); + + $u = "http://example.com/"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello <a\nhref="$u">$u</a> world), "root URL preserved"); + + $u = "http://example.com/#fragment"; + $s = "hello $u world"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq(hello <a\nhref="$u">$u</a> world), "root + fragment"); +} + +# Markdown compatibility +{ + my $l = PublicInbox::Linkify->new; + my $u = 'http://example.com/'; + my $s = "[markdown]($u)"; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq![markdown](<a\nhref="$u">$u</a>)!, 'Markdown-compatible'); + + $s = qq![markdown]($u "title")!; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq![markdown](<a\nhref="$u">$u</a> "title")!, + 'Markdown title compatible'); + + $s = qq![markdown]($u).!; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + is($s, qq![markdown](<a\nhref="$u">$u</a>).!, + 'Markdown-compatible end of sentence'); +} + done_testing(); @@ -4,10 +4,11 @@ use strict; use warnings; use Test::More; use Email::MIME; -use Email::Filter; use File::Temp qw/tempdir/; use Cwd; -use IPC::Run qw(run); +use PublicInbox::MID qw(mid2path); +eval { require IPC::Run }; +plan skip_all => "missing IPC::Run for t/mda.t" if $@; my $mda = "blib/script/public-inbox-mda"; my $learn = "blib/script/public-inbox-learn"; @@ -22,7 +23,7 @@ my $fail_bin = getcwd()."/t/fail-bin"; my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc spam mock my $addr = 'test-public@example.com'; my $cfgpfx = "publicinbox.test"; -my $failbox = "$home/fail.mbox"; +my $faildir = "$home/faildir/"; my $mime; { @@ -47,14 +48,19 @@ my $mime; local $ENV{GIT_COMMITTER_NAME} = eval { use PublicInbox::MDA; + use PublicInbox::Address; use Encode qw/encode/; my $mbox = 't/utf8.mbox'; open(my $fh, '<', $mbox) or die "failed to open mbox: $mbox\n"; my $str = eval { local $/; <$fh> }; close $fh; - my $msg = Email::Filter->new(data => $str); - $msg = Email::MIME->new($msg->simple->as_string); - my ($author, $email, $date) = PublicInbox::MDA->author_info($msg); + my $msg = Email::MIME->new($str); + + my $from = $msg->header('From'); + my ($author) = PublicInbox::Address::names($from); + my ($email) = PublicInbox::Address::emails($from); + my $date = $msg->header('Date'); + is('Eléanor', encode('us-ascii', my $tmp = $author, Encode::HTMLCREF), 'HTML conversion is correct'); @@ -67,7 +73,7 @@ die $@ if $@; { my $good_rev; - local $ENV{PI_EMERGENCY} = $failbox; + local $ENV{PI_EMERGENCY} = $faildir; local $ENV{HOME} = $home; local $ENV{ORIGINAL_RECIPIENT} = $addr; my $simple = Email::Simple->new(<<EOF); @@ -84,12 +90,11 @@ EOF # ensure successful message delivery { local $ENV{PATH} = $main_path; - run([$mda], \$in); - local $ENV{GIT_DIR} = $maindir; - my $rev = `git rev-list HEAD`; + IPC::Run::run([$mda], \$in); + my $rev = `git --git-dir=$maindir rev-list HEAD`; like($rev, qr/\A[a-f0-9]{40}/, "good revision committed"); chomp $rev; - my $cmt = `git cat-file commit $rev`; + my $cmt = `git --git-dir=$maindir cat-file commit $rev`; like($cmt, qr/^author Me <me\@example\.com> 0 \+0000\n/m, "author info set correctly"); like($cmt, qr/^committer test <test-public\@example\.com>/m, @@ -99,13 +104,14 @@ EOF # ensure failures work, fail with bad spamc { - ok(!-e $failbox, "nothing in PI_EMERGENCY before"); + my @prev = <$faildir/new/*>; + is(scalar @prev, 0 , "nothing in PI_EMERGENCY before"); local $ENV{PATH} = $fail_path; - run([$mda], \$in); - local $ENV{GIT_DIR} = $maindir; - my @revs = `git rev-list HEAD`; + IPC::Run::run([$mda], \$in); + my @revs = `git --git-dir=$maindir rev-list HEAD`; is(scalar @revs, 1, "bad revision not committed"); - ok(-s $failbox > 0, "PI_EMERGENCY is written to"); + my @new = <$faildir/new/*>; + is(scalar @new, 1, "PI_EMERGENCY is written to"); } fail_bad_header($good_rev, "bad recipient", <<""); @@ -155,7 +161,7 @@ Date: deadbeef # spam training { - local $ENV{PI_EMERGENCY} = $failbox; + local $ENV{PI_EMERGENCY} = $faildir; local $ENV{HOME} = $home; local $ENV{ORIGINAL_RECIPIENT} = $addr; local $ENV{PATH} = $main_path; @@ -173,23 +179,25 @@ EOF { # deliver the spam message, first - run([$mda], \$in); - my $msg = `ssoma cat $mid $maindir`; + IPC::Run::run([$mda], \$in); + my $path = mid2path($mid); + my $msg = `git --git-dir=$maindir cat-file blob HEAD:$path`; like($msg, qr/\Q$mid\E/, "message delivered"); # now train it local $ENV{GIT_AUTHOR_EMAIL} = 'trainer@example.com'; local $ENV{GIT_COMMITTER_EMAIL} = 'trainer@example.com'; - run([$learn, "spam"], \$msg); + local $ENV{GIT_COMMITTER_NAME} = undef; + IPC::Run::run([$learn, "spam"], \$msg); is($?, 0, "no failure from learning spam"); - run([$learn, "spam"], \$msg); + IPC::Run::run([$learn, "spam"], \$msg); is($?, 0, "no failure from learning spam idempotently"); } } # train ham message { - local $ENV{PI_EMERGENCY} = $failbox; + local $ENV{PI_EMERGENCY} = $faildir; local $ENV{HOME} = $home; local $ENV{ORIGINAL_RECIPIENT} = $addr; local $ENV{PATH} = $main_path; @@ -210,11 +218,12 @@ EOF local $ENV{GIT_AUTHOR_EMAIL} = 'trainer@example.com'; local $ENV{GIT_COMMITTER_EMAIL} = 'trainer@example.com'; - run([$learn, "ham"], \$in); + IPC::Run::run([$learn, "ham"], \$in); is($?, 0, "learned ham without failure"); - my $msg = `ssoma cat $mid $maindir`; + my $path = mid2path($mid); + my $msg = `git --git-dir=$maindir cat-file blob HEAD:$path`; like($msg, qr/\Q$mid\E/, "ham message delivered"); - run([$learn, "ham"], \$in); + IPC::Run::run([$learn, "ham"], \$in); is($?, 0, "learned ham idempotently "); # ensure trained email is filtered, too @@ -249,81 +258,28 @@ EOF { $in = $mime->as_string; - run([$learn, "ham"], \$in); + IPC::Run::run([$learn, "ham"], \$in); is($?, 0, "learned ham without failure"); - $msg = `ssoma cat $mid $maindir`; + my $path = mid2path($mid); + $msg = `git --git-dir=$maindir cat-file blob HEAD:$path`; like($msg, qr/<\Q$mid\E>/, "ham message delivered"); unlike($msg, qr/<html>/i, '<html> filtered'); } } -# faildir - emergency destination is maildir -{ - my $faildir= "$home/faildir/"; - local $ENV{PI_EMERGENCY} = $faildir; - local $ENV{HOME} = $home; - local $ENV{ORIGINAL_RECIPIENT} = $addr; - local $ENV{PATH} = $fail_path; - my $in = <<EOF; -From: Faildir <faildir\@example.com> -To: You <you\@example.com> -Cc: $addr -Message-ID: <faildir\@example.com> -Subject: faildir subject -Date: Thu, 01 Jan 1970 00:00:00 +0000 - -EOF - run([$mda], \$in); - ok(-d $faildir, "emergency exists"); - my @new = glob("$faildir/new/*"); - is(scalar(@new), 1, "message delivered"); - is(unlink(@new), 1, "removed emergency message"); - - local $ENV{PATH} = $main_path; - $in = <<EOF; -From: Faildir <faildir\@example.com> -To: $addr -Content-Type: text/html -Message-ID: <faildir\@example.com> -Subject: faildir subject -Date: Thu, 01 Jan 1970 00:00:00 +0000 - -<html><body>bad</body></html> -EOF - my $out = ''; - my $err = ''; - run([$mda], \$in, \$out, \$err); - isnt($?, 0, "mda exited with failure"); - is(length $out, 0, 'nothing in stdout'); - isnt(length $err, 0, 'error message in stderr'); - - @new = glob("$faildir/new/*"); - is(scalar(@new), 0, "new message did not show up"); - - # reject multipart again - $in = $mime->as_string; - $err = ''; - run([$mda], \$in, \$out, \$err); - isnt($?, 0, "mda exited with failure"); - is(length $out, 0, 'nothing in stdout'); - isnt(length $err, 0, 'error message in stderr'); - @new = glob("$faildir/new/*"); - is(scalar(@new), 0, "new message did not show up"); -} - done_testing(); sub fail_bad_header { my ($good_rev, $msg, $in) = @_; - open my $fh, '>', $failbox or die "failed to open $failbox: $!\n"; - close $fh or die "failed to close $failbox: $!\n"; + my @f = glob("$faildir/*/*"); + unlink @f if @f; my ($out, $err) = ("", ""); local $ENV{PATH} = $main_path; - run([$mda], \$in, \$out, \$err); - local $ENV{GIT_DIR} = $maindir; - my $rev = `git rev-list HEAD`; + IPC::Run::run([$mda], \$in, \$out, \$err); + my $rev = `git --git-dir=$maindir rev-list HEAD`; chomp $rev; is($rev, $good_rev, "bad revision not commited ($msg)"); - ok(-s $failbox > 0, "PI_EMERGENCY is written to ($msg)"); + @f = glob("$faildir/*/*"); + is(scalar @f, 1, "faildir written to"); [ $in, $out, $err ]; } diff --git a/t/mid.t b/t/mid.t new file mode 100644 index 00000000..b0af8386 --- /dev/null +++ b/t/mid.t @@ -0,0 +1,11 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use Test::More; +use PublicInbox::MID qw(mid_escape); + +is(mid_escape('foo!@(bar)'), 'foo!@(bar)'); +is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)'); +is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)'); + +done_testing(); +1; diff --git a/t/msg_iter.t b/t/msg_iter.t new file mode 100644 index 00000000..7ade6e41 --- /dev/null +++ b/t/msg_iter.t @@ -0,0 +1,44 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use_ok('PublicInbox::MsgIter'); + +{ + my $parts = [ Email::MIME->create(body => "a\n"), + Email::MIME->create(body => "b\n") ]; + my $mime = Email::MIME->create(parts => $parts, + header_str => [ From => 'root@localhost' ]); + my @parts; + msg_iter($mime, sub { + my ($part, $level, @ex) = @{$_[0]}; + my $s = $part->body_str; + $s =~ s/\s+//s; + push @parts, [ $s, $level, @ex ]; + }); + is_deeply(\@parts, [ [ qw(a 1 1) ], [ qw(b 1 2) ] ], 'order is fine'); +} + +{ + my $parts = [ Email::MIME->create(body => 'a'), + Email::MIME->create(body => 'b') ]; + $parts = [ Email::MIME->create(parts => $parts, + header_str => [ From => 'sub@localhost' ]), + Email::MIME->create(body => 'sig') ]; + my $mime = Email::MIME->create(parts => $parts, + header_str => [ From => 'root@localhost' ]); + my @parts; + msg_iter($mime, sub { + my ($part, $level, @ex) = @{$_[0]}; + my $s = $part->body_str; + $s =~ s/\s+//s; + push @parts, [ $s, $level, @ex ]; + }); + is_deeply(\@parts, [ [qw(a 2 1 1)], [qw(b 2 1 2)], [qw(sig 1 2)] ], + 'nested part shows up properly'); +} + +done_testing(); +1; @@ -11,6 +11,7 @@ foreach my $mod (qw(DBD::SQLite Search::Xapian Danga::Socket)) { } use_ok 'PublicInbox::NNTP'; +use_ok 'PublicInbox::Inbox'; { sub quote_str { @@ -95,4 +96,42 @@ use_ok 'PublicInbox::NNTP'; } } +{ # test setting NNTP headers in HEAD and ARTICLE requests + require Email::MIME; + my $u = 'https://example.com/a/'; + my $ng = PublicInbox::Inbox->new({ name => 'test', + mainrepo => 'test.git', + address => 'a@example.com', + -primary_address => 'a@example.com', + newsgroup => 'test', + domain => 'example.com', + url => '//example.com/a'}); + is($ng->base_url, $u, 'URL expanded'); + my $mid = 'a@b'; + my $mime = Email::MIME->new("Message-ID: <$mid>\r\n\r\n"); + PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 1, $mid); + is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ], + 'Message-ID unchanged'); + is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ], + 'Archived-At: set'); + is_deeply([ $mime->header('List-Archive') ], [ "<$u>" ], + 'List-Archive: set'); + is_deeply([ $mime->header('List-Post') ], [ '<mailto:a@example.com>' ], + 'List-Post: set'); + is_deeply([ $mime->header('Newsgroups') ], [ 'test' ], + 'Newsgroups: set'); + is_deeply([ $mime->header('Xref') ], [ 'example.com test:1' ], + 'Xref: set'); + + $ng->{-base_url} = 'http://mirror.example.com/m/'; + PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 2, $mid); + is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ], + 'Message-ID unchanged'); + is_deeply([ $mime->header('Archived-At') ], + [ "<${u}a\@b/>", '<http://mirror.example.com/m/a@b/>' ], + 'Archived-At: appended'); + is_deeply([ $mime->header('Xref') ], [ 'example.com test:2' ], + 'Old Xref: clobbered'); +} + done_testing(); @@ -16,26 +16,18 @@ use Fcntl qw(FD_CLOEXEC F_SETFD F_GETFD); use Socket qw(SO_KEEPALIVE IPPROTO_TCP TCP_NODELAY); use File::Temp qw/tempdir/; use Net::NNTP; -use IPC::Run qw(run); my $tmpdir = tempdir('pi-nntpd-XXXXXX', TMPDIR => 1, CLEANUP => 1); my $home = "$tmpdir/pi-home"; my $err = "$tmpdir/stderr.log"; my $out = "$tmpdir/stdout.log"; -my $pi_home = "$home/.public-inbox"; -my $pi_config = "$pi_home/config"; my $maindir = "$tmpdir/main.git"; -my $main_bin = getcwd()."/t/main-bin"; -my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock my $group = 'test-nntpd'; my $addr = $group . '@example.com'; -my $cfgpfx = "publicinbox.$group"; -my $failbox = "$home/fail.mbox"; -local $ENV{PI_EMERGENCY} = $failbox; -my $mda = 'blib/script/public-inbox-mda'; my $nntpd = 'blib/script/public-inbox-nntpd'; my $init = 'blib/script/public-inbox-init'; -my $index = 'blib/script/public-inbox-index'; +use_ok 'PublicInbox::Import'; +use_ok 'PublicInbox::Git'; my %opts = ( LocalAddr => '127.0.0.1', @@ -46,29 +38,38 @@ my %opts = ( ); my $sock = IO::Socket::INET->new(%opts); my $pid; +my $len; END { kill 'TERM', $pid if defined $pid }; { local $ENV{HOME} = $home; system($init, $group, $maindir, 'http://example.com/', $addr); + is(system(qw(git config), "--file=$home/.public-inbox/config", + "publicinbox.$group.newsgroup", $group), + 0, 'enabled newsgroup'); + my $len; # ensure successful message delivery { - local $ENV{ORIGINAL_RECIPIENT} = $addr; - my $simple = Email::Simple->new(<<EOF); -From: Me <me\@example.com> -To: You <you\@example.com> + my $mime = Email::MIME->new(<<EOF); +To: =?utf-8?Q?El=C3=A9anor?= <you\@example.com> +From: =?utf-8?Q?El=C3=A9anor?= <me\@example.com> Cc: $addr Message-Id: <nntp\@example.com> -Subject: hihi +Content-Type: text/plain; charset=utf-8 +Subject: Testing for =?utf-8?Q?El=C3=A9anor?= Date: Thu, 01 Jan 1970 06:06:06 +0000 +Content-Transfer-Encoding: 8bit -nntp +This is a test message for El\xc3\xa9anor EOF - my $in = $simple->as_string; - local $ENV{PATH} = $main_path; - IPC::Run::run([$mda], \$in); - is(0, $?, 'ran MDA correctly'); - is(0, system($index, $maindir), 'indexed git dir'); + $mime->header_set('List-Id', "<$addr>"); + $len = length($mime->as_string); + my $git = PublicInbox::Git->new($maindir); + my $im = PublicInbox::Import->new($git, 'test', $addr); + $im->add($mime); + $im->done; + my $s = PublicInbox::SearchIdx->new($maindir, 1); + $s->index_sync; } ok($sock, 'sock created'); @@ -106,10 +107,10 @@ EOF my $mid = '<nntp@example.com>'; my %xhdr = ( 'message-id' => $mid, - 'subject' => 'hihi', + subject => "Testing for El\xc3\xa9anor", 'date' => 'Thu, 01 Jan 1970 06:06:06 +0000', - 'from' => 'Me <me@example.com>', - 'to' => 'You <you@example.com>', + 'from' => "El\xc3\xa9anor <me\@example.com>", + 'to' => "El\xc3\xa9anor <you\@example.com>", 'cc' => $addr, 'xref' => "example.com $group:1" ); @@ -119,6 +120,18 @@ EOF is($buf, "201 server ready - post via email\r\n", 'got greeting'); $s->autoflush(1); + syswrite($s, "NEWGROUPS\t19990424 000000 \033GMT\007\r\n"); + is(0, sysread($s, $buf, 4096), 'GOT EOF on cntrl'); + + $s = IO::Socket::INET->new(%opts); + sysread($s, $buf, 4096); + is($buf, "201 server ready - post via email\r\n", 'got greeting'); + $s->autoflush(1); + + syswrite($s, "NEWGROUPS 19990424 000000 GMT\r\n"); + $buf = read_til_dot($s); + like($buf, qr/\A231 list of /, 'newgroups OK'); + while (my ($k, $v) = each %xhdr) { is_deeply($n->xhdr("$k $mid"), { $mid => $v }, "XHDR $k by message-id works"); @@ -126,14 +139,11 @@ EOF "$k by article number works"); is_deeply($n->xhdr("$k 1-"), { 1 => $v }, "$k by article range works"); - next; $buf = ''; syswrite($s, "HDR $k $mid\r\n"); - do { - sysread($s, $buf, 4096, length($buf)); - } until ($buf =~ /\r\n\.\r\n\z/); + $buf = read_til_dot($s); my @r = split("\r\n", $buf); - like($r[0], qr/\A224 /, '224 response for HDR'); + like($r[0], qr/\A225 /, '225 response for HDR'); is($r[1], "0 $v", 'got expected response for HDR'); } @@ -146,34 +156,42 @@ EOF } is_deeply($n->xover('1-'), { - '1' => ['hihi', - 'Me <me@example.com>', + '1' => ["Testing for El\xc3\xa9anor", + "El\xc3\xa9anor <me\@example.com>", 'Thu, 01 Jan 1970 06:06:06 +0000', '<nntp@example.com>', '', - '202', + $len, '1' ] }, "XOVER range works"); is_deeply($n->xover('1'), { - '1' => ['hihi', - 'Me <me@example.com>', + '1' => ["Testing for El\xc3\xa9anor", + "El\xc3\xa9anor <me\@example.com>", 'Thu, 01 Jan 1970 06:06:06 +0000', '<nntp@example.com>', '', - '202', + $len, '1' ] }, "XOVER by article works"); + is_deeply($n->head(1), $n->head('<nntp@example.com>'), 'HEAD OK'); + is_deeply($n->body(1), $n->body('<nntp@example.com>'), 'BODY OK'); + is($n->body(1)->[0], "This is a test message for El\xc3\xa9anor\n", + 'body really matches'); + my $art = $n->article(1); + is(ref($art), 'ARRAY', 'got array for ARTICLE'); + is_deeply($art, $n->article('<nntp@example.com>'), 'ARTICLE OK'); + is($n->article(999), undef, 'non-existent num'); + is($n->article('<non-existent@example>'), undef, 'non-existent mid'); + { syswrite($s, "OVER $mid\r\n"); - $buf = ''; - do { - sysread($s, $buf, 4096, length($buf)); - } until ($buf =~ /\r\n\.\r\n\z/); + $buf = read_til_dot($s); my @r = split("\r\n", $buf); like($r[0], qr/^224 /, 'got 224 response for OVER'); - is($r[1], "0\thihi\tMe <me\@example.com>\t" . + is($r[1], "0\tTesting for El\xc3\xa9anor\t" . + "El\xc3\xa9anor <me\@example.com>\t" . "Thu, 01 Jan 1970 06:06:06 +0000\t" . - "$mid\t\t202\t1", 'OVER by Message-ID works'); + "$mid\t\t$len\t1", 'OVER by Message-ID works'); is($r[2], '.', 'correctly terminated response'); } @@ -185,7 +203,20 @@ EOF 'XHDR on invalid header returns empty'); { - syswrite($s, "HDR List-id 1-\r\n"); + my $t0 = time; + my $date = $n->date; + my $t1 = time; + ok($date >= $t0, 'valid date after start'); + ok($date <= $t1, 'valid date before stop'); + } + + { + setsockopt($s, IPPROTO_TCP, TCP_NODELAY, 1); + syswrite($s, 'HDR List-id 1-'); + select(undef, undef, undef, 0.15); + ok(kill('TERM', $pid), 'killed nntpd'); + select(undef, undef, undef, 0.15); + syswrite($s, "\r\n"); $buf = ''; do { sysread($s, $buf, 4096, length($buf)); @@ -196,11 +227,26 @@ EOF is(scalar @r, 1, 'only one response line'); } - ok(kill('TERM', $pid), 'killed nntpd'); - $pid = undef; - waitpid(-1, 0); + $n = $s = undef; + is($pid, waitpid($pid, 0), 'nntpd exited successfully'); + my $eout = eval { + local $/; + open my $fh, '<', $err or die "open $err failed: $!"; + <$fh>; + }; + is($?, 0, 'no error in exited process'); + unlike($eout, qr/wide/i, 'no Wide character warnings'); } done_testing(); +sub read_til_dot { + my ($s) = @_; + my $buf = ''; + do { + sysread($s, $buf, 4096, length($buf)); + } until ($buf =~ /\r\n\.\r\n\z/); + $buf; +} + 1; @@ -5,36 +5,23 @@ use warnings; use Test::More; use Email::MIME; use File::Temp qw/tempdir/; -use Cwd; -use IPC::Run qw/run/; my $psgi = "examples/public-inbox.psgi"; -my $mda = "blib/script/public-inbox-mda"; my $tmpdir = tempdir('pi-plack-XXXXXX', TMPDIR => 1, CLEANUP => 1); -my $home = "$tmpdir/pi-home"; -my $pi_home = "$home/.public-inbox"; -my $pi_config = "$pi_home/config"; +my $pi_config = "$tmpdir/config"; my $maindir = "$tmpdir/main.git"; -my $main_bin = getcwd()."/t/main-bin"; -my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock my $addr = 'test-public@example.com'; my $cfgpfx = "publicinbox.test"; -my $failbox = "$home/fail.mbox"; -local $ENV{PI_EMERGENCY} = $failbox; -my @mods = qw(HTTP::Request::Common Plack::Request Plack::Test - Mail::Thread URI::Escape); +my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape); foreach my $mod (@mods) { eval "require $mod"; plan skip_all => "$mod missing for plack.t" if $@; } +use_ok 'PublicInbox::Import'; +use_ok 'PublicInbox::Git'; foreach my $mod (@mods) { use_ok $mod; } { ok(-f $psgi, "psgi example file found"); - ok(-x "$main_bin/spamc", - "spamc ham mock found (run in top of source tree"); - ok(-x $mda, "$mda is executable"); - is(1, mkdir($home, 0755), "setup ~/ for testing"); - is(1, mkdir($pi_home, 0755), "setup ~/.public-inbox"); is(0, system(qw(git init -q --bare), $maindir), "git init (main)"); open my $fh, '>', "$maindir/description" or die "open: $!\n"; print $fh "test for public-inbox\n"; @@ -42,18 +29,17 @@ foreach my $mod (@mods) { use_ok $mod; } my %cfg = ( "$cfgpfx.address" => $addr, "$cfgpfx.mainrepo" => $maindir, + "$cfgpfx.url" => 'http://example.com/test/', + "$cfgpfx.newsgroup" => 'inbox.test', ); while (my ($k,$v) = each %cfg) { is(0, system(qw(git config --file), $pi_config, $k, $v), "setup $k"); } - local $ENV{HOME} = $home; - local $ENV{ORIGINAL_RECIPIENT} = $addr; - # ensure successful message delivery { - my $simple = Email::Simple->new(<<EOF); + my $mime = Email::MIME->new(<<EOF); From: Me <me\@example.com> To: You <you\@example.com> Cc: $addr @@ -63,13 +49,38 @@ Date: Thu, 01 Jan 1970 00:00:00 +0000 zzzzzz EOF - my $in = $simple->as_string; - run_with_env({PATH => $main_path}, [$mda], \$in); - local $ENV{GIT_DIR} = $maindir; - my $rev = `git rev-list HEAD`; + my $git = PublicInbox::Git->new($maindir); + my $im = PublicInbox::Import->new($git, 'test', $addr); + $im->add($mime); + $im->done; + my $rev = `git --git-dir="$maindir" rev-list HEAD`; like($rev, qr/\A[a-f0-9]{40}/, "good revision committed"); } - my $app = require $psgi; + my $app = eval { + local $ENV{PI_CONFIG} = $pi_config; + require $psgi; + }; + + test_psgi($app, sub { + my ($cb) = @_; + foreach my $u (qw(robots.txt favicon.ico .well-known/foo)) { + my $res = $cb->(GET("http://example.com/$u")); + is($res->code, 404, "$u is missing"); + } + }); + + # redirect with newsgroup + test_psgi($app, sub { + my ($cb) = @_; + my $from = 'http://example.com/inbox.test'; + my $to = 'http://example.com/test/'; + my $res = $cb->(GET($from)); + is($res->code, 301, 'newsgroup name is permanent redirect'); + is($to, $res->header('Location'), 'redirect location matches'); + $from .= '/'; + is($res->code, 301, 'newsgroup name/ is permanent redirect'); + is($to, $res->header('Location'), 'redirect location matches'); + }); # redirect with trailing / test_psgi($app, sub { @@ -85,7 +96,7 @@ EOF foreach my $t (qw(t T)) { test_psgi($app, sub { my ($cb) = @_; - my $u = $pfx . "/blah%40example.com/$t"; + my $u = $pfx . "/blah\@example.com/$t"; my $res = $cb->(GET($u)); is(301, $res->code, "redirect for missing /"); my $location = $res->header('Location'); @@ -96,11 +107,11 @@ EOF foreach my $t (qw(f)) { test_psgi($app, sub { my ($cb) = @_; - my $u = $pfx . "/blah%40example.com/$t"; + my $u = $pfx . "/blah\@example.com/$t"; my $res = $cb->(GET($u)); - is(301, $res->code, "redirect for missing /"); + is(301, $res->code, "redirect for legacy /f"); my $location = $res->header('Location'); - like($location, qr!/\Q$t\E/\z!, + like($location, qr!/blah\@example\.com/\z!, 'redirected with missing /'); }); } @@ -108,11 +119,11 @@ EOF test_psgi($app, sub { my ($cb) = @_; my $atomurl = 'http://example.com/test/new.atom'; - my $res = $cb->(GET('http://example.com/test/')); + my $res = $cb->(GET('http://example.com/test/new.html')); is(200, $res->code, 'success response received'); - like($res->content, qr!href="\Q$atomurl\E"!, + like($res->content, qr!href="new\.atom"!, 'atom URL generated'); - like($res->content, qr!href="blah%40example\.com/"!, + like($res->content, qr!href="blah\@example\.com/"!, 'index generated'); }); @@ -121,23 +132,29 @@ EOF my $res = $cb->(GET($pfx . '/atom.xml')); is(200, $res->code, 'success response received for atom'); like($res->content, - qr!link\s+href="\Q$pfx\E/blah%40example\.com/"!s, + qr!link\s+href="\Q$pfx\E/blah\@example\.com/"!s, 'atom feed generated correct URL'); }); - foreach my $t (('', 'f/')) { - test_psgi($app, sub { - my ($cb) = @_; - my $path = "/blah%40example.com/$t"; - my $res = $cb->(GET($pfx . $path)); - is(200, $res->code, "success for $path"); - like($res->content, qr!<title>hihi - Me</title>!, - "HTML returned"); - }); - } test_psgi($app, sub { my ($cb) = @_; - my $res = $cb->(GET($pfx . '/blah%40example.com/raw')); + my $path = '/blah@example.com/'; + my $res = $cb->(GET($pfx . $path)); + is(200, $res->code, "success for $path"); + like($res->content, qr!<title>hihi - Me</title>!, + "HTML returned"); + + $path .= 'f/'; + $res = $cb->(GET($pfx . $path)); + is(301, $res->code, "redirect for $path"); + my $location = $res->header('Location'); + like($location, qr!/blah\@example\.com/\z!, + '/$MESSAGE_ID/f/ redirected to /$MESSAGE_ID/'); + }); + + test_psgi($app, sub { + my ($cb) = @_; + my $res = $cb->(GET($pfx . '/blah@example.com/raw')); is(200, $res->code, 'success response received for /*/raw'); like($res->content, qr!^From !sm, "mbox returned"); }); @@ -146,47 +163,41 @@ EOF foreach my $t (qw(m f)) { test_psgi($app, sub { my ($cb) = @_; - my $res = $cb->(GET($pfx . "/$t/blah%40example.com.txt")); + my $res = $cb->(GET($pfx . "/$t/blah\@example.com.txt")); is(301, $res->code, "redirect for old $t .txt link"); my $location = $res->header('Location'); - like($location, qr!/blah%40example\.com/raw\z!, + like($location, qr!/blah\@example\.com/raw\z!, ".txt redirected to /raw"); }); } my %umap = ( 'm' => '', - 'f' => 'f/', + 'f' => '', 't' => 't/', ); while (my ($t, $e) = each %umap) { test_psgi($app, sub { my ($cb) = @_; - my $res = $cb->(GET($pfx . "/$t/blah%40example.com.html")); + my $res = $cb->(GET($pfx . "/$t/blah\@example.com.html")); is(301, $res->code, "redirect for old $t .html link"); my $location = $res->header('Location'); like($location, - qr!/blah%40example\.com/$e(?:#u)?\z!, + qr!/blah\@example\.com/$e(?:#u)?\z!, ".html redirected to new location"); }); } foreach my $sfx (qw(mbox mbox.gz)) { test_psgi($app, sub { my ($cb) = @_; - my $res = $cb->(GET($pfx . "/t/blah%40example.com.$sfx")); + my $res = $cb->(GET($pfx . "/t/blah\@example.com.$sfx")); is(301, $res->code, 'redirect for old thread link'); my $location = $res->header('Location'); like($location, - qr!/blah%40example\.com/t\.mbox(?:\.gz)?\z!, + qr!/blah\@example\.com/t\.mbox(?:\.gz)?\z!, "$sfx redirected to /mbox.gz"); }); } } done_testing(); - -sub run_with_env { - my ($env, @args) = @_; - my $init = sub { foreach my $k (keys %$env) { $ENV{$k} = $env->{$k} } }; - run(@args, init => $init); -} diff --git a/t/precheck.t b/t/precheck.t index 3f2c5d5b..0c3ce1c4 100644 --- a/t/precheck.t +++ b/t/precheck.t @@ -4,32 +4,46 @@ use strict; use warnings; use Test::More; use Email::Simple; -use Email::Filter; use PublicInbox::MDA; sub do_checks { my ($s) = @_; - my $f = Email::Filter->new(data => $s->as_string); - my $recipient = 'foo@example.com'; - ok(!PublicInbox::MDA->precheck($f, $recipient), + ok(!PublicInbox::MDA->precheck($s, $recipient), "wrong ORIGINAL_RECIPIENT rejected"); $recipient = 'b@example.com'; - ok(PublicInbox::MDA->precheck($f, $recipient), + ok(PublicInbox::MDA->precheck($s, $recipient), "ORIGINAL_RECIPIENT in To: is OK"); $recipient = 'c@example.com'; - ok(PublicInbox::MDA->precheck($f, $recipient), + ok(PublicInbox::MDA->precheck($s, $recipient), "ORIGINAL_RECIPIENT in Cc: is OK"); $recipient = [ 'c@example.com', 'd@example.com' ]; - ok(PublicInbox::MDA->precheck($f, $recipient), + ok(PublicInbox::MDA->precheck($s, $recipient), "alias list is OK"); } { + my $s = Email::Simple->create( + header => [ + From => 'abc@example.com', + To => 'abc@example.com', + Cc => 'c@example.com, another-list@example.com', + 'Content-Type' => 'text/plain', + Subject => 'list is fine', + 'Message-ID' => '<MID@host>', + Date => 'Wed, 09 Apr 2014 01:28:34 +0000', + ], + body => "hello world\n", + ); + my $addr = [ 'c@example.com', 'd@example.com' ]; + ok(PublicInbox::MDA->precheck($s, $addr), 'Cc list is OK'); +} + +{ do_checks(Email::Simple->create( header => [ From => 'a@example.com', @@ -72,8 +86,7 @@ sub do_checks { ], body => "hello world\n", ); - my $f = Email::Filter->new(data => $s->as_string); - ok(!PublicInbox::MDA->precheck($f, $recipient), + ok(!PublicInbox::MDA->precheck($s, $recipient), "missing From: is rejected"); } diff --git a/t/psgi_attach.t b/t/psgi_attach.t new file mode 100644 index 00000000..0d20b7f7 --- /dev/null +++ b/t/psgi_attach.t @@ -0,0 +1,117 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use File::Temp qw/tempdir/; +my $tmpdir = tempdir('psgi-attach-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $maindir = "$tmpdir/main.git"; +my $addr = 'test-public@example.com'; +my $cfgpfx = "publicinbox.test"; +my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for plack.t" if $@; +} +use_ok $_ foreach @mods; +use PublicInbox::Import; +use PublicInbox::Git; +use PublicInbox::Config; +use PublicInbox::WWW; +use_ok 'PublicInbox::WwwAttach'; +use Plack::Builder; +my $config = PublicInbox::Config->new({ + "$cfgpfx.address" => $addr, + "$cfgpfx.mainrepo" => $maindir, +}); +is(0, system(qw(git init -q --bare), $maindir), "git init (main)"); +my $git = PublicInbox::Git->new($maindir); +my $im = PublicInbox::Import->new($git, 'test', $addr); + +{ + open my $fh, '<', '/dev/urandom' or die "unable to open urandom: $!\n"; + sysread($fh, my $buf, 8); + is(8, length($buf), 'read some random data'); + my $qp = "abcdef=g\n==blah\n"; + my $b64 = 'b64'.$buf."\n"; + my $txt = "plain\ntext\npass\nthrough\n"; + my $dot = "dotfile\n"; + my $parts = [ + Email::MIME->create( + attributes => { + filename => 'queue-pee', + content_type => 'text/plain', + encoding => 'quoted-printable' + }, + body => $qp), + Email::MIME->create( + attributes => { + filename => 'bayce-sixty-four', + content_type => 'appication/octet-stream', + encoding => 'base64', + }, + body => $b64), + Email::MIME->create( + attributes => { + filename => 'noop.txt', + content_type => 'text/plain', + }, + body => $txt), + Email::MIME->create( + attributes => { + filename => '.dotfile', + content_type => 'text/plain', + }, + body => $dot), + ]; + my $mime = Email::MIME->create( + parts => $parts, + header_str => [ From => 'root@z', 'Message-Id' => '<Z@B>', + Subject => 'hi'] + ); + $mime = $mime->as_string; + $mime =~ s/\r\n/\n/g; # normalize to LF only + $mime = Email::MIME->new($mime); + $im->add($mime); + $im->done; + + my $www = PublicInbox::WWW->new($config); + test_psgi(sub { $www->call(@_) }, sub { + my ($cb) = @_; + my $res; + $res = $cb->(GET('/test/Z%40B/')); + my @href = ($res->content =~ /^href="([^"]+)"/gms); + @href = grep(/\A[\d\.]+-/, @href); + is_deeply([qw(1-queue-pee 2-bayce-sixty-four 3-noop.txt + 4-a.txt)], + \@href, 'attachment links generated'); + + $res = $cb->(GET('/test/Z%40B/1-queue-pee')); + my $qp_res = $res->content; + ok(length($qp_res) >= length($qp), 'QP length is close'); + like($qp_res, qr/\n\z/s, 'trailing newline exists'); + # is(index($qp_res, $qp), 0, 'QP trailing newline is there'); + $qp_res =~ s/\r\n/\n/g; + is(index($qp_res, $qp), 0, 'QP trailing newline is there'); + + $res = $cb->(GET('/test/Z%40B/2-base-sixty-four')); + is(quotemeta($res->content), quotemeta($b64), + 'Base64 matches exactly'); + + $res = $cb->(GET('/test/Z%40B/3-noop.txt')); + my $txt_res = $res->content; + ok(length($txt_res) >= length($txt), + 'plain text almost matches'); + like($txt_res, qr/\n\z/s, 'trailing newline exists in text'); + is(index($txt_res, $txt), 0, 'plain text not truncated'); + + $res = $cb->(GET('/test/Z%40B/4-a.txt')); + my $dot_res = $res->content; + ok(length($dot_res) >= length($dot), 'dot almost matches'); + $res = $cb->(GET('/test/Z%40B/4-any-filename.txt')); + is($res->content, $dot_res, 'user-specified filename is OK'); + + }); +} +done_testing(); diff --git a/t/psgi_mount.t b/t/psgi_mount.t new file mode 100644 index 00000000..4a515c6a --- /dev/null +++ b/t/psgi_mount.t @@ -0,0 +1,78 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use File::Temp qw/tempdir/; +my $tmpdir = tempdir('psgi-path-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $maindir = "$tmpdir/main.git"; +my $addr = 'test-public@example.com'; +my $cfgpfx = "publicinbox.test"; +my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for plack.t" if $@; +} +use_ok $_ foreach @mods; +use PublicInbox::Import; +use PublicInbox::Git; +use PublicInbox::Config; +use PublicInbox::WWW; +use Plack::Builder; +use Plack::App::URLMap; +my $config = PublicInbox::Config->new({ + "$cfgpfx.address" => $addr, + "$cfgpfx.mainrepo" => $maindir, +}); +is(0, system(qw(git init -q --bare), $maindir), "git init (main)"); +my $git = PublicInbox::Git->new($maindir); +my $im = PublicInbox::Import->new($git, 'test', $addr); +{ + my $mime = Email::MIME->new(<<EOF); +From: Me <me\@example.com> +To: You <you\@example.com> +Cc: $addr +Message-Id: <blah\@example.com> +Subject: hihi +Date: Thu, 01 Jan 1970 00:00:00 +0000 + +zzzzzz +EOF + $im->add($mime); + $im->done; +} + +my $www = PublicInbox::WWW->new($config); +my $app = builder { + enable 'Head'; + mount '/a' => builder { sub { $www->call(@_) } }; + mount '/b' => builder { sub { $www->call(@_) } }; +}; + +test_psgi($app, sub { + my ($cb) = @_; + my $res; + # Atom feed: + $res = $cb->(GET('/a/test/new.atom')); + like($res->content, qr!\bhttp://[^/]+/a/test/!, + 'URLs which exist in Atom feed are mount-aware'); + unlike($res->content, qr!\b\Qhttp://[^/]+/test/\E!, + 'No URLs which are not mount-aware'); + + # redirects + $res = $cb->(GET('/a/test/blah%40example.com/')); + is($res->code, 200, 'OK with URLMap mount'); + $res = $cb->(GET('/a/test/blah%40example.com/raw')); + is($res->code, 200, 'OK with URLMap mount'); + $res = $cb->(GET('/a/test/m/blah%40example.com.html')); + is($res->header('Location'), + 'http://localhost/a/test/blah@example.com/', + 'redirect functions properly under mount'); + + $res = $cb->(GET('/test/blah%40example.com/')); + is($res->code, 404, 'intentional 404 with URLMap mount'); + +}); + +done_testing(); diff --git a/t/psgi_text.t b/t/psgi_text.t new file mode 100644 index 00000000..bf565f83 --- /dev/null +++ b/t/psgi_text.t @@ -0,0 +1,39 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Email::MIME; +use File::Temp qw/tempdir/; +my $tmpdir = tempdir('psgi-text-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $maindir = "$tmpdir/main.git"; +my $addr = 'test-public@example.com'; +my $cfgpfx = "publicinbox.test"; +my @mods = qw(HTTP::Request::Common Plack::Test URI::Escape); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for psgi_text.t" if $@; +} +use_ok $_ foreach @mods; +use PublicInbox::Import; +use PublicInbox::Git; +use PublicInbox::Config; +use PublicInbox::WWW; +use_ok 'PublicInbox::WwwText'; +use Plack::Builder; +my $config = PublicInbox::Config->new({ + "$cfgpfx.address" => $addr, + "$cfgpfx.mainrepo" => $maindir, +}); +is(0, system(qw(git init -q --bare), $maindir), "git init (main)"); +my $www = PublicInbox::WWW->new($config); + +test_psgi(sub { $www->call(@_) }, sub { + my ($cb) = @_; + my $res; + $res = $cb->(GET('/test/_/text/help/')); + like($res->content, qr!<title>public-inbox help.*</title>!, + 'default help'); +}); + +done_testing(); diff --git a/t/qspawn.t b/t/qspawn.t new file mode 100644 index 00000000..9c42e100 --- /dev/null +++ b/t/qspawn.t @@ -0,0 +1,62 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use Test::More; +use_ok 'PublicInbox::Qspawn'; + +my $limiter = PublicInbox::Qspawn::Limiter->new(1); +{ + my $x = PublicInbox::Qspawn->new([qw(true)]); + my $run = 0; + $x->start($limiter, sub { + my ($rpipe) = @_; + is(0, sysread($rpipe, my $buf, 1), 'read zero bytes'); + ok(!$x->finish, 'no error on finish'); + $run = 1; + }); + is($run, 1, 'callback ran alright'); +} + +{ + my $x = PublicInbox::Qspawn->new([qw(false)]); + my $run = 0; + $x->start($limiter, sub { + my ($rpipe) = @_; + is(0, sysread($rpipe, my $buf, 1), 'read zero bytes from false'); + my $err = $x->finish; + is($err, 256, 'error on finish'); + $run = 1; + }); + is($run, 1, 'callback ran alright'); +} + +foreach my $cmd ([qw(sleep 1)], [qw(sh -c), 'sleep 1; false']) { + my $s = PublicInbox::Qspawn->new($cmd); + my @run; + $s->start($limiter, sub { + my ($rpipe) = @_; + push @run, 'sleep'; + is(0, sysread($rpipe, my $buf, 1), 'read zero bytes'); + }); + my $n = 0; + my @t = map { + my $i = $n++; + my $x = PublicInbox::Qspawn->new([qw(true)]); + $x->start($limiter, sub { + my ($rpipe) = @_; + push @run, $i; + }); + [$x, $i] + } (0..2); + + if ($cmd->[-1] =~ /false\z/) { + ok($s->finish, 'got error on false after sleep'); + } else { + ok(!$s->finish, 'no error on sleep'); + } + ok(!$_->[0]->finish, "true $_->[1] succeeded") foreach @t; + is_deeply([qw(sleep 0 1 2)], \@run, 'ran in order'); +} + +done_testing(); + +1; @@ -33,13 +33,14 @@ ok($@, "exception raised on non-existent DB"); } my $rw = PublicInbox::SearchIdx->new($git_dir, 1); -my $ro = PublicInbox::Search->new($git_dir); +$rw->_xdb_acquire; +$rw->_xdb_release; $rw = undef; +my $ro = PublicInbox::Search->new($git_dir); my $rw_commit = sub { - $rw->{xdb}->commit_transaction if $rw; - $rw = undef; + $rw->{xdb}->commit_transaction if $rw && $rw->{xdb}; $rw = PublicInbox::SearchIdx->new($git_dir, 1); - $rw->{xdb}->begin_transaction; + $rw->_xdb_acquire->begin_transaction; }; { @@ -85,6 +86,7 @@ my $rw_commit = sub { 'Message-ID' => '<last@s>', From => 'John Smith <js@example.com>', To => 'list@example.com', + Cc => 'foo@example.com', ], body => "goodbye forever :<\n"); @@ -121,19 +123,19 @@ sub filter_mids { is($res->{total}, 0, "path variant `$p' does not match"); } - $res = $ro->query('subject:(Hello world)'); + $res = $ro->query('s:(Hello world)'); @res = filter_mids($res); - is_deeply(\@res, \@exp, 'got expected results for subject:() match'); + is_deeply(\@res, \@exp, 'got expected results for s:() match'); - $res = $ro->query('subject:"Hello world"'); + $res = $ro->query('s:"Hello world"'); @res = filter_mids($res); - is_deeply(\@res, \@exp, 'got expected results for subject:"" match'); + is_deeply(\@res, \@exp, 'got expected results for s:"" match'); - $res = $ro->query('subject:"Hello world"', {limit => 1}); + $res = $ro->query('s:"Hello world"', {limit => 1}); is(scalar @{$res->{msgs}}, 1, "limit works"); my $first = $res->{msgs}->[0]; - $res = $ro->query('subject:"Hello world"', {offset => 1}); + $res = $ro->query('s:"Hello world"', {offset => 1}); is(scalar @{$res->{msgs}}, 1, "offset works"); my $second = $res->{msgs}->[0]; @@ -179,7 +181,7 @@ sub filter_mids { $rw_commit->(); $ro->reopen; - # Subject: + # subject my $res = $ro->query('ghost'); my @exp = sort qw(ghost-message@s ghost-reply@s); my @res = filter_mids($res); @@ -274,10 +276,11 @@ sub filter_mids { # circular references { + my $s = 'foo://'. ('Circle' x 15).'/foo'; my $doc_id = $rw->add_message(Email::MIME->create( + header => [ Subject => $s ], header_str => [ Date => 'Sat, 02 Oct 2010 00:00:01 +0000', - Subject => 'Circle', 'Message-ID' => '<circle@a>', 'References' => '<circle@a>', 'In-Reply-To' => '<circle@a>', @@ -289,6 +292,139 @@ sub filter_mids { my $smsg = $rw->lookup_message('circle@a'); $smsg->ensure_metadata; is($smsg->references, '', "no references created"); + my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); + is($s, $msg->subject, 'long subject not rewritten'); +} + +{ + my $str = eval { + my $mbox = 't/utf8.mbox'; + open(my $fh, '<', $mbox) or die "failed to open mbox: $mbox\n"; + local $/; + <$fh> + }; + $str =~ s/\AFrom [^\n]+\n//s; + my $mime = Email::MIME->new($str); + my $doc_id = $rw->add_message($mime); + ok($doc_id > 0, 'message indexed doc_id with UTF-8'); + my $smsg = $rw->lookup_message('testmessage@example.com'); + my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); + + is($mime->header('Subject'), $msg->subject, 'UTF-8 subject preserved'); +} + +{ + my $res = $ro->query('d:19931002..20101002'); + ok(scalar @{$res->{msgs}} > 0, 'got results within range'); + $res = $ro->query('d:20101003..'); + is(scalar @{$res->{msgs}}, 0, 'nothing after 20101003'); + $res = $ro->query('d:..19931001'); + is(scalar @{$res->{msgs}}, 0, 'nothing before 19931001'); +} + +# names and addresses +{ + my $res = $ro->query('t:list@example.com'); + is(scalar @{$res->{msgs}}, 6, 'searched To: successfully'); + foreach my $smsg (@{$res->{msgs}}) { + like($smsg->to, qr/\blist\@example\.com\b/, 'to appears'); + } + + $res = $ro->query('tc:list@example.com'); + is(scalar @{$res->{msgs}}, 6, 'searched To+Cc: successfully'); + foreach my $smsg (@{$res->{msgs}}) { + my $tocc = join("\n", $smsg->to, $smsg->cc); + like($tocc, qr/\blist\@example\.com\b/, 'tocc appears'); + } + + foreach my $pfx ('tcf:', 'c:') { + $res = $ro->query($pfx . 'foo@example.com'); + is(scalar @{$res->{msgs}}, 1, + "searched $pfx successfully for Cc:"); + foreach my $smsg (@{$res->{msgs}}) { + like($smsg->cc, qr/\bfoo\@example\.com\b/, + 'cc appears'); + } + } + + foreach my $pfx ('', 'tcf:', 'f:') { + $res = $ro->query($pfx . 'Laggy'); + is(scalar @{$res->{msgs}}, 1, + "searched $pfx successfully for From:"); + foreach my $smsg (@{$res->{msgs}}) { + like($smsg->from, qr/Laggy Sender/, + "From appears with $pfx"); + } + } +} + +{ + $rw_commit->(); + $ro->reopen; + my $res = $ro->query('b:hello'); + is(scalar @{$res->{msgs}}, 0, 'no match on body search only'); + $res = $ro->query('bs:smith'); + is(scalar @{$res->{msgs}}, 0, + 'no match on body+subject search for From'); + + $res = $ro->query('q:theatre'); + is(scalar @{$res->{msgs}}, 1, 'only one quoted body'); + like($res->{msgs}->[0]->from, qr/\AQuoter/, 'got quoted body'); + + $res = $ro->query('nq:theatre'); + is(scalar @{$res->{msgs}}, 1, 'only one non-quoted body'); + like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, 'got non-quoted body'); + + foreach my $pfx (qw(b: bs:)) { + $res = $ro->query($pfx . 'theatre'); + is(scalar @{$res->{msgs}}, 2, "searched both bodies for $pfx"); + like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, + "non-quoter first for $pfx"); + } +} + +{ + my $part1 = Email::MIME->create( + attributes => { + content_type => 'text/plain', + disposition => 'attachment', + charset => 'US-ASCII', + encoding => 'quoted-printable', + filename => 'attached_fart.txt', + }, + body_str => 'inside the attachment', + ); + my $part2 = Email::MIME->create( + attributes => { + content_type => 'text/plain', + disposition => 'attachment', + charset => 'US-ASCII', + encoding => 'quoted-printable', + filename => 'part_deux.txt', + }, + body_str => 'inside another', + ); + my $amsg = Email::MIME->create( + header_str => [ + Subject => 'see attachment', + 'Message-ID' => '<file@attached>', + From => 'John Smith <js@example.com>', + To => 'list@example.com', + ], + parts => [ $part1, $part2 ], + ); + ok($rw->add_message($amsg), 'added attachment'); + $rw_commit->(); + $ro->reopen; + my $n = $ro->query('n:attached_fart.txt'); + is(scalar @{$n->{msgs}}, 1, 'got result for n:'); + my $res = $ro->query('part_deux.txt'); + is(scalar @{$res->{msgs}}, 1, 'got result without n:'); + is($n->{msgs}->[0]->mid, $res->{msgs}->[0]->mid, + 'same result with and without'); + my $txt = $ro->query('"inside another"'); + is($txt->{msgs}->[0]->mid, $res->{msgs}->[0]->mid, + 'search inside text attachments works'); } done_testing(); diff --git a/t/spamcheck_spamc.t b/t/spamcheck_spamc.t new file mode 100644 index 00000000..65ac5c2e --- /dev/null +++ b/t/spamcheck_spamc.t @@ -0,0 +1,49 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use Cwd; +use Email::Simple; +use IO::File; +use File::Temp qw/tempdir/; +use Fcntl qw(:DEFAULT SEEK_SET); +my $tmpdir = tempdir('spamcheck_spamc-XXXXXX', TMPDIR => 1, CLEANUP => 1); + +use_ok 'PublicInbox::Spamcheck::Spamc'; +my $spamc = PublicInbox::Spamcheck::Spamc->new; +$spamc->{checkcmd} = [qw(cat)]; + +{ + open my $fh, '+>', "$tmpdir/file" or die "open failed: $!"; + ok(!$spamc->spamcheck($fh), 'empty '.ref($fh)); +} +ok(!$spamc->spamcheck(IO::File->new_tmpfile), 'IO::File->new_tmpfile'); + +my $dst = ''; +my $src = <<'EOF'; +Date: Thu, 01 Jan 1970 00:00:00 +0000 +To: <e@example.com> +From: <e@example.com> +Subject: test +Message-ID: <testmessage@example.com> + +EOF +ok($spamc->spamcheck(Email::Simple->new($src), \$dst), 'Email::Simple works'); +is($dst, $src, 'input == output'); + +$dst = ''; +$spamc->{checkcmd} = ['sh', '-c', 'cat; false']; +ok(!$spamc->spamcheck(Email::Simple->new($src), \$dst), 'Failed check works'); +is($dst, $src, 'input == output for spammy example'); + +for my $l (qw(ham spam)) { + my $file = "$tmpdir/$l.out"; + $spamc->{$l.'cmd'} = ['tee', $file ]; + my $method = $l.'learn'; + ok($spamc->$method(Email::Simple->new($src)), "$method OK"); + open my $fh, '<', $file or die "failed to open $file: $!"; + is(eval { local $/, <$fh> }, $src, "$l command ran alright"); +} + +done_testing(); @@ -70,6 +70,15 @@ use PublicInbox::Spawn qw(which spawn popen_rd); is(sysread($fh, $buf, 6), 6, 'sysread got 6 bytes'); is($buf, "hello\n", 'tied gets works'); is(sysread($fh, $buf, 6), 0, 'sysread got EOF'); + $? = 1; + close $fh; + is($?, 0, '$? set properly'); +} + +{ + my $fh = popen_rd([qw(false)]); + close $fh; + isnt($?, 0, '$? set properly: '.$?); } { @@ -78,8 +87,9 @@ use PublicInbox::Spawn qw(which spawn popen_rd); is(kill(0, $pid), 1, 'child process is running'); ok(!defined(sysread($fh, my $buf, 1)) && $!{EAGAIN}, 'sysread returned quickly with EAGAIN'); - is(kill(15, $pid), 1, 'child process killed early'); + is(kill(9, $pid), 1, 'child process killed early'); is(waitpid($pid, 0), $pid, 'child process reapable'); + isnt($?, 0, '$? set properly: '.$?); } done_testing(); diff --git a/t/thread-cycle.t b/t/thread-cycle.t new file mode 100644 index 00000000..b0844490 --- /dev/null +++ b/t/thread-cycle.t @@ -0,0 +1,89 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use strict; +use warnings; +use Test::More; +use_ok('PublicInbox::SearchMsg'); +use_ok('PublicInbox::SearchThread'); +use Email::Simple; +my $mt = eval { + require Mail::Thread; + no warnings 'once'; + $Mail::Thread::nosubject = 1; + $Mail::Thread::noprune = 1; +}; +my @check; +my @msgs = map { + my $msg = $_; + $msg->{references} =~ s/\s+/ /sg if $msg->{references}; + my $simple = Email::Simple->create(header => [ + 'Message-Id' => "<$msg->{mid}>", + 'References' => $msg->{references}, + ]); + push @check, $simple; + bless $msg, 'PublicInbox::SearchMsg' +} ( + +# data from t/testbox-6 in Mail::Thread 2.55: + { mid => '20021124145312.GA1759@nlin.net' }, + { mid => 'slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk', + references => '<20021124145312.GA1759@nlin.net>', + }, + { mid => '15842.10677.577458.656565@jupiter.akutech-local.de', + references => '<20021124145312.GA1759@nlin.net> + <slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk>', + }, + { mid => '20021125171807.GK8236@somanetworks.com', + references => '<20021124145312.GA1759@nlin.net> + <slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk> + <15842.10677.577458.656565@jupiter.akutech-local.de>', + }, + { mid => '15843.12163.554914.469248@jupiter.akutech-local.de', + references => '<20021124145312.GA1759@nlin.net> + <slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk> + <15842.10677.577458.656565@jupiter.akutech-local.de> + <E18GPHf-0000zp-00@cloaked.freeserve.co.uk>', + }, + { mid => 'E18GPHf-0000zp-00@cloaked.freeserve.co.uk', + references => '<20021124145312.GA1759@nlin.net> + <slrnau448m.7l4.markj+0111@cloaked.freeserve.co.uk> + <15842.10677.577458.656565@jupiter.akutech-local.de>' + } +); + +my $st = thread_to_s(\@msgs); + +SKIP: { + skip 'Mail::Thread missing', 1 unless $mt; + $mt = Mail::Thread->new(@check); + $mt->thread; + $mt->order(sub { sort { $a->messageid cmp $b->messageid } @_ }); + my $check = ''; + + my @q = map { (0, $_) } $mt->rootset; + while (@q) { + my $level = shift @q; + my $node = shift @q or next; + $check .= (" "x$level) . $node->messageid . "\n"; + unshift @q, $level + 1, $node->child, $level, $node->next; + } + is($check, $st, 'Mail::Thread output matches'); +} + +done_testing(); + +sub thread_to_s { + my $th = PublicInbox::SearchThread->new(shift); + $th->thread; + $th->order(sub { [ sort { $a->{id} cmp $b->{id} } @{$_[0]} ] }); + my $st = ''; + my @q = map { (0, $_) } @{$th->{rootset}}; + while (@q) { + my $level = shift @q; + my $node = shift @q or next; + $st .= (" "x$level). "$node->{id}\n"; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}}; + } + $st; +} @@ -4,7 +4,46 @@ use strict; use warnings; use Test::More; use Email::MIME; -use PublicInbox::View; +use Plack::Util; +use_ok 'PublicInbox::View'; + +my @q = ( + 'foo@bar', 'foo@bar', + 'a b', "'a b'", + "a'b", "'a'\\''b'", +); +while (@q) { + my $input = shift @q; + my $expect = shift @q; + my $res = PublicInbox::View::squote_maybe($input); + is($res, $expect, "quote $input => $res"); +} + +# FIXME: make this test less fragile +my $ctx = { + env => { HTTP_HOST => 'example.com', 'psgi.url_scheme' => 'http' }, + -inbox => Plack::Util::inline_object( + name => 'test', + search => sub { undef }, + base_url => sub { 'http://example.com/' }, + cloneurl => sub {[]}, + nntp_url => sub {[]}, + description => sub { '' }), +}; +$ctx->{-inbox}->{-primary_address} = 'test@example.com'; + +sub msg_html ($) { + my ($mime) = @_; + + my $s = ''; + my $r = PublicInbox::View::msg_html($ctx, $mime); + my $body = $r->[2]; + while (defined(my $buf = $body->getline)) { + $s .= $buf; + } + $body->close; + $s; +} # plain text { @@ -41,26 +80,12 @@ EOF body => $body, )->as_string; my $mime = Email::MIME->new($s); - my $html = PublicInbox::View::msg_html(undef, $mime); + my $html = msg_html($mime); # ghetto tests - like($html, qr!<a\nhref="\.\./raw"!s, "raw link present"); + like($html, qr!<a\nhref="raw"!s, "raw link present"); like($html, qr/hello world\b/, "body present"); like($html, qr/> keep this inline/, "short quoted text is inline"); - like($html, qr/<a\nid=[^>]+><\/a>> Long and wordy/, - "long quoted text is anchored"); - - # short page - my $pfx = "../hello%40example.com/f/"; - $mime = Email::MIME->new($s); - my $short = PublicInbox::View::msg_html(undef, $mime, $pfx); - like($short, qr!<a\nhref="\.\./hello%40example\.com/f/!s, - "MID link present"); - like($short, qr/\n> keep this inline/, - "short quoted text is inline"); - like($short, qr/<a\nhref="\Q$pfx\E#[^>]+>Long and wordy/, - "long quoted text is made into a link"); - ok(length($short) < length($html), "short page is shorter"); } # multipart crap @@ -85,8 +110,8 @@ EOF parts => $parts, ); - my $html = PublicInbox::View::msg_html(undef, $mime); - like($html, qr/hi\n-+ part #2 -+\nbye\n/, "multipart split"); + my $html = msg_html($mime); + like($html, qr/hi\n.*-- Attachment #2.*\nbye\n/s, "multipart split"); } # multipart email with attached patch @@ -114,8 +139,8 @@ EOF parts => $parts, ); - my $html = PublicInbox::View::msg_html(undef, $mime); - like($html, qr!see attached patch\n-+ foo\.patch -+\n--- a/file\n!, + my $html = msg_html($mime); + like($html, qr!.*Attachment #2: foo\.patch --!, "parts split with filename"); } @@ -140,15 +165,18 @@ EOF ); my $orig = $mime->body_raw; - my $html = PublicInbox::View::msg_html(undef, $mime); + my $html = msg_html($mime); like($orig, qr/hi =3D bye=/, "our test used QP correctly"); like($html, qr/\bhi = bye\b/, "HTML output decoded QP"); } { use PublicInbox::MID qw/id_compress/; + + # n.b: this is probably invalid since we dropped CGI for PSGI: like(id_compress('foo%bar@wtf'), qr/\A[a-f0-9]{40}\z/, "percent always converted to sha1 to workaround buggy httpds"); + is(id_compress('foobar-wtf'), 'foobar-wtf', 'regular ID not compressed'); } diff --git a/t/watch_maildir.t b/t/watch_maildir.t new file mode 100644 index 00000000..3969c80d --- /dev/null +++ b/t/watch_maildir.t @@ -0,0 +1,126 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +use Test::More; +use File::Temp qw/tempdir/; +use Email::MIME; +use Cwd; +use PublicInbox::Config; +my @mods = qw(Filesys::Notify::Simple); +foreach my $mod (@mods) { + eval "require $mod"; + plan skip_all => "$mod missing for watch_maildir.t" if $@; +} + +my $tmpdir = tempdir('watch_maildir-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $git_dir = "$tmpdir/test.git"; +my $maildir = "$tmpdir/md"; +my $spamdir = "$tmpdir/spam"; +use_ok 'PublicInbox::WatchMaildir'; +use_ok 'PublicInbox::Emergency'; +my $cfgpfx = "publicinbox.test"; +my $addr = 'test-public@example.com'; +is(system(qw(git init -q --bare), $git_dir), 0, 'initialized git dir'); + +my $msg = <<EOF; +From: user\@example.com +To: $addr +Subject: spam +Message-Id: <a\@b.com> +Date: Sat, 18 Jun 2016 00:00:00 +0000 + +something +EOF +PublicInbox::Emergency->new($maildir)->prepare(\$msg); +ok(POSIX::mkfifo("$maildir/cur/fifo", 0777)); +my $sem = PublicInbox::Emergency->new($spamdir); # create dirs + +my $config = PublicInbox::Config->new({ + "$cfgpfx.address" => $addr, + "$cfgpfx.mainrepo" => $git_dir, + "$cfgpfx.watch" => "maildir:$maildir", + "$cfgpfx.filter" => 'PublicInbox::Filter::Vger', + "publicinboxlearn.watchspam" => "maildir:$spamdir", +}); + +PublicInbox::WatchMaildir->new($config)->scan; +my $git = PublicInbox::Git->new($git_dir); +my @list = $git->qx(qw(rev-list refs/heads/master)); +is(scalar @list, 1, 'one revision in rev-list'); + +my $write_spam = sub { + is(scalar glob("$spamdir/new/*"), undef, 'no spam existing'); + $sem->prepare(\$msg); + $sem->commit; + my @new = glob("$spamdir/new/*"); + is(scalar @new, 1); + my @p = split(m!/+!, $new[0]); + ok(link($new[0], "$spamdir/cur/".$p[-1].":2,S")); + is(unlink($new[0]), 1); +}; +$write_spam->(); +is(unlink(glob("$maildir/new/*")), 1, 'unlinked old spam'); +PublicInbox::WatchMaildir->new($config)->scan; +@list = $git->qx(qw(rev-list refs/heads/master)); +is(scalar @list, 2, 'two revisions in rev-list'); +@list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); +is(scalar @list, 0, 'tree is empty'); + +# check with scrubbing +{ + $msg .= qq(-- +To unsubscribe from this list: send the line "unsubscribe git" in +the body of a message to majordomo\@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html\n); + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + PublicInbox::WatchMaildir->new($config)->scan; + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + is(scalar @list, 1, 'tree has one file'); + my $mref = $git->cat_file('HEAD:'.$list[0]); + like($$mref, qr/something\n\z/s, 'message scrubbed on import'); + + is(unlink(glob("$maildir/new/*")), 1, 'unlinked spam'); + $write_spam->(); + PublicInbox::WatchMaildir->new($config)->scan; + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + is(scalar @list, 0, 'tree is empty'); + @list = $git->qx(qw(rev-list refs/heads/master)); + is(scalar @list, 4, 'four revisions in rev-list'); +} + +{ + my $fail_bin = getcwd()."/t/fail-bin"; + ok(-x "$fail_bin/spamc", "mock spamc exists"); + my $fail_path = "$fail_bin:$ENV{PATH}"; # for spamc ham mock + local $ENV{PATH} = $fail_path; + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + $config->{'publicinboxwatch.spamcheck'} = 'spamc'; + { + local $SIG{__WARN__} = sub {}; # quiet spam check warning + PublicInbox::WatchMaildir->new($config)->scan; + } + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + is(scalar @list, 0, 'tree has no files spamc checked'); + is(unlink(glob("$maildir/new/*")), 1); +} + +{ + my $main_bin = getcwd()."/t/main-bin"; + ok(-x "$main_bin/spamc", "mock spamc exists"); + my $main_path = "$main_bin:$ENV{PATH}"; # for spamc ham mock + local $ENV{PATH} = $main_path; + PublicInbox::Emergency->new($maildir)->prepare(\$msg); + $config->{'publicinboxwatch.spamcheck'} = 'spamc'; + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + PublicInbox::WatchMaildir->new($config)->scan; + @list = $git->qx(qw(ls-tree -r --name-only refs/heads/master)); + is(scalar @list, 1, 'tree has one file after spamc checked'); + + # XXX: workaround some weird caching/memoization in cat-file, + # shouldn't be an issue in real-world use, though... + $git = PublicInbox::Git->new($git_dir); + + my $mref = $git->cat_file('refs/heads/master:'.$list[0]); + like($$mref, qr/something\n\z/s, 'message scrubbed on import'); +} + +done_testing; |