diff options
Diffstat (limited to 'lib')
55 files changed, 5294 insertions, 2466 deletions
diff --git a/lib/PublicInbox/Address.pm b/lib/PublicInbox/Address.pm new file mode 100644 index 00000000..2c0bb040 --- /dev/null +++ b/lib/PublicInbox/Address.pm @@ -0,0 +1,27 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::Address; +use strict; +use warnings; + +# very loose regexes, here. We don't need RFC-compliance, +# just enough to make thing sanely displayable and pass to git + +sub emails { + ($_[0] =~ /([\w\.\+=\-]+\@[\w\.\-]+)>?\s*(?:\(.*?\))?(?:,\s*|\z)/g) +} + +sub names { + map { + tr/\r\n\t/ /; + s/\s*<([^<]+)\z//; + my $e = $1; + s/\A['"\s]*//; + s/['"\s]*\z//; + $e = $_ =~ /\S/ ? $_ : $e; + $e =~ s/\@\S+\z//; + $e; + } split(/\@+[\w\.\-]+>?\s*(?:\(.*?\))?(?:,\s*|\z)/, $_[0]); +} + +1; diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm new file mode 100644 index 00000000..6fdc3a2d --- /dev/null +++ b/lib/PublicInbox/AltId.pm @@ -0,0 +1,38 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::AltId; +use strict; +use warnings; +use URI::Escape qw(uri_unescape); + +# spec: TYPE:PREFIX:param1=value1¶m2=value2&... +# Example: serial:gmane:file=/path/to/altmsgmap.sqlite3 +sub new { + my ($class, $inbox, $spec) = @_; + my ($type, $prefix, $query) = split(/:/, $spec, 3); + $type eq 'serial' or die "non-serial not supported, yet\n"; + + require PublicInbox::Msgmap; + + my %params = map { + my ($k, $v) = split(/=/, uri_unescape($_), 2); + $v = '' unless defined $v; + ($k, $v); + } split(/[&;]/, $query); + my $f = $params{file} or die "file: required for $type spec $spec\n"; + unless (index($f, '/') == 0) { + $f = "$inbox->{mainrepo}/public-inbox/$f"; + } + bless { + mm_alt => PublicInbox::Msgmap->new_file($f), + xprefix => 'X'.uc($prefix), + }, $class; +} + +sub mid2alt { + my ($self, $mid) = @_; + $self->{mm_alt}->num_for($mid); +} + +1; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index f84a9550..8d66cf8c 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -5,21 +5,31 @@ package PublicInbox::Config; use strict; use warnings; -use base qw/Exporter/; -our @EXPORT_OK = qw/try_cat/; -use File::Path::Expand qw/expand_filename/; +require PublicInbox::Inbox; +use PublicInbox::Spawn qw(popen_rd); # returns key-value pairs of config directives in a hash # if keys may be multi-value, the value is an array ref containing all values sub new { my ($class, $file) = @_; $file = default_file() unless defined($file); - bless git_config_dump($file), $class; + $file = ref $file ? $file : git_config_dump($file); + my $self = bless $file, $class; + + # caches + $self->{-by_addr} ||= {}; + $self->{-by_name} ||= {}; + $self->{-by_newsgroup} ||= {}; + $self->{-limiters} ||= {}; + $self; } sub lookup { my ($self, $recipient) = @_; my $addr = lc($recipient); + my $inbox = $self->{-by_addr}->{$addr}; + return $inbox if $inbox; + my $pfx; foreach my $k (keys %$self) { @@ -37,29 +47,65 @@ sub lookup { last; } } - defined $pfx or return; + _fill($self, $pfx); +} - my %rv; - foreach my $k (qw(mainrepo address filter)) { - my $v = $self->{"$pfx.$k"}; - $rv{$k} = $v if defined $v; +sub lookup_name ($$) { + my ($self, $name) = @_; + $self->{-by_name}->{$name} || _fill($self, "publicinbox.$name"); +} + +sub each_inbox { + my ($self, $cb) = @_; + my %seen; + foreach my $k (keys %$self) { + $k =~ /\Apublicinbox\.([A-Z0-9a-z-]+)\.mainrepo\z/ or next; + next if $seen{$1}; + $seen{$1} = 1; + my $ibx = lookup_name($self, $1) or next; + $cb->($ibx); } - my $listname = $pfx; - $listname =~ s/\Apublicinbox\.//; - $rv{listname} = $listname; - my $v = $rv{address}; - $rv{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; - \%rv; +} + +sub lookup_newsgroup { + my ($self, $ng) = @_; + $ng = lc($ng); + my $rv = $self->{-by_newsgroup}->{$ng}; + return $rv if $rv; + + foreach my $k (keys %$self) { + $k =~ /\A(publicinbox\.[\w-]+)\.newsgroup\z/ or next; + my $v = $self->{$k}; + my $pfx = $1; + if ($v eq $ng) { + $rv = _fill($self, $pfx); + return $rv; + } + } + undef; +} + +sub limiter { + my ($self, $name) = @_; + $self->{-limiters}->{$name} ||= do { + require PublicInbox::Qspawn; + my $max; + # XXX "limiter.<name>.max" was a historical mistake + foreach my $pfx (qw(publicinboxlimiter limiter)) { + $max ||= $self->{"$pfx.$name.max"}; + } + PublicInbox::Qspawn::Limiter->new($max); + }; } sub get { - my ($self, $listname, $key) = @_; + my ($self, $inbox, $key) = @_; - $self->{"publicinbox.$listname.$key"}; + $self->{"publicinbox.$inbox.$key"}; } -sub config_dir { $ENV{PI_DIR} || expand_filename('~/.public-inbox') } +sub config_dir { $ENV{PI_DIR} || "$ENV{HOME}/.public-inbox" } sub default_file { my $f = $ENV{PI_CONFIG}; @@ -72,9 +118,9 @@ sub git_config_dump { my ($in, $out); my @cmd = (qw/git config/, "--file=$file", '-l'); my $cmd = join(' ', @cmd); - my $pid = open(my $fh, '-|', @cmd); - defined $pid or die "$cmd failed: $!"; + my $fh = popen_rd(\@cmd) or die "popen_rd failed for $file: $!\n"; my %rv; + local $/ = "\n"; foreach my $line (<$fh>) { chomp $line; my ($k, $v) = split(/=/, $line, 2); @@ -90,19 +136,44 @@ sub git_config_dump { $rv{$k} = $v; } } - close $fh or die "failed to close ($cmd) pipe: $!"; - $? and warn "$$ $cmd exited with: ($pid) $?"; + close $fh or die "failed to close ($cmd) pipe: $?"; \%rv; } -sub try_cat { - my ($path) = @_; - my $rv; - if (open(my $fh, '<', $path)) { - local $/; - $rv = <$fh>; +sub _fill { + my ($self, $pfx) = @_; + my $rv = {}; + + foreach my $k (qw(mainrepo address filter url newsgroup + infourl watch watchheader httpbackendmax)) { + my $v = $self->{"$pfx.$k"}; + $rv->{$k} = $v if defined $v; + } + + # TODO: more arrays, we should support multi-value for + # more things to encourage decentralization + foreach my $k (qw(altid nntpmirror)) { + if (defined(my $v = $self->{"$pfx.$k"})) { + $rv->{$k} = ref($v) eq 'ARRAY' ? $v : [ $v ]; + } + } + + return unless $rv->{mainrepo}; + my $name = $pfx; + $name =~ s/\Apublicinbox\.//; + $rv->{name} = $name; + $rv->{-pi_config} = $self; + $rv = PublicInbox::Inbox->new($rv); + my $v = $rv->{address}; + if (ref($v) eq 'ARRAY') { + $self->{-by_addr}->{lc($_)} = $rv foreach @$v; + } else { + $self->{-by_addr}->{lc($v)} = $rv; + } + if (my $ng = $rv->{newsgroup}) { + $self->{-by_newsgroup}->{$ng} = $rv; } - $rv; + $self->{-by_name}->{$name} = $rv; } 1; diff --git a/lib/PublicInbox/Daemon.pm b/lib/PublicInbox/Daemon.pm index c9594a37..37aa4187 100644 --- a/lib/PublicInbox/Daemon.pm +++ b/lib/PublicInbox/Daemon.pm @@ -9,11 +9,13 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/; use IO::Handle; use IO::Socket; use Cwd qw/abs_path/; +use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); STDOUT->autoflush(1); STDERR->autoflush(1); require Danga::Socket; require POSIX; require PublicInbox::Listener; +require PublicInbox::ParentPipe; my @CMD; my $set_user; my (@cfg_listen, $stdout, $stderr, $group, $user, $pid_file, $daemonize); @@ -101,17 +103,18 @@ sub check_absolute ($$) { } sub daemonize () { - foreach my $i (0..$#ARGV) { - my $arg = $ARGV[$i]; - next unless -e $arg; - $ARGV[$i] = abs_path($arg); - } - check_absolute('stdout', $stdout); - check_absolute('stderr', $stderr); - check_absolute('pid-file', $pid_file); + if ($daemonize) { + foreach my $i (0..$#ARGV) { + my $arg = $ARGV[$i]; + next unless -e $arg; + $ARGV[$i] = abs_path($arg); + } + check_absolute('stdout', $stdout); + check_absolute('stderr', $stderr); + check_absolute('pid-file', $pid_file); - chdir '/' or die "chdir failed: $!"; - open(STDIN, '+<', '/dev/null') or die "redirect stdin failed: $!"; + chdir '/' or die "chdir failed: $!"; + } return unless (defined $pid_file || defined $group || defined $user || $daemonize); @@ -140,15 +143,17 @@ sub daemonize () { }; if ($daemonize) { - my ($pid, $err) = do_fork(); - die "could not fork: $err\n" unless defined $pid; + my $pid = fork; + die "could not fork: $!\n" unless defined $pid; exit if $pid; + open(STDIN, '+<', '/dev/null') or + die "redirect stdin failed: $!\n"; open STDOUT, '>&STDIN' or die "redirect stdout failed: $!\n"; open STDERR, '>&STDIN' or die "redirect stderr failed: $!\n"; POSIX::setsid(); - ($pid, $err) = do_fork(); - die "could not fork: $err\n" unless defined $pid; + $pid = fork; + die "could not fork: $!\n" unless defined $pid; exit if $pid; } if (defined $pid_file) { @@ -161,29 +166,44 @@ sub daemonize () { } } -sub worker_quit () { + +sub worker_quit { + my ($reason) = @_; # killing again terminates immediately: exit unless @listeners; $_->close foreach @listeners; # call Danga::Socket::close @listeners = (); + $reason->close if ref($reason) eq 'PublicInbox::ParentPipe'; - # give slow clients 30s to finish reading/writing whatever - Danga::Socket->AddTimer(30, sub { exit }); - + my $proc_name; + my $warn = 0; # drop idle connections and try to quit gracefully Danga::Socket->SetPostLoopCallback(sub { my ($dmap, undef) = @_; my $n = 0; + my $now = clock_gettime(CLOCK_MONOTONIC); foreach my $s (values %$dmap) { - if ($s->can('busy') && $s->busy) { - $n = 1; + $s->can('busy') or next; + if ($s->busy($now)) { + ++$n; } else { # close as much as possible, early as possible $s->close; } } + if ($n) { + if (($warn + 5) < time) { + warn "$$ quitting, $n client(s) left\n"; + $warn = time; + } + unless (defined $proc_name) { + $proc_name = (split(/\s+/, $0))[0]; + $proc_name =~ s!\A.*?([^/]+)\z!$1!; + } + $0 = "$proc_name quitting, $n client(s) left"; + } $n; # true: loop continues, false: loop breaks }); } @@ -264,9 +284,9 @@ sub upgrade () { $pid_file .= '.oldbin'; write_pid($pid_file); } - my ($pid, $err) = do_fork(); + my $pid = fork; unless (defined $pid) { - warn "fork failed: $err\n"; + warn "fork failed: $!\n"; return; } if ($pid == 0) { @@ -291,17 +311,6 @@ sub kill_workers ($) { } } -sub do_fork () { - my $new = POSIX::SigSet->new; - $new->fillset; - my $old = POSIX::SigSet->new; - POSIX::sigprocmask(&POSIX::SIG_BLOCK, $new, $old) or die "SIG_BLOCK: $!"; - my $pid = fork; - my $err = $!; - POSIX::sigprocmask(&POSIX::SIG_SETMASK, $old) or die "SIG_SETMASK: $!"; - ($pid, $err); -} - sub upgrade_aborted ($) { my ($p) = @_; warn "reexec PID($p) died with: $?\n"; @@ -336,6 +345,7 @@ sub unlink_pid_file_safe_ish ($$) { return unless defined $unlink_pid && $unlink_pid == $$; open my $fh, '<', $file or return; + local $/ = "\n"; defined(my $read_pid = <$fh>) or return; chomp $read_pid; if ($read_pid == $unlink_pid) { @@ -359,6 +369,7 @@ sub master_loop { } reopen_logs(); # main loop + my $quit = 0; while (1) { while (my $s = shift @caught) { if ($s eq 'USR1') { @@ -367,10 +378,16 @@ sub master_loop { } elsif ($s eq 'USR2') { upgrade(); } elsif ($s =~ /\A(?:QUIT|TERM|INT)\z/) { - # drops pipes and causes children to die - exit + exit if $quit++; + kill_workers($s); } elsif ($s eq 'WINCH') { - $worker_processes = 0; + if (-t STDIN || -t STDOUT || -t STDERR) { + warn +"ignoring SIGWINCH since we are not daemonized\n"; + $SIG{WINCH} = 'IGNORE'; + } else { + $worker_processes = 0; + } } elsif ($s eq 'HUP') { $worker_processes = $set_workers; kill_workers($s); @@ -390,6 +407,11 @@ sub master_loop { } my $n = scalar keys %pids; + if ($quit) { + exit if $n == 0; + $set_workers = $worker_processes = $n = 0; + } + if ($n > $worker_processes) { while (my ($k, $v) = each %pids) { kill('TERM', $k) if $v >= $worker_processes; @@ -397,9 +419,9 @@ sub master_loop { $n = $worker_processes; } foreach my $i ($n..($worker_processes - 1)) { - my ($pid, $err) = do_fork(); + my $pid = fork; if (!defined $pid) { - warn "failed to fork worker[$i]: $err\n"; + warn "failed to fork worker[$i]: $!\n"; } elsif ($pid == 0) { $set_user->() if $set_user; return $p0; # run normal work code @@ -419,13 +441,12 @@ sub daemon_loop ($$) { my $parent_pipe; if ($worker_processes > 0) { $refresh->(); # preload by default - $parent_pipe = master_loop(); # returns if in child process - my $fd = fileno($parent_pipe); - Danga::Socket->AddOtherFds($fd => *worker_quit); + my $fh = master_loop(); # returns if in child process + $parent_pipe = PublicInbox::ParentPipe->new($fh, *worker_quit); } else { reopen_logs(); $set_user->() if $set_user; - $SIG{USR2} = sub { worker_quit() if upgrade() }; + $SIG{USR2} = sub { worker_quit('USR2') if upgrade() }; $refresh->(); } $uid = $gid = undef; @@ -433,6 +454,8 @@ sub daemon_loop ($$) { $SIG{QUIT} = $SIG{INT} = $SIG{TERM} = *worker_quit; $SIG{USR1} = *reopen_logs; $SIG{HUP} = $refresh; + $SIG{CHLD} = 'DEFAULT'; + $SIG{$_} = 'IGNORE' for qw(USR2 TTIN TTOU WINCH); # this calls epoll_create: @listeners = map { PublicInbox::Listener->new($_, $post_accept) diff --git a/lib/PublicInbox/Emergency.pm b/lib/PublicInbox/Emergency.pm new file mode 100644 index 00000000..4ee86215 --- /dev/null +++ b/lib/PublicInbox/Emergency.pm @@ -0,0 +1,96 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Emergency Maildir delivery for MDA +package PublicInbox::Emergency; +use strict; +use warnings; +use Fcntl qw(:DEFAULT SEEK_SET); +use Sys::Hostname qw(hostname); +use IO::Handle; + +sub new { + my ($class, $dir) = @_; + + -d $dir or mkdir($dir) or die "failed to mkdir($dir): $!\n"; + foreach (qw(new tmp cur)) { + my $d = "$dir/$_"; + next if -d $d; + -d $d or mkdir($d) or die "failed to mkdir($d): $!\n"; + } + bless { dir => $dir, files => {}, t => 0, cnt => 0 }, $class; +} + +sub _fn_in { + my ($self, $dir) = @_; + my @host = split(/\./, hostname); + my $now = time; + if ($self->{t} != $now) { + $self->{t} = $now; + $self->{cnt} = 0; + } else { + $self->{cnt}++; + } + + my $f; + do { + $f = "$self->{dir}/$dir/$self->{t}.$$"."_$self->{cnt}.$host[0]"; + $self->{cnt}++; + } while (-e $f); + $f; +} + +sub prepare { + my ($self, $strref) = @_; + + die "already in transaction: $self->{tmp}" if $self->{tmp}; + my ($tmp, $fh); + do { + $tmp = _fn_in($self, 'tmp'); + $! = undef; + } while (!sysopen($fh, $tmp, O_CREAT|O_EXCL|O_RDWR) && $!{EEXIST}); + print $fh $$strref or die "write failed: $!"; + $fh->flush or die "flush failed: $!"; + $fh->autoflush(1); + $self->{fh} = $fh; + $self->{tmp} = $tmp; +} + +sub abort { + my ($self) = @_; + delete $self->{fh}; + my $tmp = delete $self->{tmp} or return; + + unlink($tmp) or warn "Failed to unlink $tmp: $!"; + undef; +} + +sub fh { + my ($self) = @_; + my $fh = $self->{fh} or die "{fh} not open!\n"; + seek($fh, 0, SEEK_SET) or die "seek(fh) failed: $!"; + sysseek($fh, 0, SEEK_SET) or die "sysseek(fh) failed: $!"; + $fh; +} + +sub commit { + my ($self) = @_; + + delete $self->{fh}; + my $tmp = delete $self->{tmp} or return; + my $new; + do { + $new = _fn_in($self, 'new'); + } while (!link($tmp, $new) && $!{EEXIST}); + my @sn = stat($new) or die "stat $new failed: $!"; + my @st = stat($tmp) or die "stat $tmp failed: $!"; + if ($st[0] == $sn[0] && $st[1] == $sn[1]) { + unlink($tmp) or warn "Failed to unlink $tmp: $!"; + } else { + warn "stat($new) and stat($tmp) differ"; + } +} + +sub DESTROY { commit($_[0]) } + +1; diff --git a/lib/PublicInbox/EvCleanup.pm b/lib/PublicInbox/EvCleanup.pm new file mode 100644 index 00000000..2b77c617 --- /dev/null +++ b/lib/PublicInbox/EvCleanup.pm @@ -0,0 +1,74 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# event cleanups (currently for Danga::Socket) +package PublicInbox::EvCleanup; +use strict; +use warnings; +use base qw(Danga::Socket); +use fields qw(rd); +my $singleton; +my $asapq = [ [], undef ]; +my $nextq = [ [], undef ]; +my $laterq = [ [], undef ]; + +sub once_init () { + my $self = fields::new('PublicInbox::EvCleanup'); + my ($r, $w); + pipe($r, $w) or die "pipe: $!"; + $self->SUPER::new($w); + $self->{rd} = $r; # never read, since we never write.. + $self; +} + +sub _run_all ($) { + my ($q) = @_; + + my $run = $q->[0]; + $q->[0] = []; + $q->[1] = undef; + $_->() foreach @$run; +} + +sub _run_asap () { _run_all($asapq) } +sub _run_next () { _run_all($nextq) } +sub _run_later () { _run_all($laterq) } + +# Called by Danga::Socket +sub event_write { + my ($self) = @_; + $self->watch_write(0); + _run_asap(); +} + +sub _asap_timer () { + $singleton ||= once_init(); + $singleton->watch_write(1); + 1; +} + +sub asap ($) { + my ($cb) = @_; + push @{$asapq->[0]}, $cb; + $asapq->[1] ||= _asap_timer(); +} + +sub next_tick ($) { + my ($cb) = @_; + push @{$nextq->[0]}, $cb; + $nextq->[1] ||= Danga::Socket->AddTimer(0, *_run_next); +} + +sub later ($) { + my ($cb) = @_; + push @{$laterq->[0]}, $cb; + $laterq->[1] ||= Danga::Socket->AddTimer(60, *_run_later); +} + +END { + _run_asap(); + _run_next(); + _run_later(); +} + +1; diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 6356c324..67ce0407 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -8,79 +8,77 @@ package PublicInbox::ExtMsg; use strict; use warnings; -use URI::Escape qw(uri_escape_utf8); use PublicInbox::Hval; use PublicInbox::MID qw/mid2path/; +use PublicInbox::WwwStream; # TODO: user-configurable our @EXT_URL = ( - 'http://mid.gmane.org/%s', - 'https://lists.debian.org/msgid-search/%s', # leading "//" denotes protocol-relative (http:// or https://) - '//mid.mail-archive.com/%s', '//marc.info/?i=%s', + '//mid.mail-archive.com/%s', + 'http://mid.gmane.org/%s', + 'https://lists.debian.org/msgid-search/%s', + '//docs.FreeBSD.org/cgi/mid.cgi?db=mid&id=%s', + 'https://www.w3.org/mid/%s', + 'http://www.postgresql.org/message-id/%s', + 'https://lists.debconf.org/cgi-lurker/keyword.cgi?'. + 'doc-url=/lurker&format=en.html&query=id:%s' ); sub ext_msg { my ($ctx) = @_; - my $pi_config = $ctx->{pi_config}; - my $listname = $ctx->{listname}; + my $cur = $ctx->{-inbox}; my $mid = $ctx->{mid}; - my $cgi = $ctx->{cgi}; - my $env = $cgi->{env}; eval { require PublicInbox::Search }; my $have_xap = $@ ? 0 : 1; - my (@nox, @pfx); + my (@nox, @ibx, @found); - foreach my $k (keys %$pi_config) { - $k =~ /\Apublicinbox\.([A-Z0-9a-z-]+)\.url\z/ or next; - my $list = $1; - next if $list eq $listname; + $ctx->{www}->{pi_config}->each_inbox(sub { + my ($other) = @_; + return if $other->{name} eq $cur->{name} || !$other->base_url; - my $git_dir = $pi_config->{"publicinbox.$list.mainrepo"}; - defined $git_dir or next; - - my $url = $pi_config->{"publicinbox.$list.url"}; - defined $url or next; - - $url =~ s!/+\z!!; - $url = PublicInbox::Hval::prurl($env, $url); + my $s = $other->search; + if (!$s) { + push @nox, $other; + return; + } # try to find the URL with Xapian to avoid forking - if ($have_xap) { - my $s; - my $doc_id = eval { - $s = PublicInbox::Search->new($git_dir); - $s->find_unique_doc_id('mid', $mid); - }; - if ($@) { - # xapian not configured for this repo - } else { - # maybe we found it! - return r302($url, $mid) if (defined $doc_id); - - # no point in trying the fork fallback if we - # know Xapian is up-to-date but missing the - # message in the current repo - push @pfx, { git_dir => $git_dir, url => $url }; - next; - } + my $doc_id = eval { $s->find_unique_doc_id('mid', $mid) }; + if ($@) { + # xapian not configured properly for this repo + push @nox, $other; + return; } - # queue up for forking after we've tried Xapian on all of them - push @nox, { git_dir => $git_dir, url => $url }; - } + # maybe we found it! + if (defined $doc_id) { + push @found, $other; + } else { + # no point in trying the fork fallback if we + # know Xapian is up-to-date but missing the + # message in the current repo + push @ibx, $other; + } + }); - # Xapian not installed or configured for some repos - my $path = "HEAD:" . mid2path($mid); + return exact($ctx, \@found, $mid) if @found; - foreach my $n (@nox) { - # TODO: reuse existing PublicInbox::Git objects to save forks - my $git = PublicInbox::Git->new($n->{git_dir}); - my (undef, $type, undef) = $git->check($path); - return r302($n->{url}, $mid) if ($type && $type eq 'blob'); + # Xapian not installed or configured for some repos, + # do a full MID check (this is expensive...): + if (@nox) { + my $path = mid2path($mid); + foreach my $other (@nox) { + my (undef, $type, undef) = $other->path_check($path); + + if ($type && $type eq 'blob') { + push @found, $other; + } + } } + return exact($ctx, \@found, $mid) if @found; # fall back to partial MID matching my $n_partial = 0; @@ -88,22 +86,15 @@ sub ext_msg { eval { require PublicInbox::Msgmap }; my $have_mm = $@ ? 0 : 1; - my $base_url = $cgi->base->as_string; if ($have_mm) { my $tmp_mid = $mid; - my $url; again: - $url = $base_url . $listname; - unshift @pfx, { git_dir => $ctx->{git_dir}, url => $url }; - foreach my $pfx (@pfx) { - my $git_dir = delete $pfx->{git_dir} or next; - my $mm = eval { PublicInbox::Msgmap->new($git_dir) }; - - $mm or next; + unshift @ibx, $cur; + foreach my $ibx (@ibx) { + my $mm = $ibx->mm or next; if (my $res = $mm->mid_prefixes($tmp_mid)) { $n_partial += scalar(@$res); - $pfx->{res} = $res; - push @partial, $pfx; + push @partial, [ $ibx, $res ]; } } # fixup common errors: @@ -113,51 +104,76 @@ again: } my $code = 404; - my $h = PublicInbox::Hval->new_msgid($mid, 1); - my $href = $h->as_href; + my $h = PublicInbox::Hval->new_msgid($mid); + my $href = $h->{href}; my $html = $h->as_html; - my $title = "Message-ID <$html> not found"; - my $s = "<html><head><title>$title</title>" . - "</head><body><pre><b>$title</b>\n"; - + my $title = "<$html> not found"; + my $s = "<pre>Message-ID <$html>\nnot found\n"; if ($n_partial) { $code = 300; my $es = $n_partial == 1 ? '' : 'es'; - $s.= "\n$n_partial partial match$es found:\n\n"; - foreach my $pfx (@partial) { - my $u = $pfx->{url}; - foreach my $m (@{$pfx->{res}}) { + $s .= "\n$n_partial partial match$es found:\n\n"; + my $cur_name = $cur->{name}; + foreach my $pair (@partial) { + my ($ibx, $res) = @$pair; + my $env = $ctx->{env} if $ibx->{name} eq $cur_name; + my $u = $ibx->base_url($env) or next; + foreach my $m (@$res) { my $p = PublicInbox::Hval->new_msgid($m); - my $r = $p->as_href; + my $r = $p->{href}; my $t = $p->as_html; - $s .= qq{<a\nhref="$u/$r/">$u/$t/</a>\n}; + $s .= qq{<a\nhref="$u$r/">$u$t/</a>\n}; } } } + my $ext = ext_urls($ctx, $mid, $href, $html); + if ($ext ne '') { + $s .= $ext; + $code = 300; + } + $ctx->{-html_tip} = $s .= '</pre>'; + $ctx->{-title_html} = $title; + $ctx->{-upfx} = '../'; + PublicInbox::WwwStream->response($ctx, $code); +} + +sub ext_urls { + my ($ctx, $mid, $href, $html) = @_; # Fall back to external repos if configured if (@EXT_URL && index($mid, '@') >= 0) { - $code = 300; - $s .= "\nPerhaps try an external site:\n\n"; + my $env = $ctx->{env}; + my $e = "\nPerhaps try an external site:\n\n"; foreach my $url (@EXT_URL) { my $u = PublicInbox::Hval::prurl($env, $url); my $r = sprintf($u, $href); my $t = sprintf($u, $html); - $s .= qq{<a\nhref="$r">$t</a>\n}; + $e .= qq{<a\nhref="$r">$t</a>\n}; } + return $e; } - $s .= '</pre></body></html>'; - - [$code, ['Content-Type'=>'text/html; charset=UTF-8'], [$s]]; + '' } -# Redirect to another public-inbox which is mapped by $pi_config -sub r302 { - my ($url, $mid) = @_; - $url .= '/' . uri_escape_utf8($mid) . '/'; - [ 302, - [ 'Location' => $url, 'Content-Type' => 'text/plain' ], - [ "Redirecting to\n$url\n" ] ] +sub exact { + my ($ctx, $found, $mid) = @_; + my $h = PublicInbox::Hval->new_msgid($mid); + my $href = $h->{href}; + my $html = $h->as_html; + my $title = "<$html> found in "; + my $end = @$found == 1 ? 'another inbox' : 'other inboxes'; + $ctx->{-title_html} = $title . $end; + $ctx->{-upfx} = '../'; + my $ext_urls = ext_urls($ctx, $mid, $href, $html); + my $code = (@$found == 1 && $ext_urls eq '') ? 200 : 300; + $ctx->{-html_tip} = join('', + "<pre>Message-ID: <$html>\nfound in $end:\n\n", + (map { + my $u = $_->base_url; + qq(<a\nhref="$u$href/">$u$html/</a>\n) + } @$found), + $ext_urls, '</pre>'); + PublicInbox::WwwStream->response($ctx, $code); } 1; diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index d014434f..31d82adb 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -5,194 +5,111 @@ package PublicInbox::Feed; use strict; use warnings; -use Email::Address; use Email::MIME; -use Date::Parse qw(strptime); -use PublicInbox::Hval qw/ascii_html/; -use PublicInbox::Git; use PublicInbox::View; -use PublicInbox::MID qw/mid_clean mid2path/; -use POSIX qw/strftime/; +use PublicInbox::WwwAtomStream; use constant { - DATEFMT => '%Y-%m-%dT%H:%M:%SZ', # Atom standard MAX_PER_PAGE => 25, # this needs to be tunable }; # main function sub generate { my ($ctx) = @_; - sub { emit_atom($_[0], $ctx) }; + my @paths; + each_recent_blob($ctx, sub { push @paths, $_[0] }); + return _no_thread() unless @paths; + + my $ibx = $ctx->{-inbox}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $path = shift @paths) { + my $mime = do_cat_mail($ibx, $path) or next; + return $mime; + } + }); } sub generate_thread_atom { my ($ctx) = @_; - sub { emit_atom_thread($_[0], $ctx) }; -} - -sub generate_html_index { - my ($ctx) = @_; - sub { emit_html_index($_[0], $ctx) }; -} - -# private subs - -sub title_tag { - my ($title) = @_; - $title =~ tr/\t\n / /s; # squeeze spaces - # try to avoid the type attribute in title: - $title = ascii_html($title); - my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; - "<title$type>$title</title>"; -} - -sub atom_header { - my ($feed_opts, $title) = @_; - - $title = title_tag($feed_opts->{description}) unless (defined $title); - - qq(<?xml version="1.0" encoding="us-ascii"?>\n) . - qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . - qq{$title} . - qq(<link\nrel="alternate"\ntype="text/html") . - qq(\nhref="$feed_opts->{url}"/>) . - qq(<link\nrel="self"\nhref="$feed_opts->{atomurl}"/>) . - qq(<id>mailto:$feed_opts->{id_addr}</id>); -} - -sub emit_atom { - my ($cb, $ctx) = @_; - my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]); - my $max = $ctx->{max} || MAX_PER_PAGE; - my $feed_opts = get_feedopts($ctx); - my $x = atom_header($feed_opts); - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - each_recent_blob($ctx, sub { - my ($path, undef, $ts) = @_; - if (defined $x) { - $fh->write($x . feed_updated(undef, $ts)); - $x = undef; + my $mid = $ctx->{mid}; + my $res = $ctx->{srch}->get_thread($mid); + return _no_thread() unless $res->{total}; + + my $ibx = $ctx->{-inbox}; + my $html_url = $ibx->base_url($ctx->{env}); + $html_url .= PublicInbox::Hval->new_msgid($mid)->{href}; + $ctx->{-html_url} = $html_url; + my $msgs = $res->{msgs}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $msg = shift @$msgs) { + $msg = $ibx->msg_by_smsg($msg) and + return Email::MIME->new($msg); } - add_to_feed($feed_opts, $fh, $path, $git); }); - end_feed($fh); } -sub _no_thread { - my ($cb) = @_; - my $fh = $cb->([404, ['Content-Type' => 'text/plain']]); - $fh->write("No feed found for thread\n"); - $fh->close; -} - -sub end_feed { - my ($fh) = @_; - Email::Address->purge_cache; - $fh->write('</feed>'); - $fh->close; -} - -sub emit_atom_thread { - my ($cb, $ctx) = @_; - my $res = $ctx->{srch}->get_thread($ctx->{mid}); - return _no_thread($cb) unless $res->{total}; - my $fh = $cb->([200, ['Content-Type' => 'application/atom+xml']]); - my $feed_opts = get_feedopts($ctx); - - my $html_url = $feed_opts->{atomurl} = $ctx->{self_url}; - $html_url =~ s!/t\.atom\z!/!; - $feed_opts->{url} = $html_url; - $feed_opts->{emit_header} = 1; - - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - foreach my $msg (@{$res->{msgs}}) { - add_to_feed($feed_opts, $fh, mid2path($msg->mid), $git); - } - end_feed($fh); -} - -sub emit_html_index { - my ($res, $ctx) = @_; - my $fh = $res->([200,['Content-Type'=>'text/html; charset=UTF-8']]); - - my $max = $ctx->{max} || MAX_PER_PAGE; - my $feed_opts = get_feedopts($ctx); - - my $title = ascii_html($feed_opts->{description} || ''); - my ($footer, $param, $last); - my $state = { ctx => $ctx, seen => {}, anchor_idx => 0, fh => $fh }; - my $srch = $ctx->{srch}; - - my $top = "<b>$title</b> (<a\nhref=\"new.atom\">Atom feed</a>)"; - - if ($srch) { - $top = qq{<form\naction=""><pre>$top} . - qq{ <input\nname=q\ntype=text />} . - qq{<input\ntype=submit\nvalue=search />} . - q{</pre></form><pre>} - } else { - $top = '<pre>' . $top . "\n"; - } - - $fh->write("<html><head><title>$title</title>" . - "<link\nrel=alternate\ntitle=\"Atom feed\"\n". - "href=\"new.atom\"\ntype=\"application/atom+xml\"/>" . - PublicInbox::Hval::STYLE . - "</head><body>$top"); - +sub generate_html_index { + my ($ctx) = @_; # if the 'r' query parameter is given, it is a legacy permalink # which we must continue supporting: - my $cgi = $ctx->{cgi}; - if ($cgi && !$cgi->param('r') && $srch) { - $state->{srch} = $srch; - $last = PublicInbox::View::emit_index_topics($state); - $param = 'o'; - } else { - $last = emit_index_nosrch($ctx, $state); - $param = 'r'; - } - $footer = nav_footer($cgi, $last, $feed_opts, $state, $param); - if ($footer) { - my $list_footer = $ctx->{footer}; - $footer .= "\n\n" . $list_footer if $list_footer; - $footer = "<hr /><pre>$footer</pre>"; + my $qp = $ctx->{qp}; + if ($qp && !$qp->{r} && $ctx->{srch}) { + return PublicInbox::View::index_topics($ctx); } - $fh->write("$footer</body></html>"); - $fh->close; + + my $env = $ctx->{env}; + my $url = $ctx->{-inbox}->base_url($env) . 'new.html'; + my $qs = $env->{QUERY_STRING}; + $url .= "?$qs" if $qs ne ''; + [302, [ 'Location', $url, 'Content-Type', 'text/plain'], + [ "Redirecting to $url\n" ] ]; } -sub emit_index_nosrch { - my ($ctx, $state) = @_; - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); +sub new_html { + my ($ctx) = @_; + my @paths; my (undef, $last) = each_recent_blob($ctx, sub { my ($path, $commit, $ts, $u, $subj) = @_; - $state->{first} ||= $commit; - - my $mime = do_cat_mail($git, $path) or return 0; - PublicInbox::View::index_entry($mime, 0, $state); - 1; + $ctx->{first} ||= $commit; + push @paths, $path; }); - Email::Address->purge_cache; - $last; + if (!@paths) { + return [404, ['Content-Type', 'text/plain'], + ["No messages, yet\n"] ]; + } + $ctx->{-html_tip} = '<pre>'; + $ctx->{-upfx} = ''; + $ctx->{-hr} = 1; + PublicInbox::WwwStream->response($ctx, 200, sub { + while (my $path = shift @paths) { + my $m = do_cat_mail($ctx->{-inbox}, $path) or next; + my $more = scalar @paths; + my $s = PublicInbox::View::index_entry($m, $ctx, $more); + return $s; + } + new_html_footer($ctx, $last); + }); +} + +# private subs + +sub _no_thread () { + [404, ['Content-Type', 'text/plain'], ["No feed found for thread\n"]]; } -sub nav_footer { - my ($cgi, $last, $feed_opts, $state, $param) = @_; - $cgi or return ''; - my $old_r = $cgi->param($param); - my $head = ' '; +sub new_html_footer { + my ($ctx, $last) = @_; + my $qp = delete $ctx->{qp} or return; + my $old_r = $qp->{r}; + my $latest = ''; my $next = ' '; - my $first = $state->{first}; - my $anchor = $state->{anchor_idx}; if ($last) { - $next = qq!<a\nhref="?$param=$last">next</a>!; + $next = qq!<a\nhref="?r=$last"\nrel=next>next</a>!; } if ($old_r) { - $head = $cgi->path_info; - $head = qq!<a\nhref="$head">head</a>!; + $latest = qq! <a\nhref='./new.html'>latest</a>!; } - my $atom = "<a\nhref=\"$feed_opts->{atomurl}\">Atom feed</a>"; - "<a\nname=\"s$anchor\">page:</a> $next $head $atom"; + "<hr><pre>page: $next$latest</pre>"; } sub each_recent_blob { @@ -202,11 +119,11 @@ sub each_recent_blob { my $addmsg = qr!^:000000 100644 \S+ \S+ A\t(${hex}{2}/${hex}{38})$!; my $delmsg = qr!^:100644 000000 \S+ \S+ D\t(${hex}{2}/${hex}{38})$!; my $refhex = qr/(?:HEAD|${hex}{4,40})(?:~\d+)?/; - my $cgi = $ctx->{cgi}; + my $qp = $ctx->{qp}; # revision ranges may be specified my $range = 'HEAD'; - my $r = $cgi->param('r') if $cgi; + my $r = $qp->{r} if $qp; if ($r && ($r =~ /\A(?:$refhex\.\.)?$refhex\z/o)) { $range = $r; } @@ -214,9 +131,9 @@ sub each_recent_blob { # get recent messages # we could use git log -z, but, we already know ssoma will not # leave us with filenames with spaces in them.. - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my $log = $git->popen(qw/log --no-notes --no-color --raw -r - --abbrev-commit/, $git->abbrev, + my $log = $ctx->{-inbox}->git->popen(qw/log + --no-notes --no-color --raw -r + --abbrev=16 --abbrev-commit/, "--format=%h%x00%ct%x00%an%x00%s%x00", $range); my %deleted; # only an optimization at this point @@ -224,11 +141,12 @@ sub each_recent_blob { my $nr = 0; my ($cur_commit, $first_commit, $last_commit); my ($ts, $subj, $u); + local $/ = "\n"; while (defined(my $line = <$log>)) { if ($line =~ /$addmsg/o) { my $add = $1; next if $deleted{$add}; # optimization-only - $nr += $cb->($add, $cur_commit, $ts, $u, $subj); + $cb->($add, $cur_commit, $ts, $u, $subj) and $nr++; if ($nr >= $max) { $last = 1; last; @@ -244,6 +162,7 @@ sub each_recent_blob { } if ($last) { + local $/ = "\n"; while (my $line = <$log>) { if ($line =~ /^(${hex}{7,40})/o) { $last_commit = $1; @@ -256,108 +175,10 @@ sub each_recent_blob { ($first_commit, $last_commit); } -# private functions below -sub get_feedopts { - my ($ctx) = @_; - my $pi_config = $ctx->{pi_config}; - my $listname = $ctx->{listname}; - my $cgi = $ctx->{cgi}; - my %rv; - if (open my $fh, '<', "$ctx->{git_dir}/description") { - chomp($rv{description} = <$fh>); - } else { - $rv{description} = '($GIT_DIR/description missing)'; - } - - if ($pi_config && defined $listname && $listname ne '') { - my $addr = $pi_config->get($listname, 'address') || ""; - $rv{address} = $addr; - $addr = $addr->[0] if ref($addr); - $rv{id_addr} = $addr; - } - $rv{id_addr} ||= 'public-inbox@example.com'; - - my $url_base; - if ($cgi) { - $url_base = $cgi->base->as_string . $listname; - if (my $mid = $ctx->{mid}) { # per-thread feed: - $rv{atomurl} = "$url_base/$mid/t.atom"; - } else { - $rv{atomurl} = "$url_base/new.atom"; - } - } else { - $url_base = "http://example.com"; - $rv{atomurl} = "$url_base/new.atom"; - } - $rv{url} ||= "$url_base/"; - $rv{midurl} = "$url_base/"; - - \%rv; -} - -sub feed_updated { - my ($date, $ts) = @_; - my @t = eval { strptime($date) } if defined $date; - @t = gmtime($ts || time) unless scalar @t; - - '<updated>' . strftime(DATEFMT, @t) . '</updated>'; -} - -# returns 0 (skipped) or 1 (added) -sub add_to_feed { - my ($feed_opts, $fh, $add, $git) = @_; - - my $mime = do_cat_mail($git, $add) or return 0; - my $url = $feed_opts->{url}; - my $midurl = $feed_opts->{midurl}; - - my $header_obj = $mime->header_obj; - my $mid = $header_obj->header_raw('Message-ID'); - defined $mid or return 0; - $mid = PublicInbox::Hval->new_msgid($mid); - my $href = $mid->as_href; - my $content = PublicInbox::View->feed_entry($mime, "$midurl$href/f/"); - defined($content) or return 0; - $mime = undef; - - my $date = $header_obj->header('Date'); - my $updated = feed_updated($date); - - my $title = $header_obj->header('Subject'); - defined $title or return 0; - $title = title_tag($title); - - my $from = $header_obj->header('From') or return 0; - my @from = Email::Address->parse($from) or return 0; - my $name = ascii_html($from[0]->name); - my $email = $from[0]->address; - $email = ascii_html($email); - - if (delete $feed_opts->{emit_header}) { - $fh->write(atom_header($feed_opts, $title) . $updated); - } - $fh->write("<entry><author><name>$name</name><email>$email</email>" . - "</author>$title$updated" . - qq{<content\ntype="xhtml">} . - qq{<div\nxmlns="http://www.w3.org/1999/xhtml">}); - $fh->write($content); - - $add =~ tr!/!!d; - my $h = '[a-f0-9]'; - my (@uuid5) = ($add =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); - my $id = 'urn:uuid:' . join('-', @uuid5); - $fh->write(qq!</div></content><link\nhref="$midurl$href/"/>!. - "<id>$id</id></entry>"); - 1; -} - sub do_cat_mail { - my ($git, $path) = @_; - my $mime = eval { - my $str = $git->cat_file("HEAD:$path"); - Email::MIME->new($str); - }; - $@ ? undef : $mime; + my ($ibx, $path) = @_; + my $mime = eval { $ibx->msg_by_path($path) } or return; + Email::MIME->new($mime); } 1; diff --git a/lib/PublicInbox/Filter.pm b/lib/PublicInbox/Filter.pm deleted file mode 100644 index ea6fd33f..00000000 --- a/lib/PublicInbox/Filter.pm +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (C) 2013-2015 all contributors <meta@public-inbox.org> -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -# -# Used to filter incoming mail for -mda and importers -# This only exposes one function: run -# Note: the settings here are highly opinionated. Obviously, this is -# Free Software (AGPLv3), so you may change it if you host yourself. -package PublicInbox::Filter; -use strict; -use warnings; -use Email::MIME; -use Email::MIME::ContentType qw/parse_content_type/; -use Email::Filter; -use IPC::Run; -our $VERSION = '0.0.1'; -use constant NO_HTML => '*** We only accept plain-text email, no HTML ***'; -use constant TEXT_ONLY => '*** We only accept plain-text email ***'; - -# start with the same defaults as mailman -our $BAD_EXT = qr/\.(exe|bat|cmd|com|pif|scr|vbs|cpl|zip)\s*\z/i; -our $MIME_HTML = qr!\btext/x?html\b!i; -our $MIME_TEXT_ANY = qr!\btext/[a-z0-9\+\._-]+\b!i; - -# this is highly opinionated delivery -# returns 0 only if there is nothing to deliver -sub run { - my ($class, $mime, $filter) = @_; - - my $content_type = $mime->header('Content-Type') || 'text/plain'; - - # kill potentially bad/confusing headers - # Note: ssoma already does this, but since we mangle the message, - # we should do this before it gets to ssoma. - # We also kill Mail-{Followup,Reply}-To headers due to - # the nature of public-inbox having no real subscribers. - foreach my $d (qw(status lines content-length - mail-followup-to mail-reply-to)) { - $mime->header_set($d); - } - - if ($content_type =~ m!\btext/plain\b!i) { - return 1; # yay, nothing to do - } elsif ($content_type =~ $MIME_HTML) { - $filter->reject(NO_HTML) if $filter; - # HTML-only, non-multipart - my $body = $mime->body; - my $ct_parsed = parse_content_type($content_type); - dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_body($mime, $body); - return 1; - } elsif ($content_type =~ m!\bmultipart/!i) { - return strip_multipart($mime, $content_type, $filter); - } else { - $filter->reject(TEXT_ONLY) if $filter; - replace_body($mime, "$content_type message scrubbed"); - return 0; - } -} - -sub replace_part { - my ($mime, $part, $type) = ($_[0], $_[1], $_[3]); - # don't copy $_[2], that's the body (it may be huge) - - # Email::MIME insists on setting Date:, so just set it consistently - # to avoid conflicts to avoid git merge conflicts in a split brain - # situation. - unless (defined $part->header('Date')) { - my $date = $mime->header('Date') || - 'Thu, 01 Jan 1970 00:00:00 +0000'; - $part->header_set('Date', $date); - } - - $part->charset_set(undef); - $part->name_set(undef); - $part->filename_set(undef); - $part->format_set(undef); - $part->encoding_set('8bit'); - $part->disposition_set(undef); - $part->content_type_set($type); - $part->body_set($_[2]); -} - -# converts one part of a multipart message to text -sub html_part_to_text { - my ($mime, $part) = @_; - my $body = $part->body; - my $ct_parsed = parse_content_type($part->content_type); - dump_html(\$body, $ct_parsed->{attributes}->{charset}); - replace_part($mime, $part, $body, 'text/plain'); -} - -# modifies $_[0] in place -sub dump_html { - my ($body, $charset) = @_; - $charset ||= 'US-ASCII'; - my @cmd = qw(lynx -stdin -stderr -dump); - my $out = ""; - my $err = ""; - - # be careful about remote command injection! - if ($charset =~ /\A([A-Za-z0-9\-]+)\z/) { - push @cmd, "-assume_charset=$charset"; - } - if (IPC::Run::run(\@cmd, $body, \$out, \$err)) { - $out =~ s/\r\n/\n/sg; - $$body = $out; - } else { - # give them an ugly version: - $$body = "public-inbox HTML conversion failed: $err\n" . - $$body . "\n"; - } -} - -# this is to correct old archives during import. -sub strip_multipart { - my ($mime, $content_type, $filter) = @_; - - my (@html, @keep); - my $rejected = 0; - my $ok = 1; - - # scan through all parts once - $mime->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - - # some extensions are just bad, reject them outright - my $fn = $part->filename; - if (defined($fn) && $fn =~ $BAD_EXT) { - $filter->reject("Bad file type: $1") if $filter; - $rejected++; - return; - } - - my $part_type = $part->content_type || ''; - if ($part_type =~ m!\btext/plain\b!i) { - push @keep, $part; - } elsif ($part_type =~ $MIME_HTML) { - $filter->reject(NO_HTML) if $filter; - push @html, $part; - } elsif ($part_type =~ $MIME_TEXT_ANY) { - # Give other text attachments the benefit of the doubt, - # here? Could be source code or script the user wants - # help with. - - push @keep, $part; - } elsif ($part_type eq '' || - $part_type =~ m!\bapplication/octet-stream\b!i) { - # unfortunately, some mailers don't set correct types, - # let messages of unknown type through but do not - # change the sender-specified type - if (recheck_type_ok($part)) { - push @keep, $part; - } elsif ($filter) { - $filter->reject("Bad attachment: $part_type ". - TEXT_ONLY); - } else { - $rejected++; - } - } elsif ($part_type =~ m!\bapplication/pgp-signature\b!i) { - # PGP signatures are not huge, we may keep them. - # They can only be valid if it's the last element, - # so we keep them iff the message is unmodified: - if ($rejected == 0 && !@html) { - push @keep, $part; - } - } elsif ($filter) { - $filter->reject("unacceptable mime-type: $part_type ". - TEXT_ONLY); - } else { - # reject everything else, including non-PGP signatures - $rejected++; - } - }); - - if ($content_type =~ m!\bmultipart/alternative\b!i) { - if (scalar @keep == 1) { - return collapse($mime, $keep[0]); - } - } else { # convert HTML parts to plain text - foreach my $part (@html) { - html_part_to_text($mime, $part); - push @keep, $part; - } - } - - if (@keep == 0) { - @keep = (Email::MIME->create( - attributes => { - content_type => 'text/plain', - charset => 'US-ASCII', - encoding => '8bit', - }, - body_str => 'all attachments scrubbed by '. __PACKAGE__ - )); - $ok = 0; - } - if (scalar(@html) || $rejected) { - $mime->parts_set(\@keep); - $mime->body_set($mime->body_raw); - mark_changed($mime); - } # else: no changes - - return $ok; -} - -sub mark_changed { - my ($mime) = @_; - $mime->header_set('X-Content-Filtered-By', __PACKAGE__ ." $VERSION"); -} - -sub collapse { - my ($mime, $part) = @_; - $mime->header_set('Content-Type', $part->content_type); - $mime->body_set($part->body_raw); - my $cte = $part->header('Content-Transfer-Encoding'); - if (defined($cte) && $cte ne '') { - $mime->header_set('Content-Transfer-Encoding', $cte); - } - mark_changed($mime); - return 1; -} - -sub replace_body { - my $mime = $_[0]; - $mime->body_set($_[1]); - $mime->header_set('Content-Type', 'text/plain'); - if ($mime->header('Content-Transfer-Encoding')) { - $mime->header_set('Content-Transfer-Encoding', undef); - } - mark_changed($mime); -} - -# Check for display-able text, no messed up binaries -# Note: we can not rewrite the message with the detected mime type -sub recheck_type_ok { - my ($part) = @_; - my $s = $part->body; - ((length($s) < 0x10000) && ($s =~ /\A([[:print:]\s]+)\z/s)); -} - -1; diff --git a/lib/PublicInbox/Filter/Base.pm b/lib/PublicInbox/Filter/Base.pm new file mode 100644 index 00000000..b2bb1462 --- /dev/null +++ b/lib/PublicInbox/Filter/Base.pm @@ -0,0 +1,110 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# base class for creating per-list or per-project filters +package PublicInbox::Filter::Base; +use strict; +use warnings; +use PublicInbox::MsgIter; +use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian + +sub No ($) { "*** We only accept plain-text mail, No $_[0] ***" } + +our %DEFAULTS = ( + reject_suffix => [ qw(exe bat cmd com pif scr vbs cpl zip swf swfl) ], + reject_type => [ 'text/html:'.No('HTML'), 'text/xhtml:'.No('HTML'), + 'application/vnd.*:'.No('vendor-specific formats'), + 'image/*:'.No('images'), 'video/*:'.No('video'), + 'audio/*:'.No('audio') ], +); +our $INVALID_FN = qr/\0/; + +sub REJECT () { 100 } +sub ACCEPT { scalar @_ > 1 ? $_[1] : 1 } +sub IGNORE () { 0 } + +my %patmap = ('*' => '.*', '?' => '.', '[' => '[', ']' => ']'); +sub glob2pat { + my ($glob) = @_; + $glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; + $glob; +} + +sub new { + my ($class, %opts) = @_; + my $self = bless { err => '', %opts }, $class; + foreach my $f (qw(reject_suffix reject_type)) { + # allow undef: + $self->{$f} = $DEFAULTS{$f} unless exists $self->{$f}; + } + if (defined $self->{reject_suffix}) { + my $tmp = $self->{reject_suffix}; + $tmp = join('|', map { glob2pat($_) } @$tmp); + $self->{reject_suffix} = qr/\.($tmp)\s*\z/i; + } + my $rt = []; + if (defined $self->{reject_type}) { + my $tmp = $self->{reject_type}; + @$rt = map { + my ($type, $msg) = split(':', $_, 2); + $type = lc $type; + $msg ||= "Unacceptable Content-Type: $type"; + my $re = glob2pat($type); + [ qr/\b$re\b/i, $msg ]; + } @$tmp; + } + $self->{reject_type} = $rt; + $self; +} + +sub reject ($$) { + my ($self, $reason) = @_; + $self->{err} = $reason; + REJECT; +} + +sub err ($) { $_[0]->{err} } + +# by default, scrub is a no-op, see PublicInbox::Filter::Vger::scrub +# for an example of the override +sub scrub { + my ($self, $mime) = @_; + $self->ACCEPT($mime); +} + +# for MDA +sub delivery { + my ($self, $mime) = @_; + + my $rt = $self->{reject_type}; + my $reject_suffix = $self->{reject_suffix} || $INVALID_FN; + my (%sfx, %type); + + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + + my $ct = $part->content_type || 'text/plain'; + foreach my $p (@$rt) { + if ($ct =~ $p->[0]) { + $type{$p->[1]} = 1; + } + } + + my $fn = $part->filename; + if (defined($fn) && $fn =~ $reject_suffix) { + $sfx{$1} = 1; + } + }); + + my @r; + if (keys %type) { + push @r, sort keys %type; + } + if (keys %sfx) { + push @r, 'Rejected suffixes(s): '.join(', ', sort keys %sfx); + } + + @r ? $self->reject(join("\n", @r)) : $self->scrub($mime); +} + +1; diff --git a/lib/PublicInbox/Filter/Mirror.pm b/lib/PublicInbox/Filter/Mirror.pm new file mode 100644 index 00000000..d9940889 --- /dev/null +++ b/lib/PublicInbox/Filter/Mirror.pm @@ -0,0 +1,12 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Dumb filter for blindly accepting everything +package PublicInbox::Filter::Mirror; +use base qw(PublicInbox::Filter::Base); +use strict; +use warnings; + +sub delivery { $_[0]->ACCEPT }; + +1; diff --git a/lib/PublicInbox/Filter/Vger.pm b/lib/PublicInbox/Filter/Vger.pm new file mode 100644 index 00000000..2ffed184 --- /dev/null +++ b/lib/PublicInbox/Filter/Vger.pm @@ -0,0 +1,38 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Filter for vger.kernel.org list trailer +package PublicInbox::Filter::Vger; +use base qw(PublicInbox::Filter::Base); +use strict; +use warnings; + +my $l0 = qr/-+/; # older messages only had one '-' +my $l1 = + qr/To unsubscribe from this list: send the line "unsubscribe [\w-]+" in/; +my $l2 = qr/the body of a message to majordomo\@vger\.kernel\.org/; +my $l3 = + qr!More majordomo info at +http://vger\.kernel\.org/majordomo-info\.html!; + +# only LKML had this, and LKML nowadays has no list trailer since Jan 2016 +my $l4 = qr!Please read the FAQ at +http://www\.tux\.org/lkml/!; + +sub scrub { + my ($self, $mime) = @_; + my $s = $mime->as_string; + + # the vger appender seems to only work on the raw string, + # so in multipart (e.g. GPG-signed) messages, the list trailer + # becomes invisible to MIME-aware email clients. + if ($s =~ s/$l0\n$l1\n$l2\n$l3\n($l4\n)?\z//os) { + $mime = Email::MIME->new(\$s); + } + $self->ACCEPT($mime); +} + +sub delivery { + my ($self, $mime) = @_; + $self->scrub($mime); +} + +1; diff --git a/lib/PublicInbox/GetlineBody.pm b/lib/PublicInbox/GetlineBody.pm new file mode 100644 index 00000000..5f327828 --- /dev/null +++ b/lib/PublicInbox/GetlineBody.pm @@ -0,0 +1,35 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Wrap a pipe or file for PSGI streaming response bodies and calls the +# end callback when the object goes out-of-scope. +# This depends on rpipe being _blocking_ on getline. +package PublicInbox::GetlineBody; +use strict; +use warnings; + +sub new { + my ($class, $rpipe, $end, $buf) = @_; + bless { rpipe => $rpipe, end => $end, buf => $buf }, $class; +} + +# close should always be called after getline returns undef, +# but a client aborting a connection can ruin our day; so lets +# hope our underlying PSGI server does not leak references, here. +sub DESTROY { $_[0]->close } + +sub getline { + my ($self) = @_; + my $buf = delete $self->{buf}; # initial buffer + defined $buf ? $buf : $self->{rpipe}->getline; +} + +sub close { + my ($self) = @_; + my $rpipe = delete $self->{rpipe}; + close $rpipe if $rpipe; + my $end = delete $self->{end}; + $end->() if $end; +} + +1; diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 2b6782a7..dee027a3 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -3,7 +3,7 @@ # # Used to read files from a git repository without excessive forking. # Used in our web interfaces as well as our -nntpd server. -# This is based on code in Git.pm which is GPLv2, but modified to avoid +# This is based on code in Git.pm which is GPLv2+, but modified to avoid # dependence on environment variables for compatibility with mod_perl. # There are also API changes to simplify our usage and data set. package PublicInbox::Git; @@ -53,7 +53,9 @@ sub _bidi_pipe { my @cmd = ('git', "--git-dir=$self->{git_dir}", qw(cat-file), $batch); my $redir = { 0 => fileno($out_r), 1 => fileno($in_w) }; - $self->{$pid} = spawn(\@cmd, undef, $redir); + my $p = spawn(\@cmd, undef, $redir); + defined $p or fail($self, "spawn failed: $!"); + $self->{$pid} = $p; $out_w->autoflush(1); $self->{$out} = $out_w; $self->{$in} = $in_r; @@ -124,6 +126,8 @@ sub cat_file { $rv; } +sub batch_prepare ($) { _bidi_pipe($_[0], qw(--batch in out pid)) } + sub check { my ($self, $obj) = @_; $self->_bidi_pipe(qw(--batch-check in_c out_c pid_c)); @@ -167,6 +171,8 @@ sub popen { sub qx { my ($self, @cmd) = @_; my $fh = $self->popen(@cmd); + defined $fh or return; + local $/ = "\n"; return <$fh> if wantarray; local $/; <$fh> @@ -181,3 +187,55 @@ sub cleanup { sub DESTROY { cleanup(@_) } 1; +__END__ +=pod + +=head1 NAME + +PublicInbox::Git - git wrapper + +=head1 VERSION + +version 1.0 + +=head1 SYNOPSIS + + use PublicInbox::Git; + chomp(my $git_dir = `git rev-parse --git-dir`); + $git_dir or die "GIT_DIR= must be specified\n"; + my $git = PublicInbox::Git->new($git_dir); + +=head1 DESCRIPTION + +Unstable API outside of the L</new> method. +It requires L<git(1)> to be installed. + +=head1 METHODS + +=cut + +=head2 new + + my $git = PublicInbox::Git->new($git_dir); + +Initialize a new PublicInbox::Git object for use with L<PublicInbox::Import> +This is the only public API method we support. Everything else +in this module is subject to change. + +=head1 SEE ALSO + +L<Git>, L<PublicInbox::Import> + +=head1 CONTACT + +All feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> + +=head1 COPYRIGHT + +Copyright (C) 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> + +=cut diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index d0ce80bc..0275a2a0 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -7,7 +7,14 @@ package PublicInbox::GitHTTPBackend; use strict; use warnings; use Fcntl qw(:seek); -use PublicInbox::Spawn qw(spawn); +use IO::Handle; +use HTTP::Date qw(time2str); +use HTTP::Status qw(status_message); +use Plack::Util; +use PublicInbox::Qspawn; + +# 32 is same as the git-daemon connection limit +my $default_limiter = PublicInbox::Qspawn::Limiter->new(32); # n.b. serving "description" and "cloneurl" should be innocuous enough to # not cause problems. serving "config" might... @@ -20,77 +27,116 @@ my @binary = qw! objects/pack/pack-[a-f0-9]{40}\.(?:pack|idx) !; -our $ANY = join('|', @binary, @text); +our $ANY = join('|', @binary, @text, 'git-upload-pack'); my $BIN = join('|', @binary); my $TEXT = join('|', @text); -sub r { - [ $_[0] , [qw(Content-Type text/plain Content-Length 0) ], [] ] +my @no_cache = ('Expires', 'Fri, 01 Jan 1980 00:00:00 GMT', + 'Pragma', 'no-cache', + 'Cache-Control', 'no-cache, max-age=0, must-revalidate'); + +sub r ($;$) { + my ($code, $msg) = @_; + $msg ||= status_message($code); + my $len = length($msg); + [ $code, [qw(Content-Type text/plain Content-Length), $len, @no_cache], + [$msg] ] } sub serve { - my ($cgi, $git, $path) = @_; - my $service = $cgi->param('service') || ''; - if ($service =~ /\Agit-\w+-pack\z/ || $path =~ /\Agit-\w+-pack\z/) { - my $ok = serve_smart($cgi, $git, $path); + my ($env, $git, $path) = @_; + + # Documentation/technical/http-protocol.txt in git.git + # requires one and exactly one query parameter: + if ($env->{QUERY_STRING} =~ /\Aservice=git-\w+-pack\z/ || + $path =~ /\Agit-\w+-pack\z/) { + my $ok = serve_smart($env, $git, $path); return $ok if $ok; # fall through to dumb HTTP... } - serve_dumb($cgi, $git, $path); + serve_dumb($env, $git, $path); +} + +sub err ($@) { + my ($env, @msg) = @_; + $env->{'psgi.errors'}->print(@msg, "\n"); +} + +sub drop_client ($) { + if (my $io = $_[0]->{'psgix.io'}) { + $io->close; # this is Danga::Socket::close + } +} + +my $prev = 0; +my $exp; +sub cache_one_year { + my ($h) = @_; + my $t = time + 31536000; + push @$h, 'Expires', $t == $prev ? $exp : ($exp = time2str($prev = $t)), + 'Cache-Control', 'public, max-age=31536000'; } sub serve_dumb { - my ($cgi, $git, $path) = @_; + my ($env, $git, $path) = @_; - # serve dumb HTTP... + my @h; my $type; - if ($path =~ /\A(?:$BIN)\z/o) { - $type = 'application/octet-stream'; + if ($path =~ m!\Aobjects/[a-f0-9]{2}/[a-f0-9]{38}\z!) { + $type = 'application/x-git-loose-object'; + cache_one_year(\@h); + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.pack\z!) { + $type = 'application/x-git-packed-objects'; + cache_one_year(\@h); + } elsif ($path =~ m!\Aobjects/pack/pack-[a-f0-9]{40}\.idx\z!) { + $type = 'application/x-git-packed-objects-toc'; + cache_one_year(\@h); } elsif ($path =~ /\A(?:$TEXT)\z/o) { $type = 'text/plain'; + push @h, @no_cache; } else { return r(404); } - my $f = "$git->{git_dir}/$path"; - return r(404) unless -f $f && -r _; - my @st = stat(_); - my $size = $st[7]; - # TODO: If-Modified-Since and Last-Modified + my $f = (ref $git ? $git->{git_dir} : $git) . '/' . $path; + return r(404) unless -f $f && -r _; # just in case it's a FIFO :P + my $size = -s _; + + # TODO: If-Modified-Since and Last-Modified? open my $in, '<', $f or return r(404); - my $code = 200; my $len = $size; - my @h; - - my $env = $cgi->{env}; - my $range = $env->{HTTP_RANGE}; - if (defined $range && $range =~ /\bbytes=(\d*)-(\d*)\z/) { - ($code, $len) = prepare_range($cgi, $in, \@h, $1, $2, $size); + my $code = 200; + push @h, 'Content-Type', $type; + if (($env->{HTTP_RANGE} || '') =~ /\bbytes=(\d*)-(\d*)\z/) { + ($code, $len) = prepare_range($env, $in, \@h, $1, $2, $size); if ($code == 416) { push @h, 'Content-Range', "bytes */$size"; return [ 416, \@h, [] ]; } } - - push @h, 'Content-Type', $type, 'Content-Length', $len; - sub { - my ($res) = @_; # Plack callback - my $fh = $res->([ $code, \@h ]); - my $buf; - my $n = 8192; - while ($len > 0) { + push @h, 'Content-Length', $len; + my $n = 65536; + [ $code, \@h, Plack::Util::inline_object(close => sub { close $in }, + getline => sub { + return if $len == 0; $n = $len if $len < $n; - my $r = sysread($in, $buf, $n); - last if (!defined($r) || $r <= 0); - $len -= $r; - $fh->write($buf); - } - $fh->close; - } + my $r = sysread($in, my $buf, $n); + if (!defined $r) { + err($env, "$f read error: $!"); + } elsif ($r <= 0) { + err($env, "$f EOF with $len bytes left"); + } else { + $len -= $r; + $n = 8192; + return $buf; + } + drop_client($env); + return; + })] } sub prepare_range { - my ($cgi, $in, $h, $beg, $end, $size) = @_; + my ($env, $in, $h, $beg, $end, $size) = @_; my $code = 200; my $len = $size; if ($beg eq '') { @@ -119,38 +165,27 @@ sub prepare_range { if ($len <= 0) { $code = 416; } else { - seek($in, $beg, SEEK_SET) or return [ 500, [], [] ]; + sysseek($in, $beg, SEEK_SET) or return [ 500, [], [] ]; push @$h, qw(Accept-Ranges bytes Content-Range); push @$h, "bytes $beg-$end/$size"; # FIXME: Plack::Middleware::Deflater bug? - $cgi->{env}->{'psgix.no-compress'} = 1; + $env->{'psgix.no-compress'} = 1; } } ($code, $len); } +# returns undef if 403 so it falls back to dumb HTTP sub serve_smart { - my ($cgi, $git, $path) = @_; - my $env = $cgi->{env}; - - my $input = $env->{'psgi.input'}; - my $buf; - my $in; - my $err = $env->{'psgi.errors'}; - my $fd = eval { fileno($input) }; - if (defined $fd && $fd >= 0) { - $in = $input; - } else { + my ($env, $git, $path) = @_; + my $in = $env->{'psgi.input'}; + my $fd = eval { fileno($in) }; + unless (defined $fd && $fd >= 0) { $in = input_to_file($env) or return r(500); } - my ($rpipe, $wpipe); - unless (pipe($rpipe, $wpipe)) { - $err->print("error creating pipe: $!\n"); - return r(500); - } my %env = %ENV; - # GIT_HTTP_EXPORT_ALL, GIT_COMMITTER_NAME, GIT_COMMITTER_EMAIL + # GIT_COMMITTER_NAME, GIT_COMMITTER_EMAIL # may be set in the server-process and are passed as-is foreach my $name (qw(QUERY_STRING REMOTE_USER REMOTE_ADDR @@ -161,117 +196,129 @@ sub serve_smart { my $val = $env->{$name}; $env{$name} = $val if defined $val; } - my $git_dir = $git->{git_dir}; + my ($git_dir, $limiter); + if (ref $git) { + $limiter = $git->{-httpbackend_limiter} || $default_limiter; + $git_dir = $git->{git_dir}; + } else { + $limiter = $default_limiter; + $git_dir = $git; + } $env{GIT_HTTP_EXPORT_ALL} = '1'; $env{PATH_TRANSLATED} = "$git_dir/$path"; - my %rdr = ( 0 => fileno($in), 1 => fileno($wpipe), - 2 => $git->err_begin ); - my $pid = spawn([qw(git http-backend)], \%env, \%rdr); - unless (defined $pid) { - $err->print("error spawning: $!\n"); - return r(500); - } - $wpipe = $in = undef; - $buf = ''; - my ($vin, $fh, $res); + my %rdr = ( 0 => fileno($in) ); + my $x = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, \%rdr); + my ($fh, $rpipe); my $end = sub { - if ($fh) { - $fh->close; - $fh = undef; - } else { - $res->(r(500)) if $res; - } - if ($rpipe) { - $rpipe->close; # _may_ be Danga::Socket::close - $rpipe = undef; - } - if (defined $pid) { - my $wpid = $pid; - $pid = undef; - return if $wpid == waitpid($wpid, 0); - $err->print("git http-backend ($git_dir): $?\n"); + if (my $err = $x->finish) { + err($env, "git http-backend ($git_dir): $err"); } + $fh->close if $fh; # async-only }; - my $fail = sub { - if ($!{EAGAIN} || $!{EINTR}) { - select($vin, undef, undef, undef) if defined $vin; - # $vin is undef on async, so this is a noop on EAGAIN - return; - } - my $e = $!; - $end->(); - $err->print("git http-backend ($git_dir): $e\n"); + + # Danga::Socket users, we queue up the read_enable callback to + # fire after pending writes are complete: + my $buf = ''; + my $rd_hdr = sub { + my $r = sysread($rpipe, $buf, 1024, length($buf)); + return if !defined($r) && ($!{EINTR} || $!{EAGAIN}); + return r(500, 'http-backend error') unless $r; + $r = parse_cgi_headers(\$buf) or return; # incomplete headers + $r->[0] == 403 ? serve_dumb($env, $git, $path) : $r; }; - my $cb = sub { # read git-http-backend output and stream to client - my $r = $rpipe ? $rpipe->sysread($buf, 8192, length($buf)) : 0; - return $fail->() unless defined $r; - return $end->() if $r == 0; # EOF - if ($fh) { # stream body from git-http-backend to HTTP client - $fh->write($buf); - $buf = ''; - } elsif (defined $res && $buf =~ s/\A(.*?)\r\n\r\n//s) { - # parse headers - my $h = $1; - my $code = 200; - my @h; - foreach my $l (split(/\r\n/, $h)) { - my ($k, $v) = split(/:\s*/, $l, 2); - if ($k =~ /\AStatus\z/i) { - ($code) = ($v =~ /\b(\d+)\b/); - } else { - push @h, $k, $v; - } - } - # incredibly convoluted, ugh... - if ($code == 403) { - my $d = serve_dumb($cgi, $git, $path); - if (ref($d) eq 'ARRAY') { # 404 - $res->($d); - } else { - $d->($res); - } - $res = undef; - $end->(); + my $res; + my $async = $env->{'pi-httpd.async'}; + my $io = $env->{'psgix.io'}; + my $cb = sub { + my $r = $rd_hdr->() or return; + $rd_hdr = undef; + if (scalar(@$r) == 3) { # error: + if ($async) { + $async->close; # calls rpipe->close } else { - # write response header: - $fh = $res->([ $code, \@h ]); - $res = undef; - $fh->write($buf); - $buf = ''; + $rpipe->close; + $end->(); } - } # else { keep reading ... } - }; - if (my $async = $env->{'pi-httpd.async'}) { - $rpipe = $async->($rpipe, $cb); - sub { ($res) = @_ } # let Danga::Socket handle the rest. - } else { # synchronous loop for other PSGI servers - $vin = ''; - vec($vin, fileno($rpipe), 1) = 1; - sub { - ($res) = @_; - while ($rpipe) { $cb->() } + return $res->($r); } - } + if ($async) { + $fh = $res->($r); + return $async->async_pass($io, $fh, \$buf); + } + + # for synchronous PSGI servers + require PublicInbox::GetlineBody; + $r->[2] = PublicInbox::GetlineBody->new($rpipe, $end, $buf); + $res->($r); + }; + sub { + ($res) = @_; + + # hopefully this doesn't break any middlewares, + # holding the input here is a waste of FDs and memory + $env->{'psgi.input'} = undef; + + $x->start($limiter, sub { # may run later, much later... + ($rpipe) = @_; + $in = undef; + if ($async) { + $async = $async->($rpipe, $cb, $end); + } else { # generic PSGI + $cb->() while $rd_hdr; + } + }); + }; } sub input_to_file { my ($env) = @_; - my $in = IO::File->new_tmpfile; + open(my $in, '+>', undef); + unless (defined $in) { + err($env, "could not open temporary file: $!"); + return; + } my $input = $env->{'psgi.input'}; my $buf; while (1) { my $r = $input->read($buf, 8192); unless (defined $r) { - my $err = $env->{'psgi.errors'}; - $err->print("error reading input: $!\n"); + err($env, "error reading input: $!"); return; } - last if ($r == 0); - $in->write($buf); + my $off = 0; + while ($r > 0) { + my $w = syswrite($in, $buf, $r, $off); + if (defined $w) { + $r -= $w; + $off += $w; + } else { + err($env, "error writing temporary file: $!"); + return; + } + } + } + unless (defined(sysseek($in, 0, SEEK_SET))) { + err($env, "error seeking temporary file: $!"); + return; } - $in->flush; - $in->sysseek(0, SEEK_SET); return $in; } +sub parse_cgi_headers { + my ($bref) = @_; + $$bref =~ s/\A(.*?)\r\n\r\n//s or return; + my $h = $1; + my $code = 200; + my @h; + foreach my $l (split(/\r\n/, $h)) { + my ($k, $v) = split(/:\s*/, $l, 2); + if ($k =~ /\AStatus\z/i) { + ($code) = ($v =~ /\b(\d+)\b/); + } else { + push @h, $k, $v; + } + } + [ $code, \@h ] +} + 1; diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 68c3b788..cac14be3 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -4,19 +4,20 @@ # Generic PSGI server for convenience. It aims to provide # a consistent experience for public-inbox admins so they don't have # to learn different ways to admin both NNTP and HTTP components. -# There's nothing public-inbox-specific, here. +# There's nothing which depends on public-inbox, here. # Each instance of this class represents a HTTP client socket package PublicInbox::HTTP; use strict; use warnings; use base qw(Danga::Socket); -use fields qw(httpd env rbuf input_left remote_addr remote_port); +use fields qw(httpd env rbuf input_left remote_addr remote_port forward pull); use Fcntl qw(:seek); use Plack::HTTPParser qw(parse_http_request); # XS or pure Perl use HTTP::Status qw(status_message); use HTTP::Date qw(time2str); -use IO::File; +use Scalar::Util qw(weaken); +use IO::Handle; use constant { CHUNK_START => -1, # [a-f0-9]+\r\n CHUNK_END => -2, # \r\n @@ -24,13 +25,25 @@ use constant { CHUNK_MAX_HDR => 256, }; +my $pipelineq = []; +my $pipet; +sub process_pipelineq () { + my $q = $pipelineq; + $pipet = undef; + $pipelineq = []; + foreach (@$q) { + next if $_->{closed}; + rbuf_process($_); + } +} + # Use the same configuration parameter as git since this is primarily # a slow-client sponge for git-http-backend # TODO: support per-respository http.maxRequestBuffer somehow... our $MAX_REQUEST_BUFFER = $ENV{GIT_HTTP_MAX_REQUEST_BUFFER} || (10 * 1024 * 1024); -my $null_io = IO::File->new('/dev/null', '<'); +open(my $null_io, '<', '/dev/null') or die "failed to open /dev/null: $!"; my $http_date; my $prev = 0; sub http_date () { @@ -85,7 +98,7 @@ sub rbuf_process { $self->{rbuf} = substr($self->{rbuf}, $r); my $len = input_prepare($self, \%env); - defined $len or return write_err($self); # EMFILE/ENFILE + defined $len or return write_err($self, undef); # EMFILE/ENFILE $len ? event_read_input($self) : app_dispatch($self); } @@ -105,7 +118,7 @@ sub event_read_input ($) { while ($len > 0) { if ($$rbuf ne '') { my $w = write_in_full($input, $rbuf, $len); - return write_err($self) unless $w; + return write_err($self, $len) unless $w; $len -= $w; die "BUG: $len < 0 (w=$w)" if $len < 0; if ($len == 0) { # next request may be pipelined @@ -118,11 +131,11 @@ sub event_read_input ($) { return recv_err($self, $r, $len) unless $r; # continue looping if $r > 0; } - app_dispatch($self); + app_dispatch($self, $input); } -sub app_dispatch ($) { - my ($self) = @_; +sub app_dispatch { + my ($self, $input) = @_; $self->watch_read(0); my $env = $self->{env}; $env->{REMOTE_ADDR} = $self->{remote_addr}; @@ -131,10 +144,13 @@ sub app_dispatch ($) { $host =~ s/:(\d+)\z// and $env->{SERVER_PORT} = $1; $env->{SERVER_NAME} = $host; } - - sysseek($env->{'psgi.input'}, 0, SEEK_SET) or + if (defined $input) { + sysseek($input, 0, SEEK_SET) or die "BUG: psgi.input seek failed: $!"; - + } + # note: NOT $self->{sock}, we want our close (+ Danga::Socket::close), + # to do proper cleanup: + $env->{'psgix.io'} = $self; # only for ->close my $res = Plack::Util::run_app($self->{httpd}->{app}, $env); eval { if (ref($res) eq 'CODE') { @@ -163,42 +179,131 @@ sub response_header_write { if ($k =~ /\ATransfer-Encoding\z/i && $v =~ /\bchunked\b/i) { $chunked = 1; } - $h .= "$k: $v\r\n"; } my $conn = $env->{HTTP_CONNECTION} || ''; - my $alive = (defined($len) || $chunked) && - (($proto eq 'HTTP/1.1' && $conn !~ /\bclose\b/i) || - ($conn =~ /\bkeep-alive\b/i)); - - $h .= 'Connection: ' . ($alive ? 'keep-alive' : 'close'); - $h .= "\r\nDate: " . http_date() . "\r\n\r\n"; + my $term = defined($len) || $chunked; + my $prot_persist = ($proto eq 'HTTP/1.1') && ($conn !~ /\bclose\b/i); + my $alive; + if (!$term && $prot_persist) { # auto-chunk + $chunked = $alive = 2; + $h .= "Transfer-Encoding: chunked\r\n"; + # no need for "Connection: keep-alive" with HTTP/1.1 + } elsif ($term && ($prot_persist || ($conn =~ /\bkeep-alive\b/i))) { + $alive = 1; + $h .= "Connection: keep-alive\r\n"; + } else { + $alive = 0; + $h .= "Connection: close\r\n"; + } + $h .= 'Date: ' . http_date() . "\r\n\r\n"; if (($len || $chunked) && $env->{REQUEST_METHOD} ne 'HEAD') { more($self, $h); } else { $self->write($h); } - ($alive, $chunked); + $alive; +} + +# middlewares such as Deflater may write empty strings +sub chunked_wcb ($) { + my ($self) = @_; + sub { + return if $_[0] eq ''; + more($self, sprintf("%x\r\n", bytes::length($_[0]))); + more($self, $_[0]); + + # use $self->write("\n\n") if you care about real-time + # streaming responses, public-inbox WWW does not. + more($self, "\r\n"); + } +} + +sub identity_wcb ($) { + my ($self) = @_; + sub { $self->write(\($_[0])) if $_[0] ne '' } +} + +sub next_request ($) { + my ($self) = @_; + $self->watch_write(0); + if ($self->{rbuf} eq '') { # wait for next request + $self->watch_read(1); + } else { # avoid recursion for pipelined requests + push @$pipelineq, $self; + $pipet ||= PublicInbox::EvCleanup::asap(*process_pipelineq); + } +} + +sub response_done ($$) { + my ($self, $alive) = @_; + my $env = $self->{env}; + $self->{env} = undef; + $self->write("0\r\n\r\n") if $alive == 2; + $self->write(sub { $alive ? next_request($self) : $self->close }); +} + +sub getline_cb ($$$) { + my ($self, $write, $close) = @_; + local $/ = \8192; + my $forward = $self->{forward}; + # limit our own running time for fairness with other + # clients and to avoid buffering too much: + if ($forward) { + my $buf = eval { $forward->getline }; + if (defined $buf) { + $write->($buf); # may close in Danga::Socket::write + unless ($self->{closed}) { + my $next = $self->{pull}; + if ($self->{write_buf_size}) { + $self->write($next); + } else { + PublicInbox::EvCleanup::asap($next); + } + return; + } + } elsif ($@) { + err($self, "response ->getline error: $@"); + $forward = undef; + $self->close; + } + } + + $self->{forward} = $self->{pull} = undef; + # avoid recursion + if ($forward) { + eval { $forward->close }; + if ($@) { + err($self, "response ->close error: $@"); + $self->close; # idempotent + } + } + $close->(); +} + +sub getline_response { + my ($self, $body, $write, $close) = @_; + $self->{forward} = $body; + weaken($self); + my $pull = $self->{pull} = sub { getline_cb($self, $write, $close) }; + $pull->(); } sub response_write { my ($self, $env, $res) = @_; - my ($alive, $chunked) = response_header_write($self, $env, $res); - my $write = sub { $self->write($_[0]) }; - my $close = sub { - if ($alive) { - $self->event_write; # watch for readability if done + my $alive = response_header_write($self, $env, $res); + + my $write = $alive == 2 ? chunked_wcb($self) : identity_wcb($self); + my $close = sub { response_done($self, $alive) }; + if (defined(my $body = $res->[2])) { + if (ref $body eq 'ARRAY') { + $write->($_) foreach @$body; + $close->(); } else { - $self->write(sub { $self->close }); + getline_response($self, $body, $write, $close); } - $self->{env} = undef; - }; - - if (defined $res->[2]) { - Plack::Util::foreach($res->[2], $write); - $close->(); } else { # this is returned to the calling application: Plack::Util::inline_object(write => $write, close => $close); @@ -208,6 +313,7 @@ sub response_write { use constant MSG_MORE => ($^O eq 'linux') ? 0x8000 : 0; sub more ($$) { my $self = $_[0]; + return if $self->{closed}; if (MSG_MORE && !$self->{write_buf_size}) { my $n = send($self->{sock}, $_[1], MSG_MORE); if (defined $n) { @@ -220,19 +326,6 @@ sub more ($$) { $self->write($_[1]); } -# overrides existing Danga::Socket method -sub event_write { - my ($self) = @_; - # only continue watching for readability when we are done writing: - return if $self->write(undef) != 1; - - if ($self->{rbuf} eq '') { # wait for next request - $self->watch_read(1); - } else { # avoid recursion for pipelined requests - Danga::Socket->AddTimer(0, sub { rbuf_process($self) }); - } -} - sub input_prepare { my ($self, $env) = @_; my $input = $null_io; @@ -242,16 +335,15 @@ sub input_prepare { quit($self, 413); return; } - $input = IO::File->new_tmpfile; + open($input, '+>', undef); } elsif (env_chunked($env)) { $len = CHUNK_START; - $input = IO::File->new_tmpfile; + open($input, '+>', undef); } # TODO: expire idle clients on ENFILE / EMFILE return unless $input; - binmode $input; $env->{'psgi.input'} = $input; $self->{env} = $env; $self->{input_left} = $len || 0; @@ -259,11 +351,15 @@ sub input_prepare { sub env_chunked { ($_[0]->{HTTP_TRANSFER_ENCODING} || '') =~ /\bchunked\b/i } +sub err ($$) { + eval { $_[0]->{httpd}->{env}->{'psgi.errors'}->print($_[1]."\n") }; +} + sub write_err { - my ($self) = @_; - my $err = $self->{httpd}->{env}->{'psgi.errors'}; + my ($self, $len) = @_; my $msg = $! || '(zero write)'; - $err->print("error buffering to input: $msg\n"); + $msg .= " ($len bytes remaining)" if defined $len; + err($self, "error buffering to input: $msg"); quit($self, 500); } @@ -274,8 +370,7 @@ sub recv_err { $self->{input_left} = $len; return; } - my $err = $self->{httpd}->{env}->{'psgi.errors'}; - $err->print("error reading for input: $! ($len bytes remaining)\n"); + err($self, "error reading for input: $! ($len bytes remaining)"); quit($self, 500); } @@ -303,7 +398,8 @@ sub event_read_input_chunked { # unlikely... while (1) { # chunk start if ($len == CHUNK_ZEND) { - return app_dispatch($self) if $$rbuf =~ s/\A\r\n//s; + $$rbuf =~ s/\A\r\n//s and + return app_dispatch($self, $input); return quit($self, 400) if length($$rbuf) > 2; } if ($len == CHUNK_END) { @@ -337,7 +433,7 @@ sub event_read_input_chunked { # unlikely... until ($len <= 0) { if ($$rbuf ne '') { my $w = write_in_full($input, $rbuf, $len); - return write_err($self) unless $w; + return write_err($self, "$len chunk") if !$w; $len -= $w; if ($len == 0) { # we may have leftover data to parse @@ -371,11 +467,17 @@ sub quit { sub event_hup { $_[0]->close } sub event_err { $_[0]->close } -sub write ($$) : method { - my PublicInbox::HTTP $self = $_[0]; - return 1 if (defined($_[1]) && ref($_[1]) eq '' && $_[1] eq ''); - - $self->SUPER::write($_[1]); +sub close { + my $self = shift; + my $forward = $self->{forward}; + my $env = $self->{env}; + delete $env->{'psgix.io'} if $env; # prevent circular referernces + $self->{pull} = $self->{forward} = $self->{env} = undef; + if ($forward) { + eval { $forward->close }; + err($self, "forward ->close error: $@") if $@; + } + $self->SUPER::close(@_); } # for graceful shutdown in PublicInbox::Daemon: diff --git a/lib/PublicInbox/HTTPD.pm b/lib/PublicInbox/HTTPD.pm new file mode 100644 index 00000000..433d6da7 --- /dev/null +++ b/lib/PublicInbox/HTTPD.pm @@ -0,0 +1,43 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +package PublicInbox::HTTPD; +use strict; +use warnings; +use Plack::Util; +require PublicInbox::HTTPD::Async; +require PublicInbox::Daemon; + +sub pi_httpd_async { PublicInbox::HTTPD::Async->new(@_) } + +sub new { + my ($class, $sock, $app) = @_; + my $n = getsockname($sock) or die "not a socket: $sock $!\n"; + my ($host, $port) = PublicInbox::Daemon::host_with_port($n); + + my %env = ( + SERVER_NAME => $host, + SERVER_PORT => $port, + SCRIPT_NAME => '', + 'psgi.version' => [ 1, 1 ], + 'psgi.errors' => \*STDERR, + 'psgi.url_scheme' => 'http', + 'psgi.nonblocking' => Plack::Util::TRUE, + 'psgi.streaming' => Plack::Util::TRUE, + 'psgi.run_once' => Plack::Util::FALSE, + 'psgi.multithread' => Plack::Util::FALSE, + 'psgi.multiprocess' => Plack::Util::TRUE, + 'psgix.harakiri'=> Plack::Util::FALSE, + 'psgix.input.buffered' => Plack::Util::TRUE, + 'pi-httpd.async' => do { + no warnings 'once'; + *pi_httpd_async + }, + ); + bless { + app => $app, + env => \%env + }, $class; +} + +1; diff --git a/lib/PublicInbox/HTTPD/Async.pm b/lib/PublicInbox/HTTPD/Async.pm new file mode 100644 index 00000000..68514f5a --- /dev/null +++ b/lib/PublicInbox/HTTPD/Async.pm @@ -0,0 +1,78 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# XXX This is a totally unstable API for public-inbox internal use only +# This is exposed via the 'pi-httpd.async' key in the PSGI env hash. +# The name of this key is not even stable! +# Currently is is intended for use with read-only pipes. +package PublicInbox::HTTPD::Async; +use strict; +use warnings; +use base qw(Danga::Socket); +use fields qw(cb cleanup); +use Scalar::Util qw(weaken); +require PublicInbox::EvCleanup; + +sub new { + my ($class, $io, $cb, $cleanup) = @_; + my $self = fields::new($class); + IO::Handle::blocking($io, 0); + $self->SUPER::new($io); + $self->{cb} = $cb; + $self->{cleanup} = $cleanup; + $self->watch_read(1); + $self; +} + +sub restart_read_cb ($) { + my ($self) = @_; + sub { $self->watch_read(1) } +} + +sub async_pass { + my ($self, $io, $fh, $bref) = @_; + # In case the client HTTP connection ($io) dies, it + # will automatically close this ($self) object. + $io->{forward} = $self; + $fh->write($$bref); + my $restart_read = restart_read_cb($self); + weaken($self); + $self->{cb} = sub { + my $r = sysread($self->{sock}, $$bref, 8192); + if ($r) { + $fh->write($$bref); + return if $io->{closed}; + if ($io->{write_buf_size}) { + $self->watch_read(0); + $io->write($restart_read); # D::S::write + } + # stay in watch_read, but let other clients + # get some work done, too. + return; + } elsif (!defined $r) { + return if $!{EAGAIN} || $!{EINTR}; + } + + # Done! Error handling will happen in $fh->close + # called by the {cleanup} handler + $io->{forward} = undef; + $self->close; + } +} + +sub event_read { $_[0]->{cb}->() } +sub event_hup { $_[0]->{cb}->() } +sub event_err { $_[0]->{cb}->() } +sub sysread { shift->{sock}->sysread(@_) } + +sub close { + my $self = shift; + my $cleanup = $self->{cleanup}; + $self->{cleanup} = $self->{cb} = undef; + $self->SUPER::close(@_); + + # we defer this to the next timer loop since close is deferred + PublicInbox::EvCleanup::next_tick($cleanup) if $cleanup; +} + +1; diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index c0db5667..15b5fd3e 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -7,8 +7,8 @@ package PublicInbox::Hval; use strict; use warnings; use Encode qw(find_encoding); +use PublicInbox::MID qw/mid_clean mid_escape/; use URI::Escape qw(uri_escape_utf8); -use PublicInbox::MID qw/mid_clean/; use base qw/Exporter/; our @EXPORT_OK = qw/ascii_html utf8_html to_attr from_attr/; @@ -42,9 +42,8 @@ sub new { } sub new_msgid { - my ($class, $msgid, $no_compress) = @_; - $msgid = mid_clean($msgid); - $class->new($msgid, $msgid); + my ($class, $msgid) = @_; + $class->new($msgid, mid_escape($msgid)); } sub new_oneline { @@ -63,10 +62,24 @@ my %xhtml_map = ( '>' => '>', ); +$xhtml_map{chr($_)} = sprintf('\\x%02x', $_) for (0..31); +# some of these overrides are standard C escapes so they're +# easy-to-understand when rendered. +$xhtml_map{"\x00"} = '\\0'; # NUL +$xhtml_map{"\x07"} = '\\a'; # bell +$xhtml_map{"\x08"} = '\\b'; # backspace +$xhtml_map{"\x09"} = "\t"; # obvious to show as-is +$xhtml_map{"\x0a"} = "\n"; # obvious to show as-is +$xhtml_map{"\x0b"} = '\\v'; # vertical tab +$xhtml_map{"\x0c"} = '\\f'; # form feed +$xhtml_map{"\x0d"} = '\\r'; # carriage ret (not preceding \n) +$xhtml_map{"\x1b"} = '^['; # ASCII escape (mutt seems to escape this way) +$xhtml_map{"\x7f"} = '\\x7f'; # DEL + sub ascii_html { my ($s) = @_; $s =~ s/\r\n/\n/sg; # fixup bad line endings - $s =~ s/([<>&'"])/$xhtml_map{$1}/ge; + $s =~ s/([<>&'"\x7f\x00-\x1f])/$xhtml_map{$1}/sge; $enc_ascii->encode($s, Encode::HTMLCREF); } diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm new file mode 100644 index 00000000..1ac112b8 --- /dev/null +++ b/lib/PublicInbox/Import.pm @@ -0,0 +1,371 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# git fast-import-based ssoma-mda MDA replacement +# This is only ever run by public-inbox-mda and public-inbox-learn, +# not the WWW or NNTP code which only requires read-only access. +package PublicInbox::Import; +use strict; +use warnings; +use Fcntl qw(:flock :DEFAULT); +use PublicInbox::Spawn qw(spawn); +use PublicInbox::MID qw(mid_mime mid2path); +use PublicInbox::Address; + +sub new { + my ($class, $git, $name, $email, $inbox) = @_; + bless { + git => $git, + ident => "$name <$email>", + mark => 1, + ref => 'refs/heads/master', + inbox => $inbox, + }, $class +} + +# idempotent start function +sub gfi_start { + my ($self) = @_; + + return ($self->{in}, $self->{out}) if $self->{pid}; + + my ($in_r, $in_w, $out_r, $out_w); + pipe($in_r, $in_w) or die "pipe failed: $!"; + pipe($out_r, $out_w) or die "pipe failed: $!"; + my $git = $self->{git}; + my $git_dir = $git->{git_dir}; + my $lockpath = "$git_dir/ssoma.lock"; + sysopen(my $lockfh, $lockpath, O_WRONLY|O_CREAT) or + die "failed to open lock $lockpath: $!"; + + # wait for other processes to be done + flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; + local $/ = "\n"; + chomp($self->{tip} = $git->qx(qw(rev-parse --revs-only), $self->{ref})); + + my @cmd = ('git', "--git-dir=$git_dir", qw(fast-import + --quiet --done --date-format=rfc2822)); + my $rdr = { 0 => fileno($out_r), 1 => fileno($in_w) }; + my $pid = spawn(\@cmd, undef, $rdr); + die "spawn fast-import failed: $!" unless defined $pid; + $out_w->autoflush(1); + $self->{in} = $in_r; + $self->{out} = $out_w; + $self->{lockfh} = $lockfh; + $self->{pid} = $pid; + $self->{nchg} = 0; + ($in_r, $out_w); +} + +sub wfail () { die "write to fast-import failed: $!" } + +sub now2822 () { + my @t = gmtime(time); + my $day = qw(Sun Mon Tue Wed Thu Fri Sat)[$t[6]]; + my $mon = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)[$t[4]]; + + sprintf('%s, %2d %s %d %02d:%02d:%02d +0000', + $day, $t[3], $mon, $t[5] + 1900, $t[2], $t[1], $t[0]); +} + +sub norm_body ($) { + my ($mime) = @_; + my $b = $mime->body_raw; + $b =~ s/(\r?\n)+\z//s; + $b +} + +sub _check_path ($$$$) { + my ($r, $w, $tip, $path) = @_; + return if $tip eq ''; + print $w "ls $tip $path\n" or wfail; + local $/ = "\n"; + defined(my $info = <$r>) or die "EOF from fast-import: $!"; + $info =~ /\Amissing / ? undef : $info; +} + +# returns undef on non-existent +# ('MISMATCH', msg) on mismatch +# (:MARK, msg) on success +sub remove { + my ($self, $mime) = @_; # mime = Email::MIME + + my $mid = mid_mime($mime); + my $path = mid2path($mid); + + my ($r, $w) = $self->gfi_start; + my $tip = $self->{tip}; + my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); + $info =~ m!\A100644 blob ([a-f0-9]{40})\t!s or die "not blob: $info"; + my $blob = $1; + + print $w "cat-blob $blob\n" or wfail; + local $/ = "\n"; + $info = <$r>; + defined $info or die "EOF from fast-import / cat-blob: $!"; + $info =~ /\A[a-f0-9]{40} blob (\d+)\n\z/ or + die "unexpected cat-blob response: $info"; + my $left = $1; + my $offset = 0; + my $buf = ''; + my $n; + while ($left > 0) { + $n = read($r, $buf, $left, $offset); + defined($n) or die "read cat-blob failed: $!"; + $n == 0 and die 'fast-export (cat-blob) died'; + $left -= $n; + $offset += $n; + } + $n = read($r, my $lf, 1); + defined($n) or die "read final byte of cat-blob failed: $!"; + die "bad read on final byte: <$lf>" if $lf ne "\n"; + my $cur = Email::MIME->new($buf); + my $cur_s = $cur->header('Subject'); + $cur_s = '' unless defined $cur_s; + my $cur_m = $mime->header('Subject'); + $cur_m = '' unless defined $cur_m; + if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { + return ('MISMATCH', $cur); + } + + my $ref = $self->{ref}; + my $commit = $self->{mark}++; + my $parent = $tip =~ /\A:/ ? $tip : undef; + unless ($parent) { + print $w "reset $ref\n" or wfail; + } + my $ident = $self->{ident}; + my $now = now2822(); + print $w "commit $ref\nmark :$commit\n", + "author $ident $now\n", + "committer $ident $now\n", + "data 3\nrm\n\n", + 'from ', ($parent ? $parent : $tip), "\n" or wfail; + print $w "D $path\n\n" or wfail; + $self->{nchg}++; + (($self->{tip} = ":$commit"), $cur); +} + +# returns undef on duplicate +sub add { + my ($self, $mime, $check_cb) = @_; # mime = Email::MIME + + my $from = $mime->header('From'); + my ($email) = PublicInbox::Address::emails($from); + my ($name) = PublicInbox::Address::names($from); + # git gets confused with: + # "'A U Thor <u@example.com>' via foo" <foo@example.com> + # ref: + # <CAD0k6qSUYANxbjjbE4jTW4EeVwOYgBD=bXkSu=akiYC_CB7Ffw@mail.gmail.com> + $name =~ tr/<>//d; + + my $date = $mime->header('Date'); + my $subject = $mime->header('Subject'); + $subject = '(no subject)' unless defined $subject; + my $mid = mid_mime($mime); + my $path = mid2path($mid); + + my ($r, $w) = $self->gfi_start; + my $tip = $self->{tip}; + _check_path($r, $w, $tip, $path) and return; + + # kill potentially confusing/misleading headers + $mime->header_set($_) for qw(bytes lines content-length status); + if ($check_cb) { + $mime = $check_cb->($mime) or return; + } + + $mime = $mime->as_string; + my $blob = $self->{mark}++; + print $w "blob\nmark :$blob\ndata ", length($mime), "\n" or wfail; + print $w $mime, "\n" or wfail; + my $ref = $self->{ref}; + my $commit = $self->{mark}++; + my $parent = $tip =~ /\A:/ ? $tip : undef; + + unless ($parent) { + print $w "reset $ref\n" or wfail; + } + + # quiet down wide character warnings: + binmode $w, ':utf8' or die "binmode :utf8 failed: $!"; + print $w "commit $ref\nmark :$commit\n", + "author $name <$email> $date\n", + "committer $self->{ident} ", now2822(), "\n", + "data ", (bytes::length($subject) + 1), "\n", + $subject, "\n\n" or wfail; + binmode $w, ':raw' or die "binmode :raw failed: $!"; + + if ($tip ne '') { + print $w 'from ', ($parent ? $parent : $tip), "\n" or wfail; + } + print $w "M 100644 :$blob $path\n\n" or wfail; + $self->{nchg}++; + $self->{tip} = ":$commit"; +} + +sub run_die ($$) { + my ($cmd, $env) = @_; + my $pid = spawn($cmd, $env, undef); + defined $pid or die "spawning ".join(' ', @$cmd)." failed: $!"; + waitpid($pid, 0) == $pid or die join(' ', @$cmd) .' did not finish'; + $? == 0 or die join(' ', @$cmd) . " failed: $?\n"; +} + +sub done { + my ($self) = @_; + my $w = delete $self->{out} or return; + my $r = delete $self->{in} or die 'BUG: missing {in} when done'; + print $w "done\n" or wfail; + my $pid = delete $self->{pid} or die 'BUG: missing {pid} when done'; + waitpid($pid, 0) == $pid or die 'fast-import did not finish'; + $? == 0 or die "fast-import failed: $?"; + my $nchg = delete $self->{nchg}; + + # for compatibility with existing ssoma installations + # we can probably remove this entirely by 2020 + my $git_dir = $self->{git}->{git_dir}; + # XXX: change the following scope to: if (-e $index) # in 2018 or so.. + my @cmd = ('git', "--git-dir=$git_dir"); + if ($nchg && !$ENV{FAST}) { + my $index = "$git_dir/ssoma.index"; + my $env = { GIT_INDEX_FILE => $index }; + run_die([@cmd, qw(read-tree -m -v -i), $self->{ref}], $env); + } + if ($nchg) { + run_die([@cmd, 'update-server-info'], undef); + eval { + require PublicInbox::SearchIdx; + my $inbox = $self->{inbox} || $git_dir; + my $s = PublicInbox::SearchIdx->new($inbox); + $s->index_sync({ ref => $self->{ref} }); + }; + + eval { run_die([@cmd, qw(gc --auto)], undef) }; + } + + my $lockfh = delete $self->{lockfh} or die "BUG: not locked: $!"; + flock($lockfh, LOCK_UN) or die "unlock failed: $!"; + close $lockfh or die "close lock failed: $!"; +} + +1; +__END__ +=pod + +=head1 NAME + +PublicInbox::Import - message importer for public-inbox + +=head1 VERSION + +version 1.0 + +=head1 SYNOPSYS + + use Email::MIME; + use PublicInbox::Git; + use PublicInbox::Import; + + chomp(my $git_dir = `git rev-parse --git-dir`); + $git_dir or die "GIT_DIR= must be specified\n"; + my $git = PublicInbox::Git->new($git_dir); + my @committer = ('inbox', 'inbox@example.org'); + my $im = PublicInbox::Import->new($git, @committer); + + # to add a message: + my $message = "From: <u\@example.org>\n". + "Subject: test message \n" . + "Date: Thu, 01 Jan 1970 00:00:00 +0000\n" . + "Message-ID: <m\@example.org>\n". + "\ntest message"; + my $parsed = Email::MIME->new($message); + my $ret = $im->add($parsed); + if (!defined $ret) { + warn "duplicate: ", + $parsed->header_obj->header_raw('Message-ID'), "\n"; + } else { + print "imported at mark $ret\n"; + } + $im->done; + + # to remove a message + my $junk = Email::MIME->new($message); + my ($mark, $orig) = $im->remove($junk); + if ($mark eq 'MISSING') { + print "not found\n"; + } elsif ($mark eq 'MISMATCH') { + print "Message exists but does not match\n\n", + $orig->as_string, "\n",; + } else { + print "removed at mark $mark\n\n", + $orig->as_string, "\n"; + } + $im->done; + +=head1 DESCRIPTION + +An importer and remover for public-inboxes which takes L<Email::MIME> +messages as input and stores them in a ssoma repository as +documented in L<https://ssoma.public-inbox.org/ssoma_repository.txt>, +except it does not allow duplicate Message-IDs. + +It requires L<git(1)> and L<git-fast-import(1)> to be installed. + +=head1 METHODS + +=cut + +=head2 new + + my $im = PublicInbox::Import->new($git, @committer); + +Initialize a new PublicInbox::Import object. + +=head2 add + + my $parsed = Email::MIME->new($message); + $im->add($parsed); + +Adds a message to to the git repository. This will acquire +C<$GIT_DIR/ssoma.lock> and start L<git-fast-import(1)> if necessary. + +Messages added will not be visible to other processes until L</done> +is called, but L</remove> may be called on them. + +=head2 remove + + my $junk = Email::MIME->new($message); + my ($code, $orig) = $im->remove($junk); + +Removes a message from the repository. On success, it returns +a ':'-prefixed numeric code representing the git-fast-import +mark and the original messages as an Email::MIME object. +If the message could not be found, the code is "MISSING" +and the original message is undef. If there is a mismatch where +the "Message-ID" is matched but the subject and body do not match, +the returned code is "MISMATCH" and the conflicting message +is returned as orig. + +=head2 done + +Finalizes the L<git-fast-import(1)> and unlocks the repository. +Calling this is required to finalize changes to a repository. + +=head1 SEE ALSO + +L<Email::MIME> + +=head1 CONTACT + +All feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> + +=head1 COPYRIGHT + +Copyright (C) 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> + +=cut diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm new file mode 100644 index 00000000..8c639082 --- /dev/null +++ b/lib/PublicInbox/Inbox.pm @@ -0,0 +1,235 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Represents a public-inbox (which may have multiple mailing addresses) +package PublicInbox::Inbox; +use strict; +use warnings; +use Scalar::Util qw(weaken isweak); +use PublicInbox::Git; +use PublicInbox::MID qw(mid2path); + +my $weakt; +eval { + $weakt = 'disabled'; + require PublicInbox::EvCleanup; + $weakt = undef; # OK if we get here +}; + +my $WEAKEN = {}; # string(inbox) -> inbox +sub weaken_task () { + $weakt = undef; + _weaken_fields($_) for values %$WEAKEN; + $WEAKEN = {}; +} + +sub _weaken_later ($) { + my ($self) = @_; + $weakt ||= PublicInbox::EvCleanup::later(*weaken_task); + $WEAKEN->{"$self"} = $self; +} + +sub new { + my ($class, $opts) = @_; + my $v = $opts->{address} ||= 'public-inbox@example.com'; + my $p = $opts->{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; + $opts->{domain} = ($p =~ /\@(\S+)\z/) ? $1 : 'localhost'; + weaken($opts->{-pi_config}); + bless $opts, $class; +} + +sub _weaken_fields { + my ($self) = @_; + foreach my $f (qw(git mm search)) { + isweak($self->{$f}) or weaken($self->{$f}); + } +} + +sub _set_limiter ($$$) { + my ($self, $git, $pfx) = @_; + my $lkey = "-${pfx}_limiter"; + $git->{$lkey} = $self->{$lkey} ||= eval { + my $mkey = $pfx.'max'; + my $val = $self->{$mkey} or return; + my $lim; + if ($val =~ /\A\d+\z/) { + require PublicInbox::Qspawn; + $lim = PublicInbox::Qspawn::Limiter->new($val); + } elsif ($val =~ /\A[a-z][a-z0-9]*\z/) { + $lim = $self->{-pi_config}->limiter($val); + warn "$mkey limiter=$val not found\n" if !$lim; + } else { + warn "$mkey limiter=$val not understood\n"; + } + $lim; + } +} + +sub git { + my ($self) = @_; + $self->{git} ||= eval { + _weaken_later($self); + my $g = PublicInbox::Git->new($self->{mainrepo}); + _set_limiter($self, $g, 'httpbackend'); + $g; + }; +} + +sub mm { + my ($self) = @_; + $self->{mm} ||= eval { + _weaken_later($self); + PublicInbox::Msgmap->new($self->{mainrepo}); + }; +} + +sub search { + my ($self) = @_; + $self->{search} ||= eval { + _weaken_later($self); + PublicInbox::Search->new($self->{mainrepo}, $self->{altid}); + }; +} + +sub try_cat { + my ($path) = @_; + my $rv = ''; + if (open(my $fh, '<', $path)) { + local $/; + $rv = <$fh>; + } + $rv; +} + +sub description { + my ($self) = @_; + my $desc = $self->{description}; + return $desc if defined $desc; + $desc = try_cat("$self->{mainrepo}/description"); + local $/ = "\n"; + chomp $desc; + $desc =~ s/\s+/ /smg; + $desc = '($GIT_DIR/description missing)' if $desc eq ''; + $self->{description} = $desc; +} + +sub cloneurl { + my ($self) = @_; + my $url = $self->{cloneurl}; + return $url if $url; + $url = try_cat("$self->{mainrepo}/cloneurl"); + my @url = split(/\s+/s, $url); + local $/ = "\n"; + chomp @url; + $self->{cloneurl} = \@url; +} + +sub base_url { + my ($self, $env) = @_; + if ($env) { # PSGI env + my $scheme = $env->{'psgi.url_scheme'}; + my $host_port = $env->{HTTP_HOST} || + "$env->{SERVER_NAME}:$env->{SERVER_PORT}"; + my $url = "$scheme://$host_port". ($env->{SCRIPT_NAME} || '/'); + # for mount in Plack::Builder + $url .= '/' if $url !~ m!/\z!; + $url .= $self->{name} . '/'; + } else { + # either called from a non-PSGI environment (e.g. NNTP/POP3) + $self->{-base_url} ||= do { + my $url = $self->{url} or return undef; + # expand protocol-relative URLs to HTTPS if we're + # not inside a web server + $url = "https:$url" if $url =~ m!\A//!; + $url .= '/' if $url !~ m!/\z!; + $url; + }; + } +} + +sub nntp_url { + my ($self) = @_; + $self->{-nntp_url} ||= do { + # no checking for nntp_usable here, we can point entirely + # to non-local servers or users run by a different user + my $ns = $self->{-pi_config}->{'publicinbox.nntpserver'}; + my $group = $self->{newsgroup}; + my @urls; + if ($ns && $group) { + $ns = [ $ns ] if ref($ns) ne 'ARRAY'; + @urls = map { + my $u = m!\Anntps?://! ? $_ : "nntp://$_"; + $u .= '/' if $u !~ m!/\z!; + $u.$group; + } @$ns; + } + + my $mirrors = $self->{nntpmirror}; + if ($mirrors) { + my @m; + foreach (@$mirrors) { + my $u = m!\Anntps?://! ? $_ : "nntp://$_"; + if ($u =~ m!\Anntps?://[^/]+/?\z!) { + if ($group) { + $u .= '/' if $u !~ m!/\z!; + $u .= $group; + } else { + warn +"publicinbox.$self->{name}.nntpmirror=$_ missing newsgroup name\n"; + } + } + # else: allow full URLs like: + # nntp://news.example.com/alt.example + push @m, $u; + } + my %seen = map { $_ => 1 } @urls; + foreach my $u (@m) { + next if $seen{$u}; + $seen{$u} = 1; + push @urls, $u; + } + } + \@urls; + }; +} + +sub nntp_usable { + my ($self) = @_; + my $ret = $self->mm && $self->search; + $self->{mm} = $self->{search} = undef; + $ret; +} + +sub msg_by_path ($$;$) { + my ($self, $path, $ref) = @_; + # TODO: allow other refs: + my $str = git($self)->cat_file('HEAD:'.$path, $ref); + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s if $str; + $str; +} + +sub msg_by_smsg ($$;$) { + my ($self, $smsg, $ref) = @_; + + return unless defined $smsg; # ghost + + # backwards compat to fallback to msg_by_mid + # TODO: remove if we bump SCHEMA_VERSION in Search.pm: + defined(my $blob = $smsg->blob) or return msg_by_mid($self, $smsg->mid); + + my $str = git($self)->cat_file($blob, $ref); + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s if $str; + $str; +} + +sub path_check { + my ($self, $path) = @_; + git($self)->check('HEAD:'.$path); +} + +sub msg_by_mid ($$;$) { + my ($self, $mid, $ref) = @_; + msg_by_path($self, mid2path($mid), $ref); +} + +1; diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index 4eddedd0..acd2a47e 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -15,21 +15,32 @@ use warnings; use Digest::SHA qw/sha1_hex/; my $SALT = rand; -my $LINK_RE = qr!\b((?:ftp|https?|nntp):// +my $LINK_RE = qr{(\()?\b((?:ftps?|https?|nntps?|gopher):// [\@:\w\.-]+/ - ?[\@\w\+\&\?\.\%\;/#=-]*)!x; + (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) + (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? + (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? + )}xi; sub new { bless {}, shift } sub linkify_1 { my ($self, $s) = @_; $s =~ s!$LINK_RE! - my $url = $1; + my $beg = $1 || ''; + my $url = $2; my $end = ''; # it's fairly common to end URLs in messages with - # '.' or ';' to denote the end of a statement. - if ($url =~ s/(\.)\z// || $url =~ s/(;)\z//) { + # '.', ',' or ';' to denote the end of a statement; + # assume the intent was to end the statement/sentence + # in English + # Markdown compatibility: + if ($beg eq '(') { + if ($url =~ s/(\)[\.,;]?)\z//) { + $end = $1; + } + } elsif ($url =~ s/([\.,;])\z//) { $end = $1; } @@ -40,7 +51,7 @@ sub linkify_1 { # only escape ampersands, others do not match LINK_RE $url =~ s/&/&/g; $self->{$key} = $url; - 'PI-LINK-'. $key . $end; + $beg . 'PI-LINK-'. $key . $end; !ge; $s; } diff --git a/lib/PublicInbox/MDA.pm b/lib/PublicInbox/MDA.pm index 003bac65..bcf5358b 100644 --- a/lib/PublicInbox/MDA.pm +++ b/lib/PublicInbox/MDA.pm @@ -6,11 +6,22 @@ package PublicInbox::MDA; use strict; use warnings; use Email::Simple; -use Email::Address; use Date::Parse qw(strptime); use constant MAX_SIZE => 1024 * 500; # same as spamc default, should be tunable use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian -use constant cmd => qw/ssoma-mda -1/; + +our @BAD_HEADERS = ( + # postfix + qw(delivered-to x-original-to), # prevent training loops + + # The rest are taken from Mailman 2.1.15: + # could contain passwords: + qw(approved approve x-approved x-approve urgent), + # could be used phishing: + qw(return-receipt-to disposition-notification-to x-confirm-reading-to), + # Pegasus mail: + qw(x-pmrqc) +); # drop plus addressing for matching sub __drop_plus { @@ -21,16 +32,17 @@ sub __drop_plus { # do not allow Bcc, only Cc and To if recipient is set sub precheck { - my ($klass, $filter, $address) = @_; - my Email::Simple $simple = $filter->simple; - my $mid = $simple->header("Message-ID"); + my ($klass, $simple, $address) = @_; + my @mid = $simple->header('Message-ID'); + return 0 if scalar(@mid) != 1; + my $mid = $mid[0]; return 0 if (length($mid) > MAX_MID_SIZE); return 0 unless usable_str(length('<m@h>'), $mid) && $mid =~ /\@/; - return 0 unless usable_str(length('u@h'), $filter->from); + return 0 unless usable_str(length('u@h'), $simple->header("From")); return 0 unless usable_str(length(':o'), $simple->header("Subject")); return 0 unless usable_date($simple->header("Date")); return 0 if length($simple->as_string) > MAX_SIZE; - alias_specified($filter, $address); + alias_specified($simple, $address); } sub usable_str { @@ -44,17 +56,17 @@ sub usable_date { } sub alias_specified { - my ($filter, $address) = @_; + my ($simple, $address) = @_; my @address = ref($address) eq 'ARRAY' ? @$address : ($address); my %ok = map { - my @recip = Email::Address->parse($_); - lc(__drop_plus($recip[0]->address)) => 1; + lc(__drop_plus($_)) => 1; } @address; - foreach my $line ($filter->cc, $filter->to) { - foreach my $addr (Email::Address->parse($line)) { - if ($ok{lc(__drop_plus($addr->address))}) { + foreach my $line ($simple->header('Cc'), $simple->header('To')) { + my @addrs = ($line =~ /([^,<\s]+\@[^,>\s]+)/g); + foreach my $addr (@addrs) { + if ($ok{lc(__drop_plus($addr))}) { return 1; } } @@ -64,28 +76,12 @@ sub alias_specified { sub set_list_headers { my ($class, $simple, $dst) = @_; - my $pa = $dst->{-primary_address}; - - $simple->header_set("List-Id", "<$pa>"); # RFC2919 - - # remove Delivered-To: prevent training loops - # The rest are taken from Mailman 2.1.15, some may be used for phishing - foreach my $h (qw(delivered-to approved approve x-approved x-approve - urgent return-receipt-to disposition-notification-to - x-confirm-reading-to x-pmrqc)) { - $simple->header_set($h); + unless (defined $simple->header('List-Id')) { + my $pa = $dst->{-primary_address}; + $simple->header_set("List-Id", "<$pa>"); # RFC2919 } -} - -# returns a 3-element array: name, email, date -sub author_info { - my ($class, $mime) = @_; - my $from = $mime->header('From'); - my @from = Email::Address->parse($from); - my $name = $from[0]->name; - my $email = $from[0]->address; - ($name, $email, $mime->header('Date')); + $simple->header_set($_) foreach @BAD_HEADERS; } 1; diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 78952b95..1c2d75cc 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -6,7 +6,8 @@ package PublicInbox::MID; use strict; use warnings; use base qw/Exporter/; -our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime/; +our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape/; +use URI::Escape qw(uri_escape_utf8); use Digest::SHA qw/sha1_hex/; use constant MID_MAX => 40; # SHA-1 hex length @@ -25,6 +26,7 @@ sub id_compress { my ($id, $force) = @_; if ($force || $id =~ /[^\w\-]/ || length($id) > MID_MAX) { + utf8::encode($id); return sha1_hex($id); } $id; @@ -36,7 +38,9 @@ sub mid2path { unless (defined $x38) { # compatibility with old links (or short Message-IDs :) - $mid = sha1_hex(mid_clean($mid)); + $mid = mid_clean($mid); + utf8::encode($mid); + $mid = sha1_hex($mid); ($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/); } "$x2/$x38"; @@ -44,4 +48,8 @@ sub mid2path { sub mid_mime ($) { $_[0]->header_obj->header_raw('Message-ID') } +# RFC3986, section 3.3: +sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' } +sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) } + 1; diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 0d67981f..2565ea58 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -6,32 +6,18 @@ package PublicInbox::Mbox; use strict; use warnings; -use PublicInbox::MID qw/mid2path mid_clean/; -use URI::Escape qw/uri_escape_utf8/; +use PublicInbox::MID qw/mid_clean mid_escape/; require Email::Simple; -sub thread_mbox { - my ($ctx, $srch, $sfx) = @_; - sub { - my ($response) = @_; # Plack callback - emit_mbox($response, $ctx, $srch, $sfx); - } -} - sub emit1 { - my $simple = Email::Simple->new(pop); - my $ctx = pop; - sub { - my ($response) = @_; - # single message should be easily renderable in browsers - my $fh = $response->([200, ['Content-Type'=>'text/plain']]); - emit_msg($ctx, $fh, $simple); - $fh->close; - } + my ($ctx, $msg) = @_; + $msg = Email::Simple->new($msg); + # single message should be easily renderable in browsers + [200, ['Content-Type', 'text/plain'], [ msg_str($ctx, $msg)] ] } -sub emit_msg { - my ($ctx, $fh, $simple) = @_; # Email::Simple object +sub msg_str { + my ($ctx, $simple) = @_; # Email::Simple object my $header_obj = $simple->header_obj; # drop potentially confusing headers, ssoma already should've dropped @@ -39,74 +25,67 @@ sub emit_msg { foreach my $d (qw(Lines Bytes Content-Length Status)) { $header_obj->header_set($d); } - my $feed_opts = $ctx->{feed_opts}; - unless ($feed_opts) { - require PublicInbox::Feed; # FIXME: gross - $feed_opts = PublicInbox::Feed::get_feedopts($ctx); - $ctx->{feed_opts} = $feed_opts; - } - my $base = $feed_opts->{url}; + my $ibx = $ctx->{-inbox}; + my $base = $ibx->base_url($ctx->{env}); my $mid = mid_clean($header_obj->header('Message-ID')); - $mid = uri_escape_utf8($mid); - my @archived_at = $header_obj->header('Archived-At'); - push @archived_at, "<$base$mid/>"; - $header_obj->header_set('Archived-At', @archived_at); - $header_obj->header_set('List-Archive', "<$base>"); - $header_obj->header_set('List-Post', "<mailto:$feed_opts->{id_addr}>"); - - my $buf = $header_obj->as_string; - unless ($buf =~ /\AFrom /) { - $fh->write("From mboxrd\@z Thu Jan 1 00:00:00 1970\n"); + $mid = mid_escape($mid); + my @append = ( + 'Archived-At', "<$base$mid/>", + 'List-Archive', "<$base>", + 'List-Post', "<mailto:$ibx->{-primary_address}>", + ); + my $crlf = $simple->crlf; + my $buf = "From mboxrd\@z Thu Jan 1 00:00:00 1970\n" . + $header_obj->as_string; + for (my $i = 0; $i < @append; $i += 2) { + my $k = $append[$i]; + my $v = $append[$i + 1]; + my @v = $header_obj->header($k); + foreach (@v) { + if ($v eq $_) { + $v = undef; + last; + } + } + $buf .= "$k: $v$crlf" if defined $v; } - $fh->write($buf .= $simple->crlf); - - $buf = $simple->body; - $simple->body_set(''); + $buf .= $crlf; # mboxrd quoting style # ref: http://www.qmail.org/man/man5/mbox.html - $buf =~ s/^(>*From )/>$1/gm; + my $body = $simple->body; + $body =~ s/^(>*From )/>$1/gm; + $buf .= $body; + $buf .= "\n"; +} - $buf .= "\n" unless $buf =~ /\n\z/s; +sub thread_mbox { + my ($ctx, $srch, $sfx) = @_; + eval { require IO::Compress::Gzip }; + return sub { need_gzip(@_) } if $@; - $fh->write($buf); + my $cb = sub { $srch->get_thread($ctx->{mid}, @_) }; + # http://www.iana.org/assignments/media-types/application/gzip + [200, ['Content-Type' => 'application/gzip'], + PublicInbox::MboxGz->new($ctx, $cb) ]; } -sub emit_mbox { - my ($response, $ctx, $srch, $sfx) = @_; - my $type = 'mbox'; - if ($sfx) { - eval { require IO::Compress::Gzip }; - return need_gzip($response) if $@; - $type = 'gzip'; +sub emit_range { + my ($ctx, $range) = @_; + + eval { require IO::Compress::Gzip }; + return sub { need_gzip(@_) } if $@; + my $query; + if ($range eq 'all') { # TODO: YYYY[-MM] + $query = ''; + } else { + return [404, [qw(Content-Type text/plain)], []]; } + my $cb = sub { $ctx->{srch}->query($query, @_) }; # http://www.iana.org/assignments/media-types/application/gzip - # http://www.iana.org/assignments/media-types/application/mbox - my $fh = $response->([200, ['Content-Type' => "application/$type"]]); - $fh = PublicInbox::MboxGz->new($fh) if $sfx; - - require PublicInbox::Git; - my $mid = $ctx->{mid}; - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my %opts = (offset => 0); - my $nr; - do { - my $res = $srch->get_thread($mid, \%opts); - my $msgs = $res->{msgs}; - $nr = scalar @$msgs; - while (defined(my $smsg = shift @$msgs)) { - my $msg = eval { - my $p = 'HEAD:'.mid2path($smsg->mid); - Email::Simple->new($git->cat_file($p)); - }; - emit_msg($ctx, $fh, $msg) if $msg; - } - - $opts{offset} += $nr; - } while ($nr > 0); - - $fh->close; + [200, [qw(Content-Type application/gzip)], + PublicInbox::MboxGz->new($ctx, $cb) ]; } sub need_gzip { @@ -123,40 +102,55 @@ EOF 1; -# fh may not be a proper IO, so we wrap the write and close methods -# to prevent IO::Compress::Gzip from complaining package PublicInbox::MboxGz; use strict; use warnings; sub new { - my ($class, $fh) = @_; - my $buf; + my ($class, $ctx, $cb) = @_; + my $buf = ''; bless { buf => \$buf, - gz => IO::Compress::Gzip->new(\$buf), - fh => $fh, + gz => IO::Compress::Gzip->new(\$buf, Time => 0), + cb => $cb, + ctx => $ctx, + msgs => [], + opts => { offset => 0 }, }, $class; } -sub _flush_buf { +# called by Plack::Util::foreach or similar +sub getline { my ($self) = @_; - if (defined ${$self->{buf}}) { - $self->{fh}->write(${$self->{buf}}); - ${$self->{buf}} = undef; - } -} - -sub write { - $_[0]->{gz}->write($_[1]); - _flush_buf($_[0]); + my $ctx = $self->{ctx} or return; + my $res; + my $ibx = $ctx->{-inbox}; + my $gz = $self->{gz}; + do { + while (defined(my $smsg = shift @{$self->{msgs}})) { + my $msg = eval { $ibx->msg_by_smsg($smsg) } or next; + $msg = Email::Simple->new($msg); + $gz->write(PublicInbox::Mbox::msg_str($ctx, $msg)); + my $bref = $self->{buf}; + if (length($$bref) >= 8192) { + my $ret = $$bref; # copy :< + ${$self->{buf}} = ''; + return $ret; + } + + # be fair to other clients on public-inbox-httpd: + return ''; + } + $res = $self->{cb}->($self->{opts}); + $self->{msgs} = $res->{msgs}; + $res = scalar @{$self->{msgs}}; + $self->{opts}->{offset} += $res; + } while ($res); + $gz->close; + delete $self->{ctx}; + ${delete $self->{buf}}; } -sub close { - my ($self) = @_; - $self->{gz}->close; - _flush_buf($self); - $self->{fh}->close; -} +sub close {} # noop 1; diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm new file mode 100644 index 00000000..ef0d209f --- /dev/null +++ b/lib/PublicInbox/MsgIter.pm @@ -0,0 +1,57 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +package PublicInbox::MsgIter; +use strict; +use warnings; +use base qw(Exporter); +our @EXPORT = qw(msg_iter); +use Email::MIME; +use Scalar::Util qw(readonly); + +# Workaround Email::MIME versions without +# commit dcef9be66c49ae89c7a5027a789bbbac544499ce +# ("removing all trailing newlines was too much") +# This is necessary for Debian jessie +my $bad = 1.923; +my $good = 1.935; +my $ver = $Email::MIME::VERSION; +my $extra_nl = 1 if ($ver >= $bad && $ver < $good); + +# Like Email::MIME::walk_parts, but this is: +# * non-recursive +# * passes depth and indices to the iterator callback +sub msg_iter ($$) { + my ($mime, $cb) = @_; + my @parts = $mime->subparts; + if (@parts) { + my $i = 0; + @parts = map { [ $_, 1, ++$i ] } @parts; + while (my $p = shift @parts) { + my ($part, $depth, @idx) = @$p; + my @sub = $part->subparts; + if (@sub) { + $depth++; + $i = 0; + @sub = map { [ $_, $depth, @idx, ++$i ] } @sub; + @parts = (@sub, @parts); + } else { + if ($extra_nl) { + my $lf = $part->{mycrlf}; + my $bref = $part->{body}; + if (readonly($$bref)) { + my $s = $$bref . $lf; + $part->{body} = \$s; + } else { + $$bref .= $lf; + } + } + $cb->($p); + } + } + } else { + $cb->([$mime, 0, 0]); + } +} + +1; diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index 8fe17a95..3fb3805f 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -20,7 +20,12 @@ sub new { my $err = $!; -d $d or die "$d not created: $err"; } - my $f = "$d/msgmap.sqlite3"; + new_file($class, "$d/msgmap.sqlite3", $writable); +} + +sub new_file { + my ($class, $f, $writable) = @_; + my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', { AutoCommit => 1, RaiseError => 1, @@ -33,11 +38,14 @@ sub new { if ($writable) { create_tables($dbh); + $dbh->begin_work; $self->created_at(time) unless $self->created_at; + $dbh->commit; } $self; } +# n.b. invoked directly by scripts/xhdr-num2mid sub meta_accessor { my ($self, $key, $value) = @_; use constant { @@ -51,22 +59,14 @@ sub meta_accessor { defined $value or return $dbh->selectrow_array(meta_select, undef, $key); - $dbh->begin_work; - eval { - $prev = $dbh->selectrow_array(meta_select, undef, $key); + $prev = $dbh->selectrow_array(meta_select, undef, $key); - if (defined $prev) { - $dbh->do(meta_update, undef, $value, $key); - } else { - $dbh->do(meta_insert, undef, $key, $value); - } - $dbh->commit; - }; - my $err = $@; - return $prev unless $err; - - $dbh->rollback; - die $err; + if (defined $prev) { + $dbh->do(meta_update, undef, $value, $key); + } else { + $dbh->do(meta_insert, undef, $key, $value); + } + $prev; } sub last_commit { @@ -160,6 +160,7 @@ sub create_tables { 'val VARCHAR(255) NOT NULL)'); } +# used by NNTP.pm sub id_batch { my ($self, $num, $cb) = @_; my $dbh = $self->{dbh}; @@ -173,4 +174,15 @@ sub id_batch { $nr; } +# only used for mapping external serial numbers (e.g. articles from gmane) +# see scripts/xhdr-num2mid for usage +sub mid_set { + my ($self, $num, $mid) = @_; + my $sth = $self->{mid_set} ||= do { + my $sql = 'INSERT INTO msgmap (num, mid) VALUES (?,?)'; + $self->{dbh}->prepare($sql); + }; + $sth->execute($num, $mid); +} + 1; diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 8740377f..9408ffb9 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -9,12 +9,14 @@ use base qw(Danga::Socket); use fields qw(nntpd article rbuf ng long_res); use PublicInbox::Search; use PublicInbox::Msgmap; +use PublicInbox::MID qw(mid_escape); use PublicInbox::Git; -use PublicInbox::MID qw(mid2path); -use Email::MIME; -use Data::Dumper qw(Dumper); +require PublicInbox::EvCleanup; +use Email::Simple; use POSIX qw(strftime); use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); +use Digest::SHA qw(sha1_hex); +use Time::Local qw(timegm timelocal); use constant { r501 => '501 command syntax error', r221 => '221 Header follows', @@ -36,21 +38,36 @@ my $LIST_HEADERS = join("\r\n", @OVERVIEW, my %DISABLED; # = map { $_ => 1 } qw(xover list_overview_fmt newnews xhdr); my $EXPMAP; # fd -> [ idle_time, $self ] -my $EXPTIMER; +my $expt; our $EXPTIME = 180; # 3 minutes +my $nextt; + +my $nextq = []; +sub next_tick () { + $nextt = undef; + my $q = $nextq; + $nextq = []; + foreach my $nntp (@$q) { + # for request && response protocols, always finish writing + # before finishing reading: + if (my $long_cb = $nntp->{long_res}) { + $nntp->write($long_cb); + } elsif (&Danga::Socket::POLLIN & $nntp->{event_watch}) { + event_read($nntp); + } + } +} sub update_idle_time ($) { my ($self) = @_; - my $tmp = $self->{sock} or return; - $tmp = fileno($tmp); - defined $tmp and $EXPMAP->{$tmp} = [ now(), $self ]; + my $fd = $self->{fd}; + defined $fd and $EXPMAP->{$fd} = [ now(), $self ]; } sub expire_old () { my $now = now(); my $exp = $EXPTIME; my $old = $now - $exp; - my $next = $now + $exp; my $nr = 0; my %new; while (my ($fd, $v) = each %$EXPMAP) { @@ -58,36 +75,31 @@ sub expire_old () { if ($idle_time < $old) { $nntp->close; # idempotent } else { - my $nexp = $idle_time + $exp; - $next = $nexp if ($nexp < $next); ++$nr; $new{$fd} = $v; } } $EXPMAP = \%new; if ($nr) { - $next -= $now; - $next = 0 if $next < 0; - $EXPTIMER = Danga::Socket->AddTimer($next, *expire_old); + $expt = PublicInbox::EvCleanup::later(*expire_old); } else { - $EXPTIMER = undef; - # noop to kick outselves out of the loop so descriptors + $expt = undef; + # noop to kick outselves out of the loop ASAP so descriptors # really get closed - Danga::Socket->AddTimer(0, sub {}); + PublicInbox::EvCleanup::asap(sub {}); } } sub new ($$$) { my ($class, $sock, $nntpd) = @_; my $self = fields::new($class); - binmode $sock, ':utf8'; # RFC 3977 $self->SUPER::new($sock); $self->{nntpd} = $nntpd; res($self, '201 server ready - post via email'); $self->{rbuf} = ''; $self->watch_read(1); update_idle_time($self); - $EXPTIMER ||= Danga::Socket->AddTimer($EXPTIME, *expire_old); + $expt ||= PublicInbox::EvCleanup::later(*expire_old); $self; } @@ -115,7 +127,8 @@ sub process_line ($$) { my $res = eval { $req->($self, @args) }; my $err = $@; if ($err && !$self->{closed}) { - chomp($l = Dumper(\$l)); + local $/ = "\n"; + chomp($l); err($self, 'error from: %s (%s)', $l, $err); $res = '503 program fault - command not performed'; } @@ -153,7 +166,7 @@ sub list_active ($;$) { my ($self, $wildmat) = @_; wildmat2re($wildmat); foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $wildmat or next; + $ng->{newsgroup} =~ $wildmat or next; group_line($self, $ng); } } @@ -162,9 +175,9 @@ sub list_active_times ($;$) { my ($self, $wildmat) = @_; wildmat2re($wildmat); foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $wildmat or next; + $ng->{newsgroup} =~ $wildmat or next; my $c = eval { $ng->mm->created_at } || time; - more($self, "$ng->{name} $c $ng->{address}"); + more($self, "$ng->{newsgroup} $c $ng->{-primary_address}"); } } @@ -172,9 +185,9 @@ sub list_newsgroups ($;$) { my ($self, $wildmat) = @_; wildmat2re($wildmat); foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $wildmat or next; + $ng->{newsgroup} =~ $wildmat or next; my $d = $ng->description; - more($self, "$ng->{name} $d"); + more($self, "$ng->{newsgroup} $d"); } } @@ -226,7 +239,6 @@ sub cmd_listgroup ($;$) { sub parse_time ($$;$) { my ($date, $time, $gmt) = @_; - use Time::Local qw(); my ($hh, $mm, $ss) = unpack('A2A2A2', $time); if (defined $gmt) { $gmt =~ /\A(?:UTC|GMT)\z/i or die "GM invalid: $gmt"; @@ -238,22 +250,22 @@ sub parse_time ($$;$) { ($YYYY, $MM, $DD) = unpack('A4A2A2', $date); } else { # legacy clients send YYMMDD ($YYYY, $MM, $DD) = unpack('A2A2A2', $date); - if ($YYYY > strftime('%y', @now)) { - my $cur_year = $now[5] + 1900; + my $cur_year = $now[5] + 1900; + if ($YYYY > $cur_year) { $YYYY += int($cur_year / 1000) * 1000 - 100; } } if ($gmt) { - Time::Local::timegm($ss, $mm, $hh, $DD, $MM - 1, $YYYY); + timegm($ss, $mm, $hh, $DD, $MM - 1, $YYYY); } else { - Time::Local::timelocal($ss, $mm, $hh, $DD, $MM - 1, $YYYY); + timelocal($ss, $mm, $hh, $DD, $MM - 1, $YYYY); } } sub group_line ($$) { my ($self, $ng) = @_; my ($min, $max) = $ng->mm->minmax; - more($self, "$ng->{name} $max $min n") if defined $min && defined $max; + more($self, "$ng->{newsgroup} $max $min n") if defined $min && defined $max; } sub cmd_newgroups ($$$;$$) { @@ -275,7 +287,6 @@ sub wildmat2re (;$) { return $_[0] = qr/.*/ if (!defined $_[0] || $_[0] eq '*'); my %keep; my $salt = rand; - use Digest::SHA qw(sha1_hex); my $tmp = $_[0]; $tmp =~ s#(?<!\\)\[(.+)(?<!\\)\]# @@ -313,8 +324,8 @@ sub cmd_newnews ($$$$;$$) { ngpat2re($skip); my @srch; foreach my $ng (@{$self->{nntpd}->{grouplist}}) { - $ng->{name} =~ $keep or next; - $ng->{name} =~ $skip and next; + $ng->{newsgroup} =~ $keep or next; + $ng->{newsgroup} =~ $skip and next; my $srch = $ng->search or next; push @srch, $srch; }; @@ -382,7 +393,8 @@ sub cmd_last ($) { article_adj($_[0], -1) } sub cmd_post ($) { my ($self) = @_; my $ng = $self->{ng}; - $ng ? "440 mailto:$ng->{address} to post" : '440 posting not allowed' + $ng ? "440 mailto:$ng->{-primary_address} to post" + : '440 posting not allowed' } sub cmd_quit ($) { @@ -392,6 +404,29 @@ sub cmd_quit ($) { undef; } +sub header_append ($$$) { + my ($hdr, $k, $v) = @_; + my @v = $hdr->header($k); + foreach (@v) { + return if $v eq $_; + } + $hdr->header_set($k, @v, $v); +} + +sub set_nntp_headers { + my ($hdr, $ng, $n, $mid) = @_; + + # clobber some + $hdr->header_set('Newsgroups', $ng->{newsgroup}); + $hdr->header_set('Xref', xref($ng, $n)); + header_append($hdr, 'List-Post', "<mailto:$ng->{-primary_address}>"); + if (my $url = $ng->base_url) { + $mid = mid_escape($mid); + header_append($hdr, 'Archived-At', "<$url$mid/>"); + header_append($hdr, 'List-Archive', "<$url>"); + } +} + sub art_lookup ($$$) { my ($self, $art, $set_headers) = @_; my $ng = $self->{ng}; @@ -428,14 +463,12 @@ find_mid: defined $mid or return $err; } found: - my $o = 'HEAD:' . mid2path($mid); my $bytes; - my $s = eval { Email::MIME->new($ng->gcf->cat_file($o, \$bytes)) }; - return $err unless $s; + my $s = eval { $ng->msg_by_mid($mid, \$bytes) } or return $err; + $s = Email::Simple->new($s); my $lines; if ($set_headers) { - $s->header_set('Newsgroups', $ng->{name}); - $s->header_set('Xref', xref($ng, $n)); + set_nntp_headers($s->header_obj, $ng, $n, $mid); $lines = $s->body =~ tr!\n!\n!; # must be last @@ -460,6 +493,12 @@ sub set_art { $self->{article} = $art if defined $art && $art =~ /\A\d+\z/; } +sub _header ($) { + my $hdr = $_[0]->header_obj->as_string; + utf8::encode($hdr); + $hdr +} + sub cmd_article ($;$) { my ($self, $art) = @_; my $r = art_lookup($self, $art, 1); @@ -467,7 +506,7 @@ sub cmd_article ($;$) { my ($n, $mid, $s) = @$r; set_art($self, $art); more($self, "220 $n <$mid> article retrieved - head and body follow"); - do_more($self, $s->header_obj->as_string); + do_more($self, _header($s)); do_more($self, "\r\n"); simple_body_write($self, $s); } @@ -479,7 +518,7 @@ sub cmd_head ($;$) { my ($n, $mid, $s) = @$r; set_art($self, $art); more($self, "221 $n <$mid> article retrieved - head follows"); - do_more($self, $s->header_obj->as_string); + do_more($self, _header($s)); '.' } @@ -533,16 +572,6 @@ sub get_range ($$) { [ $beg, $end ]; } -sub hdr_val ($$) { - my ($r, $header) = @_; - return $r->[3] if $header =~ /\A:?bytes\z/i; - return $r->[4] if $header =~ /\A:?lines\z/i; - $r = $r->[2]->header_obj->header($header); - defined $r or return; - $r =~ s/[\r\n\t]+/ /sg; - $r; -} - sub long_response ($$$$) { my ($self, $beg, $end, $cb) = @_; die "BUG: nested long response" if $self->{long_res}; @@ -584,9 +613,9 @@ sub long_response ($$$$) { # no recursion, schedule another call ASAP # but only after all pending writes are done update_idle_time($self); - Danga::Socket->AddTimer(0, sub { - $self->write($self->{long_res}); - }); + + push @$nextq, $self; + $nextt ||= PublicInbox::EvCleanup::asap(*next_tick); } else { # all done! $self->{long_res} = undef; $self->watch_read(1); @@ -622,7 +651,7 @@ sub hdr_message_id ($$$) { # optimize XHDR Message-ID [range] for slrnpull. sub xref ($$) { my ($ng, $n) = @_; - "$ng->{domain} $ng->{name}:$n" + "$ng->{domain} $ng->{newsgroup}:$n" } sub mid_lookup ($$) { @@ -665,8 +694,7 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin sub search_header_for { my ($srch, $mid, $field) = @_; - my $smsg = $srch->lookup_message($mid) or return; - $smsg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); + my $smsg = $srch->lookup_mail($mid) or return; $smsg->$field; } @@ -696,6 +724,7 @@ sub hdr_searchmsg ($$$$) { foreach my $s (@$msgs) { $tmp .= $s->num . ' ' . $s->$field . "\r\n"; } + utf8::encode($tmp); do_more($self, $tmp); # -1 to adjust for implicit increment in long_response $$i = $nr ? $$i + $nr - 1 : long_response_limit; @@ -784,7 +813,7 @@ sub over_line ($$) { my ($num, $smsg) = @_; # n.b. field access and procedural calls can be # 10%-15% faster than OO method calls: - join("\t", $num, + my $s = join("\t", $num, $smsg->{subject}, $smsg->{from}, PublicInbox::SearchMsg::date($smsg), @@ -792,16 +821,17 @@ sub over_line ($$) { $smsg->{references}, PublicInbox::SearchMsg::bytes($smsg), PublicInbox::SearchMsg::lines($smsg)); + utf8::encode($s); + $s } sub cmd_over ($;$) { my ($self, $range) = @_; if ($range && $range =~ /\A<(.+)>\z/) { my ($ng, $n) = mid_lookup($self, $1); - my $smsg = $ng->search->lookup_message($range) or + my $smsg = $ng->search->lookup_mail($range) or return '430 No article with that message-id'; more($self, '224 Overview information follows (multi-line)'); - $smsg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); # Only set article number column if it's the current group my $self_ng = $self->{ng}; @@ -846,7 +876,7 @@ sub cmd_xpath ($$) { my @paths; foreach my $ng (values %{$self->{nntpd}->{groups}}) { my $n = $ng->mm->num_for($mid); - push @paths, "$ng->{name}/$n" if defined $n; + push @paths, "$ng->{newsgroup}/$n" if defined $n; } return '430 no such article on server' unless @paths; '223 '.join(' ', @paths); @@ -865,7 +895,7 @@ sub more ($$) { sub do_write ($$) { my ($self, $data) = @_; my $done = $self->write($data); - die if $self->{closed}; + return if $self->{closed}; # Do not watch for readability if we have data in the queue, # instead re-enable watching for readability when we can @@ -922,6 +952,7 @@ sub event_read { $self->{rbuf} .= $$buf; while ($r > 0 && $self->{rbuf} =~ s/\A\s*([^\r\n]+)\r?\n//) { my $line = $1; + return $self->close if $line =~ /[[:cntrl:]]/s; my $t0 = now(); my $fd = $self->{fd}; $r = eval { process_line($self, $line) }; @@ -945,19 +976,25 @@ sub watch_read { # and we must double-check again by the time the timer fires # in case we really did dispatch a read event and started # another long response. - Danga::Socket->AddTimer(0, sub { - if (&Danga::Socket::POLLIN & $self->{event_watch}) { - event_read($self); - } - }); + push @$nextq, $self; + $nextt ||= PublicInbox::EvCleanup::asap(*next_tick); } $rv; } +sub not_idle_long ($$) { + my ($self, $now) = @_; + defined(my $fd = $self->{fd}) or return; + my $ary = $EXPMAP->{$fd} or return; + my $exp_at = $ary->[0] + $EXPTIME; + $exp_at > $now; +} + # for graceful shutdown in PublicInbox::Daemon: -sub busy () { - my ($self) = @_; - ($self->{rbuf} ne '' || $self->{long_res} || $self->{write_buf_size}); +sub busy { + my ($self, $now) = @_; + ($self->{rbuf} ne '' || $self->{long_res} || $self->{write_buf_size} || + not_idle_long($self, $now)); } 1; diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm new file mode 100644 index 00000000..eb43a2bf --- /dev/null +++ b/lib/PublicInbox/NNTPD.pm @@ -0,0 +1,44 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# represents an NNTPD (currently a singleton), +# see script/public-inbox-nntpd for how it is used +package PublicInbox::NNTPD; +use strict; +use warnings; +require PublicInbox::Config; + +sub new { + my ($class) = @_; + bless { + groups => {}, + err => \*STDERR, + out => \*STDOUT, + grouplist => [], + }, $class; +} + +sub refresh_groups () { + my ($self) = @_; + my $pi_config = PublicInbox::Config->new; + my $new = {}; + my @list; + $pi_config->each_inbox(sub { + my ($ng) = @_; + my $ngname = $ng->{newsgroup} or return; + if (ref $ngname) { + warn 'multiple newsgroups not supported: '. + join(', ', @$ngname). "\n"; + } elsif ($ng->nntp_usable) { + # Only valid if msgmap and search works + $new->{$ngname} = $ng; + push @list, $ng; + } + }); + @list = sort { $a->{newsgroup} cmp $b->{newsgroup} } @list; + $self->{grouplist} = \@list; + # this will destroy old groups that got deleted + %{$self->{groups}} = %$new; +} + +1; diff --git a/lib/PublicInbox/NewsGroup.pm b/lib/PublicInbox/NewsGroup.pm deleted file mode 100644 index b20180e6..00000000 --- a/lib/PublicInbox/NewsGroup.pm +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (C) 2015 all contributors <meta@public-inbox.org> -# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -# -# Used only by the NNTP server to represent a public-inbox git repository -# as a newsgroup -package PublicInbox::NewsGroup; -use strict; -use warnings; -use Scalar::Util qw(weaken); -require Danga::Socket; -require PublicInbox::Msgmap; -require PublicInbox::Search; -require PublicInbox::Git; - -sub new { - my ($class, $name, $git_dir, $address) = @_; - $address = $address->[0] if ref($address); - my $self = bless { - name => $name, - git_dir => $git_dir, - address => $address, - }, $class; - $self->{domain} = ($address =~ /\@(\S+)\z/) ? $1 : 'localhost'; - $self; -} - -sub defer_weaken { - my ($self, $field) = @_; - Danga::Socket->AddTimer(30, sub { weaken($self->{$field}) }); -} - -sub gcf { - my ($self) = @_; - $self->{gcf} ||= eval { - my $gcf = PublicInbox::Git->new($self->{git_dir}); - - # git repos may be repacked and old packs unlinked - defer_weaken($self, 'gcf'); - $gcf; - }; -} - -sub usable { - my ($self) = @_; - eval { - PublicInbox::Msgmap->new($self->{git_dir}); - PublicInbox::Search->new($self->{git_dir}); - }; -} - -sub mm { - my ($self) = @_; - $self->{mm} ||= eval { - my $mm = PublicInbox::Msgmap->new($self->{git_dir}); - - # may be needed if we run low on handles - defer_weaken($self, 'mm'); - $mm; - }; -} - -sub search { - my ($self) = @_; - $self->{search} ||= eval { - my $search = PublicInbox::Search->new($self->{git_dir}); - - # may be needed if we run low on handles - defer_weaken($self, 'search'); - $search; - }; -} - -sub description { - my ($self) = @_; - open my $fh, '<', "$self->{git_dir}/description" or return ''; - my $desc = eval { local $/; <$fh> }; - chomp $desc; - $desc =~ s/\s+/ /smg; - $desc; -} - -sub update { - my ($self, $new) = @_; - $self->{address} = $new->{address}; - $self->{domain} = $new->{domain}; - if ($self->{git_dir} ne $new->{git_dir}) { - # new git_dir requires a new mm and gcf - $self->{mm} = $self->{gcf} = undef; - $self->{git_dir} = $new->{git_dir}; - } -} - -1; diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm index dfc00217..b4d74763 100644 --- a/lib/PublicInbox/NewsWWW.pm +++ b/lib/PublicInbox/NewsWWW.pm @@ -2,14 +2,14 @@ # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Plack app redirector for mapping /$NEWSGROUP requests to -# the appropriate /$LISTNAME in PublicInbox::WWW because some +# the appropriate /$INBOX in PublicInbox::WWW because some # auto-linkifiers cannot handle nntp:// redirects properly. # This is also used directly by PublicInbox::WWW package PublicInbox::NewsWWW; use strict; use warnings; use PublicInbox::Config; -use URI::Escape qw(uri_escape_utf8); +use PublicInbox::MID qw(mid_escape); sub new { my ($class, $pi_config) = @_; @@ -19,7 +19,6 @@ sub new { sub call { my ($self, $env) = @_; - my $ng_map = $self->newsgroup_map; my $path = $env->{PATH_INFO}; $path =~ s!\A/+!!; $path =~ s!/+\z!!; @@ -27,54 +26,24 @@ sub call { # some links may have the article number in them: # /inbox.foo.bar/123456 my ($ng, $article) = split(m!/+!, $path, 2); - if (my $info = $ng_map->{$ng}) { - my $url = PublicInbox::Hval::prurl($env, $info->{url}); + if (my $inbox = $self->{pi_config}->lookup_newsgroup($ng)) { + my $url = PublicInbox::Hval::prurl($env, $inbox->{url}); my $code = 301; - my $h = [ Location => $url, 'Content-Type' => 'text/plain' ]; if (defined $article && $article =~ /\A\d+\z/) { - my $mid = eval { ng_mid_for($ng, $info, $article) }; + my $mid = eval { $inbox->mm->mid_for($article) }; if (defined $mid) { # article IDs are not stable across clones, # do not encourage caching/bookmarking them $code = 302; - $url .= uri_escape_utf8($mid) . '/'; + $url .= mid_escape($mid) . '/'; } } - return [ $code, $h, [ "Redirecting to $url\n" ] ] - } - [ 404, [ 'Content-Type' => 'text/plain' ], [] ]; -} - -sub ng_mid_for { - my ($ng, $info, $article) = @_; - # may fail due to lack of Danga::Socket - # for defer_weaken: - require PublicInbox::NewsGroup; - $ng = $info->{ng} ||= - PublicInbox::NewsGroup->new($ng, $info->{git_dir}, ''); - $ng->mm->mid_for($article); -} - -sub newsgroup_map { - my ($self) = @_; - my $rv; - $rv = $self->{ng_map} and return $rv; - my $pi_config = $self->{pi_config}; - my %ng_map; - foreach my $k (keys %$pi_config) { - $k =~ /\Apublicinbox\.([^\.]+)\.mainrepo\z/ or next; - my $listname = $1; - my $git_dir = $pi_config->{"publicinbox.$listname.mainrepo"}; - my $url = $pi_config->{"publicinbox.$listname.url"}; - defined $url or next; - my $ng = $pi_config->{"publicinbox.$listname.newsgroup"}; - next if (!defined $ng) || ($ng eq ''); # disabled + my $h = [ Location => $url, 'Content-Type' => 'text/plain' ]; - $url =~ m!/\z! or $url .= '/'; - $ng_map{$ng} = { url => $url, git_dir => $git_dir }; + return [ $code, $h, [ "Redirecting to $url\n" ] ] } - $self->{ng_map} = \%ng_map; + [ 404, [ 'Content-Type' => 'text/plain' ], [ "404 Not Found\n" ] ]; } 1; diff --git a/lib/PublicInbox/ParentPipe.pm b/lib/PublicInbox/ParentPipe.pm new file mode 100644 index 00000000..d2d054ce --- /dev/null +++ b/lib/PublicInbox/ParentPipe.pm @@ -0,0 +1,21 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# only for PublicInbox::Daemon +package PublicInbox::ParentPipe; +use strict; +use warnings; +use base qw(Danga::Socket); +use fields qw(cb); + +sub new ($$$) { + my ($class, $pipe, $cb) = @_; + my $self = fields::new($class); + $self->SUPER::new($pipe); + $self->{cb} = $cb; + $self->watch_read(1); + $self; +} + +sub event_read { $_[0]->{cb}->($_[0]) } + +1; diff --git a/lib/PublicInbox/Qspawn.pm b/lib/PublicInbox/Qspawn.pm new file mode 100644 index 00000000..697c55a1 --- /dev/null +++ b/lib/PublicInbox/Qspawn.pm @@ -0,0 +1,71 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Limits the number of processes spawned +# This does not depend on Danga::Socket or any other external +# scheduling mechanism, you just need to call start and finish +# appropriately +package PublicInbox::Qspawn; +use strict; +use warnings; +use PublicInbox::Spawn qw(popen_rd); + +sub new ($$$;) { + my ($class, $cmd, $env, $opt) = @_; + bless { args => [ $cmd, $env, $opt ] }, $class; +} + +sub _do_spawn { + my ($self, $cb) = @_; + my $err; + + ($self->{rpipe}, $self->{pid}) = popen_rd(@{$self->{args}}); + if (defined $self->{pid}) { + $self->{limiter}->{running}++; + } else { + $self->{err} = $!; + } + $cb->($self->{rpipe}); +} + +sub finish ($) { + my ($self) = @_; + my $limiter = $self->{limiter}; + if (delete $self->{rpipe}) { + my $pid = delete $self->{pid}; + $self->{err} = $pid == waitpid($pid, 0) ? $? : + "PID:$pid still running?"; + $limiter->{running}--; + } + if (my $next = shift @{$limiter->{run_queue}}) { + _do_spawn(@$next); + } + $self->{err}; +} + +sub start { + my ($self, $limiter, $cb) = @_; + $self->{limiter} = $limiter; + + if ($limiter->{running} < $limiter->{max}) { + _do_spawn($self, $cb); + } else { + push @{$limiter->{run_queue}}, [ $self, $cb ]; + } +} + +package PublicInbox::Qspawn::Limiter; +use strict; +use warnings; + +sub new { + my ($class, $max) = @_; + bless { + # 32 is same as the git-daemon connection limit + max => $max || 32, + running => 0, + run_queue => [], + }, $class; +} + +1; diff --git a/lib/PublicInbox/Repobrowse.pm b/lib/PublicInbox/Repobrowse.pm index 0a812f72..cdd708e9 100644 --- a/lib/PublicInbox/Repobrowse.pm +++ b/lib/PublicInbox/Repobrowse.pm @@ -96,6 +96,7 @@ sub call { extra => \@extra, # path cgi => $cgi, rconfig => $rconfig, + env => $env, }; my $tslash = 0; my $cmd = shift @extra; diff --git a/lib/PublicInbox/RepobrowseConfig.pm b/lib/PublicInbox/RepobrowseConfig.pm index 77ef46bb..a08c6cec 100644 --- a/lib/PublicInbox/RepobrowseConfig.pm +++ b/lib/PublicInbox/RepobrowseConfig.pm @@ -3,7 +3,8 @@ package PublicInbox::RepobrowseConfig; use strict; use warnings; -use PublicInbox::Config qw/try_cat/; +use PublicInbox::Inbox; +use PublicInbox::Config; require PublicInbox::Hval; sub new { @@ -52,7 +53,7 @@ sub lookup { # gitweb compatibility foreach my $key (qw(description cloneurl)) { - $rv->{$key} = try_cat("$path/$key"); + $rv->{$key} = PublicInbox::Inbox::try_cat("$path/$key"); } $rv->{desc_html} = diff --git a/lib/PublicInbox/RepobrowseGitFallback.pm b/lib/PublicInbox/RepobrowseGitFallback.pm index 696e5b94..38640139 100644 --- a/lib/PublicInbox/RepobrowseGitFallback.pm +++ b/lib/PublicInbox/RepobrowseGitFallback.pm @@ -15,8 +15,7 @@ sub call { my $expath = $req->{expath}; return if index($expath, '..') >= 0; # prevent path traversal my $git = $req->{repo_info}->{git}; - my $cgi = $req->{cgi}; - PublicInbox::GitHTTPBackend::serve($cgi, $git, $expath); + PublicInbox::GitHTTPBackend::serve($req->{env}, $git, $expath); } 1; diff --git a/lib/PublicInbox/SaPlugin/ListMirror.pm b/lib/PublicInbox/SaPlugin/ListMirror.pm new file mode 100644 index 00000000..3808196c --- /dev/null +++ b/lib/PublicInbox/SaPlugin/ListMirror.pm @@ -0,0 +1,109 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Rules useful for running a mailing list mirror. We want to: +# * ensure Received: headers are really from the list mail server +# users expect. This is to prevent malicious users from +# injecting spam into mirrors without going through the expected +# server +# * flag messages where the mailing list is Bcc:-ed since it is +# common for spam to have wrong or non-existent To:/Cc: headers. + +package PublicInbox::SaPlugin::ListMirror; +use strict; +use warnings; +use base qw(Mail::SpamAssassin::Plugin); + +# constructor: register the eval rules +sub new { + my ($class, $mail) = @_; + + # some boilerplate... + $class = ref($class) || $class; + my $self = $class->SUPER::new($mail); + bless $self, $class; + $mail->{conf}->{list_mirror_check} = []; + $self->register_eval_rule('check_list_mirror_received'); + $self->register_eval_rule('check_list_mirror_bcc'); + $self->set_config($mail->{conf}); + $self; +} + +sub check_list_mirror_received { + my ($self, $pms) = @_; + my $recvd = $pms->get('Received') || ''; + $recvd =~ s/\n.*\z//s; + + foreach my $cfg (@{$pms->{conf}->{list_mirror_check}}) { + my ($hdr, $hval, $host_re, $addr_re) = @$cfg; + my $v = $pms->get($hdr) or next; + local $/ = "\n"; + chomp $v; + next if $v ne $hval; + return 1 if $recvd !~ $host_re; + } + + 0; +} + +sub check_list_mirror_bcc { + my ($self, $pms) = @_; + my $tocc = $pms->get('ToCc'); + + foreach my $cfg (@{$pms->{conf}->{list_mirror_check}}) { + my ($hdr, $hval, $host_re, $addr_re) = @$cfg; + defined $addr_re or next; + my $v = $pms->get($hdr) or next; + local $/ = "\n"; + chomp $v; + next if $v ne $hval; + return 1 if !$tocc || $tocc !~ $addr_re; + } + + 0; +} + +# list_mirror HEADER HEADER_VALUE HOSTNAME_GLOB [LIST_ADDRESS] +# list_mirror X-Mailing-List git@vger.kernel.org *.kernel.org +# list_mirror List-Id <foo.example.org> *.example.org foo@example.org +sub config_list_mirror { + my ($self, $key, $value, $line) = @_; + + defined $value or + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + + my ($hdr, $hval, $host_glob, @extra) = split(/\s+/, $value); + my $addr = shift @extra; + + if (defined $addr) { + $addr !~ /\@/ and + return $Mail::SpamAssassin::Conf::INVALID_VALUE; + $addr = join('|', map { quotemeta } split(/,/, $addr)); + $addr = qr/\b$addr\b/i; + } + + @extra and return $Mail::SpamAssassin::Conf::INVALID_VALUE; + + defined $host_glob or + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + + my %patmap = ('*' => '\S+', '?' => '.', '[' => '[', ']' => ']'); + $host_glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; + my $host_re = qr/\A\s*from\s+$host_glob(?:\s|$)/si; + + push @{$self->{list_mirror_check}}, [ $hdr, $hval, $host_re, $addr ]; +} + +sub set_config { + my ($self, $conf) = @_; + my @cmds; + push @cmds, { + setting => 'list_mirror', + default => '', + type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING, + code => *config_list_mirror, + }; + $conf->{parser}->register_commands(\@cmds); +} + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 0f7815fb..24cb2667 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -12,6 +12,7 @@ use constant TS => 0; # timestamp use constant NUM => 1; # NNTP article number use constant BYTES => 2; # :bytes as defined in RFC 3977 use constant LINES => 3; # :lines as defined in RFC 3977 +use constant YYYYMMDD => 4; # for searching in the WWW UI use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; @@ -36,7 +37,9 @@ use constant { # 8 - remove redundant/unneeded document data # 9 - disable Message-ID compression (SHA-1) # 10 - optimize doc for NNTP overviews - SCHEMA_VERSION => 10, + # 11 - merge threads when vivifying ghosts + # 12 - change YYYYMMDD value column to numeric + SCHEMA_VERSION => 12, # n.b. FLAG_PURE_NOT is expensive not suitable for a public website # as it could become a denial-of-service vector @@ -50,16 +53,57 @@ my %bool_pfx_internal = ( ); my %bool_pfx_external = ( + # do we still need these? probably not.. path => 'XPATH', mid => 'Q', # uniQue id (Message-ID) ); my %prob_prefix = ( - subject => 'S', - s => 'S', # for mairix compatibility - m => 'Q', # 'mid' is exact, 'm' can do partial + # for mairix compatibility + s => 'S', + m => 'XMID', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + f => 'A', + t => 'XTO', + tc => 'XTO XCC', + c => 'XCC', + tcf => 'XTO XCC A', + a => 'XTO XCC A', + b => 'XNQ XQUOT', + bs => 'XNQ XQUOT S', + n => 'XFN', + + q => 'XQUOT', + nq => 'XNQ', + + # default: + '' => 'XMID S A XNQ XQUOT XFN', ); +# not documenting m: and mid: for now, the using the URLs works w/o Xapian +our @HELP = ( + 's:' => 'match within Subject e.g. s:"a quick brown fox"', + 'd:' => <<EOF, +date range as YYYYMMDD e.g. d:19931002..20101002 +Open-ended ranges such as d:19931002.. and d:..20101002 +are also supported +EOF + 'b:' => 'match within message body, including text attachments', + 'nq:' => 'match non-quoted text within message body', + 'quot:' => 'match quoted text within message body', + 'n:' => 'match filename of attachment(s)', + 't:' => 'match within the To header', + 'c:' => 'match within the Cc header', + 'f:' => 'match within the From header', + 'a:' => 'match within the To, Cc, and From headers', + 'tc:' => 'match within the To and Cc headers', + 'bs:' => 'match within the Subject and body', +); +chomp @HELP; +# TODO: +# df (filenames from diff) +# da (diff a/ removed lines) +# db (diff b/ added lines) + my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); sub xpfx { $all_pfx{$_[0]} } @@ -78,10 +122,10 @@ sub xdir { } sub new { - my ($class, $git_dir) = @_; + my ($class, $git_dir, $altid) = @_; my $dir = $class->xdir($git_dir); my $db = Search::Xapian::Database->new($dir); - bless { xdb => $db, git_dir => $git_dir }, $class; + bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class; } sub reopen { $_[0]->{xdb}->reopen } @@ -97,7 +141,7 @@ sub query { $opts->{relevance} = 1 unless exists $opts->{relevance}; } - $self->do_enquire($query, $opts); + _do_enquire($self, $query, $opts); } sub get_thread { @@ -106,13 +150,44 @@ sub get_thread { return { total => 0, msgs => [] } unless $smsg; my $qtid = Search::Xapian::Query->new(xpfx('thread').$smsg->thread_id); - my $path = id_compress($smsg->path); - my $qsub = Search::Xapian::Query->new(xpfx('path').$path); - my $query = Search::Xapian::Query->new(OP_OR, $qtid, $qsub); - $self->do_enquire($query, $opts); + my $path = $smsg->path; + if (defined $path && $path ne '') { + my $path = id_compress($smsg->path); + my $qsub = Search::Xapian::Query->new(xpfx('path').$path); + $qtid = Search::Xapian::Query->new(OP_OR, $qtid, $qsub); + } + $opts ||= {}; + $opts->{limit} ||= 1000; + + # always sort threads by timestamp, this makes life easier + # for the threading algorithm (in SearchThread.pm) + $opts->{asc} = 1; + + _do_enquire($self, $qtid, $opts); +} + +sub retry_reopen { + my ($self, $cb) = @_; + my $ret; + for (1..10) { + eval { $ret = $cb->() }; + return $ret unless $@; + # Exception: The revision being read has been discarded - + # you should call Xapian::Database::reopen() + if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') { + reopen($self); + } else { + die; + } + } +} + +sub _do_enquire { + my ($self, $query, $opts) = @_; + retry_reopen($self, sub { _enquire_once($self, $query, $opts) }); } -sub do_enquire { +sub _enquire_once { my ($self, $query, $opts) = @_; my $enquire = $self->enquire; if (defined $query) { @@ -125,6 +200,8 @@ sub do_enquire { my $desc = !$opts->{asc}; if ($opts->{relevance}) { $enquire->set_sort_by_relevance_then_value(TS, $desc); + } elsif ($opts->{num}) { + $enquire->set_sort_by_value(NUM, 0); } else { $enquire->set_sort_by_value_then_relevance(TS, $desc); } @@ -155,28 +232,37 @@ sub qp { $qp->set_database($self->{xdb}); $qp->set_stemmer($self->stemmer); $qp->set_stemming_strategy(STEM_SOME); - $qp->add_valuerangeprocessor($self->ts_range_processor); - $qp->add_valuerangeprocessor($self->date_range_processor); + $qp->add_valuerangeprocessor( + Search::Xapian::NumberValueRangeProcessor->new(YYYYMMDD, 'd:')); while (my ($name, $prefix) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $prefix); } + # we do not actually create AltId objects, + # just parse the spec to avoid the extra DB handles for now. + if (my $altid = $self->{altid}) { + my $user_pfx = $self->{-user_pfx} ||= []; + for (@$altid) { + # $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3' + /\Aserial:(\w+):/ or next; + my $pfx = $1; + push @$user_pfx, "$pfx:", <<EOF; +alternate serial number e.g. $pfx:12345 (boolean) +EOF + # gmane => XGMANE + $qp->add_boolean_prefix($pfx, 'X'.uc($pfx)); + } + chomp @$user_pfx; + } + while (my ($name, $prefix) = each %prob_prefix) { - $qp->add_prefix($name, $prefix); + $qp->add_prefix($name, $_) foreach split(/ /, $prefix); } $self->{query_parser} = $qp; } -sub ts_range_processor { - $_[0]->{tsrp} ||= Search::Xapian::NumberValueRangeProcessor->new(TS); -} - -sub date_range_processor { - $_[0]->{drp} ||= Search::Xapian::DateValueRangeProcessor->new(TS); -} - sub num_range_processor { $_[0]->{nrp} ||= Search::Xapian::NumberValueRangeProcessor->new(NUM); } @@ -184,21 +270,12 @@ sub num_range_processor { # only used for NNTP server sub query_xover { my ($self, $beg, $end, $offset) = @_; - my $enquire = $self->enquire; my $qp = Search::Xapian::QueryParser->new; $qp->set_database($self->{xdb}); $qp->add_valuerangeprocessor($self->num_range_processor); my $query = $qp->parse_query("$beg..$end", QP_FLAGS); - $query = Search::Xapian::Query->new(OP_AND, $mail_query, $query); - $enquire->set_query($query); - $enquire->set_sort_by_value(NUM, 0); - my $limit = 200; - my $mset = $enquire->get_mset($offset, $limit); - my @msgs = map { - PublicInbox::SearchMsg->load_doc($_->get_document); - } $mset->items; - { total => $mset->get_matches_estimated, msgs => \@msgs } + _do_enquire($self, $query, {num => 1, limit => 200, offset => $offset}); } sub lookup_message { @@ -216,6 +293,12 @@ sub lookup_message { $smsg; } +sub lookup_mail { # no ghosts! + my ($self, $mid) = @_; + my $smsg = lookup_message($self, $mid) or return; + PublicInbox::SearchMsg->load_doc($smsg->{doc}); +} + sub find_unique_doc_id { my ($self, $term, $value) = @_; @@ -295,4 +378,14 @@ sub enquire { $self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb}); } +sub help { + my ($self) = @_; + $self->qp; # parse altids + my @ret = @HELP; + if (my $user_pfx = $self->{-user_pfx}) { + push @ret, @$user_pfx; + } + \@ret; +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 63be6810..832d1cbf 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -4,13 +4,20 @@ # # Indexes mail with Xapian and our (SQLite-based) ::Msgmap for use # with the web and NNTP interfaces. This index maintains thread -# relationships for use by Mail::Thread. This writes to the search -# index. +# relationships for use by PublicInbox::SearchThread. +# This writes to the search index. package PublicInbox::SearchIdx; use strict; use warnings; +use Fcntl qw(:flock :DEFAULT); +use Email::MIME; +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; use base qw(PublicInbox::Search); use PublicInbox::MID qw/mid_clean id_compress mid_mime/; +use PublicInbox::MsgIter; +use Carp qw(croak); +use POSIX qw(strftime); require PublicInbox::Git; *xpfx = *PublicInbox::Search::xpfx; @@ -24,135 +31,206 @@ use constant { }; sub new { - my ($class, $git_dir, $writable) = @_; - my $dir = $class->xdir($git_dir); + my ($class, $inbox, $creat) = @_; + my $git_dir = $inbox; + my $altid; + if (ref $inbox) { + $git_dir = $inbox->{mainrepo}; + $altid = $inbox->{altid}; + if ($altid) { + require PublicInbox::AltId; + $altid = [ map { + PublicInbox::AltId->new($inbox, $_); + } @$altid ]; + } + } require Search::Xapian::WritableDatabase; - my $flag = Search::Xapian::DB_OPEN; - my $self = bless { git_dir => $git_dir }, $class; + my $self = bless { git_dir => $git_dir, -altid => $altid }, $class; my $perm = $self->_git_config_perm; my $umask = _umask_for($perm); $self->{umask} = $umask; - $self->{xdb} = $self->with_umask(sub { - if ($writable == 1) { - require File::Path; - File::Path::mkpath($dir); - $flag = Search::Xapian::DB_CREATE_OR_OPEN; - } - Search::Xapian::WritableDatabase->new($dir, $flag); - }); + $self->{lock_path} = "$git_dir/ssoma.lock"; + $self->{git} = PublicInbox::Git->new($git_dir); + $self->{creat} = ($creat || 0) == 1; $self; } -sub add_val { +sub _xdb_release { + my ($self) = @_; + my $xdb = delete $self->{xdb} or croak 'not acquired'; + $xdb->close; + _lock_release($self) if $self->{creat}; + undef; +} + +sub _xdb_acquire { + my ($self) = @_; + croak 'already acquired' if $self->{xdb}; + my $dir = PublicInbox::Search->xdir($self->{git_dir}); + my $flag = Search::Xapian::DB_OPEN; + if ($self->{creat}) { + require File::Path; + _lock_acquire($self); + File::Path::mkpath($dir); + $self->{batch_size} = 100; + $flag = Search::Xapian::DB_CREATE_OR_OPEN; + } + $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); +} + +# we only acquire the flock if creating or reindexing; +# PublicInbox::Import already has the lock on its own. +sub _lock_acquire { + my ($self) = @_; + croak 'already locked' if $self->{lockfh}; + sysopen(my $lockfh, $self->{lock_path}, O_WRONLY|O_CREAT) or + die "failed to open lock $self->{lock_path}: $!\n"; + flock($lockfh, LOCK_EX) or die "lock failed: $!\n"; + $self->{lockfh} = $lockfh; +} + +sub _lock_release { + my ($self) = @_; + my $lockfh = delete $self->{lockfh} or croak 'not locked'; + flock($lockfh, LOCK_UN) or die "unlock failed: $!\n"; + close $lockfh or die "close failed: $!\n"; +} + +sub add_val ($$$) { my ($doc, $col, $num) = @_; $num = Search::Xapian::sortable_serialise($num); $doc->add_value($col, $num); } +sub add_values ($$$) { + my ($smsg, $bytes, $num) = @_; + + my $ts = $smsg->ts; + my $doc = $smsg->{doc}; + add_val($doc, &PublicInbox::Search::TS, $ts); + + defined($num) and add_val($doc, &PublicInbox::Search::NUM, $num); + + defined($bytes) and add_val($doc, &PublicInbox::Search::BYTES, $bytes); + + add_val($doc, &PublicInbox::Search::LINES, + $smsg->{mime}->body_raw =~ tr!\n!\n!); + + my $yyyymmdd = strftime('%Y%m%d', gmtime($ts)); + add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); +} + +sub index_users ($$) { + my ($tg, $smsg) = @_; + + my $from = $smsg->from; + my $to = $smsg->to; + my $cc = $smsg->cc; + + $tg->index_text($from, 1, 'A'); # A - author + $tg->increase_termpos; + $tg->index_text($to, 1, 'XTO') if $to ne ''; + $tg->increase_termpos; + $tg->index_text($cc, 1, 'XCC') if $cc ne ''; + $tg->increase_termpos; +} + +sub index_body ($$$) { + my ($tg, $lines, $inc) = @_; + $tg->index_text(join("\n", @$lines), $inc, $inc ? 'XNQ' : 'XQUOT'); + @$lines = (); + $tg->increase_termpos; +} + sub add_message { - my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object + my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object my $db = $self->{xdb}; - my $doc_id; + my ($doc_id, $old_tid); my $mid = mid_clean(mid_mime($mime)); - my $was_ghost = 0; - my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; my $smsg = $self->lookup_message($mid); - my $doc; - if ($smsg) { - $smsg->ensure_metadata; # convert a ghost to a regular message # it will also clobber any existing regular message - $smsg->mime($mime); - $doc = $smsg->{doc}; - - my $type = xpfx('type'); - eval { - $doc->remove_term($type . 'ghost'); - $was_ghost = 1; - }; - - # probably does not exist: - eval { $doc->remove_term($type . 'mail') }; - $doc->add_term($type . 'mail'); - } else { - $smsg = PublicInbox::SearchMsg->new($mime); - $doc = $smsg->{doc}; - $doc->add_term(xpfx('mid') . $mid); + $doc_id = $smsg->doc_id; + $old_tid = $smsg->thread_id; } + $smsg = PublicInbox::SearchMsg->new($mime); + my $doc = $smsg->{doc}; + $doc->add_term(xpfx('mid') . $mid); my $subj = $smsg->subject; - if ($subj ne '') { my $path = $self->subject_path($subj); $doc->add_term(xpfx('path') . id_compress($path)); } - add_val($doc, &PublicInbox::Search::TS, $smsg->ts); - - defined($num) and - add_val($doc, &PublicInbox::Search::NUM, $num); - - defined($bytes) and - add_val($doc, &PublicInbox::Search::BYTES, $bytes); - - add_val($doc, &PublicInbox::Search::LINES, - $mime->body_raw =~ tr!\n!\n!); + add_values($smsg, $bytes, $num); my $tg = $self->term_generator; $tg->set_document($doc); $tg->index_text($subj, 1, 'S') if $subj; $tg->increase_termpos; - $tg->index_text($subj) if $subj; - $tg->increase_termpos; - $tg->index_text($smsg->from); - $tg->increase_termpos; + index_users($tg, $smsg); - $mime->walk_parts(sub { - my ($part) = @_; - return if $part->subparts; # walk_parts already recurses - my $ct = $part->content_type || $ct_msg; + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $tg->index_text($fn, 1, 'XFN'); + } - # account for filter bugs... - $ct =~ m!\btext/plain\b!i or return; + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; my (@orig, @quot); my $body = $part->body; - $part->body_set(''); my @lines = split(/\n/, $body); while (defined(my $l = shift @lines)) { - if ($l =~ /^\s*>/) { + if ($l =~ /^>/) { + index_body($tg, \@orig, 1) if @orig; push @quot, $l; } else { + index_body($tg, \@quot, 0) if @quot; push @orig, $l; } } - if (@quot) { - $tg->index_text(join("\n", @quot), 0); - @quot = (); - $tg->increase_termpos; - } - if (@orig) { - $tg->index_text(join("\n", @orig)); - @orig = (); - $tg->increase_termpos; - } + index_body($tg, \@quot, 0) if @quot; + index_body($tg, \@orig, 1) if @orig; }); - if ($was_ghost) { - $doc_id = $smsg->doc_id; - $self->link_message($smsg, 0); - $doc->set_data($smsg->to_doc_data); + link_message($self, $smsg, $old_tid); + $tg->index_text($mid, 1, 'XMID'); + $doc->set_data($smsg->to_doc_data($blob)); + + if (my $altid = $self->{-altid}) { + foreach my $alt (@$altid) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_term($alt->{xprefix} . $id); + } + } + if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { - $self->link_message($smsg, 0); - $doc->set_data($smsg->to_doc_data); $doc_id = $db->add_document($doc); } }; @@ -208,27 +286,17 @@ sub next_thread_id { } sub link_message { - my ($self, $smsg, $is_ghost) = @_; - - if ($is_ghost) { - $smsg->ensure_metadata; - } else { - $self->link_message_to_parents($smsg); - } -} - -sub link_message_to_parents { - my ($self, $smsg) = @_; + my ($self, $smsg, $old_tid) = @_; my $doc = $smsg->{doc}; my $mid = $smsg->mid; my $mime = $smsg->mime; my $hdr = $mime->header_obj; my $refs = $hdr->header_raw('References'); my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); - if (my $irt = $hdr->header_raw('In-Reply-To')) { - # last References should be $irt - # we will de-dupe later - push @refs, mid_clean($irt); + my $irt = $hdr->header_raw('In-Reply-To'); + if (defined $irt) { + $irt = mid_clean($irt); + $irt = undef if $mid eq $irt; } my $tid; @@ -237,6 +305,15 @@ sub link_message_to_parents { my @orig_refs = @refs; @refs = (); + if (defined $irt) { + # to check MAX_MID_SIZE + push @orig_refs, $irt; + + # below, we will ensure IRT (if specified) + # is the last References + $uniq{$irt} = 1; + } + # prevent circular references via References: here: foreach my $ref (@orig_refs) { if (length($ref) > MAX_MID_SIZE) { @@ -247,6 +324,11 @@ sub link_message_to_parents { push @refs, $ref; } } + + # last References should be IRT, but some mail clients do things + # out of order, so trust IRT over References iff IRT exists + push @refs, $irt if defined $irt; + if (@refs) { $smsg->{references} = '<'.join('> <', @refs).'>'; @@ -254,13 +336,12 @@ sub link_message_to_parents { # but we can never trust clients to do the right thing my $ref = shift @refs; $tid = $self->_resolve_mid_to_tid($ref); + $self->merge_threads($tid, $old_tid) if defined $old_tid; # the rest of the refs should point to this tid: foreach $ref (@refs) { my $ptid = $self->_resolve_mid_to_tid($ref); - if ($tid ne $ptid) { - $self->merge_threads($tid, $ptid); - } + merge_threads($self, $tid, $ptid); } } else { $tid = $self->next_thread_id; @@ -269,135 +350,196 @@ sub link_message_to_parents { } sub index_blob { - my ($self, $git, $mime, $bytes, $num) = @_; - $self->add_message($mime, $bytes, $num); + my ($self, $mime, $bytes, $num, $blob) = @_; + $self->add_message($mime, $bytes, $num, $blob); } sub unindex_blob { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; my $mid = eval { mid_clean(mid_mime($mime)) }; $self->remove_message($mid) if defined $mid; } sub index_mm { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_insert(mid_clean(mid_mime($mime))); } sub unindex_mm { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); } sub index_mm2 { - my ($self, $git, $mime, $bytes) = @_; + my ($self, $mime, $bytes, $blob) = @_; my $num = $self->{mm}->num_for(mid_clean(mid_mime($mime))); - index_blob($self, $git, $mime, $bytes, $num); + index_blob($self, $mime, $bytes, $num, $blob); } sub unindex_mm2 { - my ($self, $git, $mime) = @_; + my ($self, $mime) = @_; $self->{mm}->mid_delete(mid_clean(mid_mime($mime))); - unindex_blob($self, $git, $mime); + unindex_blob($self, $mime); } sub index_both { - my ($self, $git, $mime, $bytes) = @_; - my $num = index_mm($self, $git, $mime); - index_blob($self, $git, $mime, $bytes, $num); + my ($self, $mime, $bytes, $blob) = @_; + my $num = index_mm($self, $mime); + index_blob($self, $mime, $bytes, $num, $blob); } sub unindex_both { - my ($self, $git, $mime) = @_; - unindex_blob($self, $git, $mime); - unindex_mm($self, $git, $mime); + my ($self, $mime) = @_; + unindex_blob($self, $mime); + unindex_mm($self, $mime); } sub do_cat_mail { my ($git, $blob, $sizeref) = @_; my $mime = eval { my $str = $git->cat_file($blob, $sizeref); + # fixup bugs from import: + $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; Email::MIME->new($str); }; $@ ? undef : $mime; } sub index_sync { - my ($self, $head) = @_; - $self->with_umask(sub { $self->_index_sync($head) }); + my ($self, $opts) = @_; + with_umask($self, sub { $self->_index_sync($opts) }); } sub rlog { - my ($self, $range, $add_cb, $del_cb) = @_; + my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; my $h40 = $hex .'{40}'; my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!; - my $git = PublicInbox::Git->new($self->{git_dir}); - my $log = $git->popen(qw/log --reverse --no-notes --no-color - --raw -r --no-abbrev/, $range); + my $git = $self->{git}; my $latest; my $bytes; - while (defined(my $line = <$log>)) { + my $max = $self->{batch_size}; # may be undef + local $/ = "\n"; + my $line; + while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { - my $mime = do_cat_mail($git, $1, \$bytes) or next; - $add_cb->($self, $git, $mime, $bytes); + my $blob = $1; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { - my $mime = do_cat_mail($git, $1) or next; - $del_cb->($self, $git, $mime); + my $blob = $1; + my $mime = do_cat_mail($git, $blob) or next; + $del_cb->($self, $mime); } elsif ($line =~ /^commit ($h40)/o) { + if (defined $max && --$max <= 0) { + $max = $self->{batch_size}; + $batch_cb->($latest, 1); + } $latest = $1; } } - $latest; + $batch_cb->($latest, 0); } -# indexes all unindexed messages -sub _index_sync { - my ($self, $head) = @_; - my $db = $self->{xdb}; - $head ||= 'HEAD'; - my $mm = $self->{mm} = eval { +sub _msgmap_init { + my ($self) = @_; + $self->{mm} = eval { require PublicInbox::Msgmap; PublicInbox::Msgmap->new($self->{git_dir}, 1); }; +} + +sub _git_log { + my ($self, $range) = @_; + $self->{git}->popen(qw/log --reverse --no-notes --no-color + --raw -r --no-abbrev/, $range); +} + +# indexes all unindexed messages +sub _index_sync { + my ($self, $opts) = @_; + my $tip = $opts->{ref} || 'HEAD'; + my $reindex = $opts->{reindex}; + my ($mkey, $last_commit, $lx, $xlog); + $self->{git}->batch_prepare; + my $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + do { + $xlog = undef; + $mkey = 'last_commit'; + $last_commit = $xdb->get_metadata('last_commit'); + $lx = $last_commit; + if ($reindex) { + $lx = ''; + $mkey = undef if $last_commit ne ''; + } + $xdb->cancel_transaction; + $xdb = _xdb_release($self); + + # ensure we leak no FDs to "git log" + my $range = $lx eq '' ? $tip : "$lx..$tip"; + $xlog = _git_log($self, $range); + + $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + } while ($xdb->get_metadata('last_commit') ne $last_commit); + + my $mm = _msgmap_init($self); + my $dbh = $mm->{dbh} if $mm; + my $mm_only; + my $cb = sub { + my ($commit, $more) = @_; + if ($dbh) { + $mm->last_commit($commit) if $commit; + $dbh->commit; + } + if (!$mm_only) { + $xdb->set_metadata($mkey, $commit) if $mkey && $commit; + $xdb->commit_transaction; + $xdb = _xdb_release($self); + } + # let another process do some work... < + if ($more) { + if (!$mm_only) { + $xdb = _xdb_acquire($self); + $xdb->begin_transaction; + } + $dbh->begin_work if $dbh; + } + }; - $db->begin_transaction; - my $lx = $db->get_metadata('last_commit'); - my $range = $lx eq '' ? $head : "$lx..$head"; if ($mm) { - $mm->{dbh}->begin_work; + $dbh->begin_work; my $lm = $mm->last_commit || ''; if ($lm eq $lx) { # Common case is the indexes are synced, # we only need to run git-log once: - $lx = $self->rlog($range, *index_both, *unindex_both); - $mm->{dbh}->commit; - if (defined $lx) { - $db->set_metadata('last_commit', $lx); - $mm->last_commit($lx); - } + rlog($self, $xlog, *index_both, *unindex_both, $cb); } else { - # dumb case, msgmap and xapian are out-of-sync - # do not care for performance: - my $r = $lm eq '' ? $head : "$lm..$head"; - $lm = $self->rlog($r, *index_mm, *unindex_mm); - $mm->{dbh}->commit; - $mm->last_commit($lm) if defined $lm; - - $lx = $self->rlog($range, *index_mm2, *unindex_mm2); - $db->set_metadata('last_commit', $lx) if defined $lx; + # Uncommon case, msgmap and xapian are out-of-sync + # do not care for performance (but git is fast :>) + # This happens if we have to reindex Xapian since + # msgmap is a frozen format and our Xapian format + # is evolving. + my $r = $lm eq '' ? $tip : "$lm..$tip"; + + # first, ensure msgmap is up-to-date: + my $mkey_prev = $mkey; + $mkey = undef; # ignore xapian, for now + my $mlog = _git_log($self, $r); + $mm_only = 1; + rlog($self, $mlog, *index_mm, *unindex_mm, $cb); + $mm_only = $mlog = undef; + + # now deal with Xapian + $mkey = $mkey_prev; + $dbh = undef; + rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb); } } else { # user didn't install DBD::SQLite and DBI - $lx = $self->rlog($range, *index_blob, *unindex_blob); - $db->set_metadata('last_commit', $lx) if defined $lx; - } - if ($@) { - $db->cancel_transaction; - $mm->{dbh}->rollback if $mm; - } else { - $db->commit_transaction; + rlog($self, $xlog, *index_blob, *unindex_blob, $cb); } } @@ -410,17 +552,15 @@ sub _resolve_mid_to_tid { } sub create_ghost { - my ($self, $mid, $tid) = @_; - - $tid = $self->next_thread_id unless defined $tid; + my ($self, $mid) = @_; + my $tid = $self->next_thread_id; my $doc = Search::Xapian::Document->new; $doc->add_term(xpfx('mid') . $mid); $doc->add_term(xpfx('thread') . $tid); $doc->add_term(xpfx('type') . 'ghost'); my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid); - $self->link_message($smsg, 1); $self->{xdb}->add_document($doc); $smsg; @@ -428,6 +568,7 @@ sub create_ghost { sub merge_threads { my ($self, $winner_tid, $loser_tid) = @_; + return if $winner_tid == $loser_tid; my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid); my $thread_pfx = xpfx('thread'); my $db = $self->{xdb}; @@ -445,6 +586,7 @@ sub _read_git_config_perm { my ($self) = @_; my @cmd = qw(config core.sharedRepository); my $fh = PublicInbox::Git->new($self->{git_dir})->popen(@cmd); + local $/ = "\n"; my $perm = <$fh>; chomp $perm if defined $perm; $perm; @@ -496,4 +638,10 @@ sub with_umask { $rv; } +sub DESTROY { + # order matters for unlocking + $_[0]->{xdb} = undef; + $_[0]->{lockfh} = undef; +} + 1; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index a0899159..5779d1e2 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -7,14 +7,11 @@ package PublicInbox::SearchMsg; use strict; use warnings; use Search::Xapian; -use Email::Address qw//; use POSIX qw//; use Date::Parse qw/str2time/; use PublicInbox::MID qw/mid_clean/; -use Encode qw/find_encoding/; -my $enc_utf8 = find_encoding('UTF-8'); +use PublicInbox::Address; our $PFX2TERM_RE = undef; -use constant EPOCH_822 => 'Thu, 01 Jan 1970 00:00:00 +0000'; use POSIX qw(strftime); sub new { @@ -37,10 +34,10 @@ sub get_val ($$) { sub load_doc { my ($class, $doc) = @_; - my $data = $doc->get_data; + my $data = $doc->get_data or return; my $ts = get_val($doc, &PublicInbox::Search::TS); - $data = $enc_utf8->decode($data); - my ($subj, $from, $refs, $to, $cc) = split(/\n/, $data); + utf8::decode($data); + my ($subj, $from, $refs, $to, $cc, $blob) = split(/\n/, $data); bless { doc => $doc, subject => $subj, @@ -49,6 +46,7 @@ sub load_doc { references => $refs, to => $to, cc => $cc, + blob => $blob, }, $class; } @@ -80,16 +78,15 @@ sub date ($) { return $date if defined $date; my $ts = $self->{ts}; return unless defined $ts; - $self->{date} = strftime('%a, %d %b %Y %T %z', gmtime($ts)); + $self->{date} = strftime('%a, %d %b %Y %T +0000', gmtime($ts)); } sub from ($) { my ($self) = @_; my $from = __hdr($self, 'from'); if (defined $from && !defined $self->{from_name}) { - $from =~ tr/\t\r\n/ /; - my @from = Email::Address->parse($from); - $self->{from_name} = $from[0]->name; + my @n = PublicInbox::Address::names($from); + $self->{from_name} = join(', ', @n); } $from; } @@ -108,9 +105,11 @@ sub ts { } sub to_doc_data { - my ($self) = @_; - join("\n", $self->subject, $self->from, $self->references, - $self->to, $self->cc); + my ($self, $blob) = @_; + my @rows = ($self->subject, $self->from, $self->references, + $self->to, $self->cc); + push @rows, $blob if defined $blob; + join("\n", @rows); } sub references { @@ -144,38 +143,6 @@ sub ensure_metadata { } } -# for threading only -sub mini_mime { - my ($self) = @_; - $self->ensure_metadata; - my @hs = ( - 'Subject' => $self->subject, - 'X-PI-From' => $self->from_name, - ); - - my @h = ( - # prevent Email::Simple::Creator from running, - # this header is useless for threading as we use X-PI-TS - # for sorting and display: - 'Date' => EPOCH_822, - 'Message-ID' => "<$self->{mid}>", - 'X-PI-TS' => $self->ts, - ); - if (my $refs = $self->{references}) { - push @h, References => $refs; - } - my $mime = Email::MIME->create(header_str => \@hs, header => \@h); - my $h = $mime->header_obj; - - # set these headers manually since Encode::encode('MIME-Q', ...) - # will add spaces to long values when using header_str above. - - # drop useless headers Email::MIME set for us - $h->header_set('Date'); - $h->header_set('MIME-Version'); - $mime; -} - sub mid ($;$) { my ($self, $mid) = @_; @@ -191,6 +158,15 @@ sub mid ($;$) { sub _extract_mid { mid_clean(mid_mime($_[0]->mime)) } +sub blob { + my ($self, $x40) = @_; + if (defined $x40) { + $self->{blob} = $x40; + } else { + $self->{blob}; + } +} + sub mime { my ($self, $mime) = @_; if (defined $mime) { diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm new file mode 100644 index 00000000..601a84b0 --- /dev/null +++ b/lib/PublicInbox/SearchThread.pm @@ -0,0 +1,163 @@ +# This library is free software; you can redistribute it and/or modify +# it under the same terms as Perl itself. +# +# This license differs from the rest of public-inbox +# +# Our own jwz-style threading class based on Mail::Thread from CPAN. +# Mail::Thread is unmaintained and unavailable on some distros. +# We also do not want pruning or subject grouping, since we want +# to encourage strict threading and hopefully encourage people +# to use proper In-Reply-To. +# +# This includes fixes from several open bugs for Mail::Thread +# +# Avoid circular references +# - https://rt.cpan.org/Public/Bug/Display.html?id=22817 +# +# And avoid recursion in recurse_down: +# - https://rt.cpan.org/Ticket/Display.html?id=116727 +# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479 +package PublicInbox::SearchThread; +use strict; +use warnings; + +sub new { + return bless { + messages => $_[1], + id_table => {}, + rootset => [] + }, $_[0]; +} + +sub thread { + my $self = shift; + _add_message($self, $_) foreach @{$self->{messages}}; + my $id_table = delete $self->{id_table}; + $self->{rootset} = [ grep { + !delete($_->{parent}) && $_->visible } values %$id_table ]; +} + +sub _get_cont_for_id ($$) { + my ($self, $mid) = @_; + $self->{id_table}{$mid} ||= PublicInbox::SearchThread::Msg->new($mid); +} + +sub _add_message ($$) { + my ($self, $smsg) = @_; + + # A. if id_table... + my $this = _get_cont_for_id($self, $smsg->{mid}); + $this->{smsg} = $smsg; + + # B. For each element in the message's References field: + defined(my $refs = $smsg->{references}) or return; + + # This loop exists to help fill in gaps left from missing + # messages. It is not needed in a perfect world where + # everything is perfectly referenced, only the last ref + # matters. + my $prev; + foreach my $ref ($refs =~ m/<([^>]+)>/g) { + # Find a Container object for the given Message-ID + my $cont = _get_cont_for_id($self, $ref); + + # Link the References field's Containers together in + # the order implied by the References header + # + # * If they are already linked don't change the + # existing links + # * Do not add a link if adding that link would + # introduce a loop... + if ($prev && + !$cont->{parent} && # already linked + !$cont->has_descendent($prev) # would loop + ) { + $prev->add_child($cont); + } + $prev = $cont; + } + + # C. Set the parent of this message to be the last element in + # References. + $prev->add_child($this) if defined $prev; +} + +sub order { + my ($self, $ordersub) = @_; + my $rootset = $ordersub->($self->{rootset}); + $self->{rootset} = $rootset; + $_->order_children($ordersub) for @$rootset; +} + +package PublicInbox::SearchThread::Msg; +use strict; +use warnings; +use Carp qw(croak); + +sub new { + bless { + id => $_[1], + children => {}, # becomes an array when sorted by ->order(...) + }, $_[0]; +} + +sub topmost { + my ($self) = @_; + my @q = ($self); + while (my $cont = shift @q) { + return $cont if $cont->{smsg}; + push @q, values %{$cont->{children}}; + } + undef; +} + +sub add_child { + my ($self, $child) = @_; + croak "Cowardly refusing to become my own parent: $self" + if $self == $child; + + my $cid = $child->{id}; + + # reparenting: + if (defined(my $parent = $child->{parent})) { + delete $parent->{children}->{$cid}; + } + + $self->{children}->{$cid} = $child; + $child->{parent} = $self; +} + +sub has_descendent { + my ($self, $child) = @_; + my %seen; # loop prevention XXX may not be necessary + while ($child) { + return 1 if $self == $child || $seen{$child}++; + $child = $child->{parent}; + } + 0; +} + +# Do not show/keep ghosts iff they have no children. Sometimes +# a ghost Message-ID is the result of a long header line +# being folded/mangled by a MUA, and not a missing message. +sub visible ($) { + my ($self) = @_; + $self->{smsg} || scalar values %{$self->{children}}; +} + +sub order_children { + my ($cur, $ordersub) = @_; + + my %seen = ($cur => 1); # self-referential loop prevention + my @q = ($cur); + while (defined($cur = shift @q)) { + my $c = $cur->{children}; # The hashref here... + + $c = [ grep { !$seen{$_}++ && visible($_) } values %$c ]; + $c = $ordersub->($c) if scalar @$c > 1; + $cur->{children} = $c; # ...becomes an arrayref + push @q, @$c; + } +} + +1; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index ab0ff19a..50a2c01c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -8,14 +8,18 @@ use warnings; use PublicInbox::SearchMsg; use PublicInbox::Hval qw/ascii_html/; use PublicInbox::View; -use PublicInbox::MID qw(mid2path mid_clean mid_mime); +use PublicInbox::WwwAtomStream; +use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape); use Email::MIME; require PublicInbox::Git; +require PublicInbox::SearchThread; our $LIM = 50; +sub noop {} + sub sres_top_html { my ($ctx) = @_; - my $q = PublicInbox::SearchQuery->new($ctx->{cgi}); + my $q = PublicInbox::SearchQuery->new($ctx->{qp}); my $code = 200; # double the limit for expanded views: @@ -26,72 +30,92 @@ sub sres_top_html { relevance => $q->{r}, }; my ($mset, $total); - eval { - $mset = $ctx->{srch}->query($q->{q}, $opts); + $mset = $ctx->{srch}->query($q->{'q'}, $opts); $total = $mset->get_matches_estimated; }; my $err = $@; - my $res = html_start($q, $ctx) . '<pre>'; + ctx_prepare($q, $ctx); + my $cb; if ($err) { $code = 400; - $res .= err_txt($ctx, $err) . "</pre><hr /><pre>" . foot($ctx); + $ctx->{-html_tip} = '<pre>'.err_txt($ctx, $err).'</pre><hr>'; + $cb = *noop; } elsif ($total == 0) { $code = 404; - $res .= "\n\n[No results found]</pre><hr /><pre>".foot($ctx); + $ctx->{-html_tip} = "<pre>\n[No results found]</pre><hr>"; + $cb = *noop; } else { my $x = $q->{x}; - return sub { adump($_[0], $mset, $q, $ctx) } if ($x eq 'A'); + return adump($_[0], $mset, $q, $ctx) if $x eq 'A'; - $res .= search_nav_top($mset, $q) . "\n\n"; + $ctx->{-html_tip} = search_nav_top($mset, $q) . "\n\n"; if ($x eq 't') { - return sub { tdump($_[0], $res, $mset, $q, $ctx) }; + $cb = mset_thread($ctx, $mset, $q); + } else { + $cb = mset_summary($ctx, $mset, $q); } - dump_mset(\$res, $mset); - $res .= '</pre>' . search_nav_bot($mset, $q) . - "\n\n" . foot($ctx); } + PublicInbox::WwwStream->response($ctx, $code, $cb); +} + +# allow undef for individual doc loads... +sub load_doc_retry { + my ($srch, $mitem) = @_; - $res .= "</pre></body></html>"; - [$code, ['Content-Type'=>'text/html; charset=UTF-8'], [$res]]; + eval { + $srch->retry_reopen(sub { + PublicInbox::SearchMsg->load_doc($mitem->get_document) + }); + } } # display non-threaded search results similar to what users expect from # regular WWW search engines: -sub dump_mset { - my ($res, $mset) = @_; +sub mset_summary { + my ($ctx, $mset, $q) = @_; my $total = $mset->get_matches_estimated; my $pad = length("$total"); my $pfx = ' ' x $pad; + my $res = \($ctx->{-html_tip}); + my $srch = $ctx->{srch}; foreach my $m ($mset->items) { my $rank = sprintf("%${pad}d", $m->get_rank + 1); my $pct = $m->get_percent; - my $smsg = PublicInbox::SearchMsg->load_doc($m->get_document); + my $smsg = load_doc_retry($srch, $m); + unless ($smsg) { + eval { + $m = "$m ".$m->get_docid . " expired\n"; + $ctx->{env}->{'psgi.errors'}->print($m); + }; + next; + } my $s = ascii_html($smsg->subject); my $f = ascii_html($smsg->from_name); my $ts = PublicInbox::View::fmt_ts($smsg->ts); - my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->as_href; + my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href}; $$res .= qq{$rank. <b><a\nhref="$mid/">}. $s . "</a></b>\n"; $$res .= "$pfx - by $f @ $ts UTC [$pct%]\n\n"; } + $$res .= search_nav_bot($mset, $q); + *noop; } sub err_txt { my ($ctx, $err) = @_; - my $u = '//xapian.org/docs/queryparser.html'; - $u = PublicInbox::Hval::prurl($ctx->{cgi}->{env}, $u); + my $u = $ctx->{-inbox}->base_url($ctx->{env}) . '_/text/help/'; $err =~ s/^\s*Exception:\s*//; # bad word to show users :P $err = ascii_html($err); - "\n\nBad query: <b>$err</b>\n" . - qq{See <a\nhref="$u">$u</a> for Xapian query syntax}; + "\nBad query: <b>$err</b>\n" . + qq{See <a\nhref="$u">$u</a> for help on using search}; } sub search_nav_top { my ($mset, $q) = @_; - my $rv = "Search results ordered by ["; + my $rv = "<pre>Search results ordered by ["; if ($q->{r}) { my $d = $q->qs_html(r => 0); $rv .= qq{<a\nhref="?$d">date</a>|<b>relevance</b>}; @@ -121,156 +145,121 @@ sub search_nav_bot { my $o = $q->{o}; my $end = $o + $nr; my $beg = $o + 1; - my $rv = "<hr /><pre>Results $beg-$end of $total"; + my $rv = "</pre><hr><pre>Results $beg-$end of $total"; my $n = $o + $LIM; if ($n < $total) { my $qs = $q->qs_html(o => $n); - $rv .= qq{, <a\nhref="?$qs">next</a>} + $rv .= qq{, <a\nhref="?$qs"\nrel=next>next</a>} } if ($o > 0) { $rv .= $n < $total ? '/' : ', '; my $p = $o - $LIM; my $qs = $q->qs_html(o => ($p > 0 ? $p : 0)); - $rv .= qq{<a\nhref="?$qs">prev</a>}; + $rv .= qq{<a\nhref="?$qs"\nrel=prev>prev</a>}; } - $rv; + $rv .= '</pre>'; } -sub tdump { - my ($cb, $res, $mset, $q, $ctx) = @_; - my $fh = $cb->([200, ['Content-Type'=>'text/html; charset=UTF-8']]); - $fh->write($res .= '</pre>'); +sub mset_thread { + my ($ctx, $mset, $q) = @_; my %pct; - my @m = map { + my $msgs = $ctx->{srch}->retry_reopen(sub { [ map { my $i = $_; - my $m = PublicInbox::SearchMsg->load_doc($i->get_document); - $pct{$m->mid} = $i->get_percent; - $m = $m->mini_mime; - $m; - } ($mset->items); + my $smsg = PublicInbox::SearchMsg->load_doc($i->get_document); + $pct{$smsg->mid} = $i->get_percent; + $smsg; + } ($mset->items) ]}); - require PublicInbox::Thread; - my $th = PublicInbox::Thread->new(@m); - { - no warnings 'once'; - $Mail::Thread::nosubject = 0; - } + my $th = PublicInbox::SearchThread->new($msgs); $th->thread; - if ($q->{r}) { + if ($q->{r}) { # order by relevance $th->order(sub { - sort { (eval { $pct{$b->topmost->messageid} } || 0) + [ sort { (eval { $pct{$b->topmost->{id}} } || 0) <=> - (eval { $pct{$a->topmost->messageid} } || 0) - } @_; + (eval { $pct{$a->topmost->{id}} } || 0) + } @{$_[0]} ]; }); - } else { - no warnings 'once'; - $th->order(*PublicInbox::View::rsort_ts); + } else { # order by time (default for threaded view) + $th->order(*PublicInbox::View::sort_ts); } - - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my $state = { - ctx => $ctx, - anchor_idx => 0, - pct => \%pct, - cur_level => 0, - fh => $fh, + my $skel = search_nav_bot($mset, $q). "<pre>"; + my $inbox = $ctx->{-inbox}; + $ctx->{-upfx} = ''; + $ctx->{anchor_idx} = 1; + $ctx->{cur_level} = 0; + $ctx->{dst} = \$skel; + $ctx->{mapping} = {}; + $ctx->{pct} = \%pct; + $ctx->{prev_attr} = ''; + $ctx->{prev_level} = 0; + $ctx->{seen} = {}; + $ctx->{s_nr} = scalar(@$msgs).'+ results'; + + PublicInbox::View::walk_thread($th, $ctx, + *PublicInbox::View::pre_thread); + + my $mime; + sub { + return unless $msgs; + while ($mime = shift @$msgs) { + $mime = $inbox->msg_by_smsg($mime) and last; + } + if ($mime) { + $mime = Email::MIME->new($mime); + return PublicInbox::View::index_entry($mime, $ctx, + scalar @$msgs); + } + $msgs = undef; + $skel .= "\n</pre>"; }; - $ctx->{searchview} = 1; - tdump_ent($git, $state, $_, 0) for $th->rootset; - PublicInbox::View::thread_adj_level($state, 0); - Email::Address->purge_cache; - - $fh->write(search_nav_bot($mset, $q). "\n\n" . - foot($ctx). '</pre></body></html>'); - - $fh->close; } -sub tdump_ent { - my ($git, $state, $node, $level) = @_; - return unless $node; - my $mime = $node->message; - - if ($mime) { - # lazy load the full message from mini_mime: - my $mid = mid_mime($mime); - $mime = eval { - my $path = mid2path(mid_clean($mid)); - Email::MIME->new($git->cat_file('HEAD:'.$path)); - }; - } - if ($mime) { - my $end = PublicInbox::View::thread_adj_level($state, $level); - PublicInbox::View::index_entry($mime, $level, $state); - $state->{fh}->write($end) if $end; - } else { - my $mid = $node->messageid; - PublicInbox::View::ghost_flush($state, '', $mid, $level); - } - tdump_ent($git, $state, $node->child, $level + 1); - tdump_ent($git, $state, $node->next, $level); -} - -sub foot { - my ($ctx) = @_; - my $foot = $ctx->{footer} || ''; - qq{Back to <a\nhref=".">index</a>.\n$foot}; -} - -sub html_start { +sub ctx_prepare { my ($q, $ctx) = @_; my $qh = ascii_html($q->{'q'}); - my $A = $q->qs_html(x => 'A', r => undef); - my $res = '<html><head>' . PublicInbox::Hval::STYLE . - "<title>$qh - search results</title>" . - qq{<link\nrel=alternate\ntitle="Atom feed"\n} . - qq!href="?$A"\ntype="application/atom+xml"/></head>! . - qq{<body><form\naction="">} . - qq{<input\nname=q\nvalue="$qh"\ntype=text />}; - - $res .= qq{<input\ntype=hidden\nname=r />} if $q->{r}; + $ctx->{-q_value_html} = $qh; + $ctx->{-atom} = '?'.$q->qs_html(x => 'A', r => undef); + $ctx->{-title_html} = "$qh - search results"; + my $extra = ''; + $extra .= qq{<input\ntype=hidden\nname=r />} if $q->{r}; if (my $x = $q->{x}) { $x = ascii_html($x); - $res .= qq{<input\ntype=hidden\nname=x\nvalue="$x" />}; + $extra .= qq{<input\ntype=hidden\nname=x\nvalue="$x" />}; } - - $res .= qq{<input\ntype=submit\nvalue=search /></form>}; + $ctx->{-extra_form_html} = $extra; } sub adump { my ($cb, $mset, $q, $ctx) = @_; - my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]); - my $git = $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - my $feed_opts = PublicInbox::Feed::get_feedopts($ctx); - my $x = ascii_html($q->{'q'}); - $x = qq{$x - search results}; - $feed_opts->{atomurl} = $feed_opts->{url} . '?'. $q->qs_html; - $feed_opts->{url} .= '?'. $q->qs_html(x => undef); - $x = PublicInbox::Feed::atom_header($feed_opts, $x); - $fh->write($x. PublicInbox::Feed::feed_updated()); - - for ($mset->items) { - $x = PublicInbox::SearchMsg->load_doc($_->get_document)->mid; - $x = mid2path($x); - PublicInbox::Feed::add_to_feed($feed_opts, $fh, $x, $git); - } - PublicInbox::Feed::end_feed($fh); + my $ibx = $ctx->{-inbox}; + my @items = $mset->items; + $ctx->{search_query} = $q; + my $srch = $ctx->{srch}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $x = shift @items) { + $x = load_doc_retry($srch, $x); + $x = $ibx->msg_by_smsg($x) and + return Email::MIME->new($x); + } + return undef; + }); } package PublicInbox::SearchQuery; use strict; use warnings; use PublicInbox::Hval; +use PublicInbox::MID qw(mid_escape); sub new { - my ($class, $cgi) = @_; - my $r = $cgi->param('r'); + my ($class, $qp) = @_; + + my $r = $qp->{r}; bless { - q => $cgi->param('q'), - x => $cgi->param('x') || '', - o => int($cgi->param('o') || 0) || 0, + q => $qp->{'q'}, + x => $qp->{x} || '', + o => (($qp->{o} || '0') =~ /(\d+)/), r => (defined $r && $r ne '0'), }, $class; } @@ -286,7 +275,7 @@ sub qs_html { $self = $tmp; } - my $q = PublicInbox::Hval->new($self->{'q'})->as_href; + my $q = mid_escape($self->{'q'}); $q =~ s/%20/+/g; # improve URL readability my $qs = "q=$q"; diff --git a/lib/PublicInbox/Spamcheck/Spamc.pm b/lib/PublicInbox/Spamcheck/Spamc.pm new file mode 100644 index 00000000..30eec95c --- /dev/null +++ b/lib/PublicInbox/Spamcheck/Spamc.pm @@ -0,0 +1,93 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +package PublicInbox::Spamcheck::Spamc; +use strict; +use warnings; +use PublicInbox::Spawn qw(popen_rd spawn); +use IO::Handle; +use Fcntl qw(:DEFAULT SEEK_SET); + +sub new { + my ($class) = @_; + bless { + checkcmd => [qw(spamc -E --headers)], + hamcmd => [qw(spamc -L ham)], + spamcmd => [qw(spamc -L spam)], + }, $class; +} + +sub spamcheck { + my ($self, $msg, $out) = @_; + + my $tmp; + my $fd = _msg_to_fd($self, $msg, \$tmp); + my $rdr = { 0 => $fd }; + my ($fh, $pid) = popen_rd($self->{checkcmd}, undef, $rdr); + defined $pid or die "failed to popen_rd spamc: $!\n"; + my $r; + unless (ref $out) { + my $buf = ''; + $out = \$buf; + } + do { + $r = sysread($fh, $$out, 65536, length($$out)); + } while (defined($r) && $r != 0); + defined $r or die "read failed: $!"; + close $fh or die "close failed: $!"; + waitpid($pid, 0); + ($? || $$out eq '') ? 0 : 1; +} + +sub hamlearn { + my ($self, $msg, $rdr) = @_; + _learn($self, $msg, $rdr, 'hamcmd'); +} + +sub spamlearn { + my ($self, $msg, $rdr) = @_; + _learn($self, $msg, $rdr, 'spamcmd'); +} + +sub _learn { + my ($self, $msg, $rdr, $field) = @_; + $rdr ||= {}; + $rdr->{1} ||= $self->_devnull; + $rdr->{2} ||= $self->_devnull; + my $tmp; + $rdr->{0} = _msg_to_fd($self, $msg, \$tmp); + my $pid = spawn($self->{$field}, undef, $rdr); + waitpid($pid, 0); + !$?; +} + +sub _devnull { + my ($self) = @_; + my $fd = $self->{-devnullfd}; + return $fd if defined $fd; + open my $fh, '+>', '/dev/null' or + die "failed to open /dev/null: $!"; + $self->{-devnull} = $fh; + $self->{-devnullfd} = fileno($fh); +} + +sub _msg_to_fd { + my ($self, $msg, $tmpref) = @_; + my $fd; + if (my $ref = ref($msg)) { + my $fileno = eval { fileno($msg) }; + return $fileno if defined $fileno; + + open(my $tmpfh, '+>', undef) or die "failed to open: $!"; + $tmpfh->autoflush(1); + $msg = \($msg->as_string) if $ref ne 'SCALAR'; + print $tmpfh $$msg or die "failed to print: $!"; + sysseek($tmpfh, 0, SEEK_SET) or + die "sysseek(fh) failed: $!"; + $$tmpref = $tmpfh; + + return fileno($tmpfh); + } + $msg; +} + +1; diff --git a/lib/PublicInbox/Spawn.pm b/lib/PublicInbox/Spawn.pm index 23f303fb..41b08a33 100644 --- a/lib/PublicInbox/Spawn.pm +++ b/lib/PublicInbox/Spawn.pm @@ -24,6 +24,8 @@ my $vfork_spawn = <<'VFORK_SPAWN'; #include <sys/uio.h> #include <unistd.h> #include <alloca.h> +#include <signal.h> +#include <assert.h> #define AV_ALLOCA(av, max) alloca((max = (av_len((av)) + 1)) * sizeof(char *)) @@ -81,6 +83,8 @@ int public_inbox_fork_exec(int in, int out, int err, pid_t pid; char **argv, **envp; I32 max; + sigset_t set, old; + int ret, errnum; argv = AV_ALLOCA(cmd, max); av2c_copy(argv, cmd, max); @@ -88,14 +92,30 @@ int public_inbox_fork_exec(int in, int out, int err, envp = AV_ALLOCA(env, max); av2c_copy(envp, env, max); + ret = sigfillset(&set); + assert(ret == 0 && "BUG calling sigfillset"); + ret = sigprocmask(SIG_SETMASK, &set, &old); + assert(ret == 0 && "BUG calling sigprocmask to block"); pid = vfork(); if (pid == 0) { + int sig; + REDIR(in, 0); REDIR(out, 1); REDIR(err, 2); + for (sig = 1; sig < NSIG; sig++) + signal(sig, SIG_DFL); /* ignore errors on signals */ + /* + * don't bother unblocking, we don't want signals + * to the group taking out a subprocess + */ execve(filename, argv, envp); xerr("execve failed"); } + errnum = errno; + ret = sigprocmask(SIG_SETMASK, &old, NULL); + assert(ret == 0 && "BUG calling sigprocmask to restore"); + errno = errnum; return (int)pid; } @@ -111,7 +131,7 @@ if (defined $vfork_spawn) { my $f = "$inline_dir/.public-inbox.lock"; open my $fh, '>', $f or die "failed to open $f: $!\n"; flock($fh, LOCK_EX) or die "LOCK_EX failed on $f: $!\n"; - eval 'use Inline C => $vfork_spawn'; + eval 'use Inline C => $vfork_spawn'; #, BUILD_NOISY => 1'; my $err = $@; flock($fh, LOCK_UN) or die "LOCK_UN failed on $f: $!\n"; die $err if $err; @@ -128,6 +148,7 @@ unless (defined $vfork_spawn) { *public_inbox_fork_exec = *PublicInbox::SpawnPP::public_inbox_fork_exec } +# n.b. we never use absolute paths with this sub which ($) { my ($file) = @_; foreach my $p (split(':', $ENV{PATH})) { @@ -161,7 +182,8 @@ sub spawn ($;$$) { my $in = $opts->{0} || 0; my $out = $opts->{1} || 1; my $err = $opts->{2} || 2; - public_inbox_fork_exec($in, $out, $err, $f, $cmd, \@env); + my $pid = public_inbox_fork_exec($in, $out, $err, $f, $cmd, \@env); + $pid < 0 ? undef : $pid; } sub popen_rd { @@ -172,6 +194,7 @@ sub popen_rd { IO::Handle::blocking($r, $blocking) if defined $blocking; $opts->{1} = fileno($w); my $pid = spawn($cmd, $env, $opts); + return unless defined $pid; return ($r, $pid) if wantarray; my $ret = gensym; tie *$ret, 'PublicInbox::ProcessPipe', $pid, $r; diff --git a/lib/PublicInbox/SpawnPP.pm b/lib/PublicInbox/SpawnPP.pm index dc2ef364..179aba5e 100644 --- a/lib/PublicInbox/SpawnPP.pm +++ b/lib/PublicInbox/SpawnPP.pm @@ -3,12 +3,21 @@ package PublicInbox::SpawnPP; use strict; use warnings; -use POSIX qw(dup2); +use POSIX qw(dup2 :signal_h); # Pure Perl implementation for folks that do not use Inline::C sub public_inbox_fork_exec ($$$$$$) { my ($in, $out, $err, $f, $cmd, $env) = @_; + my $old = POSIX::SigSet->new(); + my $set = POSIX::SigSet->new(); + $set->fillset or die "fillset failed: $!"; + sigprocmask(SIG_SETMASK, $set, $old) or die "can't block signals: $!"; + my $syserr; my $pid = fork; + unless (defined $pid) { # compat with Inline::C version + $syserr = $!; + $pid = -1; + } if ($pid == 0) { if ($in != 0) { dup2($in, 0) or die "dup2 failed for stdin: $!"; @@ -19,9 +28,18 @@ sub public_inbox_fork_exec ($$$$$$) { if ($err != 2) { dup2($err, 2) or die "dup2 failed for stderr: $!"; } - exec qw(env -i), @$env, @$cmd; - die "exec env -i ... $cmd->[0] failed: $!\n"; + + if ($ENV{MOD_PERL}) { + exec qw(env -i), @$env, @$cmd; + die "exec env -i ... $cmd->[0] failed: $!\n"; + } else { + local %ENV = map { split(/=/, $_, 2) } @$env; + exec @$cmd; + die "exec $cmd->[0] failed: $!\n"; + } } + sigprocmask(SIG_SETMASK, $old) or die "can't unblock signals: $!"; + $! = $syserr; $pid; } diff --git a/lib/PublicInbox/Thread.pm b/lib/PublicInbox/Thread.pm deleted file mode 100644 index 781fffff..00000000 --- a/lib/PublicInbox/Thread.pm +++ /dev/null @@ -1,42 +0,0 @@ -# subclass Mail::Thread and use this to workaround a memory leak -# Based on the patch in: https://rt.cpan.org/Public/Bug/Display.html?id=22817 -# -# Additionally, workaround for a bug where $walk->topmost returns undef: -# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913 -# - https://rt.cpan.org/Ticket/Display.html?id=106498 -# -# License differs from the rest of public-inbox (but is compatible): -# This library is free software; you can redistribute it and/or modify -# it under the same terms as Perl itself. -package PublicInbox::Thread; -use strict; -use warnings; -use base qw(Mail::Thread); - -if ($Mail::Thread::VERSION <= 2.55) { - eval q(sub _container_class { 'PublicInbox::Thread::Container' }); -} - -package PublicInbox::Thread::Container; -use strict; -use warnings; -use base qw(Mail::Thread::Container); -use Scalar::Util qw(weaken); -sub parent { @_ == 2 ? weaken($_[0]->{parent} = $_[1]) : $_[0]->{parent} } - -sub topmost { - $_[0]->SUPER::topmost || PublicInbox::Thread::CPANRTBug106498->new; -} - -# ref: -# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=795913 -# - https://rt.cpan.org/Ticket/Display.html?id=106498 -package PublicInbox::Thread::CPANRTBug106498; -use strict; -use warnings; - -sub new { bless {}, $_[0] } - -sub simple_subject {} - -1; diff --git a/lib/PublicInbox/Unsubscribe.pm b/lib/PublicInbox/Unsubscribe.pm new file mode 100644 index 00000000..fca300e5 --- /dev/null +++ b/lib/PublicInbox/Unsubscribe.pm @@ -0,0 +1,180 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Standalone PSGI app to handle HTTP(s) unsubscribe links generated +# by milters like examples/unsubscribe.milter to mailing lists. +# +# This does not depend on any other modules in the PublicInbox::* +# and ought to be usable with any mailing list software. +package PublicInbox::Unsubscribe; +use strict; +use warnings; +use Crypt::CBC; +use Plack::Util; +use MIME::Base64 qw(decode_base64url); +my $CODE_URL = 'https://public-inbox.org/public-inbox.git'; +my @CT_HTML = ('Content-Type', 'text/html; charset=UTF-8'); + +sub new { + my ($class, %opt) = @_; + my $key_file = $opt{key_file}; + defined $key_file or die "`key_file' needed"; + open my $fh, '<', $key_file or die + "failed to open key_file=$key_file: $!\n"; + my ($key, $iv); + if (read($fh, $key, 8) != 8 || read($fh, $iv, 8) != 8 || + read($fh, my $end, 8) != 0) { + die "key_file must be 16 bytes\n"; + } + + # these parameters were chosen to generate shorter parameters + # to reduce the possibility of copy+paste errors + my $cipher = Crypt::CBC->new(-key => $key, + -iv => $iv, + -header => 'none', + -cipher => 'Blowfish'); + + my $e = $opt{owner_email} or die "`owner_email' not specified\n"; + my $unsubscribe = $opt{unsubscribe} or + die "`unsubscribe' callback not given\n"; + + bless { + pi_config => $opt{pi_config}, # PublicInbox::Config + owner_email => $opt{owner_email}, + cipher => $cipher, + unsubscribe => $unsubscribe, + contact => qq(<a\nhref="mailto:$e">$e</a>), + code_url => $opt{code_url} || $CODE_URL, + confirm => $opt{confirm}, + }, $class; +} + +# entry point for PSGI +sub call { + my ($self, $env) = @_; + my $m = $env->{REQUEST_METHOD}; + if ($m eq 'GET' || $m eq 'HEAD') { + $self->{confirm} ? confirm_prompt($self, $env) + : finalize_unsub($self, $env); + } elsif ($m eq 'POST') { + finalize_unsub($self, $env); + } else { + r($self, 405, + Plack::Util::encode_html($m).' method not allowed'); + } +} + +sub _user_list_addr { + my ($self, $env) = @_; + my ($blank, $u, $list) = split('/', $env->{PATH_INFO}); + + if (!defined $u || $u eq '') { + return r($self, 400, 'Bad request', + 'Missing encrypted email address in path component'); + } + if (!defined $list && $list eq '') { + return r($self, 400, 'Bad request', + 'Missing mailing list name in path component'); + } + my $user = eval { $self->{cipher}->decrypt(decode_base64url($u)) }; + if (!defined $user || index($user, '@') < 1) { + my $err = quotemeta($@); + my $errors = $env->{'psgi.errors'}; + $errors->print("error decrypting: $u\n"); + $errors->print("$_\n") for split("\n", $err); + $u = Plack::Util::encode_html($u); + return r($self, 400, 'Bad request', "Failed to decrypt: $u"); + } + + # The URLs are too damn long if we have the encrypted domain + # name in the PATH_INFO + if (index($list, '@') < 0) { + my $host = (split(':', $env->{HTTP_HOST}))[0]; + $list .= '@'.$host; + } + ($user, $list); +} + +sub confirm_prompt { # on GET + my ($self, $env) = @_; + my ($user_addr, $list_addr) = _user_list_addr($self, $env); + return $user_addr if ref $user_addr; + + my $xl = Plack::Util::encode_html($list_addr); + my $xu = Plack::Util::encode_html($user_addr); + my @body = ( + "Confirmation required to remove", '', + "\t$xu", '', + "from the mailing list at", '', + "\t$xl", '', + 'You will get one last email once you hit "Confirm" below:', + qq(</pre><form\nmethod=post\naction="">) . + qq(<input\ntype=submit\nvalue="Confirm" />) . + '</form><pre>'); + + push @body, archive_info($self, $env, $list_addr); + + r($self, 200, "Confirm unsubscribe for $xl", @body); +} + +sub finalize_unsub { # on POST + my ($self, $env) = @_; + my ($user_addr, $list_addr) = _user_list_addr($self, $env); + return $user_addr if ref $user_addr; + + my @archive = archive_info($self, $env, $list_addr); + if (my $err = $self->{unsubscribe}->($user_addr, $list_addr)) { + return r($self, 500, Plack::Util::encode_html($err), @archive); + } + + my $xl = Plack::Util::encode_html($list_addr); + r($self, 200, "Unsubscribed from $xl", + 'You may get one final goodbye message', @archive); +} + +sub r { + my ($self, $code, $title, @body) = @_; + [ $code, [ @CT_HTML ], [ + "<html><head><title>$title</title></head><body><pre>". + join("\n", "<b>$title</b>\n", @body) . '</pre><hr>'. + "<pre>This page is available under AGPL-3.0+\n" . + "git clone $self->{code_url}\n" . + qq(Email $self->{contact} if you have any questions). + '</pre></body></html>' + ] ]; +} + +sub archive_info { + my ($self, $env, $list_addr) = @_; + my $archive_url = $self->{archive_urls}->{$list_addr}; + + unless ($archive_url) { + if (my $config = $self->{pi_config}) { + # PublicInbox::Config::lookup + my $inbox = $config->lookup($list_addr); + # PublicInbox::Inbox::base_url + $archive_url = $inbox->base_url if $inbox; + } + } + + # protocol-relative URL: "//example.com/" => "https://example.com/" + if ($archive_url && $archive_url =~ m!\A//!) { + $archive_url = "$env->{'psgi.url_scheme'}:$archive_url"; + } + + # maybe there are other places where we could map + # list_addr => archive_url without ~/.public-inbox/config + if ($archive_url) { + $archive_url = Plack::Util::encode_html($archive_url); + ('', + 'HTML and git clone-able archives are available at:', + qq(<a\nhref="$archive_url">$archive_url</a>)) + } else { + ('', + 'There ought to be archives for this list,', + 'but unfortunately the admin did not configure '. + __PACKAGE__. ' to show you the URL'); + } +} + +1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index a4047aa2..fa47a16a 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -8,86 +8,93 @@ use strict; use warnings; use URI::Escape qw/uri_escape_utf8/; use Date::Parse qw/str2time/; -use Encode qw/find_encoding/; -use Encode::MIME::Header; -use Email::MIME::ContentType qw/parse_content_type/; use PublicInbox::Hval qw/ascii_html/; use PublicInbox::Linkify; -use PublicInbox::MID qw/mid_clean id_compress mid2path mid_mime/; +use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/; +use PublicInbox::MsgIter; +use PublicInbox::Address; +use PublicInbox::WwwStream; require POSIX; -# TODO: make these constants tunable -use constant MAX_INLINE_QUOTED => 12; # half an 80x24 terminal -use constant MAX_TRUNC_LEN => 72; -use constant T_ANCHOR => '#u'; use constant INDENT => ' '; +use constant TCHILD => '` '; +sub th_pfx ($) { $_[0] == 0 ? '' : TCHILD }; -my $enc_utf8 = find_encoding('UTF-8'); - -# public functions: +# public functions: (unstable) sub msg_html { - my ($ctx, $mime, $full_pfx, $footer) = @_; - $footer = defined($footer) ? "\n$footer" : ''; + my ($ctx, $mime) = @_; my $hdr = $mime->header_obj; - headers_to_html_header($hdr, $full_pfx, $ctx) . - multipart_text_as_html($mime, $full_pfx) . - '</pre><hr /><pre>' . - html_footer($hdr, 1, $full_pfx, $ctx) . - $footer . - '</pre></body></html>'; + my $tip = _msg_html_prepare($hdr, $ctx); + PublicInbox::WwwStream->response($ctx, 200, sub { + my ($nr, undef) = @_; + if ($nr == 1) { + $tip . multipart_text_as_html($mime, '') . '</pre><hr>' + } elsif ($nr == 2) { + # fake an EOF if generating the footer fails; + # we want to at least show the message if something + # here crashes: + eval { + '<pre>' . html_footer($hdr, 1, $ctx) . + '</pre>' . msg_reply($ctx, $hdr) + }; + } else { + undef + } + }); } -# /$LISTNAME/$MESSAGE_ID/R/ +# /$INBOX/$MESSAGE_ID/#R sub msg_reply { - my ($ctx, $hdr, $footer) = @_; - my $s = $hdr->header('Subject'); - $s = '(no subject)' if (!defined $s) || ($s eq ''); - my $f = $hdr->header('From'); - $f = '' unless defined $f; - my $mid = $hdr->header_raw('Message-ID'); - $mid = PublicInbox::Hval->new_msgid($mid); - my $t = ascii_html($s); + my ($ctx, $hdr) = @_; my $se_url = 'https://kernel.org/pub/software/scm/git/docs/git-send-email.html'; + my $p_url = + 'https://en.wikipedia.org/wiki/Posting_style#Interleaved_style'; + + my $info = ''; + if (my $url = $ctx->{-inbox}->{infourl}) { + $url = PublicInbox::Hval::prurl($ctx->{env}, $url); + $info = qq(\n List information: <a\nhref="$url">$url</a>\n); + } my ($arg, $link) = mailto_arg_link($hdr); push @$arg, '/path/to/YOUR_REPLY'; + $arg = ascii_html(join(" \\\n ", '', @$arg)); + <<EOF +<hr><pre +id=R><b>Reply instructions:</b> + +You may reply publically to <a +href=#t>this message</a> via plain-text email +using any one of the following methods: + +* Save the following mbox file, import it into your mail client, + and reply-to-all from there: <a +href=raw>mbox</a> + + Avoid top-posting and favor interleaved quoting: + <a +href="$p_url">$p_url</a> +$info +* Reply to all the recipients using the <b>--to</b>, <b>--cc</b>, + and <b>--in-reply-to</b> switches of git-send-email(1): + + git send-email$arg - "<html><head><title>replying to \"$t\"</title></head><body><pre>" . - "replying to message:\n\n" . - "Subject: <b>$t</b>\n" . - "From: ". ascii_html($f) . - "\nDate: " . ascii_html($hdr->header('Date')) . - "\nMessage-ID: <" . $mid->as_html . ">\n\n" . - "There are multiple ways to reply:\n\n" . - "* Save the following mbox file, import it into your mail client,\n" . - " and reply-to-all from there: <a\nhref=../raw>mbox</a>\n\n" . - "* Reply to all the recipients using the <b>--to</b>, <b>--cc</b>,\n" . - " and <b>--in-reply-to</b> switches of git-send-email(1):\n\n" . - "\tgit send-email \\\n\t\t" . - join(" \\ \n\t\t", @$arg ). "\n\n" . - qq( <a\nhref="$se_url">$se_url</a>\n\n) . - "* If your mail client supports setting the <b>In-Reply-To</b>" . - " header\n via mailto: links, try the " . - qq(<a\nhref="$link">mailto: link</a>\n) . - "\nFor context, the original <a\nhref=../>message</a> or " . - qq(<a\nhref="../t/#u">thread</a>) . - '</pre><hr /><pre>' . $footer . '</pre></body></html>'; -} - -sub feed_entry { - my ($class, $mime, $full_pfx) = @_; - - # no <head> here for <style>... - qq(<pre\nstyle="white-space:pre-wrap">) . - multipart_text_as_html($mime, $full_pfx) . '</pre>'; + <a +href="$se_url">$se_url</a> + +* If your mail client supports setting the <b>In-Reply-To</b> header + via mailto: links, try the <a +href="$link">mailto: link</a></pre> +EOF } sub in_reply_to { my ($hdr) = @_; my $irt = $hdr->header_raw('In-Reply-To'); - return mid_clean($irt) if (defined $irt); + return mid_clean($irt) if defined $irt && $irt ne ''; my $refs = $hdr->header_raw('References'); if ($refs && $refs =~ /<([^>]+)>\s*\z/s) { @@ -96,276 +103,385 @@ sub in_reply_to { undef; } +sub _hdr_names ($$) { + my ($hdr, $field) = @_; + my $val = $hdr->header($field) or return ''; + ascii_html(join(', ', PublicInbox::Address::names($val))); +} + +sub nr_to_s ($$$) { + my ($nr, $singular, $plural) = @_; + return "0 $plural" if $nr == 0; + $nr == 1 ? "$nr $singular" : "$nr $plural"; +} + # this is already inside a <pre> sub index_entry { - my ($mime, $level, $state) = @_; - my $midx = $state->{anchor_idx}++; - my $ctx = $state->{ctx}; + my ($mime, $ctx, $more) = @_; my $srch = $ctx->{srch}; - my $part_nr = 0; my $hdr = $mime->header_obj; - my $enc = enc_for($hdr->header("Content-Type")); my $subj = $hdr->header('Subject'); my $mid_raw = mid_clean(mid_mime($mime)); - my $id = anchor_for($mid_raw); - my $seen = $state->{seen}; - $seen->{$id} = "#$id"; # save the anchor for children, later - - my $mid = PublicInbox::Hval->new_msgid($mid_raw); - my $from = $hdr->header('From'); - my @from = Email::Address->parse($from); - $from = $from[0]->name; - - my $root_anchor = $state->{root_anchor} || ''; - my $path = $root_anchor ? '../../' : ''; - my $href = $mid->as_href; - my $irt = in_reply_to($hdr); - my $parent_anchor = $seen->{anchor_for($irt)} if defined $irt; - - $from = ascii_html($from); - $subj = ascii_html($subj); - $subj = "<a\nhref=\"${path}$href/\">$subj</a>"; - $subj = "<u\nid=u>$subj</u>" if $root_anchor eq $id; - - my $ts = _msg_date($hdr); - my $rv = "<pre\nid=s$midx>"; - $rv .= "<b\nid=$id>$subj</b>\n"; - my $txt = "${path}$href/raw"; - my $fh = $state->{fh}; - $fh->write($rv .= "- $from @ $ts UTC (<a\nhref=\"$txt\">raw</a>)\n\n"); - - my $fhref; - my $mhref = "${path}$href/"; - - # show full message if it's our root message - my $neq = $root_anchor ne $id; - if ($neq || ($neq && $level != 0 && !$ctx->{flat})) { - $fhref = "${path}$href/f/"; + my $id = id_compress($mid_raw, 1); + my $id_m = 'm'.$id; + + my $root_anchor = $ctx->{root_anchor} || ''; + my $irt; + + my $rv = "<a\nhref=#e$id\nid=m$id>*</a> "; + $subj = '<b>'.ascii_html($subj).'</b>'; + $subj = "<u\nid=u>$subj</u>" if $root_anchor eq $id_m; + $rv .= $subj . "\n"; + $rv .= _th_index_lite($mid_raw, \$irt, $id, $ctx); + my @tocc; + foreach my $f (qw(To Cc)) { + my $dst = _hdr_names($hdr, $f); + push @tocc, "$f: $dst" if $dst ne ''; + } + $rv .= "From: "._hdr_names($hdr, 'From').' @ '._msg_date($hdr)." UTC"; + my $upfx = $ctx->{-upfx}; + my $mhref = $upfx . mid_escape($mid_raw) . '/'; + $rv .= qq{ (<a\nhref="$mhref">permalink</a> / }; + $rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n}; + $rv .= ' '.join('; +', @tocc) . "\n" if @tocc; + + my $mapping = $ctx->{mapping}; + if (!$mapping && (defined($irt) || defined($irt = in_reply_to($hdr)))) { + my $mirt = PublicInbox::Hval->new_msgid($irt); + my $href = $upfx . $mirt->{href}. '/'; + my $html = $mirt->as_html; + $rv .= qq(In-Reply-To: <<a\nhref="$href">$html</a>>\n) } - # scan through all parts, looking for displayable text - $mime->walk_parts(sub { - index_walk($fh, $_[0], $enc, \$part_nr, $fhref); - }); - $mime->body_set(''); - $rv = "\n" . html_footer($hdr, 0, undef, $ctx, $mhref); + $rv .= "\n"; - if (defined $irt) { - unless (defined $parent_anchor) { - my $v = PublicInbox::Hval->new_msgid($irt, 1); - $v = $v->as_href; - $parent_anchor = "${path}$v/"; - } - $rv .= " <a\nhref=\"$parent_anchor\">parent</a>"; - } - if (my $pct = $state->{pct}) { # used by SearchView.pm - $rv .= " [relevance $pct->{$mid_raw}%]"; - } elsif ($srch) { + # scan through all parts, looking for displayable text + msg_iter($mime, sub { $rv .= add_text_body($mhref, $_[0]) }); + + # add the footer + $rv .= "\n<a\nhref=#$id_m\nid=e$id>^</a> ". + "<a\nhref=\"$mhref\">permalink</a>" . + " <a\nhref=\"${mhref}raw\">raw</a>" . + " <a\nhref=\"${mhref}#R\">reply</a>"; + + my $hr; + if (my $pct = $ctx->{pct}) { # used by SearchView.pm + $rv .= "\t[relevance $pct->{$mid_raw}%]"; + $hr = 1; + } elsif ($mapping) { my $threaded = 'threaded'; my $flat = 'flat'; + my $end = ''; if ($ctx->{flat}) { + $hr = 1; $flat = "<b>$flat</b>"; } else { $threaded = "<b>$threaded</b>"; } - $rv .= " [<a\nhref=\"${path}$href/t/#u\">$threaded</a>"; - $rv .= "|<a\nhref=\"${path}$href/T/#u\">$flat</a>]"; + $rv .= "\t[<a\nhref=\"${mhref}T/#u\">$flat</a>"; + $rv .= "|<a\nhref=\"${mhref}t/#u\">$threaded</a>]"; + $rv .= " <a\nhref=#r$id>$ctx->{s_nr}</a>"; + } else { + $hr = $ctx->{-hr}; } - $fh->write($rv .= '</pre>'); -} -sub thread_html { - my ($ctx, $foot, $srch) = @_; - # $_[0] in sub is the Plack callback - sub { emit_thread_html($_[0], $ctx, $foot, $srch) } + $rv .= $more ? '</pre><hr><pre>' : '</pre>' if $hr; + $rv; } -# only private functions below. +sub pad_link ($$;$) { + my ($mid, $level, $s) = @_; + $s ||= '...'; + my $id = id_compress($mid, 1); + (' 'x19).indent_for($level).th_pfx($level)."<a\nhref=#r$id>($s)</a>\n"; +} -sub emit_thread_html { - my ($res, $ctx, $foot, $srch) = @_; - my $mid = $ctx->{mid}; - my $msgs = load_results($srch->get_thread($mid)); - my $nr = scalar @$msgs; - return missing_thread($res, $ctx) if $nr == 0; - my $flat = $ctx->{flat}; - my $seen = {}; - my $state = { - res => $res, - ctx => $ctx, - seen => $seen, - root_anchor => anchor_for($mid), - anchor_idx => 0, - cur_level => 0, - }; - - require PublicInbox::Git; - $ctx->{git} ||= PublicInbox::Git->new($ctx->{git_dir}); - if ($flat) { - pre_anchor_entry($seen, $_) for (@$msgs); - __thread_entry($state, $_, 0) for (@$msgs); - } else { - my $th = thread_results($msgs); - thread_entry($state, $_, 0) for $th->rootset; - if (my $max = $state->{cur_level}) { - $state->{fh}->write( - ('</ul></li>' x ($max - 1)) . '</ul>'); +sub _th_index_lite { + my ($mid_raw, $irt, $id, $ctx) = @_; + my $rv = ''; + my $mapping = $ctx->{mapping} or return $rv; + my $pad = ' '; + my ($attr, $node, $idx, $level) = @{$mapping->{$mid_raw}}; + my $children = $node->{children}; + my $nr_c = scalar @$children; + my $nr_s = 0; + my $siblings; + if (my $smsg = $node->{smsg}) { + ($$irt) = (($smsg->{references} || '') =~ m/<([^>]+)>\z/); + } + my $irt_map = $mapping->{$$irt} if defined $$irt; + if (defined $irt_map) { + $siblings = $irt_map->[1]->{children}; + $nr_s = scalar(@$siblings) - 1; + $rv .= $pad . $irt_map->[0]; + if ($idx > 0) { + my $prev = $siblings->[$idx - 1]; + my $pmid = $prev->{id}; + if ($idx > 2) { + my $s = ($idx - 1). ' preceding siblings ...'; + $rv .= pad_link($pmid, $level, $s); + } elsif ($idx == 2) { + my $ppmid = $siblings->[0]->{id}; + $rv .= $pad . $mapping->{$ppmid}->[0]; + } + $rv .= $pad . $mapping->{$pmid}->[0]; + } + } + my $s_s = nr_to_s($nr_s, 'sibling', 'siblings'); + my $s_c = nr_to_s($nr_c, 'reply', 'replies'); + $attr =~ s!\n\z!</b>\n!s; + $attr =~ s!<a\nhref.*</a> !!s; # no point in duplicating subject + $attr =~ s!<a\nhref=[^>]+>([^<]+)</a>!$1!s; # no point linking to self + $rv .= "<b>@ $attr"; + if ($nr_c) { + my $cmid = $children->[0]->{id}; + $rv .= $pad . $mapping->{$cmid}->[0]; + if ($nr_c > 2) { + my $s = ($nr_c - 1). ' more replies'; + $rv .= pad_link($cmid, $level + 1, $s); + } elsif (my $cn = $children->[1]) { + $rv .= $pad . $mapping->{$cn->{id}}->[0]; } } - Email::Address->purge_cache; - - # there could be a race due to a message being deleted in git - # but still being in the Xapian index: - my $fh = delete $state->{fh} or return missing_thread($res, $ctx); - - my $final_anchor = $state->{anchor_idx}; - my $next = "<a\nid=s$final_anchor>"; - $next .= $final_anchor == 1 ? 'only message in' : 'end of'; - $next .= " thread</a>, back to <a\nhref=\"../../\">index</a>"; - $next .= "\ndownload thread: "; - $next .= "<a\nhref=\"../t.mbox.gz\">mbox.gz</a>"; - $next .= " / follow: <a\nhref=\"../t.atom\">Atom feed</a>"; - $fh->write('<hr /><pre>' . $next . "\n\n". - $foot . '</pre></body></html>'); - $fh->close; -} - -sub index_walk { - my ($fh, $part, $enc, $part_nr, $fhref) = @_; - my $s = add_text_body($enc, $part, $part_nr, $fhref, 1); - - return if $s eq ''; - - $s .= "\n"; # ensure there's a trailing newline - $fh->write($s); + my $next = $siblings->[$idx+1] if $siblings && $idx >= 0; + if ($next) { + my $nmid = $next->{id}; + $rv .= $pad . $mapping->{$nmid}->[0]; + my $nnext = $nr_s - $idx; + if ($nnext > 2) { + my $s = ($nnext - 1).' subsequent siblings'; + $rv .= pad_link($nmid, $level, $s); + } elsif (my $nn = $siblings->[$idx + 2]) { + $rv .= $pad . $mapping->{$nn->{id}}->[0]; + } + } + $rv .= $pad ."<a\nhref=#r$id>$s_s, $s_c; $ctx->{s_nr}</a>\n"; +} + +sub walk_thread { + my ($th, $ctx, $cb) = @_; + my @q = map { (0, $_, -1) } @{$th->{rootset}}; + while (@q) { + my ($level, $node, $i) = splice(@q, 0, 3); + defined $node or next; + $cb->($ctx, $level, $node, $i); + ++$level; + $i = 0; + unshift @q, map { ($level, $_, $i++) } @{$node->{children}}; + } +} + +sub pre_thread { + my ($ctx, $level, $node, $idx) = @_; + $ctx->{mapping}->{$node->{id}} = [ '', $node, $idx, $level ]; + skel_dump($ctx, $level, $node); +} + +sub thread_index_entry { + my ($ctx, $level, $mime) = @_; + my ($beg, $end) = thread_adj_level($ctx, $level); + $beg . '<pre>' . index_entry($mime, $ctx, 0) . '</pre>' . $end; +} + +sub stream_thread ($$) { + my ($th, $ctx) = @_; + my $inbox = $ctx->{-inbox}; + my $mime; + my @q = map { (0, $_) } @{$th->{rootset}}; + my $level; + while (@q) { + $level = shift @q; + my $node = shift @q or next; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}}; + $mime = $inbox->msg_by_smsg($node->{smsg}) and last; + } + return missing_thread($ctx) unless $mime; + + $mime = Email::MIME->new($mime); + $ctx->{-title_html} = ascii_html($mime->header('Subject')); + $ctx->{-html_tip} = thread_index_entry($ctx, $level, $mime); + PublicInbox::WwwStream->response($ctx, 200, sub { + return unless $ctx; + while (@q) { + $level = shift @q; + my $node = shift @q or next; + my $cl = $level + 1; + unshift @q, map { ($cl, $_) } @{$node->{children}}; + my $mid = $node->{id}; + if ($mime = $inbox->msg_by_smsg($node->{smsg})) { + $mime = Email::MIME->new($mime); + return thread_index_entry($ctx, $level, $mime); + } else { + return ghost_index_entry($ctx, $level, $node); + } + } + my $ret = join('', thread_adj_level($ctx, 0)); + $ret .= ${$ctx->{dst}}; # skel + $ctx = undef; + $ret; + }); } -sub enc_for { - my ($ct, $default) = @_; - $default ||= $enc_utf8; - defined $ct or return $default; - my $ct_parsed = parse_content_type($ct); - if ($ct_parsed) { - if (my $charset = $ct_parsed->{attributes}->{charset}) { - my $enc = find_encoding($charset); - return $enc if $enc; +sub thread_html { + my ($ctx) = @_; + my $mid = $ctx->{mid}; + my $srch = $ctx->{srch}; + my $sres = $srch->get_thread($mid); + my $msgs = load_results($srch, $sres); + my $nr = $sres->{total}; + return missing_thread($ctx) if $nr == 0; + my $skel = '<hr><pre>'; + $skel .= $nr == 1 ? 'only message in thread' : 'end of thread'; + $skel .= ", back to <a\nhref=\"../../\">index</a>\n\n"; + $skel .= "<b\nid=t>Thread overview:</b> "; + $skel .= $nr == 1 ? '(only message)' : "$nr+ messages"; + $skel .= " (download: <a\nhref=\"../t.mbox.gz\">mbox.gz</a>"; + $skel .= " / follow: <a\nhref=\"../t.atom\">Atom feed</a>)\n"; + $skel .= "-- links below jump to the message on this page --\n"; + $ctx->{-upfx} = '../../'; + $ctx->{cur_level} = 0; + $ctx->{dst} = \$skel; + $ctx->{prev_attr} = ''; + $ctx->{prev_level} = 0; + $ctx->{root_anchor} = anchor_for($mid); + $ctx->{seen} = {}; + $ctx->{mapping} = {}; + $ctx->{s_nr} = "$nr+ messages in thread"; + + my $th = thread_results($msgs); + walk_thread($th, $ctx, *pre_thread); + $skel .= '</pre>'; + return stream_thread($th, $ctx) unless $ctx->{flat}; + + # flat display: lazy load the full message from smsg + my $inbox = $ctx->{-inbox}; + my $mime; + while ($mime = shift @$msgs) { + $mime = $inbox->msg_by_smsg($mime) and last; + } + return missing_thread($ctx) unless $mime; + $mime = Email::MIME->new($mime); + $ctx->{-title_html} = ascii_html($mime->header('Subject')); + $ctx->{-html_tip} = '<pre>'.index_entry($mime, $ctx, scalar @$msgs); + $mime = undef; + PublicInbox::WwwStream->response($ctx, 200, sub { + return unless $msgs; + while ($mime = shift @$msgs) { + $mime = $inbox->msg_by_smsg($mime) and last; } - } - $default; + if ($mime) { + $mime = Email::MIME->new($mime); + return index_entry($mime, $ctx, scalar @$msgs); + } + $msgs = undef; + $skel; + }); } sub multipart_text_as_html { - my ($mime, $full_pfx, $srch) = @_; + my ($mime, $upfx) = @_; my $rv = ""; - my $part_nr = 0; - my $enc = enc_for($mime->header("Content-Type")); # scan through all parts, looking for displayable text - $mime->walk_parts(sub { - my ($part) = @_; - $part = add_text_body($enc, $part, \$part_nr, $full_pfx, 1); - $rv .= $part; - $rv .= "\n" if $part ne ''; + msg_iter($mime, sub { + my ($p) = @_; + $rv .= add_text_body($upfx, $p); }); - $mime->body_set(''); $rv; } -sub add_filename_line { - my ($enc, $fn) = @_; - my $len = 72; - my $pad = "-"; - $fn = $enc->decode($fn); - $len -= length($fn); - $pad x= ($len/2) if ($len > 0); - "$pad " . ascii_html($fn) . " $pad\n"; -} - sub flush_quote { - my ($quot, $n, $part_nr, $full_pfx, $final, $do_anchor) = @_; - - # n.b.: do not use <blockquote> since it screws up alignment - # w.r.t. unquoted text. Repliers may rely on pre-formatted - # alignment to point out a certain word in quoted text. - if ($full_pfx) { - if (!$final && scalar(@$quot) <= MAX_INLINE_QUOTED) { - # show quote inline - my $l = PublicInbox::Linkify->new; - my $rv = join('', map { $l->linkify_1($_) } @$quot); - @$quot = (); - $rv = ascii_html($rv); - return $l->linkify_2($rv); - } - - # show a short snippet of quoted text and link to full version: - @$quot = map { s/^(?:>\s*)+//gm; $_ } @$quot; - my $cur = join(' ', @$quot); - @$quot = split(/\s+/, $cur); - $cur = ''; - do { - my $tmp = shift(@$quot); - my $len = length($tmp) + length($cur); - if ($len > MAX_TRUNC_LEN) { - @$quot = (); - } else { - $cur .= $tmp . ' '; - } - } while (@$quot && length($cur) < MAX_TRUNC_LEN); - @$quot = (); - $cur =~ s/ \z/ .../s; - $cur = ascii_html($cur); - my $nr = ++$$n; - "> [<a\nhref=\"$full_pfx#q${part_nr}_$nr\">$cur</a>]\n"; + my ($s, $l, $quot) = @_; + + # show everything in the full version with anchor from + # short version (see above) + my $rv = $l->linkify_1(join('', @$quot)); + @$quot = (); + + # we use a <span> here to allow users to specify their own + # color for quoted text + $rv = $l->linkify_2(ascii_html($rv)); + $$s .= qq(<span\nclass="q">) . $rv . '</span>' +} + +sub attach_link ($$$$;$) { + my ($upfx, $ct, $p, $fn, $err) = @_; + my ($part, $depth, @idx) = @$p; + my $nl = $idx[-1] > 1 ? "\n" : ''; + my $idx = join('.', @idx); + my $size = bytes::length($part->body); + + # hide attributes normally, unless we want to aid users in + # spotting MUA problems: + $ct =~ s/;.*// unless $err; + $ct = ascii_html($ct); + my $desc = $part->header('Content-Description'); + $desc = $fn unless defined $desc; + $desc = '' unless defined $desc; + my $sfn; + if (defined $fn && $fn =~ /\A[[:alnum:]][\w\.-]+[[:alnum:]]\z/) { + $sfn = $fn; + } elsif ($ct eq 'text/plain') { + $sfn = 'a.txt'; } else { - # show everything in the full version with anchor from - # short version (see above) - my $l = PublicInbox::Linkify->new; - my $rv .= join('', map { $l->linkify_1($_) } @$quot); - @$quot = (); - $rv = ascii_html($rv); - return $l->linkify_2($rv) unless $do_anchor; - my $nr = ++$$n; - "<a\nid=q${part_nr}_$nr></a>" . $l->linkify_2($rv); + $sfn = 'a.bin'; + } + my $ret = qq($nl<a\nhref="$upfx$idx-$sfn">); + if ($err) { + $ret .= +"[-- Warning: decoded text below may be mangled --]\n"; } + $ret .= "[-- Attachment #$idx: "; + my $ts = "Type: $ct, Size: $size bytes"; + $ret .= ($desc eq '') ? "$ts --]" : "$desc --]\n[-- $ts --]"; + $ret .= "</a>\n"; } sub add_text_body { - my ($enc_msg, $part, $part_nr, $full_pfx, $do_anchor) = @_; - return '' if $part->subparts; - - my $ct = $part->content_type; - # account for filter bugs... - if (defined $ct && $ct =~ m!\btext/x?html\b!i) { - $part->body_set(''); - return ''; - } - my $enc = enc_for($ct, $enc_msg); - my $n = 0; - my $nr = 0; - my $s = $part->body; - $part->body_set(''); - $s = $enc->decode($s); - my @lines = split(/^/m, $s); - $s = ''; + my ($upfx, $p) = @_; # from msg_iter: [ Email::MIME, depth, @idx ] + my ($part, $depth, @idx) = @$p; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; - if ($$part_nr > 0) { - my $fn = $part->filename; - defined($fn) or $fn = "part #" . ($$part_nr + 1); - $s .= add_filename_line($enc, $fn); + if ($ct =~ m!\btext/x?html\b!i) { + return attach_link($upfx, $ct, $p, $fn); } + my $s = eval { $part->body_str }; + + # badly-encoded message? tell the world about it! + my $err = $@; + if ($err) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine seems to + # do wacky things and set charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + + # If forcing charset=UTF-8 failed, + # attach_link will warn further down... + $s = $part->body if $@; + } else { + return attach_link($upfx, $ct, $p, $fn); + } + } + + my @lines = split(/^/m, $s); + $s = ''; + if (defined($fn) || $depth > 0 || $err) { + $s .= attach_link($upfx, $ct, $p, $fn, $err); + $s .= "\n"; + } my @quot; + my $l = PublicInbox::Linkify->new; while (defined(my $cur = shift @lines)) { if ($cur !~ /^>/) { # show the previously buffered quote inline - if (scalar @quot) { - $s .= flush_quote(\@quot, \$n, $$part_nr, - $full_pfx, 0, $do_anchor); - } + flush_quote(\$s, $l, \@quot) if @quot; # regular line, OK - my $l = PublicInbox::Linkify->new; $cur = $l->linkify_1($cur); $cur = ascii_html($cur); $s .= $l->linkify_2($cur); @@ -373,24 +489,29 @@ sub add_text_body { push @quot, $cur; } } - if (scalar @quot) { - $s .= flush_quote(\@quot, \$n, $$part_nr, $full_pfx, 1, - $do_anchor); - } - ++$$part_nr; + my $end = "\n"; + if (@quot) { + $end = ''; + flush_quote(\$s, $l, \@quot); + } $s =~ s/[ \t]+$//sgm; # kill per-line trailing whitespace $s =~ s/\A\n+//s; # kill leading blank lines - $s =~ s/\s+\z//s; # kill all trailing spaces (final "\n" added if ne '') - $s; + $s =~ s/\s+\z//s; # kill all trailing spaces + $s .= $end; } -sub headers_to_html_header { - my ($hdr, $full_pfx, $ctx) = @_; +sub _msg_html_prepare { + my ($hdr, $ctx) = @_; my $srch = $ctx->{srch} if $ctx; - my $rv = ""; + my $atom = ''; + my $rv = "<pre\nid=b>"; # anchor for body start + + if ($srch) { + $ctx->{-upfx} = '../'; + } my @title; - my $mid = $hdr->header_raw('Message-ID'); + my $mid = mid_clean($hdr->header_raw('Message-ID')); $mid = PublicInbox::Hval->new_msgid($mid); foreach my $h (qw(From To Cc Subject Date)) { my $v = $hdr->header($h); @@ -398,53 +519,45 @@ sub headers_to_html_header { $v = PublicInbox::Hval->new($v); if ($h eq 'From') { - my @from = Email::Address->parse($v->raw); - $title[1] = ascii_html($from[0]->name); + my @n = PublicInbox::Address::names($v->raw); + $title[1] = ascii_html(join(', ', @n)); } elsif ($h eq 'Subject') { $title[0] = $v->as_html; if ($srch) { - $rv .= "$h: <b\nid=t>"; - $rv .= $v->as_html . "</b>\n"; + $rv .= qq($h: <a\nhref="#r"\nid=t>); + $rv .= $v->as_html . "</a>\n"; next; } } - $rv .= "$h: " . $v->as_html . "\n"; + $v = $v->as_html; + $v =~ s/(\@[^,]+,) /$1\n\t/g if ($h eq 'Cc' || $h eq 'To'); + $rv .= "$h: $v\n"; } + $title[0] ||= '(no subject)'; + $ctx->{-title_html} = join(' - ', @title); $rv .= 'Message-ID: <' . $mid->as_html . '> '; - my $upfx = $full_pfx ? '' : '../'; - $rv .= "(<a\nhref=\"${upfx}raw\">raw</a>)\n"; - my $atom; - if ($srch) { - thread_inline(\$rv, $ctx, $hdr, $upfx); - - $atom = qq{<link\nrel=alternate\ntitle="Atom feed"\n} . - qq!href="${upfx}t.atom"\ntype="application/atom+xml"/>!; - } else { - $rv .= _parent_headers_nosrch($hdr); - $atom = ''; - } + $rv .= "(<a\nhref=\"raw\">raw</a>)\n"; + $rv .= _parent_headers($hdr, $srch); $rv .= "\n"; - - ("<html><head><title>". join(' - ', @title) . "</title>$atom". - PublicInbox::Hval::STYLE . "</head><body><pre>" . $rv); } -sub thread_inline { - my ($dst, $ctx, $hdr, $upfx) = @_; +sub thread_skel { + my ($dst, $ctx, $hdr, $tpfx) = @_; my $srch = $ctx->{srch}; my $mid = mid_clean($hdr->header_raw('Message-ID')); my $sres = $srch->get_thread($mid); my $nr = $sres->{total}; - my $expand = "<a\nhref=\"${upfx}t/#u\">expand</a> " . - "/ <a\nhref=\"${upfx}t.mbox.gz\">mbox.gz</a>"; + my $expand = qq(<a\nhref="${tpfx}T/#u">expand</a> ) . + qq(/ <a\nhref="${tpfx}t.mbox.gz">mbox.gz</a> ) . + qq(/ <a\nhref="${tpfx}t.atom">Atom feed</a>); - $$dst .= 'Thread: '; my $parent = in_reply_to($hdr); + $$dst .= "\n<b>Thread overview: </b>"; if ($nr <= 1) { if (defined $parent) { $$dst .= "($expand)\n "; - $$dst .= ghost_parent("$upfx../", $parent) . "\n"; + $$dst .= ghost_parent("$tpfx../", $parent) . "\n"; } else { $$dst .= "[no followups, yet] ($expand)\n"; } @@ -453,43 +566,37 @@ sub thread_inline { return; } - $$dst .= "~$nr messages ($expand"; - if ($nr > MAX_INLINE_QUOTED) { - $$dst .= qq! / <a\nhref="#b">[scroll down]</a>!; - } - $$dst .= ")\n"; + $$dst .= "$nr+ messages in thread ($expand"; + $$dst .= qq! / <a\nhref="#b">[top]</a>)\n!; my $subj = $srch->subject_path($hdr->header('Subject')); - my $state = { - seen => { $subj => 1 }, - srch => $srch, - cur => $mid, - parent_cmp => defined $parent ? $parent : '', - parent => $parent, - prev_attr => '', - prev_level => 0, - }; - for (thread_results(load_results($sres))->rootset) { - inline_dump($dst, $state, $upfx, $_, 0); - } - $$dst .= "<a\nid=b></a>"; # anchor for body start - $ctx->{next_msg} = $state->{next_msg}; - $ctx->{parent_msg} = $state->{parent}; -} - -sub _parent_headers_nosrch { - my ($hdr) = @_; + $ctx->{seen} = { $subj => 1 }; + $ctx->{cur} = $mid; + $ctx->{prev_attr} = ''; + $ctx->{prev_level} = 0; + $ctx->{dst} = $dst; + $sres = load_results($srch, $sres); + walk_thread(thread_results($sres), $ctx, *skel_dump); + $ctx->{parent_msg} = $parent; +} + +sub _parent_headers { + my ($hdr, $srch) = @_; my $rv = ''; my $irt = in_reply_to($hdr); if (defined $irt) { - my $v = PublicInbox::Hval->new_msgid($irt, 1); + my $v = PublicInbox::Hval->new_msgid($irt); my $html = $v->as_html; - my $href = $v->as_href; + my $href = $v->{href}; $rv .= "In-Reply-To: <"; $rv .= "<a\nhref=\"../$href/\">$html</a>>\n"; } + # do not display References: if search is present, + # we show the thread skeleton at the bottom, instead. + return $rv if $srch; + my $refs = $hdr->header_raw('References'); if ($refs) { # avoid redundant URLs wasting bandwidth @@ -504,12 +611,21 @@ sub _parent_headers_nosrch { } if (@refs) { - $rv .= 'References: '. join(' ', @refs) . "\n"; + $rv .= 'References: '. join("\n\t", @refs) . "\n"; } } $rv; } +sub squote_maybe ($) { + my ($val) = @_; + if ($val =~ m{([^\w@\./,\%\+\-])}) { + $val =~ s/(['!])/'\\$1'/g; # '!' for csh + return "'$val'"; + } + $val; +} + sub mailto_arg_link { my ($hdr) = @_; my %cc; # everyone else @@ -518,207 +634,110 @@ sub mailto_arg_link { foreach my $h (qw(From To Cc)) { my $v = $hdr->header($h); defined($v) && ($v ne '') or next; - my @addrs = Email::Address->parse($v); - foreach my $recip (@addrs) { - my $address = $recip->address; + my @addrs = PublicInbox::Address::emails($v); + foreach my $address (@addrs) { my $dst = lc($address); $cc{$dst} ||= $address; $to ||= $dst; } } - Email::Address->purge_cache; my @arg; my $subj = $hdr->header('Subject') || ''; $subj = "Re: $subj" unless $subj =~ /\bRe:/i; my $mid = $hdr->header_raw('Message-ID'); - push @arg, "--in-reply-to='" . ascii_html($mid) . "'"; - my $irt = uri_escape_utf8($mid); + push @arg, '--in-reply-to='.squote_maybe(mid_clean($mid)); + my $irt = mid_escape($mid); delete $cc{$to}; - push @arg, '--to=' . ascii_html($to); + push @arg, "--to=$to"; $to = uri_escape_utf8($to); $subj = uri_escape_utf8($subj); - my $cc = join(',', sort values %cc); - push @arg, '--cc=' . ascii_html($cc); - $cc = uri_escape_utf8($cc); + my @cc = sort values %cc; + push(@arg, map { "--cc=$_" } @cc); + my $cc = uri_escape_utf8(join(',', @cc)); my $href = "mailto:$to?In-Reply-To=$irt&Cc=${cc}&Subject=$subj"; - $href =~ s/%20/+/g; - (\@arg, $href); + (\@arg, ascii_html($href)); } sub html_footer { - my ($mime, $standalone, $full_pfx, $ctx, $mhref) = @_; + my ($hdr, $standalone, $ctx, $rhref) = @_; my $srch = $ctx->{srch} if $ctx; - my $upfx = $full_pfx ? '../' : '../../'; - my $tpfx = $full_pfx ? '' : '../'; + my $upfx = '../'; + my $tpfx = ''; my $idx = $standalone ? " <a\nhref=\"$upfx\">index</a>" : ''; my $irt = ''; - - if ($srch && $standalone) { - $idx .= qq{ / follow: <a\nhref="${tpfx}t.atom">Atom feed</a>\n}; - } if ($idx && $srch) { - my $p = $ctx->{parent_msg}; - my $next = $ctx->{next_msg}; - if ($p) { - $p = PublicInbox::Hval->new_msgid($p); - $p = $p->as_href; - $irt = "<a\nhref=\"$upfx$p/\">parent</a> "; - } else { - $irt = ' ' x length('parent '); + $idx .= "\n"; + thread_skel(\$idx, $ctx, $hdr, $tpfx); + my ($next, $prev); + my $parent = ' '; + $next = $prev = ' '; + + if (my $n = $ctx->{next_msg}) { + $n = PublicInbox::Hval->new_msgid($n)->{href}; + $next = "<a\nhref=\"$upfx$n/\"\nrel=next>next</a>"; } - if ($next) { - $irt .= "<a\nhref=\"$upfx$next/\">next</a> "; - } else { - $irt .= ' ' x length('next '); + my $u; + my $par = $ctx->{parent_msg}; + if ($par) { + $u = PublicInbox::Hval->new_msgid($par)->{href}; + $u = "$upfx$u/"; } - if ($p || $next) { - $irt .= "<a\nhref=\"${tpfx}t/#u\">thread</a> "; - } else { - $irt .= ' ' x length('thread '); + if (my $p = $ctx->{prev_msg}) { + $prev = PublicInbox::Hval->new_msgid($p)->{href}; + if ($p && $par && $p eq $par) { + $prev = "<a\nhref=\"$upfx$prev/\"\n" . + 'rel=prev>prev parent</a>'; + $parent = ''; + } else { + $prev = "<a\nhref=\"$upfx$prev/\"\n" . + 'rel=prev>prev</a>'; + $parent = " <a\nhref=\"$u\">parent</a>" if $u; + } + } elsif ($u) { # unlikely + $parent = " <a\nhref=\"$u\"\nrel=prev>parent</a>"; } + $irt = "$next $prev$parent "; } else { $irt = ''; } - - $mhref = './' unless defined $mhref; - $irt . qq(<a\nhref="${mhref}R/">reply</a>) . $idx; + $rhref ||= '#R'; + $irt .= qq(<a\nhref="$rhref">reply</a>); + $irt .= $idx; } sub linkify_ref_nosrch { - my $v = PublicInbox::Hval->new_msgid($_[0], 1); + my $v = PublicInbox::Hval->new_msgid($_[0]); my $html = $v->as_html; - my $href = $v->as_href; + my $href = $v->{href}; "<<a\nhref=\"../$href/\">$html</a>>"; } sub anchor_for { my ($msgid) = @_; - my $id = $msgid; - if ($id !~ /\A[a-f0-9]{40}\z/) { - $id = id_compress(mid_clean($id), 1); - } - 'm' . $id; -} - -sub thread_html_head { - my ($hdr, $state) = @_; - my $res = delete $state->{res} or die "BUG: no Plack callback in {res}"; - my $fh = $res->([200, ['Content-Type'=> 'text/html; charset=UTF-8']]); - $state->{fh} = $fh; - - my $s = ascii_html($hdr->header('Subject')); - $fh->write("<html><head><title>$s</title>". - qq{<link\nrel=alternate\ntitle="Atom feed"\n} . - qq!href="../t.atom"\ntype="application/atom+xml"/>! . - PublicInbox::Hval::STYLE . - "</head><body>"); -} - -sub pre_anchor_entry { - my ($seen, $mime) = @_; - my $id = anchor_for(mid_mime($mime)); - $seen->{$id} = "#$id"; # save the anchor for children, later + 'm' . id_compress($msgid, 1); } sub ghost_parent { my ($upfx, $mid) = @_; - # 'subject dummy' is used internally by Mail::Thread - return '[no common parent]' if ($mid eq 'subject dummy'); $mid = PublicInbox::Hval->new_msgid($mid); - my $href = $mid->as_href; + my $href = $mid->{href}; my $html = $mid->as_html; qq{[parent not found: <<a\nhref="$upfx$href/">$html</a>>]}; } -sub thread_adj_level { - my ($state, $level) = @_; - - my $max = $state->{cur_level}; - if ($level <= 0) { - return '' if $max == 0; # flat output - - # reset existing lists - my $x = $max > 1 ? ('</ul></li>' x ($max - 1)) : ''; - $state->{fh}->write($x . '</ul>'); - $state->{cur_level} = 0; - return ''; - } - if ($level == $max) { # continue existing list - $state->{fh}->write('<li>'); - } elsif ($level < $max) { - my $x = $max > 1 ? ('</ul></li>' x ($max - $level)) : ''; - $state->{fh}->write($x .= '<li>'); - $state->{cur_level} = $level; - } else { # ($level > $max) # start a new level - $state->{cur_level} = $level; - $state->{fh}->write(($max ? '<li>' : '') . '<ul><li>'); - } - '</li>'; -} - -sub ghost_flush { - my ($state, $upfx, $mid, $level) = @_; - my $end = '<pre>'. ghost_parent($upfx, $mid) . '</pre>'; - $state->{fh}->write($end .= thread_adj_level($state, $level)); -} - -sub __thread_entry { - my ($state, $mime, $level) = @_; - - # lazy load the full message from mini_mime: - $mime = eval { - my $path = mid2path(mid_clean(mid_mime($mime))); - Email::MIME->new($state->{ctx}->{git}->cat_file('HEAD:'.$path)); - } or return; - - thread_html_head($mime, $state) if $state->{anchor_idx} == 0; - if (my $ghost = delete $state->{ghost}) { - # n.b. ghost messages may only be parents, not children - foreach my $g (@$ghost) { - ghost_flush($state, '../../', @$g); - } - } - my $end = thread_adj_level($state, $level); - index_entry($mime, $level, $state); - $state->{fh}->write($end) if $end; - - 1; -} - sub indent_for { my ($level) = @_; INDENT x ($level - 1); } -sub __ghost_prepare { - my ($state, $node, $level) = @_; - my $ghost = $state->{ghost} ||= []; - push @$ghost, [ $node->messageid, $level ]; -} - -sub thread_entry { - my ($state, $node, $level) = @_; - return unless $node; - if (my $mime = $node->message) { - unless (__thread_entry($state, $mime, $level)) { - __ghost_prepare($state, $node, $level); - } - } else { - __ghost_prepare($state, $node, $level); - } - - thread_entry($state, $node->child, $level + 1); - thread_entry($state, $node->next, $level); -} - sub load_results { - my ($sres) = @_; - - [ map { $_->mini_mime } @{delete $sres->{msgs}} ]; + my ($srch, $sres) = @_; + my $msgs = delete $sres->{msgs}; + $srch->retry_reopen(sub { [ map { $_->ensure_metadata; $_ } @$msgs ] }); } sub msg_timestamp { @@ -728,21 +747,18 @@ sub msg_timestamp { } sub thread_results { - my ($msgs, $nosubject) = @_; - require PublicInbox::Thread; - my $th = PublicInbox::Thread->new(@$msgs); - no warnings 'once'; - $Mail::Thread::nosubject = $nosubject; + my ($msgs) = @_; + require PublicInbox::SearchThread; + my $th = PublicInbox::SearchThread->new($msgs); $th->thread; $th->order(*sort_ts); $th } sub missing_thread { - my ($res, $ctx) = @_; + my ($ctx) = @_; require PublicInbox::ExtMsg; - - $res->(PublicInbox::ExtMsg::ext_msg($ctx)) + PublicInbox::ExtMsg::ext_msg($ctx); } sub _msg_date { @@ -753,187 +769,255 @@ sub _msg_date { sub fmt_ts { POSIX::strftime('%Y-%m-%d %k:%M', gmtime($_[0])) } -sub _inline_header { - my ($dst, $state, $upfx, $hdr, $level) = @_; - my $dot = $level == 0 ? '' : '` '; +sub skel_dump { + my ($ctx, $level, $node) = @_; + my $smsg = $node->{smsg} or return _skel_ghost($ctx, $level, $node); - my $cur = $state->{cur}; - my $mid = mid_clean($hdr->header_raw('Message-ID')); - my $f = ascii_html($hdr->header('X-PI-From')); - my $d = _msg_date($hdr); - my $pfx = ' ' . $d . ' ' . indent_for($level); + my $dst = $ctx->{dst}; + my $cur = $ctx->{cur}; + my $mid = $smsg->{mid}; + my $f = ascii_html($smsg->from_name); + my $d = fmt_ts($smsg->{ts}) . ' ' . indent_for($level) . th_pfx($level); my $attr = $f; - $state->{first_level} ||= $level; + $ctx->{first_level} ||= $level; - if ($attr ne $state->{prev_attr} || $state->{prev_level} > $level) { - $state->{prev_attr} = $attr; - } else { - $attr = ''; + if ($attr ne $ctx->{prev_attr} || $ctx->{prev_level} > $level) { + $ctx->{prev_attr} = $attr; } - $state->{prev_level} = $level; + $ctx->{prev_level} = $level; if ($cur) { if ($cur eq $mid) { - delete $state->{cur}; - $$dst .= "$pfx$dot<b><a\nid=r\nhref=\"#b\">". + delete $ctx->{cur}; + $$dst .= "<b>$d<a\nid=r\nhref=\"#t\">". "$attr [this message]</a></b>\n"; - return; + } else { + $ctx->{prev_msg} = $mid; } } else { - $state->{next_msg} ||= $mid; + $ctx->{next_msg} ||= $mid; } # Subject is never undef, this mail was loaded from # our Xapian which would've resulted in '' if it were # really missing (and Filter rejects empty subjects) - my $s = $hdr->header('Subject'); - my $h = $state->{srch}->subject_path($s); - if ($state->{seen}->{$h}) { + my $s = $smsg->subject; + my $h = $ctx->{srch}->subject_path($s); + if ($ctx->{seen}->{$h}) { $s = undef; } else { - $state->{seen}->{$h} = 1; + $ctx->{seen}->{$h} = 1; $s = PublicInbox::Hval->new($s); $s = $s->as_html; } - my $m = PublicInbox::Hval->new_msgid($mid); - $m = $upfx . '../' . $m->as_href . '/'; - if (defined $s) { - $$dst .= "$pfx$dot<a\nhref=\"$m\">$s</a> $attr\n"; + my $m; + my $id = ''; + my $mapping = $ctx->{mapping}; + my $end = defined($s) ? "$s</a> $f\n" : "$f</a>\n"; + if ($mapping) { + my $map = $mapping->{$mid}; + $id = id_compress($mid, 1); + $m = '#m'.$id; + $map->[0] = "$d<a\nhref=\"$m\">$end"; + $id = "\nid=r".$id; } else { - $$dst .= "$pfx$dot<a\nhref=\"$m\">$f</a>\n"; + $m = $ctx->{-upfx}.mid_escape($mid).'/'; } + $$dst .= $d . "<a\nhref=\"$m\"$id>" . $end; } -sub inline_dump { - my ($dst, $state, $upfx, $node, $level) = @_; - return unless $node; - if (my $mime = $node->message) { - my $hdr = $mime->header_obj; - my $mid = mid_clean($hdr->header_raw('Message-ID')); - if ($mid eq $state->{parent_cmp}) { - $state->{parent} = $mid; - } - _inline_header($dst, $state, $upfx, $hdr, $level); +sub _skel_ghost { + my ($ctx, $level, $node) = @_; + + my $mid = $node->{id}; + my $d = $ctx->{pct} ? ' [irrelevant] ' # search result + : ' [not found] '; + $d .= indent_for($level) . th_pfx($level); + my $upfx = $ctx->{-upfx}; + my $m = PublicInbox::Hval->new_msgid($mid); + my $href = $upfx . $m->{href} . '/'; + my $html = $m->as_html; + + my $mapping = $ctx->{mapping}; + my $map = $mapping->{$mid} if $mapping; + if ($map) { + my $id = id_compress($mid, 1); + $map->[0] = $d . qq{<<a\nhref=#r$id>$html</a>>\n}; + $d .= qq{<<a\nhref="$href"\nid=r$id>$html</a>>\n}; } else { - my $dot = $level == 0 ? '' : '` '; - my $pfx = (' ' x length(' 1970-01-01 13:37 ')). - indent_for($level) . $dot; - $$dst .= $pfx; - $$dst .= ghost_parent("$upfx../", $node->messageid) . "\n"; + $d .= qq{<<a\nhref="$href">$html</a>>\n}; } - inline_dump($dst, $state, $upfx, $node->child, $level+1); - inline_dump($dst, $state, $upfx, $node->next, $level); + my $dst = $ctx->{dst}; + $$dst .= $d; } sub sort_ts { - sort { - (eval { $a->topmost->message->header('X-PI-TS') } || 0) <=> - (eval { $b->topmost->message->header('X-PI-TS') } || 0) - } @_; -} - -sub rsort_ts { - sort { - (eval { $b->topmost->message->header('X-PI-TS') } || 0) <=> - (eval { $a->topmost->message->header('X-PI-TS') } || 0) - } @_; + [ sort { + (eval { $a->topmost->{smsg}->ts } || 0) <=> + (eval { $b->topmost->{smsg}->ts } || 0) + } @{$_[0]} ]; } # accumulate recent topics if search is supported -# returns 1 if done, undef if not -sub add_topic { - my ($state, $node, $level) = @_; - return unless $node; - my $child_adjust = 1; - - if (my $x = $node->message) { - $x = $x->header_obj; - my $subj; - - $subj = $x->header('Subject'); - $subj = $state->{srch}->subject_normalized($subj); - - if (++$state->{subjs}->{$subj} == 1) { - push @{$state->{order}}, [ $level, $subj ]; +# returns 200 if done, 404 if not +sub acc_topic { + my ($ctx, $level, $node) = @_; + my $srch = $ctx->{srch}; + my $mid = $node->{id}; + my $x = $node->{smsg} || $srch->lookup_mail($mid); + my ($subj, $ts); + my $topic; + if ($x) { + $subj = $x->subject; + $subj = $srch->subject_normalized($subj); + $ts = $x->ts; + if ($level == 0) { + $topic = [ $ts, 1, { $subj => $mid }, $subj ]; + $ctx->{-cur_topic} = $topic; + push @{$ctx->{order}}, $topic; + return; } - my $mid = mid_clean($x->header_raw('Message-ID')); - - my $ts = $x->header('X-PI-TS'); - my $exist = $state->{latest}->{$subj}; - if (!$exist || $exist->[1] < $ts) { - $state->{latest}->{$subj} = [ $mid, $ts ]; + $topic = $ctx->{-cur_topic}; # should never be undef + $topic->[0] = $ts if $ts > $topic->[0]; + $topic->[1]++; + my $seen = $topic->[2]; + if (scalar(@$topic) == 3) { # parent was a ghost + push @$topic, $subj; + } elsif (!$seen->{$subj}) { + push @$topic, $level, $subj; } - } else { - # ghost message, do not bump level - $child_adjust = 0; + $seen->{$subj} = $mid; # latest for subject + } else { # ghost message + return if $level != 0; # ignore child ghosts + $topic = [ -666, 0, {} ]; + $ctx->{-cur_topic} = $topic; + push @{$ctx->{order}}, $topic; } - - add_topic($state, $node->child, $level + $child_adjust); - add_topic($state, $node->next, $level); } sub dump_topics { - my ($state) = @_; - my $order = $state->{order}; - my $subjs = $state->{subjs}; - my $latest = $state->{latest}; - return "\n[No topics in range]</pre>" unless (scalar @$order); - my $dst = ''; - my $pfx; - my $prev = 0; - my $prev_attr = ''; - while (defined(my $info = shift @$order)) { - my ($level, $subj) = @$info; - my $n = delete $subjs->{$subj}; - my ($mid, $ts) = @{delete $latest->{$subj}}; - $mid = PublicInbox::Hval->new_msgid($mid)->as_href; - $subj = PublicInbox::Hval->new($subj)->as_html; - $pfx = indent_for($level); - my $nl = $level == $prev ? "\n" : ''; - my $dot = $level == 0 ? '' : '` '; - $dst .= "$nl$pfx$dot<a\nhref=\"$mid/t/#u\"><b>$subj</b></a>\n"; - + my ($ctx) = @_; + my $order = delete $ctx->{order}; # [ ts, subj1, subj2, subj3, ... ] + if (!@$order) { + $ctx->{-html_tip} = '<pre>[No topics in range]</pre>'; + return 404; + } + + my @out; + + # sort by recency, this allows new posts to "bump" old topics... + foreach my $topic (sort { $b->[0] <=> $a->[0] } @$order) { + my ($ts, $n, $seen, $top, @ex) = @$topic; + @$topic = (); + next unless defined $top; # ghost topic + my $mid = delete $seen->{$top}; + my $href = mid_escape($mid); + $top = PublicInbox::Hval->new($top)->as_html; $ts = fmt_ts($ts); - my $attr = " $ts UTC"; # $n isn't the total number of posts on the topic, # just the number of posts in the current results window - $n = $n == 1 ? '' : " ($n+ messages)"; - - if ($level == 0 || $attr ne $prev_attr) { - my $mbox = qq(<a\nhref="$mid/t.mbox.gz">mbox.gz</a>); - my $atom = qq(<a\nhref="$mid/t.atom">Atom</a>); - $pfx .= INDENT if $level > 0; - $dst .= $pfx . $attr . $n . " - $mbox / $atom\n"; - $prev_attr = $attr; + my $anchor; + if ($n == 1) { + $n = ''; + $anchor = '#u'; # top of only message + } else { + $n = " ($n+ messages)"; + $anchor = '#t'; # thread skeleton + } + + my $mbox = qq(<a\nhref="$href/t.mbox.gz">mbox.gz</a>); + my $atom = qq(<a\nhref="$href/t.atom">Atom</a>); + my $s = "<a\nhref=\"$href/T/$anchor\"><b>$top</b></a>\n" . + " $ts UTC $n - $mbox / $atom\n"; + for (my $i = 0; $i < scalar(@ex); $i += 2) { + my $level = $ex[$i]; + my $sub = $ex[$i + 1]; + $mid = delete $seen->{$sub}; + $sub = PublicInbox::Hval->new($sub)->as_html; + $href = mid_escape($mid); + $s .= indent_for($level) . TCHILD; + $s .= "<a\nhref=\"$href/T/#u\">$sub</a>\n"; } + push @out, $s; } - $dst .= '</pre>'; + $ctx->{-html_tip} = '<pre>' . join("\n", @out) . '</pre>'; + 200; } -sub emit_index_topics { - my ($state) = @_; - my $off = $state->{ctx}->{cgi}->param('o'); - $off = 0 unless defined $off; - $state->{order} = []; - $state->{subjs} = {}; - $state->{latest} = {}; - my $max = 25; - my %opts = ( offset => int $off, limit => $max * 4 ); - while (scalar @{$state->{order}} < $max) { - my $sres = $state->{srch}->query('', \%opts); - my $nr = scalar @{$sres->{msgs}} or last; +sub index_nav { # callback for WwwStream + my (undef, $ctx) = @_; + delete $ctx->{qp} or return; + my ($next, $prev); + $next = $prev = ' '; + my $latest = ''; + + my $next_o = $ctx->{-next_o}; + if ($next_o) { + $next = qq!<a\nhref="?o=$next_o"\nrel=next>next</a>!; + } + if (my $cur_o = $ctx->{-cur_o}) { + $latest = qq! <a\nhref=.>latest</a>!; - for (rsort_ts(thread_results(load_results($sres), 1)->rootset)){ - add_topic($state, $_, 0); + my $o = $cur_o - ($next_o - $cur_o); + if ($o > 0) { + $prev = qq!<a\nhref="?o=$o"\nrel=prev>prev</a>!; + } elsif ($o == 0) { + $prev = qq!<a\nhref=.\nrel=prev>prev</a>!; } - $opts{offset} += $nr; } + "<hr><pre>page: $next $prev$latest</pre>"; +} + +sub index_topics { + my ($ctx) = @_; + my ($off) = (($ctx->{qp}->{o} || '0') =~ /(\d+)/); + my $opts = { offset => $off, limit => 200 }; + + $ctx->{order} = []; + my $srch = $ctx->{srch}; + my $sres = $srch->query('', $opts); + my $nr = scalar @{$sres->{msgs}}; + if ($nr) { + $sres = load_results($srch, $sres); + walk_thread(thread_results($sres), $ctx, *acc_topic); + } + $ctx->{-next_o} = $off+ $nr; + $ctx->{-cur_o} = $off; + PublicInbox::WwwStream->response($ctx, dump_topics($ctx), *index_nav); +} + +sub thread_adj_level { + my ($ctx, $level) = @_; + + my $max = $ctx->{cur_level}; + if ($level <= 0) { + return ('', '') if $max == 0; # flat output + + # reset existing lists + my $beg = $max > 1 ? ('</ul></li>' x ($max - 1)) : ''; + $ctx->{cur_level} = 0; + ("$beg</ul>", ''); + } elsif ($level == $max) { # continue existing list + qw(<li> </li>); + } elsif ($level < $max) { + my $beg = $max > 1 ? ('</ul></li>' x ($max - $level)) : ''; + $ctx->{cur_level} = $level; + ("$beg<li>", '</li>'); + } else { # ($level > $max) # start a new level + $ctx->{cur_level} = $level; + my $beg = ($max ? '<li>' : '') . '<ul><li>'; + ($beg, '</li>'); + } +} - $state->{fh}->write(dump_topics($state)); - $opts{offset}; +sub ghost_index_entry { + my ($ctx, $level, $node) = @_; + my ($beg, $end) = thread_adj_level($ctx, $level); + $beg . '<pre>'. ghost_parent($ctx->{-upfx}, $node->{id}) + . '</pre>' . $end; } 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index bb54aaa6..11fc92e9 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -13,16 +13,16 @@ package PublicInbox::WWW; use 5.008; use strict; use warnings; -use Plack::Request; -use PublicInbox::Config qw(try_cat); -use URI::Escape qw(uri_escape_utf8 uri_unescape); -use constant SSOMA_URL => '//ssoma.public-inbox.org/'; -use constant PI_URL => '//public-inbox.org/'; +use PublicInbox::Config; +use PublicInbox::Hval; +use URI::Escape qw(uri_unescape); +use PublicInbox::MID qw(mid_escape); require PublicInbox::Git; use PublicInbox::GitHTTPBackend; -our $LISTNAME_RE = qr!\A/([\w\.\-]+)!; +our $INBOX_RE = qr!\A/([\w\.\-]+)!; our $MID_RE = qr!([^/]+)!; -our $END_RE = qr!(f/|T/|t/|R/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; +our $END_RE = qr!(T/|t/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; +our $ATTACH_RE = qr!(\d[\.\d]*)-([[:alnum:]][\w\.-]+[[:alnum:]])!i; sub new { my ($class, $pi_config) = @_; @@ -38,16 +38,24 @@ sub run { sub call { my ($self, $env) = @_; - my $cgi = Plack::Request->new($env); - my $ctx = { cgi => $cgi, pi_config => $self->{pi_config} }; - my $path_info = $cgi->path_info; + my $ctx = { env => $env, www => $self }; + + # we don't care about multi-value + my %qp = map { + my ($k, $v) = split('=', uri_unescape($_), 2); + $v = '' unless defined $v; + $v =~ tr/+/ /; + ($k, $v) + } split(/[&;]/, $env->{QUERY_STRING}); + $ctx->{qp} = \%qp; + + my $path_info = $env->{PATH_INFO}; + my $method = $env->{REQUEST_METHOD}; - my $method = $cgi->method; if ($method eq 'POST' && - $path_info =~ m!$LISTNAME_RE/(git-upload-pack)\z!) { + $path_info =~ m!$INBOX_RE/(git-upload-pack)\z!) { my $path = $2; - return (invalid_list($self, $ctx, $1) || - serve_git($cgi, $ctx->{git}, $path)); + return invalid_inbox($ctx, $1) || serve_git($ctx, $path); } elsif ($method !~ /\AGET|HEAD\z/) { return r(405, 'Method Not Allowed'); @@ -56,33 +64,47 @@ sub call { # top-level indices and feeds if ($path_info eq '/') { r404(); - } elsif ($path_info =~ m!$LISTNAME_RE\z!o) { - invalid_list($self, $ctx, $1) || r301($ctx, $1); - } elsif ($path_info =~ m!$LISTNAME_RE(?:/|/index\.html)?\z!o) { - invalid_list($self, $ctx, $1) || get_index($ctx); - } elsif ($path_info =~ m!$LISTNAME_RE/(?:atom\.xml|new\.atom)\z!o) { - invalid_list($self, $ctx, $1) || get_atom($ctx); - - } elsif ($path_info =~ m!$LISTNAME_RE/ + } elsif ($path_info =~ m!$INBOX_RE\z!o) { + invalid_inbox($ctx, $1) || r301($ctx, $1); + } elsif ($path_info =~ m!$INBOX_RE(?:/|/index\.html)?\z!o) { + invalid_inbox($ctx, $1) || get_index($ctx); + } elsif ($path_info =~ m!$INBOX_RE/(?:atom\.xml|new\.atom)\z!o) { + invalid_inbox($ctx, $1) || get_atom($ctx); + } elsif ($path_info =~ m!$INBOX_RE/new\.html\z!o) { + invalid_inbox($ctx, $1) || get_new($ctx); + } elsif ($path_info =~ m!$INBOX_RE/ ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my $path = $2; - invalid_list($self, $ctx, $1) || - serve_git($cgi, $ctx->{git}, $path); - } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/$END_RE\z!o) { - msg_page($self, $ctx, $1, $2, $3); - + invalid_inbox($ctx, $1) || serve_git($ctx, $path); + } elsif ($path_info =~ m!$INBOX_RE/([\w-]+).mbox\.gz\z!o) { + serve_mbox_range($ctx, $1, $2); + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$END_RE\z!o) { + msg_page($ctx, $1, $2, $3); + + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$ATTACH_RE\z!o) { + my ($idx, $fn) = ($3, $4); + invalid_inbox_mid($ctx, $1, $2) || get_attach($ctx, $idx, $fn); # in case people leave off the trailing slash: - } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/(f|T|t|R)\z!o) { - my ($listname, $mid, $suffix) = ($1, $2, $3); + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/(T|t)\z!o) { + my ($inbox, $mid, $suffix) = ($1, $2, $3); $suffix .= $suffix =~ /\A[tT]\z/ ? '/#u' : '/'; - r301($ctx, $listname, $mid, $suffix); + r301($ctx, $inbox, $mid, $suffix); + + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/R/?\z!o) { + my ($inbox, $mid) = ($1, $2); + r301($ctx, $inbox, $mid, '#R'); + + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/f/?\z!o) { + r301($ctx, $1, $2); + } elsif ($path_info =~ m!$INBOX_RE/_/text(?:/(.*))?\z!o) { + get_text($ctx, $1, $2); # convenience redirects order matters - } elsif ($path_info =~ m!$LISTNAME_RE/([^/]{2,})\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/([^/]{2,})\z!o) { r301($ctx, $1, $2); } else { - legacy_redirects($self, $ctx, $path_info); + legacy_redirects($ctx, $path_info); } } @@ -90,14 +112,14 @@ sub call { sub preload { require PublicInbox::Feed; require PublicInbox::View; - require PublicInbox::Thread; + require PublicInbox::SearchThread; require Email::MIME; require Digest::SHA; require POSIX; foreach (qw(PublicInbox::Search PublicInbox::SearchView PublicInbox::Mbox IO::Compress::Gzip - PublicInbox::NewsWWW PublicInbox::NewsGroup)) { + PublicInbox::NewsWWW)) { eval "require $_;"; } } @@ -118,13 +140,15 @@ sub r404 { sub r { [ $_[0], ['Content-Type' => 'text/plain'], [ join(' ', @_, "\n") ] ] } # returns undef if valid, array ref response if invalid -sub invalid_list { - my ($self, $ctx, $listname, $mid) = @_; - my $git_dir = $ctx->{pi_config}->get($listname, "mainrepo"); - if (defined $git_dir) { - $ctx->{git_dir} = $git_dir; - $ctx->{git} = PublicInbox::Git->new($git_dir); - $ctx->{listname} = $listname; +sub invalid_inbox ($$) { + my ($ctx, $inbox) = @_; + my $www = $ctx->{www}; + my $obj = $www->{pi_config}->lookup_name($inbox); + if (defined $obj) { + $ctx->{git_dir} = $obj->{mainrepo}; + $ctx->{git} = $obj->git; + $ctx->{-inbox} = $obj; + $ctx->{inbox} = $inbox; return; } @@ -132,13 +156,13 @@ sub invalid_list { # generation and link things intended for nntp:// to https?://, # so try to infer links and redirect them to the appropriate # list URL. - $self->news_www->call($ctx->{cgi}->{env}); + $www->news_www->call($ctx->{env}); } # returns undef if valid, array ref response if invalid -sub invalid_list_mid { - my ($self, $ctx, $listname, $mid) = @_; - my $ret = invalid_list($self, $ctx, $listname, $mid); +sub invalid_inbox_mid { + my ($ctx, $inbox, $mid) = @_; + my $ret = invalid_inbox($ctx, $inbox); return $ret if $ret; $ctx->{mid} = $mid = uri_unescape($mid); @@ -154,20 +178,26 @@ sub invalid_list_mid { undef; } -# /$LISTNAME/new.atom -> Atom feed, includes replies +# /$INBOX/new.atom -> Atom feed, includes replies sub get_atom { my ($ctx) = @_; require PublicInbox::Feed; PublicInbox::Feed::generate($ctx); } -# /$LISTNAME/?r=$GIT_COMMIT -> HTML only +# /$INBOX/new.html -> HTML only +sub get_new { + my ($ctx) = @_; + require PublicInbox::Feed; + PublicInbox::Feed::new_html($ctx); +} + +# /$INBOX/?r=$GIT_COMMIT -> HTML only sub get_index { my ($ctx) = @_; require PublicInbox::Feed; - my $srch = searcher($ctx); - footer($ctx); - if (defined $ctx->{cgi}->param('q')) { + searcher($ctx); + if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) { require PublicInbox::SearchView; PublicInbox::SearchView::sres_top_html($ctx); } else { @@ -178,12 +208,10 @@ sub get_index { # just returns a string ref for the blob in the current ctx sub mid2blob { my ($ctx) = @_; - require PublicInbox::MID; - my $path = PublicInbox::MID::mid2path($ctx->{mid}); - $ctx->{git}->cat_file("HEAD:$path"); + $ctx->{-inbox}->msg_by_mid($ctx->{mid}); } -# /$LISTNAME/$MESSAGE_ID/raw -> raw mbox +# /$INBOX/$MESSAGE_ID/raw -> raw mbox sub get_mid_txt { my ($ctx) = @_; my $x = mid2blob($ctx) or return r404($ctx); @@ -191,55 +219,37 @@ sub get_mid_txt { PublicInbox::Mbox::emit1($ctx, $x); } -# /$LISTNAME/$MESSAGE_ID/ -> HTML content (short quotes) +# /$INBOX/$MESSAGE_ID/ -> HTML content (short quotes) sub get_mid_html { my ($ctx) = @_; my $x = mid2blob($ctx) or return r404($ctx); require PublicInbox::View; - my $foot = footer($ctx); require Email::MIME; my $mime = Email::MIME->new($x); searcher($ctx); - [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View::msg_html($ctx, $mime, 'f/', $foot) ] ]; + PublicInbox::View::msg_html($ctx, $mime); } -# /$LISTNAME/$MESSAGE_ID/f/ -> HTML content (fullquotes) -sub get_full_html { - my ($ctx) = @_; - my $x = mid2blob($ctx) or return r404($ctx); - +# /$INBOX/$MESSAGE_ID/t/ +sub get_thread { + my ($ctx, $flat) = @_; + searcher($ctx) or return need_search($ctx); + $ctx->{flat} = $flat; require PublicInbox::View; - my $foot = footer($ctx); - require Email::MIME; - my $mime = Email::MIME->new($x); - searcher($ctx); - [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View::msg_html($ctx, $mime, undef, $foot)] ]; + PublicInbox::View::thread_html($ctx); } -# /$LISTNAME/$MESSAGE_ID/R/ -> HTML content (fullquotes) -sub get_reply_html { - my ($ctx) = @_; - my $x = mid2blob($ctx) or return r404($ctx); +# /$INBOX/_/text/$KEY/ +# /$INBOX/_/text/$KEY/raw +# KEY may contain slashes +sub get_text { + my ($ctx, $inbox, $key) = @_; + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; - require PublicInbox::View; - my $foot = footer($ctx); - require Email::MIME; - my $hdr = Email::MIME->new($x)->header_obj; - [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View::msg_reply($ctx, $hdr, $foot)] ]; -} - -# /$LISTNAME/$MESSAGE_ID/t/ -sub get_thread { - my ($ctx, $flat) = @_; - my $srch = searcher($ctx) or return need_search($ctx); - require PublicInbox::View; - my $foot = footer($ctx); - $ctx->{flat} = $flat; - PublicInbox::View::thread_html($ctx, $foot, $srch); + require PublicInbox::WwwText; + PublicInbox::WwwText::get_text($ctx, $key); } sub ctx_get { @@ -249,66 +259,13 @@ sub ctx_get { $val; } -sub footer { - my ($ctx) = @_; - return '' unless $ctx; - my $git_dir = ctx_get($ctx, 'git_dir'); - - # favor user-supplied footer - my $footer = try_cat("$git_dir/public-inbox/footer.html"); - if (defined $footer) { - chomp $footer; - $ctx->{footer} = $footer; - return $footer; - } - - # auto-generate a footer - my $listname = ctx_get($ctx, 'listname'); - my $desc = try_cat("$git_dir/description"); - $desc = '$GIT_DIR/description missing' unless defined $desc; - chomp $desc; - - my $urls = try_cat("$git_dir/cloneurl"); - my @urls = split(/\r?\n/, $urls || ''); - my %seen = map { $_ => 1 } @urls; - my $cgi = $ctx->{cgi}; - my $http = $cgi->base->as_string . $listname; - $seen{$http} or unshift @urls, $http; - my $ssoma_url = PublicInbox::Hval::prurl($cgi->{env}, SSOMA_URL); - if (scalar(@urls) == 1) { - $urls = "URL for <a\nhref=\"" . $ssoma_url . - qq(">ssoma</a> or <b>git clone --mirror $urls[0]</b>); - } else { - $urls = "URLs for <a\nhref=\"" . $ssoma_url . - qq(">ssoma</a> or <b>git clone --mirror</b>\n) . - join("\n", map { "\tgit clone --mirror $_" } @urls); - } - - my $addr = $ctx->{pi_config}->get($listname, 'address'); - if (ref($addr) eq 'ARRAY') { - $addr = $addr->[0]; # first address is primary - } - - $addr = "<a\nhref=\"mailto:$addr\">$addr</a>"; - - $ctx->{footer} = join("\n", - '- ' . $desc, - "A <a\nhref=\"" . - PublicInbox::Hval::prurl($ctx->{cgi}->{env}, PI_URL) . - '">public-inbox</a>, ' . - 'anybody may post in plain-text (not HTML):', - $addr, - $urls - ); -} - # search support is optional, returns undef if Xapian is not installed # or not configured for the given GIT_DIR sub searcher { my ($ctx) = @_; eval { require PublicInbox::Search; - $ctx->{srch} = PublicInbox::Search->new($ctx->{git_dir}); + $ctx->{srch} = $ctx->{-inbox}->search; }; } @@ -322,8 +279,8 @@ EOF [ 501, [ 'Content-Type' => 'text/html; charset=UTF-8' ], [ $msg ] ]; } -# /$LISTNAME/$MESSAGE_ID/t.mbox -> thread as mbox -# /$LISTNAME/$MESSAGE_ID/t.mbox.gz -> thread as gzipped mbox +# /$INBOX/$MESSAGE_ID/t.mbox -> thread as mbox +# /$INBOX/$MESSAGE_ID/t.mbox.gz -> thread as gzipped mbox # note: I'm not a big fan of other compression formats since they're # significantly more expensive on CPU than gzip and less-widely available, # especially on older systems. Stick to zlib since that's what git uses. @@ -335,78 +292,83 @@ sub get_thread_mbox { } -# /$LISTNAME/$MESSAGE_ID/t.atom -> thread as Atom feed +# /$INBOX/$MESSAGE_ID/t.atom -> thread as Atom feed sub get_thread_atom { my ($ctx) = @_; searcher($ctx) or return need_search($ctx); - $ctx->{self_url} = $ctx->{cgi}->uri->as_string; require PublicInbox::Feed; PublicInbox::Feed::generate_thread_atom($ctx); } sub legacy_redirects { - my ($self, $ctx, $path_info) = @_; + my ($ctx, $path_info) = @_; # single-message pages - if ($path_info =~ m!$LISTNAME_RE/m/(\S+)/\z!o) { + if ($path_info =~ m!$INBOX_RE/m/(\S+)/\z!o) { r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)/raw\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/m/(\S+)/raw\z!o) { r301($ctx, $1, $2, 'raw'); - } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)/\z!o) { - r301($ctx, $1, $2, 'f/'); + } elsif ($path_info =~ m!$INBOX_RE/f/(\S+)/\z!o) { + r301($ctx, $1, $2); # thread display - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)/\z!o) { r301($ctx, $1, $2, 't/#u'); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/mbox(\.gz)?\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)/mbox(\.gz)?\z!o) { r301($ctx, $1, $2, "t.mbox$3"); # even older legacy redirects - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)\.html\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/m/(\S+)\.html\z!o) { r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)\.html\z!o) { r301($ctx, $1, $2, 't/#u'); - } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)\.html\z!o) { - r301($ctx, $1, $2, 'f/'); + } elsif ($path_info =~ m!$INBOX_RE/f/(\S+)\.html\z!o) { + r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/(?:m|f)/(\S+)\.txt\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/(?:m|f)/(\S+)\.txt\z!o) { r301($ctx, $1, $2, 'raw'); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)(\.mbox(?:\.gz)?)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)(\.mbox(?:\.gz)?)\z!o) { r301($ctx, $1, $2, "t$3"); # legacy convenience redirects, order still matters - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/m/(\S+)\z!o) { r301($ctx, $1, $2); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/t/(\S+)\z!o) { r301($ctx, $1, $2, 't/#u'); - } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)\z!o) { - r301($ctx, $1, $2, 'f/'); + } elsif ($path_info =~ m!$INBOX_RE/f/(\S+)\z!o) { + r301($ctx, $1, $2); # some Message-IDs have slashes in them and the HTTP server # may try to be clever and unescape them :< - } elsif ($path_info =~ m!$LISTNAME_RE/(\S+/\S+)/$END_RE\z!o) { - msg_page($self, $ctx, $1, $2, $3); + } elsif ($path_info =~ m!$INBOX_RE/(\S+/\S+)/$END_RE\z!o) { + msg_page($ctx, $1, $2, $3); # in case people leave off the trailing slash: - } elsif ($path_info =~ m!$LISTNAME_RE/(\S+/\S+)/(f|T|t)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/(\S+/\S+)/(T|t)\z!o) { r301($ctx, $1, $2, $3 eq 't' ? 't/#u' : $3); + } elsif ($path_info =~ m!$INBOX_RE/(\S+/\S+)/f\z!o) { + r301($ctx, $1, $2); } else { - $self->news_www->call($ctx->{cgi}->{env}); + $ctx->{www}->news_www->call($ctx->{env}); } } sub r301 { - my ($ctx, $listname, $mid, $suffix) = @_; - my $cgi = $ctx->{cgi}; - my $url; - my $qs = $cgi->env->{QUERY_STRING}; - $url = $cgi->base->as_string . $listname . '/'; - $url .= (uri_escape_utf8($mid) . '/') if (defined $mid); + my ($ctx, $inbox, $mid, $suffix) = @_; + my $obj = $ctx->{-inbox}; + unless ($obj) { + my $r404 = invalid_inbox($ctx, $inbox); + return $r404 if $r404; + $obj = $ctx->{-inbox}; + } + my $url = $obj->base_url($ctx->{env}); + my $qs = $ctx->{env}->{QUERY_STRING}; + $url .= (mid_escape($mid) . '/') if (defined $mid); $url .= $suffix if (defined $suffix); $url .= "?$qs" if $qs ne ''; @@ -416,32 +378,48 @@ sub r301 { } sub msg_page { - my ($self, $ctx, $list, $mid, $e) = @_; + my ($ctx, $inbox, $mid, $e) = @_; my $ret; - $ret = invalid_list_mid($self, $ctx, $list, $mid) and return $ret; + $ret = invalid_inbox_mid($ctx, $inbox, $mid) and return $ret; '' eq $e and return get_mid_html($ctx); + 'T/' eq $e and return get_thread($ctx, 1); 't/' eq $e and return get_thread($ctx); 't.atom' eq $e and return get_thread_atom($ctx); 't.mbox' eq $e and return get_thread_mbox($ctx); 't.mbox.gz' eq $e and return get_thread_mbox($ctx, '.gz'); - 'T/' eq $e and return get_thread($ctx, 1); 'raw' eq $e and return get_mid_txt($ctx); - 'f/' eq $e and return get_full_html($ctx); - 'R/' eq $e and return get_reply_html($ctx); + + # legacy, but no redirect for compatibility: + 'f/' eq $e and return get_mid_html($ctx); r404($ctx); } sub serve_git { - my ($cgi, $git, $path) = @_; - PublicInbox::GitHTTPBackend::serve($cgi, $git, $path); + my ($ctx, $path) = @_; + PublicInbox::GitHTTPBackend::serve($ctx->{env}, $ctx->{git}, $path); +} + +sub serve_mbox_range { + my ($ctx, $inbox, $range) = @_; + invalid_inbox($ctx, $inbox) || eval { + require PublicInbox::Mbox; + searcher($ctx); + PublicInbox::Mbox::emit_range($ctx, $range); + } } sub news_www { my ($self) = @_; - my $nw = $self->{news_www}; - return $nw if $nw; - require PublicInbox::NewsWWW; - $self->{news_www} = PublicInbox::NewsWWW->new($self->{pi_config}); + $self->{news_www} ||= do { + require PublicInbox::NewsWWW; + PublicInbox::NewsWWW->new($self->{pi_config}); + } +} + +sub get_attach { + my ($ctx, $idx, $fn) = @_; + require PublicInbox::WwwAttach; + PublicInbox::WwwAttach::get_attach($ctx, $idx, $fn); } 1; diff --git a/lib/PublicInbox/WWW.pod b/lib/PublicInbox/WWW.pod new file mode 100644 index 00000000..a1d33a3b --- /dev/null +++ b/lib/PublicInbox/WWW.pod @@ -0,0 +1,56 @@ +=head1 NAME + +PublicInbox::WWW - PSGI interface for public-inbox + +=head1 SYNOPSIS + +In your .psgi file: + + use PublicInbox::WWW; + + my $www = PublicInbox::WWW->new; + builder { + enable 'Head'; + mount '/inboxes' => sub { $www->call(@_) }; + }; + +=head1 DESCRIPTION + +The PSGI web interface for public-inbox. + +Using this directly is not needed unless you wish to customize +your public-inbox PSGI deployment or are using a PSGI server +other than L<public-inbox-httpd(1)>. + +While this PSGI application works with all PSGI/Plack web +servers such as L<starman(1)>, L<starlet(1)> or L<twiggy(1)>; +PublicInbox::WWW takes advantage of currently-undocumented APIs +of L<public-inbox-httpd(1)> to improve fairness when serving +large responses for thread views and git clones. + +=head1 ENVIRONMENT + +=over 8 + +=item PI_CONFIG + +Used to override the default "~/.public-inbox/config" value. + +=back + +=head1 CONTACT + +Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org> + +The mail archives are hosted at L<https://public-inbox.org/meta/> +and L<http://hjrcffqmbrq6wope.onion/meta/> + +=head1 COPYRIGHT + +Copyright (C) 2016 all contributors L<mailto:meta@public-inbox.org> + +License: AGPL-3.0+ L<http://www.gnu.org/licenses/agpl-3.0.txt> + +=head1 SEE ALSO + +L<http://plackperl.org/>, L<Plack>, L<public-inbox-httpd(1)> diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm new file mode 100644 index 00000000..c8ea3ed3 --- /dev/null +++ b/lib/PublicInbox/WatchMaildir.pm @@ -0,0 +1,253 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# ref: https://cr.yp.to/proto/maildir.html +# http://wiki2.dovecot.org/MailboxFormat/Maildir +package PublicInbox::WatchMaildir; +use strict; +use warnings; +use Email::MIME; +use Email::MIME::ContentType; +$Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect +use PublicInbox::Git; +use PublicInbox::Import; +use PublicInbox::MDA; +use PublicInbox::Spawn qw(spawn); + +sub new { + my ($class, $config) = @_; + my (%mdmap, @mdir, $spamc); + + # "publicinboxwatch" is the documented namespace + # "publicinboxlearn" is legacy but may be supported + # indefinitely... + foreach my $pfx (qw(publicinboxwatch publicinboxlearn)) { + my $k = "$pfx.watchspam"; + if (my $spamdir = $config->{$k}) { + if ($spamdir =~ s/\Amaildir://) { + $spamdir =~ s!/+\z!!; + # skip "new", no MUA has seen it, yet. + my $cur = "$spamdir/cur"; + push @mdir, $cur; + $mdmap{$cur} = 'watchspam'; + } else { + warn "unsupported $k=$spamdir\n"; + } + } + } + + my $k = 'publicinboxwatch.spamcheck'; + my $spamcheck = $config->{$k}; + if ($spamcheck) { + if ($spamcheck eq 'spamc') { + $spamcheck = 'PublicInbox::Spamcheck::Spamc'; + } + if ($spamcheck =~ /::/) { + eval "require $spamcheck"; + $spamcheck = _spamcheck_cb($spamcheck->new); + } else { + warn "unsupported $k=$spamcheck\n"; + $spamcheck = undef; + } + } + foreach $k (keys %$config) { + $k =~ /\Apublicinbox\.([^\.]+)\.watch\z/ or next; + my $name = $1; + my $watch = $config->{$k}; + if ($watch =~ s/\Amaildir://) { + $watch =~ s!/+\z!!; + my $inbox = $config->lookup_name($name); + if (my $wm = $inbox->{watchheader}) { + my ($k, $v) = split(/:/, $wm, 2); + $inbox->{-watchheader} = [ $k, qr/\Q$v\E/ ]; + } + my $new = "$watch/new"; + my $cur = "$watch/cur"; + push @mdir, $new, $cur; + die "$new already in use\n" if $mdmap{$new}; + die "$cur already in use\n" if $mdmap{$cur}; + $mdmap{$new} = $mdmap{$cur} = $inbox; + } else { + warn "watch unsupported: $k=$watch\n"; + } + } + return unless @mdir; + + my $mdre = join('|', map { quotemeta($_) } @mdir); + $mdre = qr!\A($mdre)/!; + bless { + spamcheck => $spamcheck, + mdmap => \%mdmap, + mdir => \@mdir, + mdre => $mdre, + importers => {}, + }, $class; +} + +sub _done_for_now { + $_->done foreach values %{$_[0]->{importers}}; +} + +sub _try_fsn_paths { + my ($self, $paths) = @_; + _try_path($self, $_->{path}) foreach @$paths; + _done_for_now($self); +} + +sub _remove_spam { + my ($self, $path) = @_; + $path =~ /:2,[A-R]*S[T-Z]*\z/i or return; + my $mime = _path_to_mime($path) or return; + _force_mid($mime); + foreach my $inbox (values %{$self->{mdmap}}) { + next unless ref $inbox; + my $im = _importer_for($self, $inbox); + $im->remove($mime); + if (my $scrub = _scrubber_for($inbox)) { + my $scrubbed = $scrub->scrub($mime) or next; + $im->remove($scrubbed); + } + } +} + +# used to hash the relevant portions of a message when there are conflicts +sub _hash_mime2 { + my ($mime) = @_; + require Digest::SHA; + my $dig = Digest::SHA->new('SHA-1'); + $dig->add($mime->header_obj->header_raw('Subject')); + $dig->add($mime->body_raw); + $dig->hexdigest; +} + +sub _force_mid { + my ($mime) = @_; + # probably a bad idea, but we inject a Message-Id if + # one is missing, here.. + my $mid = $mime->header_obj->header_raw('Message-Id'); + if (!defined $mid || $mid =~ /\A\s*\z/) { + $mid = '<' . _hash_mime2($mime) . '@generated>'; + $mime->header_set('Message-Id', $mid); + } +} + +sub _try_path { + my ($self, $path) = @_; + my @p = split(m!/+!, $path); + return if $p[-1] !~ /\A[a-zA-Z0-9][\w:,=\.]+\z/; + if ($p[-1] =~ /:2,([A-Z]+)\z/i) { + my $flags = $1; + return if $flags =~ /[DT]/; # no [D]rafts or [T]rashed mail + } + return unless -f $path; + if ($path !~ $self->{mdre}) { + warn "unrecognized path: $path\n"; + return; + } + my $inbox = $self->{mdmap}->{$1}; + unless ($inbox) { + warn "unmappable dir: $1\n"; + return; + } + if (!ref($inbox) && $inbox eq 'watchspam') { + return _remove_spam($self, $path); + } + my $im = _importer_for($self, $inbox); + my $mime = _path_to_mime($path) or return; + $mime->header_set($_) foreach @PublicInbox::MDA::BAD_HEADERS; + my $wm = $inbox->{-watchheader}; + if ($wm) { + my $v = $mime->header_obj->header_raw($wm->[0]); + return unless ($v && $v =~ $wm->[1]); + } + if (my $scrub = _scrubber_for($inbox)) { + $mime = $scrub->scrub($mime) or return; + } + + _force_mid($mime); + $im->add($mime, $self->{spamcheck}); +} + +sub watch { + my ($self) = @_; + my $cb = sub { _try_fsn_paths($self, \@_) }; + my $mdir = $self->{mdir}; + + # lazy load here, we may support watching via IMAP IDLE + # in the future... + require Filesys::Notify::Simple; + my $watcher = Filesys::Notify::Simple->new($mdir); + $watcher->wait($cb) while (1); +} + +sub scan { + my ($self) = @_; + my $mdir = $self->{mdir}; + foreach my $dir (@$mdir) { + my $ok = opendir(my $dh, $dir); + unless ($ok) { + warn "failed to open $dir: $!\n"; + next; + } + while (my $fn = readdir($dh)) { + _try_path($self, "$dir/$fn"); + } + closedir $dh; + } + _done_for_now($self); +} + +sub _path_to_mime { + my ($path) = @_; + if (open my $fh, '<', $path) { + local $/; + my $str = <$fh>; + $str or return; + return Email::MIME->new(\$str); + } elsif ($!{ENOENT}) { + return; + } else { + warn "failed to open $path: $!\n"; + return; + } +} + +sub _importer_for { + my ($self, $inbox) = @_; + my $im = $inbox->{-import} ||= eval { + my $git = $inbox->git; + my $name = $inbox->{name}; + my $addr = $inbox->{-primary_address}; + PublicInbox::Import->new($git, $name, $addr, $inbox); + }; + $self->{importers}->{"$im"} = $im; +} + +sub _scrubber_for { + my ($inbox) = @_; + my $f = $inbox->{filter}; + if ($f && $f =~ /::/) { + eval "require $f"; + if ($@) { + warn $@; + } else { + return $f->new; + } + } + undef; +} + +sub _spamcheck_cb { + my ($sc) = @_; + sub { + my ($mime) = @_; + my $tmp = ''; + if ($sc->spamcheck($mime, \$tmp)) { + return Email::MIME->new(\$tmp); + } + warn $mime->header('Message-ID')." failed spam check\n"; + undef; + } +} + +1; diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm new file mode 100644 index 00000000..5720384c --- /dev/null +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -0,0 +1,134 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# Atom body stream for which yields getline+close methods +package PublicInbox::WwwAtomStream; +use strict; +use warnings; + +# FIXME: locale-independence: +use POSIX qw(strftime); +use Date::Parse qw(strptime); + +use PublicInbox::Address; +use PublicInbox::Hval qw(ascii_html); +use PublicInbox::MID qw/mid_clean mid2path mid_escape/; + +# called by PSGI server after getline: +sub close {} + +sub new { + my ($class, $ctx, $cb) = @_; + $ctx->{emit_header} = 1; + $ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env}); + bless { cb => $cb || *close, ctx => $ctx }, $class; +} + +sub response { + my ($class, $ctx, $code, $cb) = @_; + [ $code, [ 'Content-Type', 'application/atom+xml' ], + $class->new($ctx, $cb) ] +} + +# called once for each message by PSGI server +sub getline { + my ($self) = @_; + if (my $middle = $self->{cb}) { + my $mime = $middle->(); + return feed_entry($self, $mime) if $mime; + } + delete $self->{cb} ? '</feed>' : undef; +} + +# private + +sub title_tag { + my ($title) = @_; + $title =~ tr/\t\n / /s; # squeeze spaces + # try to avoid the type attribute in title: + $title = ascii_html($title); + my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; + "<title$type>$title</title>"; +} + +sub atom_header { + my ($ctx, $title) = @_; + my $ibx = $ctx->{-inbox}; + my $base_url = $ctx->{feed_base_url}; + my $search_q = $ctx->{search_query}; + my $self_url = $base_url; + my $mid = $ctx->{mid}; + if (defined $mid) { # per-thread + $self_url .= mid_escape($mid).'/t.atom'; + } elsif (defined $search_q) { + my $query = $search_q->{'q'}; + $title = title_tag("$query - search results"); + $base_url .= '?' . $search_q->qs_html(x => undef); + $self_url .= '?' . $search_q->qs_html; + } else { + $title = title_tag($ibx->description); + $self_url .= 'new.atom'; + } + my $mtime = (stat($ibx->{mainrepo}))[9] || time; + + qq(<?xml version="1.0" encoding="us-ascii"?>\n) . + qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . + qq{$title} . + qq(<link\nrel="alternate"\ntype="text/html") . + qq(\nhref="$base_url"/>) . + qq(<link\nrel="self"\nhref="$self_url"/>) . + qq(<id>mailto:$ibx->{-primary_address}</id>) . + feed_updated(gmtime($mtime)); +} + +# returns undef or string +sub feed_entry { + my ($self, $mime) = @_; + my $ctx = $self->{ctx}; + my $hdr = $mime->header_obj; + my $mid = mid_clean($hdr->header_raw('Message-ID')); + + my $uuid = mid2path($mid); + $uuid =~ tr!/!!d; + my $h = '[a-f0-9]'; + my (@uuid5) = ($uuid =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); + $uuid = 'urn:uuid:' . join('-', @uuid5); + + $mid = PublicInbox::Hval->new_msgid($mid); + my $href = $ctx->{feed_base_url} . $mid->{href}. '/'; + + my $date = $hdr->header('Date'); + my @t = eval { strptime($date) } if defined $date; + @t = gmtime(time) unless scalar @t; + my $updated = feed_updated(@t); + + my $title = $hdr->header('Subject'); + $title = '(no subject)' unless defined $title && $title ne ''; + $title = title_tag($title); + + my $from = $hdr->header('From') or return; + my ($email) = PublicInbox::Address::emails($from); + my $name = join(', ',PublicInbox::Address::names($from)); + $name = ascii_html($name); + $email = ascii_html($email); + + my $s = ''; + if (delete $ctx->{emit_header}) { + $s .= atom_header($ctx, $title); + } + $s .= "<entry><author><name>$name</name><email>$email</email>" . + "</author>$title$updated" . + qq{<content\ntype="xhtml">} . + qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} . + qq(<pre\nstyle="white-space:pre-wrap">) . + PublicInbox::View::multipart_text_as_html($mime, $href) . + '</pre>' . + qq!</div></content><link\nhref="$href"/>!. + "<id>$uuid</id></entry>"; +} + +sub feed_updated { + '<updated>' . strftime('%Y-%m-%dT%H:%M:%SZ', @_) . '</updated>'; +} + +1; diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm new file mode 100644 index 00000000..33bfce27 --- /dev/null +++ b/lib/PublicInbox/WwwAttach.pm @@ -0,0 +1,43 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# For retrieving attachments from messages in the WWW interface +package PublicInbox::WwwAttach; # internal package +use strict; +use warnings; +use Email::MIME; +use Email::MIME::ContentType qw(parse_content_type); +$Email::MIME::ContentType::STRICT_PARAMS = 0; +use PublicInbox::MsgIter; + +# /$LISTNAME/$MESSAGE_ID/$IDX-$FILENAME +sub get_attach ($$$) { + my ($ctx, $idx, $fn) = @_; + my $res = [ 404, [ 'Content-Type', 'text/plain' ], [ "Not found\n" ] ]; + my $mime = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return $res; + $mime = Email::MIME->new($mime); + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + return if join('.', @idx) ne $idx; + $res->[0] = 200; + my $ct = $part->content_type; + $ct = parse_content_type($ct) if $ct; + + # discrete == type, we remain Debian wheezy-compatible + if ($ct && (($ct->{discrete} || '') eq 'text')) { + # display all text as text/plain: + my $cset = $ct->{attributes}->{charset}; + if ($cset && ($cset =~ /\A[\w-]+\z/)) { + $res->[1]->[1] .= qq(; charset=$cset); + } + } else { # TODO: allow user to configure safe types + $res->[1]->[1] = 'application/octet-stream'; + } + $part = $part->body; + push @{$res->[1]}, 'Content-Length', bytes::length($part); + $res->[2]->[0] = $part; + }); + $res; +} + +1; diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm new file mode 100644 index 00000000..01f7b31b --- /dev/null +++ b/lib/PublicInbox/WwwStream.pm @@ -0,0 +1,129 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# HTML body stream for which yields getline+close methods +package PublicInbox::WwwStream; +use strict; +use warnings; +use PublicInbox::Hval qw(ascii_html); +use URI; +our $TOR_URL = 'https://www.torproject.org/'; +our $TOR2WEB_URL = 'https://www.tor2web.org/'; +our $CODE_URL = 'https://public-inbox.org/'; +our $PROJECT = 'public-inbox'; + +# noop for HTTP.pm (and any other PSGI servers) +sub close {} + +sub new { + my ($class, $ctx, $cb) = @_; + bless { nr => 0, cb => $cb || *close, ctx => $ctx }, $class; +} + +sub response { + my ($class, $ctx, $code, $cb) = @_; + [ $code, [ 'Content-Type', 'text/html; charset=UTF-8' ], + $class->new($ctx, $cb) ] +} + +sub _html_top ($) { + my ($self) = @_; + my $ctx = $self->{ctx}; + my $obj = $ctx->{-inbox}; + my $desc = ascii_html($obj->description); + my $title = $ctx->{-title_html} || $desc; + my $upfx = $ctx->{-upfx} || ''; + my $help = $upfx.'_/text/help'; + my $atom = $ctx->{-atom} || $upfx.'new.atom'; + my $tip = $ctx->{-html_tip} || ''; + my $top = "<b>$desc</b>"; + my $links = "<a\nhref=\"$help\">help</a> / ". + "<a\nhref=\"$atom\">Atom feed</a>"; + if ($obj->search) { + my $q_val = $ctx->{-q_value_html}; + if (defined $q_val && $q_val ne '') { + $q_val = qq(\nvalue="$q_val"); + } else { + $q_val = ''; + } + # XXX gross, for SearchView.pm + my $extra = $ctx->{-extra_form_html} || ''; + my $action = $upfx eq '' ? './' : $upfx; + $top = qq{<form\naction="$action"><pre>$top} . + qq{\n<input\nname=q\ntype=text$q_val />} . + $extra . + qq{<input\ntype=submit\nvalue=search />} . + ' ' . $links . + q{</pre></form>} + } else { + $top = '<pre>' . $top . "\n" . $links . '</pre>'; + } + "<html><head><title>$title</title>" . + "<link\nrel=alternate\ntitle=\"Atom feed\"\n". + "href=\"$atom\"\ntype=\"application/atom+xml\"/>" . + PublicInbox::Hval::STYLE . + "</head><body>". $top . $tip; +} + +sub _html_end { + my ($self) = @_; + my $urls = 'Archives are clonable:'; + my $ctx = $self->{ctx}; + my $obj = $ctx->{-inbox}; + my $desc = ascii_html($obj->description); + + my $http = $obj->base_url($ctx->{env}); + chop $http; + my %seen = ( $http => 1 ); + my @urls = ($http); + foreach my $u (@{$obj->cloneurl}) { + next if $seen{$u}; + $seen{$u} = 1; + push @urls, $u =~ /\Ahttps?:/ ? qq(<a\nhref="$u">$u</a>) : $u; + } + if (scalar(@urls) == 1) { + $urls .= " git clone --mirror $http"; + } else { + $urls .= "\n" . + join("\n", map { "\tgit clone --mirror $_" } @urls); + } + + my @nntp = map { qq(<a\nhref="$_">$_</a>) } @{$obj->nntp_url}; + if (@nntp) { + $urls .= "\n\n"; + $urls .= @nntp == 1 ? 'Newsgroup' : 'Newsgroups are'; + $urls .= ' available over NNTP:'; + $urls .= "\n\t" . join("\n\t", @nntp) . "\n"; + } + if ($urls =~ m!\b[^:]+://\w+\.onion/!) { + $urls .= "\n note: .onion URLs require Tor: "; + $urls .= qq[<a\nhref="$TOR_URL">$TOR_URL</a>]; + if ($TOR2WEB_URL) { + $urls .= "\n or Tor2web: "; + $urls .= qq[<a\nhref="$TOR2WEB_URL">$TOR2WEB_URL</a>]; + } + } + my $url = PublicInbox::Hval::prurl($ctx->{env}, $CODE_URL); + '<hr><pre>'.join("\n\n", + $desc, + $urls, + 'AGPL code for this site: '. + qq(git clone <a\nhref="$url">$url</a> $PROJECT) + ).'</pre></body></html>'; +} + +# callback for HTTP.pm (and any other PSGI servers) +sub getline { + my ($self) = @_; + my $nr = $self->{nr}++; + + return _html_top($self) if $nr == 0; + + if (my $middle = $self->{cb}) { + $middle = $middle->($nr, $self->{ctx}) and return $middle; + } + + delete $self->{cb} ? _html_end($self) : undef; +} + +1; diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm new file mode 100644 index 00000000..b0f262cd --- /dev/null +++ b/lib/PublicInbox/WwwText.pm @@ -0,0 +1,207 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# serves the /$INBOX/_/* endpoints from :text/* of the git tree +package PublicInbox::WwwText; +use strict; +use warnings; +use PublicInbox::Linkify; +use PublicInbox::WwwStream; +use PublicInbox::Hval qw(ascii_html); +our $QP_URL = 'https://xapian.org/docs/queryparser.html'; +our $WIKI_URL = 'https://en.wikipedia.org/wiki'; + +# /$INBOX/_/text/$KEY/ # KEY may contain slashes +# For now, "help" is the only supported $KEY +sub get_text { + my ($ctx, $key) = @_; + my $code = 200; + + $key = 'help' if !defined $key; # this 302s to _/text/help/ + + # get the raw text the same way we get mboxrds + my $raw = ($key =~ s!/raw\z!!); + my $have_tslash = ($key =~ s!/\z!!) if !$raw; + + my $txt = ''; + if (!_default_text($ctx, $key, \$txt)) { + $code = 404; + $txt = "404 Not Found ($key)\n"; + } + if ($raw) { + return [ $code, [ 'Content-Type', 'text/plain', + 'Content-Length', bytes::length($txt) ], + [ $txt ] ] + } + + # enforce trailing slash for "wget -r" compatibility + if (!$have_tslash && $code == 200) { + my $url = $ctx->{-inbox}->base_url($ctx->{env}); + $url .= "_/text/$key/"; + + return [ 302, [ 'Content-Type', 'text/plain', + 'Location', $url ], + [ "Redirecting to $url\n" ] ]; + } + + # Follow git commit message conventions, + # first line is the Subject/title + my ($title) = ($txt =~ /\A([^\n]*)/s); + _do_linkify($txt); + $ctx->{-title_html} = ascii_html($title); + + my $nslash = ($key =~ tr!/!/!); + $ctx->{-upfx} = '../../../' . ('../' x $nslash); + + PublicInbox::WwwStream->response($ctx, $code, sub { + my ($nr, undef) = @_; + $nr == 1 ? '<pre>'.$txt.'</pre>' : undef + }); +} + +sub _do_linkify { + my $l = PublicInbox::Linkify->new; + $_[0] = $l->linkify_2(ascii_html($l->linkify_1($_[0]))); +} + +sub _srch_prefix ($$) { + my ($srch, $txt) = @_; + my $pad = 0; + my $htxt = ''; + my $help = $srch->help; + my $i; + for ($i = 0; $i < @$help; $i += 2) { + my $pfx = $help->[$i]; + my $n = length($pfx); + $pad = $n if $n > $pad; + $htxt .= $pfx . "\0"; + $htxt .= $help->[$i + 1]; + $htxt .= "\f\n"; + } + $pad += 2; + my $padding = ' ' x ($pad + 8); + $htxt =~ s/^/$padding/gms; + $htxt =~ s/^$padding(\S+)\0/" $1". + (' ' x ($pad - length($1)))/egms; + $htxt =~ s/\f\n/\n/gs; + $$txt .= $htxt; + 1; +} + + +sub _default_text ($$$) { + my ($ctx, $key, $txt) = @_; + return if $key ne 'help'; # TODO more keys? + + my $ibx = $ctx->{-inbox}; + my $base_url = $ibx->base_url($ctx->{env}); + $$txt .= "public-inbox help for $base_url\n"; + $$txt .= <<EOF; + +overview +-------- + + public-inbox uses Message-ID identifiers in URLs. + One may look up messages by substituting Message-IDs + (without the leading '<' or trailing '>') into the URL. + Forward slash ('/') characters in the Message-IDs + need to be escaped as "%2F" (without quotes). + + Thus, it is possible to retrieve any message by its + Message-ID by going to: + + $base_url<Message-ID>/ + + (without the '<' or '>') + + Message-IDs are described at: + + $WIKI_URL/Message-ID + +EOF + + # n.b. we use the Xapian DB for any regeneratable, + # order-of-arrival-independent data. + my $srch = $ibx->search; + if ($srch) { + $$txt .= <<EOF; +search +------ + + This public-inbox has search functionality provided by Xapian. + + It supports typical AND, OR, NOT, '+', '-' queries present + in other search engines. + + We also support search prefixes to limit the scope of the + search to certain fields. + + Prefixes supported in this installation include: + +EOF + _srch_prefix($srch, $txt); + + $$txt .= <<EOF; + + Most prefixes are probabilistic, meaning they support stemming + and wildcards ('*'). Ranges (such as 'd:') and boolean prefixes + do not support stemming or wildcards. + The upstream Xapian query parser documentation fully explains + the query syntax: + + $QP_URL + +message threading +----------------- + + Message threading is enabled for this public-inbox, + additional endpoints for message threads are available: + + * $base_url<Message-ID>/T/#u + + Loads the thread belonging to the given <Message-ID> + in flat chronological order. The "#u" anchor + focuses the browser on the given <Message-ID>. + + * $base_url<Message-ID>/t/#u + + Loads the thread belonging to the given <Message-ID> + in threaded order with nesting. For deep threads, + this requires a wide display or horizontal scrolling. + + Both of these HTML endpoints are suitable for offline reading + using the thread overview at the bottom of each page. + + Users of feed readers may follow a particular thread using: + + * $base_url<Message-ID>/t.atom + + Which loads the thread in Atom Syndication Standard + described at Wikipedia and RFC4287: + + $WIKI_URL/Atom_(standard) + https://tools.ietf.org/html/rfc4287 + + Finally, the gzipped mbox for a thread is available for + downloading and importing into your favorite mail client: + + * $base_url<Message-ID>/t.mbox.gz + + We use the mboxrd variant of the mbox format described + at: + + $WIKI_URL/Mbox + +contact +------- + + This help text is maintained by public-inbox developers + reachable via plain-text email at: meta\@public-inbox.org + +EOF + # TODO: support admin contact info in ~/.public-inbox/config + } + 1; +} + +1; |