about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2016-12-03 00:24:06 +0000
committerEric Wong <e@80x24.org>2016-12-03 01:48:14 +0000
commit95d4bf7aded41cb3b0040c321d315532f68633e1 (patch)
treeb971609ef6fd3665d7d68352600d22ffe5b05cf8
parent21f5b7a8bcd942b19475c1c0c265f39dfdf93608 (diff)
downloadpublic-inbox-95d4bf7aded41cb3b0040c321d315532f68633e1.tar.gz
This will let us stream larger Atom documents bodies without
wasting too much memory and reduce the amount of round-trip
requests needed to get necessary information.

Hopefully clients are using streaming (SAX) parsers, too.

This is the final transition in the core public-inbox
code to allow migrating to a "pull"-based body streaming
scheme which allows a HTTP server to respond appropriately
to backpressure from slow clients.
-rw-r--r--MANIFEST1
-rw-r--r--lib/PublicInbox/Feed.pm188
-rw-r--r--lib/PublicInbox/SearchView.pm29
-rw-r--r--lib/PublicInbox/WwwAtomStream.pm134
-rw-r--r--t/common.perl21
5 files changed, 186 insertions, 187 deletions
diff --git a/MANIFEST b/MANIFEST
index 3a4d7c4d..3388b1a1 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -88,6 +88,7 @@ lib/PublicInbox/View.pm
 lib/PublicInbox/WWW.pm
 lib/PublicInbox/WWW.pod
 lib/PublicInbox/WatchMaildir.pm
+lib/PublicInbox/WwwAtomStream.pm
 lib/PublicInbox/WwwAttach.pm
 lib/PublicInbox/WwwStream.pm
 lib/PublicInbox/WwwText.pm
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index 25fec10a..31d82adb 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -6,26 +6,45 @@ package PublicInbox::Feed;
 use strict;
 use warnings;
 use Email::MIME;
-use Date::Parse qw(strptime);
-use PublicInbox::Hval qw/ascii_html/;
 use PublicInbox::View;
-use PublicInbox::MID qw/mid_clean mid2path/;
-use PublicInbox::Address;
-use POSIX qw/strftime/;
+use PublicInbox::WwwAtomStream;
 use constant {
-        DATEFMT => '%Y-%m-%dT%H:%M:%SZ', # Atom standard
         MAX_PER_PAGE => 25, # this needs to be tunable
 };
 
 # main function
 sub generate {
         my ($ctx) = @_;
-        sub { emit_atom($_[0], $ctx) };
+        my @paths;
+        each_recent_blob($ctx, sub { push @paths, $_[0] });
+        return _no_thread() unless @paths;
+
+        my $ibx = $ctx->{-inbox};
+        PublicInbox::WwwAtomStream->response($ctx, 200, sub {
+                while (my $path = shift @paths) {
+                        my $mime = do_cat_mail($ibx, $path) or next;
+                        return $mime;
+                }
+        });
 }
 
 sub generate_thread_atom {
         my ($ctx) = @_;
-        sub { emit_atom_thread($_[0], $ctx) };
+        my $mid = $ctx->{mid};
+        my $res = $ctx->{srch}->get_thread($mid);
+        return _no_thread() unless $res->{total};
+
+        my $ibx = $ctx->{-inbox};
+        my $html_url = $ibx->base_url($ctx->{env});
+        $html_url .= PublicInbox::Hval->new_msgid($mid)->{href};
+        $ctx->{-html_url} = $html_url;
+        my $msgs = $res->{msgs};
+        PublicInbox::WwwAtomStream->response($ctx, 200, sub {
+                while (my $msg = shift @$msgs) {
+                        $msg = $ibx->msg_by_smsg($msg) and
+                                        return Email::MIME->new($msg);
+                }
+        });
 }
 
 sub generate_html_index {
@@ -73,80 +92,8 @@ sub new_html {
 
 # private subs
 
-sub title_tag {
-        my ($title) = @_;
-        $title =~ tr/\t\n / /s; # squeeze spaces
-        # try to avoid the type attribute in title:
-        $title = ascii_html($title);
-        my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : '';
-        "<title$type>$title</title>";
-}
-
-sub atom_header {
-        my ($feed_opts, $title) = @_;
-
-        $title = title_tag($feed_opts->{description}) unless (defined $title);
-
-        qq(<?xml version="1.0" encoding="us-ascii"?>\n) .
-        qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} .
-        qq{$title} .
-        qq(<link\nrel="alternate"\ntype="text/html") .
-                qq(\nhref="$feed_opts->{url}"/>) .
-        qq(<link\nrel="self"\nhref="$feed_opts->{atomurl}"/>) .
-        qq(<id>mailto:$feed_opts->{id_addr}</id>);
-}
-
-sub emit_atom {
-        my ($cb, $ctx) = @_;
-        my $feed_opts = get_feedopts($ctx);
-        my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]);
-        my $max = $ctx->{max} || MAX_PER_PAGE;
-        my $x = atom_header($feed_opts);
-        my $ibx = $ctx->{-inbox};
-        each_recent_blob($ctx, sub {
-                my ($path, undef, $ts) = @_;
-                if (defined $x) {
-                        $fh->write($x . feed_updated(undef, $ts));
-                        $x = undef;
-                }
-                my $s = feed_entry($feed_opts, $path, $ibx) or return 0;
-                $fh->write($s);
-                1;
-        });
-        end_feed($fh);
-}
-
-sub _no_thread {
-        my ($cb) = @_;
-        $cb->([404, ['Content-Type', 'text/plain'],
-                ["No feed found for thread\n"]]);
-}
-
-sub end_feed {
-        my ($fh) = @_;
-        $fh->write('</feed>');
-        $fh->close;
-}
-
-sub emit_atom_thread {
-        my ($cb, $ctx) = @_;
-        my $mid = $ctx->{mid};
-        my $res = $ctx->{srch}->get_thread($mid);
-        return _no_thread($cb) unless $res->{total};
-        my $feed_opts = get_feedopts($ctx);
-        my $fh = $cb->([200, ['Content-Type' => 'application/atom+xml']]);
-        my $ibx = $ctx->{-inbox};
-        my $html_url = $ibx->base_url($ctx->{env});
-        $html_url .= PublicInbox::Hval->new_msgid($mid)->{href};
-
-        $feed_opts->{url} = $html_url;
-        $feed_opts->{emit_header} = 1;
-
-        foreach my $msg (@{$res->{msgs}}) {
-                my $s = feed_entry($feed_opts, mid2path($msg->mid), $ibx);
-                $fh->write($s) if defined $s;
-        }
-        end_feed($fh);
+sub _no_thread () {
+        [404, ['Content-Type', 'text/plain'], ["No feed found for thread\n"]];
 }
 
 sub new_html_footer {
@@ -199,7 +146,7 @@ sub each_recent_blob {
                 if ($line =~ /$addmsg/o) {
                         my $add = $1;
                         next if $deleted{$add}; # optimization-only
-                        $nr += $cb->($add, $cur_commit, $ts, $u, $subj);
+                        $cb->($add, $cur_commit, $ts, $u, $subj) and $nr++;
                         if ($nr >= $max) {
                                 $last = 1;
                                 last;
@@ -228,81 +175,6 @@ sub each_recent_blob {
         ($first_commit, $last_commit);
 }
 
-# private functions below
-sub get_feedopts {
-        my ($ctx) = @_;
-        my $inbox = $ctx->{inbox};
-        my $obj = $ctx->{-inbox};
-        my %rv = ( description => $obj->description );
-
-        $rv{address} = $obj->{address};
-        $rv{id_addr} = $obj->{-primary_address};
-        my $url_base = $obj->base_url($ctx->{env});
-        if (my $mid = $ctx->{mid}) { # per-thread feed:
-                $rv{atomurl} = "$url_base$mid/t.atom";
-        } else {
-                $rv{atomurl} = $url_base."new.atom";
-        }
-        $rv{url} ||= $url_base;
-        $rv{midurl} = $url_base;
-
-        \%rv;
-}
-
-sub feed_updated {
-        my ($date, $ts) = @_;
-        my @t = eval { strptime($date) } if defined $date;
-        @t = gmtime($ts || time) unless scalar @t;
-
-        '<updated>' . strftime(DATEFMT, @t) . '</updated>';
-}
-
-# returns undef or string
-sub feed_entry {
-        my ($feed_opts, $add, $ibx) = @_;
-
-        my $mime = do_cat_mail($ibx, $add) or return;
-        my $url = $feed_opts->{url};
-        my $midurl = $feed_opts->{midurl};
-
-        my $header_obj = $mime->header_obj;
-        my $mid = mid_clean($header_obj->header_raw('Message-ID'));
-        $mid = PublicInbox::Hval->new_msgid($mid);
-        my $href = $midurl . $mid->{href}. '/';
-
-        my $date = $header_obj->header('Date');
-        my $updated = feed_updated($date);
-
-        my $title = $header_obj->header('Subject');
-        defined $title or return;
-        $title = title_tag($title);
-
-        my $from = $header_obj->header('From') or return;
-        my ($email) = PublicInbox::Address::emails($from);
-        my $name = join(', ',PublicInbox::Address::names($from));
-        $name = ascii_html($name);
-        $email = ascii_html($email);
-
-        my $s = '';
-        if (delete $feed_opts->{emit_header}) {
-                $s .= atom_header($feed_opts, $title) . $updated;
-        }
-        $s .= "<entry><author><name>$name</name><email>$email</email>" .
-                "</author>$title$updated" .
-                qq{<content\ntype="xhtml">} .
-                qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} .
-                qq(<pre\nstyle="white-space:pre-wrap">) .
-                PublicInbox::View::multipart_text_as_html($mime, $href) .
-                '</pre>';
-
-        $add =~ tr!/!!d;
-        my $h = '[a-f0-9]';
-        my (@uuid5) = ($add =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o);
-        my $id = 'urn:uuid:' . join('-', @uuid5);
-        $s .= qq!</div></content><link\nhref="$href"/>!.
-                "<id>$id</id></entry>";
-}
-
 sub do_cat_mail {
         my ($ibx, $path) = @_;
         my $mime = eval { $ibx->msg_by_path($path) } or return;
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index fbba9c49..6af151a4 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -8,6 +8,7 @@ use warnings;
 use PublicInbox::SearchMsg;
 use PublicInbox::Hval qw/ascii_html/;
 use PublicInbox::View;
+use PublicInbox::WwwAtomStream;
 use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape);
 use Email::MIME;
 require PublicInbox::Git;
@@ -46,7 +47,7 @@ sub sres_top_html {
                 $cb = *noop;
         } else {
                 my $x = $q->{x};
-                return sub { adump($_[0], $mset, $q, $ctx) } if ($x eq 'A');
+                return adump($_[0], $mset, $q, $ctx) if $x eq 'A';
 
                 $ctx->{-html_tip} = search_nav_top($mset, $q) . "\n\n";
                 if ($x eq 't') {
@@ -213,23 +214,17 @@ sub ctx_prepare {
 
 sub adump {
         my ($cb, $mset, $q, $ctx) = @_;
-        my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]);
         my $ibx = $ctx->{-inbox};
-        my $feed_opts = PublicInbox::Feed::get_feedopts($ctx);
-        my $x = ascii_html($q->{'q'});
-        $x = qq{$x - search results};
-        $feed_opts->{atomurl} = $feed_opts->{url} . '?'. $q->qs_html;
-        $feed_opts->{url} .= '?'. $q->qs_html(x => undef);
-        $x = PublicInbox::Feed::atom_header($feed_opts, "<title>$x</title>");
-        $fh->write($x. PublicInbox::Feed::feed_updated());
-
-        for ($mset->items) {
-                $x = PublicInbox::SearchMsg->load_doc($_->get_document)->mid;
-                $x = mid2path($x);
-                my $s = PublicInbox::Feed::feed_entry($feed_opts, $x, $ibx);
-                $fh->write($s) if defined $s;
-        }
-        PublicInbox::Feed::end_feed($fh);
+        my @items = $mset->items;
+        $ctx->{search_query} = $q;
+        PublicInbox::WwwAtomStream->response($ctx, 200, sub {
+                while (my $x = shift @items) {
+                        $x = PublicInbox::SearchMsg->load_doc($x->get_document);
+                        $x = $ibx->msg_by_smsg($x) and
+                                        return Email::MIME->new($x);
+                }
+                return undef;
+        });
 }
 
 package PublicInbox::SearchQuery;
diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm
new file mode 100644
index 00000000..5720384c
--- /dev/null
+++ b/lib/PublicInbox/WwwAtomStream.pm
@@ -0,0 +1,134 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# Atom body stream for which yields getline+close methods
+package PublicInbox::WwwAtomStream;
+use strict;
+use warnings;
+
+# FIXME: locale-independence:
+use POSIX qw(strftime);
+use Date::Parse qw(strptime);
+
+use PublicInbox::Address;
+use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::MID qw/mid_clean mid2path mid_escape/;
+
+# called by PSGI server after getline:
+sub close {}
+
+sub new {
+        my ($class, $ctx, $cb) = @_;
+        $ctx->{emit_header} = 1;
+        $ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env});
+        bless { cb => $cb || *close, ctx => $ctx }, $class;
+}
+
+sub response {
+        my ($class, $ctx, $code, $cb) = @_;
+        [ $code, [ 'Content-Type', 'application/atom+xml' ],
+          $class->new($ctx, $cb) ]
+}
+
+# called once for each message by PSGI server
+sub getline {
+        my ($self) = @_;
+        if (my $middle = $self->{cb}) {
+                my $mime = $middle->();
+                return feed_entry($self, $mime) if $mime;
+        }
+        delete $self->{cb} ? '</feed>' : undef;
+}
+
+# private
+
+sub title_tag {
+        my ($title) = @_;
+        $title =~ tr/\t\n / /s; # squeeze spaces
+        # try to avoid the type attribute in title:
+        $title = ascii_html($title);
+        my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : '';
+        "<title$type>$title</title>";
+}
+
+sub atom_header {
+        my ($ctx, $title) = @_;
+        my $ibx = $ctx->{-inbox};
+        my $base_url = $ctx->{feed_base_url};
+        my $search_q = $ctx->{search_query};
+        my $self_url = $base_url;
+        my $mid = $ctx->{mid};
+        if (defined $mid) { # per-thread
+                $self_url .= mid_escape($mid).'/t.atom';
+        } elsif (defined $search_q) {
+                my $query = $search_q->{'q'};
+                $title = title_tag("$query - search results");
+                $base_url .= '?' . $search_q->qs_html(x => undef);
+                $self_url .= '?' . $search_q->qs_html;
+        } else {
+                $title = title_tag($ibx->description);
+                $self_url .= 'new.atom';
+        }
+        my $mtime = (stat($ibx->{mainrepo}))[9] || time;
+
+        qq(<?xml version="1.0" encoding="us-ascii"?>\n) .
+        qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} .
+        qq{$title} .
+        qq(<link\nrel="alternate"\ntype="text/html") .
+                qq(\nhref="$base_url"/>) .
+        qq(<link\nrel="self"\nhref="$self_url"/>) .
+        qq(<id>mailto:$ibx->{-primary_address}</id>) .
+        feed_updated(gmtime($mtime));
+}
+
+# returns undef or string
+sub feed_entry {
+        my ($self, $mime) = @_;
+        my $ctx = $self->{ctx};
+        my $hdr = $mime->header_obj;
+        my $mid = mid_clean($hdr->header_raw('Message-ID'));
+
+        my $uuid = mid2path($mid);
+        $uuid =~ tr!/!!d;
+        my $h = '[a-f0-9]';
+        my (@uuid5) = ($uuid =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o);
+        $uuid = 'urn:uuid:' . join('-', @uuid5);
+
+        $mid = PublicInbox::Hval->new_msgid($mid);
+        my $href = $ctx->{feed_base_url} . $mid->{href}. '/';
+
+        my $date = $hdr->header('Date');
+        my @t = eval { strptime($date) } if defined $date;
+        @t = gmtime(time) unless scalar @t;
+        my $updated = feed_updated(@t);
+
+        my $title = $hdr->header('Subject');
+        $title = '(no subject)' unless defined $title && $title ne '';
+        $title = title_tag($title);
+
+        my $from = $hdr->header('From') or return;
+        my ($email) = PublicInbox::Address::emails($from);
+        my $name = join(', ',PublicInbox::Address::names($from));
+        $name = ascii_html($name);
+        $email = ascii_html($email);
+
+        my $s = '';
+        if (delete $ctx->{emit_header}) {
+                $s .= atom_header($ctx, $title);
+        }
+        $s .= "<entry><author><name>$name</name><email>$email</email>" .
+                "</author>$title$updated" .
+                qq{<content\ntype="xhtml">} .
+                qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} .
+                qq(<pre\nstyle="white-space:pre-wrap">) .
+                PublicInbox::View::multipart_text_as_html($mime, $href) .
+                '</pre>' .
+                qq!</div></content><link\nhref="$href"/>!.
+                "<id>$uuid</id></entry>";
+}
+
+sub feed_updated {
+        '<updated>' . strftime('%Y-%m-%dT%H:%M:%SZ', @_) . '</updated>';
+}
+
+1;
diff --git a/t/common.perl b/t/common.perl
index bec57699..1251333d 100644
--- a/t/common.perl
+++ b/t/common.perl
@@ -1,18 +1,15 @@
 # Copyright (C) 2015 all contributors <meta@public-inbox.org>
 # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
-require IO::File;
-use POSIX qw/dup/;
 
 sub stream_to_string {
-        my ($cb) = @_;
-        my $headers;
-        my $io = IO::File->new_tmpfile;
-        my $dup = dup($io->fileno);
-        my $response = sub { $headers = \@_, $io };
-        $cb->($response);
-        $io = IO::File->new;
-        $io->fdopen($dup, 'r+');
-        $io->seek(0, 0);
-        $io->read(my $str, ($io->stat)[7]);
+        my ($res) = @_;
+        my $body = $res->[2];
+        my $str = '';
+        while (defined(my $chunk = $body->getline)) {
+                $str .= $chunk;
+        }
+        $body->close;
         $str;
 }
+
+1;