From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E61151FC96 for ; Sat, 3 Dec 2016 01:50:45 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] atom: switch to getline/close for response bodies Date: Sat, 3 Dec 2016 01:50:45 +0000 Message-Id: <20161203015045.9398-1-e@80x24.org> List-Id: This will let us stream larger Atom documents bodies without wasting too much memory and reduce the amount of round-trip requests needed to get necessary information. Hopefully clients are using streaming (SAX) parsers, too. This is the final transition in the core public-inbox code to allow migrating to a "pull"-based body streaming scheme which allows a HTTP server to respond appropriately to backpressure from slow clients. --- MANIFEST | 1 + lib/PublicInbox/Feed.pm | 188 +++++++-------------------------------- lib/PublicInbox/SearchView.pm | 29 +++--- lib/PublicInbox/WwwAtomStream.pm | 134 ++++++++++++++++++++++++++++ t/common.perl | 21 ++--- 5 files changed, 186 insertions(+), 187 deletions(-) create mode 100644 lib/PublicInbox/WwwAtomStream.pm diff --git a/MANIFEST b/MANIFEST index 3a4d7c4..3388b1a 100644 --- a/MANIFEST +++ b/MANIFEST @@ -88,6 +88,7 @@ lib/PublicInbox/View.pm lib/PublicInbox/WWW.pm lib/PublicInbox/WWW.pod lib/PublicInbox/WatchMaildir.pm +lib/PublicInbox/WwwAtomStream.pm lib/PublicInbox/WwwAttach.pm lib/PublicInbox/WwwStream.pm lib/PublicInbox/WwwText.pm diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 25fec10..31d82ad 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -6,26 +6,45 @@ package PublicInbox::Feed; use strict; use warnings; use Email::MIME; -use Date::Parse qw(strptime); -use PublicInbox::Hval qw/ascii_html/; use PublicInbox::View; -use PublicInbox::MID qw/mid_clean mid2path/; -use PublicInbox::Address; -use POSIX qw/strftime/; +use PublicInbox::WwwAtomStream; use constant { - DATEFMT => '%Y-%m-%dT%H:%M:%SZ', # Atom standard MAX_PER_PAGE => 25, # this needs to be tunable }; # main function sub generate { my ($ctx) = @_; - sub { emit_atom($_[0], $ctx) }; + my @paths; + each_recent_blob($ctx, sub { push @paths, $_[0] }); + return _no_thread() unless @paths; + + my $ibx = $ctx->{-inbox}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $path = shift @paths) { + my $mime = do_cat_mail($ibx, $path) or next; + return $mime; + } + }); } sub generate_thread_atom { my ($ctx) = @_; - sub { emit_atom_thread($_[0], $ctx) }; + my $mid = $ctx->{mid}; + my $res = $ctx->{srch}->get_thread($mid); + return _no_thread() unless $res->{total}; + + my $ibx = $ctx->{-inbox}; + my $html_url = $ibx->base_url($ctx->{env}); + $html_url .= PublicInbox::Hval->new_msgid($mid)->{href}; + $ctx->{-html_url} = $html_url; + my $msgs = $res->{msgs}; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $msg = shift @$msgs) { + $msg = $ibx->msg_by_smsg($msg) and + return Email::MIME->new($msg); + } + }); } sub generate_html_index { @@ -73,80 +92,8 @@ sub new_html { # private subs -sub title_tag { - my ($title) = @_; - $title =~ tr/\t\n / /s; # squeeze spaces - # try to avoid the type attribute in title: - $title = ascii_html($title); - my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; - "$title"; -} - -sub atom_header { - my ($feed_opts, $title) = @_; - - $title = title_tag($feed_opts->{description}) unless (defined $title); - - qq(\n) . - qq{} . - qq{$title} . - qq() . - qq({atomurl}"/>) . - qq(mailto:$feed_opts->{id_addr}); -} - -sub emit_atom { - my ($cb, $ctx) = @_; - my $feed_opts = get_feedopts($ctx); - my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]); - my $max = $ctx->{max} || MAX_PER_PAGE; - my $x = atom_header($feed_opts); - my $ibx = $ctx->{-inbox}; - each_recent_blob($ctx, sub { - my ($path, undef, $ts) = @_; - if (defined $x) { - $fh->write($x . feed_updated(undef, $ts)); - $x = undef; - } - my $s = feed_entry($feed_opts, $path, $ibx) or return 0; - $fh->write($s); - 1; - }); - end_feed($fh); -} - -sub _no_thread { - my ($cb) = @_; - $cb->([404, ['Content-Type', 'text/plain'], - ["No feed found for thread\n"]]); -} - -sub end_feed { - my ($fh) = @_; - $fh->write(''); - $fh->close; -} - -sub emit_atom_thread { - my ($cb, $ctx) = @_; - my $mid = $ctx->{mid}; - my $res = $ctx->{srch}->get_thread($mid); - return _no_thread($cb) unless $res->{total}; - my $feed_opts = get_feedopts($ctx); - my $fh = $cb->([200, ['Content-Type' => 'application/atom+xml']]); - my $ibx = $ctx->{-inbox}; - my $html_url = $ibx->base_url($ctx->{env}); - $html_url .= PublicInbox::Hval->new_msgid($mid)->{href}; - - $feed_opts->{url} = $html_url; - $feed_opts->{emit_header} = 1; - - foreach my $msg (@{$res->{msgs}}) { - my $s = feed_entry($feed_opts, mid2path($msg->mid), $ibx); - $fh->write($s) if defined $s; - } - end_feed($fh); +sub _no_thread () { + [404, ['Content-Type', 'text/plain'], ["No feed found for thread\n"]]; } sub new_html_footer { @@ -199,7 +146,7 @@ sub each_recent_blob { if ($line =~ /$addmsg/o) { my $add = $1; next if $deleted{$add}; # optimization-only - $nr += $cb->($add, $cur_commit, $ts, $u, $subj); + $cb->($add, $cur_commit, $ts, $u, $subj) and $nr++; if ($nr >= $max) { $last = 1; last; @@ -228,81 +175,6 @@ sub each_recent_blob { ($first_commit, $last_commit); } -# private functions below -sub get_feedopts { - my ($ctx) = @_; - my $inbox = $ctx->{inbox}; - my $obj = $ctx->{-inbox}; - my %rv = ( description => $obj->description ); - - $rv{address} = $obj->{address}; - $rv{id_addr} = $obj->{-primary_address}; - my $url_base = $obj->base_url($ctx->{env}); - if (my $mid = $ctx->{mid}) { # per-thread feed: - $rv{atomurl} = "$url_base$mid/t.atom"; - } else { - $rv{atomurl} = $url_base."new.atom"; - } - $rv{url} ||= $url_base; - $rv{midurl} = $url_base; - - \%rv; -} - -sub feed_updated { - my ($date, $ts) = @_; - my @t = eval { strptime($date) } if defined $date; - @t = gmtime($ts || time) unless scalar @t; - - '' . strftime(DATEFMT, @t) . ''; -} - -# returns undef or string -sub feed_entry { - my ($feed_opts, $add, $ibx) = @_; - - my $mime = do_cat_mail($ibx, $add) or return; - my $url = $feed_opts->{url}; - my $midurl = $feed_opts->{midurl}; - - my $header_obj = $mime->header_obj; - my $mid = mid_clean($header_obj->header_raw('Message-ID')); - $mid = PublicInbox::Hval->new_msgid($mid); - my $href = $midurl . $mid->{href}. '/'; - - my $date = $header_obj->header('Date'); - my $updated = feed_updated($date); - - my $title = $header_obj->header('Subject'); - defined $title or return; - $title = title_tag($title); - - my $from = $header_obj->header('From') or return; - my ($email) = PublicInbox::Address::emails($from); - my $name = join(', ',PublicInbox::Address::names($from)); - $name = ascii_html($name); - $email = ascii_html($email); - - my $s = ''; - if (delete $feed_opts->{emit_header}) { - $s .= atom_header($feed_opts, $title) . $updated; - } - $s .= "$name$email" . - "$title$updated" . - qq{} . - qq{} . - qq() . - PublicInbox::View::multipart_text_as_html($mime, $href) . - ''; - - $add =~ tr!/!!d; - my $h = '[a-f0-9]'; - my (@uuid5) = ($add =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); - my $id = 'urn:uuid:' . join('-', @uuid5); - $s .= qq!!. - "$id"; -} - sub do_cat_mail { my ($ibx, $path) = @_; my $mime = eval { $ibx->msg_by_path($path) } or return; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index fbba9c4..6af151a 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -8,6 +8,7 @@ use warnings; use PublicInbox::SearchMsg; use PublicInbox::Hval qw/ascii_html/; use PublicInbox::View; +use PublicInbox::WwwAtomStream; use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape); use Email::MIME; require PublicInbox::Git; @@ -46,7 +47,7 @@ sub sres_top_html { $cb = *noop; } else { my $x = $q->{x}; - return sub { adump($_[0], $mset, $q, $ctx) } if ($x eq 'A'); + return adump($_[0], $mset, $q, $ctx) if $x eq 'A'; $ctx->{-html_tip} = search_nav_top($mset, $q) . "\n\n"; if ($x eq 't') { @@ -213,23 +214,17 @@ sub ctx_prepare { sub adump { my ($cb, $mset, $q, $ctx) = @_; - my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]); my $ibx = $ctx->{-inbox}; - my $feed_opts = PublicInbox::Feed::get_feedopts($ctx); - my $x = ascii_html($q->{'q'}); - $x = qq{$x - search results}; - $feed_opts->{atomurl} = $feed_opts->{url} . '?'. $q->qs_html; - $feed_opts->{url} .= '?'. $q->qs_html(x => undef); - $x = PublicInbox::Feed::atom_header($feed_opts, "$x"); - $fh->write($x. PublicInbox::Feed::feed_updated()); - - for ($mset->items) { - $x = PublicInbox::SearchMsg->load_doc($_->get_document)->mid; - $x = mid2path($x); - my $s = PublicInbox::Feed::feed_entry($feed_opts, $x, $ibx); - $fh->write($s) if defined $s; - } - PublicInbox::Feed::end_feed($fh); + my @items = $mset->items; + $ctx->{search_query} = $q; + PublicInbox::WwwAtomStream->response($ctx, 200, sub { + while (my $x = shift @items) { + $x = PublicInbox::SearchMsg->load_doc($x->get_document); + $x = $ibx->msg_by_smsg($x) and + return Email::MIME->new($x); + } + return undef; + }); } package PublicInbox::SearchQuery; diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm new file mode 100644 index 0000000..5720384 --- /dev/null +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -0,0 +1,134 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +# +# Atom body stream for which yields getline+close methods +package PublicInbox::WwwAtomStream; +use strict; +use warnings; + +# FIXME: locale-independence: +use POSIX qw(strftime); +use Date::Parse qw(strptime); + +use PublicInbox::Address; +use PublicInbox::Hval qw(ascii_html); +use PublicInbox::MID qw/mid_clean mid2path mid_escape/; + +# called by PSGI server after getline: +sub close {} + +sub new { + my ($class, $ctx, $cb) = @_; + $ctx->{emit_header} = 1; + $ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env}); + bless { cb => $cb || *close, ctx => $ctx }, $class; +} + +sub response { + my ($class, $ctx, $code, $cb) = @_; + [ $code, [ 'Content-Type', 'application/atom+xml' ], + $class->new($ctx, $cb) ] +} + +# called once for each message by PSGI server +sub getline { + my ($self) = @_; + if (my $middle = $self->{cb}) { + my $mime = $middle->(); + return feed_entry($self, $mime) if $mime; + } + delete $self->{cb} ? '' : undef; +} + +# private + +sub title_tag { + my ($title) = @_; + $title =~ tr/\t\n / /s; # squeeze spaces + # try to avoid the type attribute in title: + $title = ascii_html($title); + my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; + "$title"; +} + +sub atom_header { + my ($ctx, $title) = @_; + my $ibx = $ctx->{-inbox}; + my $base_url = $ctx->{feed_base_url}; + my $search_q = $ctx->{search_query}; + my $self_url = $base_url; + my $mid = $ctx->{mid}; + if (defined $mid) { # per-thread + $self_url .= mid_escape($mid).'/t.atom'; + } elsif (defined $search_q) { + my $query = $search_q->{'q'}; + $title = title_tag("$query - search results"); + $base_url .= '?' . $search_q->qs_html(x => undef); + $self_url .= '?' . $search_q->qs_html; + } else { + $title = title_tag($ibx->description); + $self_url .= 'new.atom'; + } + my $mtime = (stat($ibx->{mainrepo}))[9] || time; + + qq(\n) . + qq{} . + qq{$title} . + qq() . + qq() . + qq(mailto:$ibx->{-primary_address}) . + feed_updated(gmtime($mtime)); +} + +# returns undef or string +sub feed_entry { + my ($self, $mime) = @_; + my $ctx = $self->{ctx}; + my $hdr = $mime->header_obj; + my $mid = mid_clean($hdr->header_raw('Message-ID')); + + my $uuid = mid2path($mid); + $uuid =~ tr!/!!d; + my $h = '[a-f0-9]'; + my (@uuid5) = ($uuid =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); + $uuid = 'urn:uuid:' . join('-', @uuid5); + + $mid = PublicInbox::Hval->new_msgid($mid); + my $href = $ctx->{feed_base_url} . $mid->{href}. '/'; + + my $date = $hdr->header('Date'); + my @t = eval { strptime($date) } if defined $date; + @t = gmtime(time) unless scalar @t; + my $updated = feed_updated(@t); + + my $title = $hdr->header('Subject'); + $title = '(no subject)' unless defined $title && $title ne ''; + $title = title_tag($title); + + my $from = $hdr->header('From') or return; + my ($email) = PublicInbox::Address::emails($from); + my $name = join(', ',PublicInbox::Address::names($from)); + $name = ascii_html($name); + $email = ascii_html($email); + + my $s = ''; + if (delete $ctx->{emit_header}) { + $s .= atom_header($ctx, $title); + } + $s .= "$name$email" . + "$title$updated" . + qq{} . + qq{} . + qq() . + PublicInbox::View::multipart_text_as_html($mime, $href) . + '' . + qq!!. + "$uuid"; +} + +sub feed_updated { + '' . strftime('%Y-%m-%dT%H:%M:%SZ', @_) . ''; +} + +1; diff --git a/t/common.perl b/t/common.perl index bec5769..1251333 100644 --- a/t/common.perl +++ b/t/common.perl @@ -1,18 +1,15 @@ # Copyright (C) 2015 all contributors # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt) -require IO::File; -use POSIX qw/dup/; sub stream_to_string { - my ($cb) = @_; - my $headers; - my $io = IO::File->new_tmpfile; - my $dup = dup($io->fileno); - my $response = sub { $headers = \@_, $io }; - $cb->($response); - $io = IO::File->new; - $io->fdopen($dup, 'r+'); - $io->seek(0, 0); - $io->read(my $str, ($io->stat)[7]); + my ($res) = @_; + my $body = $res->[2]; + my $str = ''; + while (defined(my $chunk = $body->getline)) { + $str .= $chunk; + } + $body->close; $str; } + +1; -- EW