user/dev discussion of public-inbox itself
 help / Atom feed
* [PATCH] atom: switch to getline/close for response bodies
@ 2016-12-03  1:50 Eric Wong
  0 siblings, 0 replies; 1+ messages in thread
From: Eric Wong @ 2016-12-03  1:50 UTC (permalink / raw)
  To: meta

This will let us stream larger Atom documents bodies without
wasting too much memory and reduce the amount of round-trip
requests needed to get necessary information.

Hopefully clients are using streaming (SAX) parsers, too.

This is the final transition in the core public-inbox
code to allow migrating to a "pull"-based body streaming
scheme which allows a HTTP server to respond appropriately
to backpressure from slow clients.
---
 MANIFEST                         |   1 +
 lib/PublicInbox/Feed.pm          | 188 +++++++--------------------------------
 lib/PublicInbox/SearchView.pm    |  29 +++---
 lib/PublicInbox/WwwAtomStream.pm | 134 ++++++++++++++++++++++++++++
 t/common.perl                    |  21 ++---
 5 files changed, 186 insertions(+), 187 deletions(-)
 create mode 100644 lib/PublicInbox/WwwAtomStream.pm

diff --git a/MANIFEST b/MANIFEST
index 3a4d7c4..3388b1a 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -88,6 +88,7 @@ lib/PublicInbox/View.pm
 lib/PublicInbox/WWW.pm
 lib/PublicInbox/WWW.pod
 lib/PublicInbox/WatchMaildir.pm
+lib/PublicInbox/WwwAtomStream.pm
 lib/PublicInbox/WwwAttach.pm
 lib/PublicInbox/WwwStream.pm
 lib/PublicInbox/WwwText.pm
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index 25fec10..31d82ad 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -6,26 +6,45 @@ package PublicInbox::Feed;
 use strict;
 use warnings;
 use Email::MIME;
-use Date::Parse qw(strptime);
-use PublicInbox::Hval qw/ascii_html/;
 use PublicInbox::View;
-use PublicInbox::MID qw/mid_clean mid2path/;
-use PublicInbox::Address;
-use POSIX qw/strftime/;
+use PublicInbox::WwwAtomStream;
 use constant {
-	DATEFMT => '%Y-%m-%dT%H:%M:%SZ', # Atom standard
 	MAX_PER_PAGE => 25, # this needs to be tunable
 };
 
 # main function
 sub generate {
 	my ($ctx) = @_;
-	sub { emit_atom($_[0], $ctx) };
+	my @paths;
+	each_recent_blob($ctx, sub { push @paths, $_[0] });
+	return _no_thread() unless @paths;
+
+	my $ibx = $ctx->{-inbox};
+	PublicInbox::WwwAtomStream->response($ctx, 200, sub {
+		while (my $path = shift @paths) {
+			my $mime = do_cat_mail($ibx, $path) or next;
+			return $mime;
+		}
+	});
 }
 
 sub generate_thread_atom {
 	my ($ctx) = @_;
-	sub { emit_atom_thread($_[0], $ctx) };
+	my $mid = $ctx->{mid};
+	my $res = $ctx->{srch}->get_thread($mid);
+	return _no_thread() unless $res->{total};
+
+	my $ibx = $ctx->{-inbox};
+	my $html_url = $ibx->base_url($ctx->{env});
+	$html_url .= PublicInbox::Hval->new_msgid($mid)->{href};
+	$ctx->{-html_url} = $html_url;
+	my $msgs = $res->{msgs};
+	PublicInbox::WwwAtomStream->response($ctx, 200, sub {
+		while (my $msg = shift @$msgs) {
+			$msg = $ibx->msg_by_smsg($msg) and
+					return Email::MIME->new($msg);
+		}
+	});
 }
 
 sub generate_html_index {
@@ -73,80 +92,8 @@ sub new_html {
 
 # private subs
 
-sub title_tag {
-	my ($title) = @_;
-	$title =~ tr/\t\n / /s; # squeeze spaces
-	# try to avoid the type attribute in title:
-	$title = ascii_html($title);
-	my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : '';
-	"<title$type>$title</title>";
-}
-
-sub atom_header {
-	my ($feed_opts, $title) = @_;
-
-	$title = title_tag($feed_opts->{description}) unless (defined $title);
-
-	qq(<?xml version="1.0" encoding="us-ascii"?>\n) .
-	qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} .
-	qq{$title} .
-	qq(<link\nrel="alternate"\ntype="text/html") .
-		qq(\nhref="$feed_opts->{url}"/>) .
-	qq(<link\nrel="self"\nhref="$feed_opts->{atomurl}"/>) .
-	qq(<id>mailto:$feed_opts->{id_addr}</id>);
-}
-
-sub emit_atom {
-	my ($cb, $ctx) = @_;
-	my $feed_opts = get_feedopts($ctx);
-	my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]);
-	my $max = $ctx->{max} || MAX_PER_PAGE;
-	my $x = atom_header($feed_opts);
-	my $ibx = $ctx->{-inbox};
-	each_recent_blob($ctx, sub {
-		my ($path, undef, $ts) = @_;
-		if (defined $x) {
-			$fh->write($x . feed_updated(undef, $ts));
-			$x = undef;
-		}
-		my $s = feed_entry($feed_opts, $path, $ibx) or return 0;
-		$fh->write($s);
-		1;
-	});
-	end_feed($fh);
-}
-
-sub _no_thread {
-	my ($cb) = @_;
-	$cb->([404, ['Content-Type', 'text/plain'],
-		["No feed found for thread\n"]]);
-}
-
-sub end_feed {
-	my ($fh) = @_;
-	$fh->write('</feed>');
-	$fh->close;
-}
-
-sub emit_atom_thread {
-	my ($cb, $ctx) = @_;
-	my $mid = $ctx->{mid};
-	my $res = $ctx->{srch}->get_thread($mid);
-	return _no_thread($cb) unless $res->{total};
-	my $feed_opts = get_feedopts($ctx);
-	my $fh = $cb->([200, ['Content-Type' => 'application/atom+xml']]);
-	my $ibx = $ctx->{-inbox};
-	my $html_url = $ibx->base_url($ctx->{env});
-	$html_url .= PublicInbox::Hval->new_msgid($mid)->{href};
-
-	$feed_opts->{url} = $html_url;
-	$feed_opts->{emit_header} = 1;
-
-	foreach my $msg (@{$res->{msgs}}) {
-		my $s = feed_entry($feed_opts, mid2path($msg->mid), $ibx);
-		$fh->write($s) if defined $s;
-	}
-	end_feed($fh);
+sub _no_thread () {
+	[404, ['Content-Type', 'text/plain'], ["No feed found for thread\n"]];
 }
 
 sub new_html_footer {
@@ -199,7 +146,7 @@ sub each_recent_blob {
 		if ($line =~ /$addmsg/o) {
 			my $add = $1;
 			next if $deleted{$add}; # optimization-only
-			$nr += $cb->($add, $cur_commit, $ts, $u, $subj);
+			$cb->($add, $cur_commit, $ts, $u, $subj) and $nr++;
 			if ($nr >= $max) {
 				$last = 1;
 				last;
@@ -228,81 +175,6 @@ sub each_recent_blob {
 	($first_commit, $last_commit);
 }
 
-# private functions below
-sub get_feedopts {
-	my ($ctx) = @_;
-	my $inbox = $ctx->{inbox};
-	my $obj = $ctx->{-inbox};
-	my %rv = ( description => $obj->description );
-
-	$rv{address} = $obj->{address};
-	$rv{id_addr} = $obj->{-primary_address};
-	my $url_base = $obj->base_url($ctx->{env});
-	if (my $mid = $ctx->{mid}) { # per-thread feed:
-		$rv{atomurl} = "$url_base$mid/t.atom";
-	} else {
-		$rv{atomurl} = $url_base."new.atom";
-	}
-	$rv{url} ||= $url_base;
-	$rv{midurl} = $url_base;
-
-	\%rv;
-}
-
-sub feed_updated {
-	my ($date, $ts) = @_;
-	my @t = eval { strptime($date) } if defined $date;
-	@t = gmtime($ts || time) unless scalar @t;
-
-	'<updated>' . strftime(DATEFMT, @t) . '</updated>';
-}
-
-# returns undef or string
-sub feed_entry {
-	my ($feed_opts, $add, $ibx) = @_;
-
-	my $mime = do_cat_mail($ibx, $add) or return;
-	my $url = $feed_opts->{url};
-	my $midurl = $feed_opts->{midurl};
-
-	my $header_obj = $mime->header_obj;
-	my $mid = mid_clean($header_obj->header_raw('Message-ID'));
-	$mid = PublicInbox::Hval->new_msgid($mid);
-	my $href = $midurl . $mid->{href}. '/';
-
-	my $date = $header_obj->header('Date');
-	my $updated = feed_updated($date);
-
-	my $title = $header_obj->header('Subject');
-	defined $title or return;
-	$title = title_tag($title);
-
-	my $from = $header_obj->header('From') or return;
-	my ($email) = PublicInbox::Address::emails($from);
-	my $name = join(', ',PublicInbox::Address::names($from));
-	$name = ascii_html($name);
-	$email = ascii_html($email);
-
-	my $s = '';
-	if (delete $feed_opts->{emit_header}) {
-		$s .= atom_header($feed_opts, $title) . $updated;
-	}
-	$s .= "<entry><author><name>$name</name><email>$email</email>" .
-		"</author>$title$updated" .
-		qq{<content\ntype="xhtml">} .
-		qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} .
-		qq(<pre\nstyle="white-space:pre-wrap">) .
-		PublicInbox::View::multipart_text_as_html($mime, $href) .
-		'</pre>';
-
-	$add =~ tr!/!!d;
-	my $h = '[a-f0-9]';
-	my (@uuid5) = ($add =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o);
-	my $id = 'urn:uuid:' . join('-', @uuid5);
-	$s .= qq!</div></content><link\nhref="$href"/>!.
-		"<id>$id</id></entry>";
-}
-
 sub do_cat_mail {
 	my ($ibx, $path) = @_;
 	my $mime = eval { $ibx->msg_by_path($path) } or return;
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index fbba9c4..6af151a 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -8,6 +8,7 @@ use warnings;
 use PublicInbox::SearchMsg;
 use PublicInbox::Hval qw/ascii_html/;
 use PublicInbox::View;
+use PublicInbox::WwwAtomStream;
 use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape);
 use Email::MIME;
 require PublicInbox::Git;
@@ -46,7 +47,7 @@ sub sres_top_html {
 		$cb = *noop;
 	} else {
 		my $x = $q->{x};
-		return sub { adump($_[0], $mset, $q, $ctx) } if ($x eq 'A');
+		return adump($_[0], $mset, $q, $ctx) if $x eq 'A';
 
 		$ctx->{-html_tip} = search_nav_top($mset, $q) . "\n\n";
 		if ($x eq 't') {
@@ -213,23 +214,17 @@ sub ctx_prepare {
 
 sub adump {
 	my ($cb, $mset, $q, $ctx) = @_;
-	my $fh = $cb->([ 200, ['Content-Type' => 'application/atom+xml']]);
 	my $ibx = $ctx->{-inbox};
-	my $feed_opts = PublicInbox::Feed::get_feedopts($ctx);
-	my $x = ascii_html($q->{'q'});
-	$x = qq{$x - search results};
-	$feed_opts->{atomurl} = $feed_opts->{url} . '?'. $q->qs_html;
-	$feed_opts->{url} .= '?'. $q->qs_html(x => undef);
-	$x = PublicInbox::Feed::atom_header($feed_opts, "<title>$x</title>");
-	$fh->write($x. PublicInbox::Feed::feed_updated());
-
-	for ($mset->items) {
-		$x = PublicInbox::SearchMsg->load_doc($_->get_document)->mid;
-		$x = mid2path($x);
-		my $s = PublicInbox::Feed::feed_entry($feed_opts, $x, $ibx);
-		$fh->write($s) if defined $s;
-	}
-	PublicInbox::Feed::end_feed($fh);
+	my @items = $mset->items;
+	$ctx->{search_query} = $q;
+	PublicInbox::WwwAtomStream->response($ctx, 200, sub {
+		while (my $x = shift @items) {
+			$x = PublicInbox::SearchMsg->load_doc($x->get_document);
+			$x = $ibx->msg_by_smsg($x) and
+					return Email::MIME->new($x);
+		}
+		return undef;
+	});
 }
 
 package PublicInbox::SearchQuery;
diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm
new file mode 100644
index 0000000..5720384
--- /dev/null
+++ b/lib/PublicInbox/WwwAtomStream.pm
@@ -0,0 +1,134 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# Atom body stream for which yields getline+close methods
+package PublicInbox::WwwAtomStream;
+use strict;
+use warnings;
+
+# FIXME: locale-independence:
+use POSIX qw(strftime);
+use Date::Parse qw(strptime);
+
+use PublicInbox::Address;
+use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::MID qw/mid_clean mid2path mid_escape/;
+
+# called by PSGI server after getline:
+sub close {}
+
+sub new {
+	my ($class, $ctx, $cb) = @_;
+	$ctx->{emit_header} = 1;
+	$ctx->{feed_base_url} = $ctx->{-inbox}->base_url($ctx->{env});
+	bless { cb => $cb || *close, ctx => $ctx }, $class;
+}
+
+sub response {
+	my ($class, $ctx, $code, $cb) = @_;
+	[ $code, [ 'Content-Type', 'application/atom+xml' ],
+	  $class->new($ctx, $cb) ]
+}
+
+# called once for each message by PSGI server
+sub getline {
+	my ($self) = @_;
+	if (my $middle = $self->{cb}) {
+		my $mime = $middle->();
+		return feed_entry($self, $mime) if $mime;
+	}
+	delete $self->{cb} ? '</feed>' : undef;
+}
+
+# private
+
+sub title_tag {
+	my ($title) = @_;
+	$title =~ tr/\t\n / /s; # squeeze spaces
+	# try to avoid the type attribute in title:
+	$title = ascii_html($title);
+	my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : '';
+	"<title$type>$title</title>";
+}
+
+sub atom_header {
+	my ($ctx, $title) = @_;
+	my $ibx = $ctx->{-inbox};
+	my $base_url = $ctx->{feed_base_url};
+	my $search_q = $ctx->{search_query};
+	my $self_url = $base_url;
+	my $mid = $ctx->{mid};
+	if (defined $mid) { # per-thread
+		$self_url .= mid_escape($mid).'/t.atom';
+	} elsif (defined $search_q) {
+		my $query = $search_q->{'q'};
+		$title = title_tag("$query - search results");
+		$base_url .= '?' . $search_q->qs_html(x => undef);
+		$self_url .= '?' . $search_q->qs_html;
+	} else {
+		$title = title_tag($ibx->description);
+		$self_url .= 'new.atom';
+	}
+	my $mtime = (stat($ibx->{mainrepo}))[9] || time;
+
+	qq(<?xml version="1.0" encoding="us-ascii"?>\n) .
+	qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} .
+	qq{$title} .
+	qq(<link\nrel="alternate"\ntype="text/html") .
+		qq(\nhref="$base_url"/>) .
+	qq(<link\nrel="self"\nhref="$self_url"/>) .
+	qq(<id>mailto:$ibx->{-primary_address}</id>) .
+	feed_updated(gmtime($mtime));
+}
+
+# returns undef or string
+sub feed_entry {
+	my ($self, $mime) = @_;
+	my $ctx = $self->{ctx};
+	my $hdr = $mime->header_obj;
+	my $mid = mid_clean($hdr->header_raw('Message-ID'));
+
+	my $uuid = mid2path($mid);
+	$uuid =~ tr!/!!d;
+	my $h = '[a-f0-9]';
+	my (@uuid5) = ($uuid =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o);
+	$uuid = 'urn:uuid:' . join('-', @uuid5);
+
+	$mid = PublicInbox::Hval->new_msgid($mid);
+	my $href = $ctx->{feed_base_url} . $mid->{href}. '/';
+
+	my $date = $hdr->header('Date');
+	my @t = eval { strptime($date) } if defined $date;
+	@t = gmtime(time) unless scalar @t;
+	my $updated = feed_updated(@t);
+
+	my $title = $hdr->header('Subject');
+	$title = '(no subject)' unless defined $title && $title ne '';
+	$title = title_tag($title);
+
+	my $from = $hdr->header('From') or return;
+	my ($email) = PublicInbox::Address::emails($from);
+	my $name = join(', ',PublicInbox::Address::names($from));
+	$name = ascii_html($name);
+	$email = ascii_html($email);
+
+	my $s = '';
+	if (delete $ctx->{emit_header}) {
+		$s .= atom_header($ctx, $title);
+	}
+	$s .= "<entry><author><name>$name</name><email>$email</email>" .
+		"</author>$title$updated" .
+		qq{<content\ntype="xhtml">} .
+		qq{<div\nxmlns="http://www.w3.org/1999/xhtml">} .
+		qq(<pre\nstyle="white-space:pre-wrap">) .
+		PublicInbox::View::multipart_text_as_html($mime, $href) .
+		'</pre>' .
+		qq!</div></content><link\nhref="$href"/>!.
+		"<id>$uuid</id></entry>";
+}
+
+sub feed_updated {
+	'<updated>' . strftime('%Y-%m-%dT%H:%M:%SZ', @_) . '</updated>';
+}
+
+1;
diff --git a/t/common.perl b/t/common.perl
index bec5769..1251333 100644
--- a/t/common.perl
+++ b/t/common.perl
@@ -1,18 +1,15 @@
 # Copyright (C) 2015 all contributors <meta@public-inbox.org>
 # License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
-require IO::File;
-use POSIX qw/dup/;
 
 sub stream_to_string {
-	my ($cb) = @_;
-	my $headers;
-	my $io = IO::File->new_tmpfile;
-	my $dup = dup($io->fileno);
-	my $response = sub { $headers = \@_, $io };
-	$cb->($response);
-	$io = IO::File->new;
-	$io->fdopen($dup, 'r+');
-	$io->seek(0, 0);
-	$io->read(my $str, ($io->stat)[7]);
+	my ($res) = @_;
+	my $body = $res->[2];
+	my $str = '';
+	while (defined(my $chunk = $body->getline)) {
+		$str .= $chunk;
+	}
+	$body->close;
 	$str;
 }
+
+1;
-- 
EW


^ permalink raw reply	[flat|threaded] 1+ messages in thread

only message in thread, back to index

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-03  1:50 [PATCH] atom: switch to getline/close for response bodies Eric Wong

user/dev discussion of public-inbox itself

Archives are clonable:
	git clone --mirror https://public-inbox.org/meta
	git clone --mirror http://czquwvybam4bgbro.onion/meta
	git clone --mirror http://hjrcffqmbrq6wope.onion/meta
	git clone --mirror http://ou63pmih66umazou.onion/meta

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.mail.public-inbox.meta
	nntp://ou63pmih66umazou.onion/inbox.comp.mail.public-inbox.meta
	nntp://czquwvybam4bgbro.onion/inbox.comp.mail.public-inbox.meta
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.mail.public-inbox.meta
	nntp://news.gmane.org/gmane.mail.public-inbox.general

 note: .onion URLs require Tor: https://www.torproject.org/
       or Tor2web: https://www.tor2web.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox