about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2015-08-21 21:42:23 +0000
committerEric Wong <e@80x24.org>2015-08-21 21:50:13 +0000
commited724ad94ae8eccb7924584b01ec4975abfef48d (patch)
tree1e454a69f1b9d842d0d59e8040395369ba7a413b
parentea926383498a4883140086aec3f58d9e468433b9 (diff)
downloadpublic-inbox-ed724ad94ae8eccb7924584b01ec4975abfef48d.tar.gz
Mboxes may be huge, so only support downloading gzipped mboxes
to save bandwidth and to get free checksumming.

Streaming output means we should not be wasting too much memory
on this unless the chosen server sucks.
-rw-r--r--lib/PublicInbox/Mbox.pm64
-rw-r--r--lib/PublicInbox/WWW.pm8
-rw-r--r--t/cgi.t11
3 files changed, 75 insertions, 8 deletions
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index fc9df1ad..cb9d65d1 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -11,8 +11,7 @@ sub thread_mbox {
         my ($ctx, $srch) = @_;
         sub {
                 my ($response) = @_; # Plack callback
-                my $w = $response->([200, ['Content-Type' => 'text/plain']]);
-                emit_mbox($w, $ctx, $srch);
+                emit_mbox($response, $ctx, $srch);
         }
 }
 
@@ -40,7 +39,14 @@ sub emit_msg {
 }
 
 sub emit_mbox {
-        my ($fh, $ctx, $srch) = @_;
+        my ($response, $ctx, $srch) = @_;
+        eval { require IO::Compress::Gzip };
+        return need_gzip($response) if $@;
+
+        # http://www.iana.org/assignments/media-types/application/gzip
+        # http://www.iana.org/assignments/media-types/application/mbox
+        my $fh = $response->([200, ['Content-Type' => 'application/gzip']]);
+        $fh = PublicInbox::MboxGz->new($fh);
 
         require PublicInbox::GitCatFile;
         require Email::Simple;
@@ -62,6 +68,58 @@ sub emit_mbox {
 
                 $opts{offset} += $nr;
         } while ($nr > 0);
+
+        $fh->close;
+}
+
+sub need_gzip {
+        my $fh = $_[0]->([501, ['Content-Type' => 'text/html']]);
+        my $title = 'gzipped mbox not available';
+        $fh->write(<<EOF);
+<html><head><title>$title</title><body><pre>$title
+The administrator needs to install the IO::Compress::Gzip Perl module
+to support gzipped mboxes.
+<a href="../">Return to index</a></pre></body></html>
+EOF
+}
+
+1;
+
+# fh may not be a proper IO, so we wrap the write and close methods
+# to prevent IO::Compress::Gzip from complaining
+package PublicInbox::MboxGz;
+use strict;
+use warnings;
+use fields qw(gz fh buf);
+
+sub new {
+        my ($class, $fh) = @_;
+        my $self = fields::new($class);
+        my $buf;
+        $self->{buf} = \$buf;
+        $self->{gz} = IO::Compress::Gzip->new(\$buf);
+        $self->{fh} = $fh;
+        $self;
+}
+
+sub _flush_buf {
+        my ($self) = @_;
+        if (defined ${$self->{buf}}) {
+                $self->{fh}->write(${$self->{buf}});
+                ${$self->{buf}} = undef;
+        }
+}
+
+sub write {
+        $_[0]->{gz}->write($_[1]);
+        _flush_buf($_[0]);
+}
+
+sub close {
+        my ($self) = @_;
+        $self->{gz}->close;
+        _flush_buf($self);
+        # do not actually close $fh
 }
 
 1;
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index cd8a5705..68839d7c 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -53,7 +53,8 @@ sub run {
         } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) {
                 invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx, $cgi);
 
-        } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox\z!o) {
+        } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox\.gz!o) {
+                my $sfx = $3;
                 invalid_list_mid(\%ctx, $1, $2) || get_thread_mbox(\%ctx, $cgi);
 
         } elsif ($path_info =~ m!$LISTNAME_RE/f/\S+\.txt\z!o) {
@@ -329,7 +330,10 @@ sub msg_pfx {
         "../f/$href.html";
 }
 
-# /$LISTNAME/t/$MESSAGE_ID.mbox                    -> search results as mbox
+# /$LISTNAME/t/$MESSAGE_ID.mbox.gz        -> search results as gzipped mbox
+# note: I'm not a big fan of other compression formats since they're
+# significantly more expensive on CPU than gzip and less-widely available,
+# especially on older systems.  Stick to zlib since that's what git uses.
 sub get_thread_mbox {
         my ($ctx, $cgi) = @_;
         my $srch = searcher($ctx) or return need_search($ctx);
diff --git a/t/cgi.t b/t/cgi.t
index 2747a159..e87f7dca 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -183,15 +183,20 @@ EOF
 {
         local $ENV{HOME} = $home;
         local $ENV{PATH} = $main_path;
-        my $path = "/test/t/blahblah%40example.com.mbox";
+        my $path = "/test/t/blahblah%40example.com.mbox.gz";
         my $res = cgi_run($path);
         like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
         my $indexed = system($index, $maindir) == 0;
         if ($indexed) {
                 $res = cgi_run($path);
-                # use Data::Dumper; print STDERR Dumper($res);
                 like($res->{head}, qr/^Status: 200 /, "search returned mbox");
-                like($res->{body}, qr/^From /m, "From lines in mbox");
+                eval {
+                        require IO::Uncompress::Gunzip;
+                        my $in = $res->{body};
+                        my $out;
+                        IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+                        like($out, qr/^From /m, "From lines in mbox");
+                };
         } else {
                 like($res->{head}, qr/^Status: 501 /, "search not available");
         }