about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--Documentation/public-inbox-config.pod4
-rw-r--r--Documentation/public-inbox-index.pod23
-rw-r--r--lib/PublicInbox/Admin.pm11
-rw-r--r--lib/PublicInbox/SearchIdx.pm14
-rw-r--r--lib/PublicInbox/V2Writable.pm3
-rwxr-xr-xscript/public-inbox-index15
-rw-r--r--t/admin.t20
-rw-r--r--t/cgi.t30
-rw-r--r--t/v2mirror.t31
9 files changed, 134 insertions, 17 deletions
diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod
index 708a785f..f3b6c8b7 100644
--- a/Documentation/public-inbox-config.pod
+++ b/Documentation/public-inbox-config.pod
@@ -288,6 +288,10 @@ or /usr/share/cgit/
 
 See L<public-inbox-edit(1)>
 
+=item publicinbox.indexMaxSize
+
+See L<public-inbox-index(1)>
+
 =item publicinbox.wwwlisting
 
 Enable a HTML listing style when the root path of the URL '/' is accessed.
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index dede5d2e..398ac516 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -66,6 +66,12 @@ is detected.  This is intended to be used in mirrors after running
 L<public-inbox-edit(1)> or L<public-inbox-purge(1)> to ensure data
 is expunged from mirrors.
 
+=item --max-size SIZE
+
+Sets or overrides L</publicinbox.indexMaxSize> on a
+per-invocation basis.  See L</publicinbox.indexMaxSize>
+below.
+
 =back
 
 =head1 FILES
@@ -76,6 +82,23 @@ C<$GIT_DIR/public-inbox/> directory.
 
 v2 inboxes are described in L<public-inbox-v2-format>.
 
+=head1 CONFIGURATION
+
+=over 8
+
+=item publicinbox.indexMaxSize
+
+Prevents indexing of messages larger than the specified size
+value.  A single suffix modifier of C<k>, C<m> or C<g> is
+supported, thus the value of C<1m> to prevents indexing of
+messages larger than one megabyte.
+
+This is useful for avoiding memory exhaustion in mirrors.
+
+Default: none
+
+=back
+
 =head1 ENVIRONMENT
 
 =over 8
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 60f4f40d..62ddbe82 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -234,4 +234,15 @@ sub progress_prepare ($) {
         }
 }
 
+# same unit factors as git:
+sub parse_unsigned ($) {
+        my ($max_size) = @_;
+
+        $$max_size =~ /\A([0-9]+)([kmg])?\z/i or return;
+        my ($n, $unit_factor) = ($1, $2 // '');
+        my %u = ( k => 1024, m => 1024**2, g => 1024**3 );
+        $$max_size = $n * ($u{lc($unit_factor)} // 1);
+        1;
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 579b85e3..25118f43 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -64,6 +64,7 @@ sub new {
                 $self->{lock_path} = "$inboxdir/ssoma.lock";
                 my $dir = $self->xdir;
                 $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3");
+                $self->{index_max_size} = $ibx->{index_max_size};
         } elsif ($version == 2) {
                 defined $shard or die "shard is required for v2\n";
                 # shard is a number
@@ -572,6 +573,16 @@ sub batch_adjust ($$$$$) {
         }
 }
 
+sub too_big ($$$) {
+        my ($self, $git, $oid) = @_;
+        my $max_size = $self->{index_max_size} or return;
+        my (undef, undef, $size) = $git->check($oid);
+        die "E: bad $oid in $git->{git_dir}\n" if !defined($size);
+        return if $size <= $max_size;
+        warn "W: skipping $oid ($size > $max_size)\n";
+        1;
+}
+
 # only for v1
 sub read_log {
         my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
@@ -598,6 +609,7 @@ sub read_log {
                                 }
                                 next;
                         }
+                        next if too_big($self, $git, $blob);
                         my $mime = do_cat_mail($git, $blob, \$bytes);
                         my $smsg = bless {}, 'PublicInbox::Smsg';
                         batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
@@ -606,7 +618,7 @@ sub read_log {
                         $add_cb->($self, $mime, $smsg);
                 } elsif ($line =~ /$delmsg/o) {
                         my $blob = $1;
-                        $D{$blob} = 1;
+                        $D{$blob} = 1 unless too_big($self, $git, $blob);
                 } elsif ($line =~ /^commit ($h40)/o) {
                         $latest = $1;
                         $newest ||= $latest;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 12cc1f13..01b8bed6 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -120,6 +120,7 @@ sub new {
                 last_commit => [], # git repo -> commit
         };
         $self->{shards} = count_shards($self) || nproc_shards($creat);
+        $self->{index_max_size} = $v2ibx->{index_max_size};
         bless $self, $class;
 }
 
@@ -867,6 +868,7 @@ sub atfork_child {
 
 sub mark_deleted ($$$$) {
         my ($self, $sync, $git, $oid) = @_;
+        return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
         my $msgref = $git->cat_file($oid);
         my $mime = PublicInbox::MIME->new($$msgref);
         my $mids = mids($mime->header_obj);
@@ -993,6 +995,7 @@ sub multi_mid_q_push ($$$) {
 
 sub reindex_oid ($$$$) {
         my ($self, $sync, $git, $oid) = @_;
+        return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
         my ($num, $mid0, $len);
         my $msgref = $git->cat_file($oid, \$len);
         return if $len == 0; # purged
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 7def9964..2d0f0eca 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -14,8 +14,9 @@ PublicInbox::Admin::require_or_die('-index');
 use PublicInbox::Xapcmd;
 
 my $compact_opt;
-my $opt = { quiet => -1, compact => 0 };
-GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune indexlevel|L=s))
+my $opt = { quiet => -1, compact => 0, maxsize => undef };
+GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune
+                indexlevel|L=s maxsize|max-size=s))
         or die "bad command-line args\n$usage";
 die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0;
 
@@ -25,14 +26,22 @@ if ($opt->{compact}) {
         $compact_opt = { -coarse_lock => 1, compact => 1 };
 }
 
-my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV);
+my $cfg = PublicInbox::Config->new;
+my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg);
 PublicInbox::Admin::require_or_die('-index');
 unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 }
 my $mods = {};
+my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')};
+if (defined $max_size) {
+        PublicInbox::Admin::parse_unsigned(\$max_size) or
+                die "`publicInbox.indexMaxSize=$max_size' not parsed\n";
+}
+
 foreach my $ibx (@ibxs) {
         # XXX: users can shoot themselves in the foot, with opt->{indexlevel}
         $ibx->{indexlevel} //= $opt->{indexlevel} //
                         PublicInbox::Admin::detect_indexlevel($ibx);
+        $ibx->{index_max_size} = $max_size;
         PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
 }
 
diff --git a/t/admin.t b/t/admin.t
index a9d67d25..c25667b2 100644
--- a/t/admin.t
+++ b/t/admin.t
@@ -78,4 +78,24 @@ SKIP: {
 }
 
 chdir '/';
+
+my @pairs = (
+        '1g' => 1024 ** 3,
+        666 => 666,
+        '1500K' => 1500 * 1024,
+        '15m' => 15 * (1024 ** 2),
+);
+
+while (@pairs) {
+        my ($in, $out) = splice(@pairs, 0, 2);
+        my $orig = $in;
+        ok(PublicInbox::Admin::parse_unsigned(\$in), "parse_unsigned $orig");
+        is($in, $out, "got $orig => ($in == $out)");
+}
+
+for my $v ('', 'bogus', '1p', '1gig') {
+        ok(!PublicInbox::Admin::parse_unsigned(\$v),
+                "parse_unsigned rejects $v");
+}
+
 done_testing();
diff --git a/t/cgi.t b/t/cgi.t
index 52f80e88..bceb83e5 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -55,10 +55,14 @@ Date: Thu, 01 Jan 1970 00:00:00 +0000
 
 zzzzzz
 EOF
-        $im->add($mime);
+        ok($im->add($mime), 'added initial message');
+
+        $mime->header_set('Message-ID', '<toobig@example.com>');
+        $mime->body_str_set("z\n" x 1024);
+        ok($im->add($mime), 'added big message');
 
         # deliver a reply, too
-        my $reply = Email::MIME->new(<<EOF);
+        $mime = Email::MIME->new(<<EOF);
 From: You <you\@example.com>
 To: Me <me\@example.com>
 Cc: $addr
@@ -72,7 +76,7 @@ Me wrote:
 
 what?
 EOF
-        $im->add($reply);
+        ok($im->add($mime), 'added reply');
 
         my $slashy_mid = 'slashy/asdf@example.com';
         my $slashy = Email::MIME->new(<<EOF);
@@ -85,7 +89,7 @@ Date: Thu, 01 Jan 1970 00:00:01 +0000
 
 slashy
 EOF
-        $im->add($slashy);
+        ok($im->add($slashy), 'added slash');
         $im->done;
 
         my $res = cgi_run("/test/slashy/asdf\@example.com/raw");
@@ -99,14 +103,9 @@ EOF
         my $path = "/test/blahblah\@example.com/t.mbox.gz";
         my $res = cgi_run($path);
         like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
-        my $indexed;
-        eval {
-                require DBD::SQLite;
-                require PublicInbox::SearchIdx;
-                my $s = PublicInbox::SearchIdx->new($ibx, 1);
-                $s->index_sync;
-                $indexed = 1;
-        };
+        my $cmd = ['-index', $ibx->{inboxdir}, '--max-size=2k'];
+        my $opt = { 2 => \(my $err) };
+        my $indexed = run_script($cmd, undef, $opt);
         if ($indexed) {
                 $res = cgi_run($path);
                 like($res->{head}, qr/^Status: 200 /, "search returned mbox");
@@ -117,9 +116,14 @@ EOF
                         IO::Uncompress::Gunzip::gunzip(\$in => \$out);
                         like($out, qr/^From /m, "From lines in mbox");
                 };
+                $res = cgi_run('/test/toobig@example.com/');
+                like($res->{head}, qr/^Status: 300 /,
+                        'did not index or return >max-size message');
+                like($err, qr/skipping [a-f0-9]{40,}/,
+                        'warned about skipping large OID');
         } else {
                 like($res->{head}, qr/^Status: 501 /, "search not available");
-                SKIP: { skip 'DBD::SQLite not available', 2 };
+                SKIP: { skip 'DBD::SQLite not available', 4 };
         }
 
         my $have_xml_treepp = eval { require XML::TreePP; 1 } if $indexed;
diff --git a/t/v2mirror.t b/t/v2mirror.t
index 406bbd4f..ecf96891 100644
--- a/t/v2mirror.t
+++ b/t/v2mirror.t
@@ -187,6 +187,37 @@ is($mibx->git->check($to_purge), undef, 'unindex+prune successful in mirror');
         is(scalar($mset->items), 0, '1@example.com no longer visible in mirror');
 }
 
+if ('max size') {
+        $mime->header_set('Message-ID', '<2big@a>');
+        my $max = '2k';
+        $mime->body_str_set("z\n" x 1024);
+        ok($v2w->add($mime), "add big message");
+        $v2w->done;
+        $ibx->cleanup;
+        $fetch_each_epoch->();
+        PublicInbox::InboxWritable::cleanup($mibx);
+        my $cmd = ['-index', "$tmpdir/m", "--max-size=$max" ];
+        my $opt = { 2 => \(my $err) };
+        ok(run_script($cmd, undef, $opt), 'indexed with --max-size');
+        like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+        $mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+        is(scalar($mset->items), 0, 'large message not indexed');
+
+        {
+                open my $fh, '>>', $pi_config or die;
+                print $fh <<EOF or die;
+[publicinbox]
+        indexMaxSize = 2k
+EOF
+                close $fh or die;
+        }
+        $cmd = ['-index', "$tmpdir/m", "--reindex" ];
+        ok(run_script($cmd, undef, $opt), 'reindexed w/ indexMaxSize in file');
+        like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+        $mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+        is(scalar($mset->items), 0, 'large message not re-indexed');
+}
+
 ok($td->kill, 'killed httpd');
 $td->join;