From fb8e7dbd1b711d25d1033c3f5f540ce47f6c0849 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 20 Apr 2020 22:55:37 +0000 Subject: index: support --max-size / publicinbox.indexMaxSize In normal mail paths, we can rely on MTAs being configured with reasonable limits in the -watch and -mda mail injection paths. However, the MTA is bypassed in a git-only delivery path, a BOFH could inject a large message and DoS users attempting to mirror a public-inbox. This doesn't protect unindexed WWW interfaces from Email::MIME memory explosions on v1 inboxes. Probably nobody cares about unindexed WWW interfaces anymore, especially now that Xapian is optional for indexing. --- Documentation/public-inbox-config.pod | 4 ++++ Documentation/public-inbox-index.pod | 23 +++++++++++++++++++++++ lib/PublicInbox/Admin.pm | 11 +++++++++++ lib/PublicInbox/SearchIdx.pm | 14 +++++++++++++- lib/PublicInbox/V2Writable.pm | 3 +++ script/public-inbox-index | 15 ++++++++++++--- t/admin.t | 20 ++++++++++++++++++++ t/cgi.t | 30 +++++++++++++++++------------- t/v2mirror.t | 31 +++++++++++++++++++++++++++++++ 9 files changed, 134 insertions(+), 17 deletions(-) diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index 708a785f..f3b6c8b7 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -288,6 +288,10 @@ or /usr/share/cgit/ See L +=item publicinbox.indexMaxSize + +See L + =item publicinbox.wwwlisting Enable a HTML listing style when the root path of the URL '/' is accessed. diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index dede5d2e..398ac516 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -66,6 +66,12 @@ is detected. This is intended to be used in mirrors after running L or L to ensure data is expunged from mirrors. +=item --max-size SIZE + +Sets or overrides L on a +per-invocation basis. See L +below. + =back =head1 FILES @@ -76,6 +82,23 @@ C<$GIT_DIR/public-inbox/> directory. v2 inboxes are described in L. +=head1 CONFIGURATION + +=over 8 + +=item publicinbox.indexMaxSize + +Prevents indexing of messages larger than the specified size +value. A single suffix modifier of C, C or C is +supported, thus the value of C<1m> to prevents indexing of +messages larger than one megabyte. + +This is useful for avoiding memory exhaustion in mirrors. + +Default: none + +=back + =head1 ENVIRONMENT =over 8 diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 60f4f40d..62ddbe82 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -234,4 +234,15 @@ sub progress_prepare ($) { } } +# same unit factors as git: +sub parse_unsigned ($) { + my ($max_size) = @_; + + $$max_size =~ /\A([0-9]+)([kmg])?\z/i or return; + my ($n, $unit_factor) = ($1, $2 // ''); + my %u = ( k => 1024, m => 1024**2, g => 1024**3 ); + $$max_size = $n * ($u{lc($unit_factor)} // 1); + 1; +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 579b85e3..25118f43 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -64,6 +64,7 @@ sub new { $self->{lock_path} = "$inboxdir/ssoma.lock"; my $dir = $self->xdir; $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); + $self->{index_max_size} = $ibx->{index_max_size}; } elsif ($version == 2) { defined $shard or die "shard is required for v2\n"; # shard is a number @@ -572,6 +573,16 @@ sub batch_adjust ($$$$$) { } } +sub too_big ($$$) { + my ($self, $git, $oid) = @_; + my $max_size = $self->{index_max_size} or return; + my (undef, undef, $size) = $git->check($oid); + die "E: bad $oid in $git->{git_dir}\n" if !defined($size); + return if $size <= $max_size; + warn "W: skipping $oid ($size > $max_size)\n"; + 1; +} + # only for v1 sub read_log { my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; @@ -598,6 +609,7 @@ sub read_log { } next; } + next if too_big($self, $git, $blob); my $mime = do_cat_mail($git, $blob, \$bytes); my $smsg = bless {}, 'PublicInbox::Smsg'; batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr); @@ -606,7 +618,7 @@ sub read_log { $add_cb->($self, $mime, $smsg); } elsif ($line =~ /$delmsg/o) { my $blob = $1; - $D{$blob} = 1; + $D{$blob} = 1 unless too_big($self, $git, $blob); } elsif ($line =~ /^commit ($h40)/o) { $latest = $1; $newest ||= $latest; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 12cc1f13..01b8bed6 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -120,6 +120,7 @@ sub new { last_commit => [], # git repo -> commit }; $self->{shards} = count_shards($self) || nproc_shards($creat); + $self->{index_max_size} = $v2ibx->{index_max_size}; bless $self, $class; } @@ -867,6 +868,7 @@ sub atfork_child { sub mark_deleted ($$$$) { my ($self, $sync, $git, $oid) = @_; + return if PublicInbox::SearchIdx::too_big($self, $git, $oid); my $msgref = $git->cat_file($oid); my $mime = PublicInbox::MIME->new($$msgref); my $mids = mids($mime->header_obj); @@ -993,6 +995,7 @@ sub multi_mid_q_push ($$$) { sub reindex_oid ($$$$) { my ($self, $sync, $git, $oid) = @_; + return if PublicInbox::SearchIdx::too_big($self, $git, $oid); my ($num, $mid0, $len); my $msgref = $git->cat_file($oid, \$len); return if $len == 0; # purged diff --git a/script/public-inbox-index b/script/public-inbox-index index 7def9964..2d0f0eca 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -14,8 +14,9 @@ PublicInbox::Admin::require_or_die('-index'); use PublicInbox::Xapcmd; my $compact_opt; -my $opt = { quiet => -1, compact => 0 }; -GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune indexlevel|L=s)) +my $opt = { quiet => -1, compact => 0, maxsize => undef }; +GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune + indexlevel|L=s maxsize|max-size=s)) or die "bad command-line args\n$usage"; die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0; @@ -25,14 +26,22 @@ if ($opt->{compact}) { $compact_opt = { -coarse_lock => 1, compact => 1 }; } -my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV); +my $cfg = PublicInbox::Config->new; +my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg); PublicInbox::Admin::require_or_die('-index'); unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 } my $mods = {}; +my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')}; +if (defined $max_size) { + PublicInbox::Admin::parse_unsigned(\$max_size) or + die "`publicInbox.indexMaxSize=$max_size' not parsed\n"; +} + foreach my $ibx (@ibxs) { # XXX: users can shoot themselves in the foot, with opt->{indexlevel} $ibx->{indexlevel} //= $opt->{indexlevel} // PublicInbox::Admin::detect_indexlevel($ibx); + $ibx->{index_max_size} = $max_size; PublicInbox::Admin::scan_ibx_modules($mods, $ibx); } diff --git a/t/admin.t b/t/admin.t index a9d67d25..c25667b2 100644 --- a/t/admin.t +++ b/t/admin.t @@ -78,4 +78,24 @@ SKIP: { } chdir '/'; + +my @pairs = ( + '1g' => 1024 ** 3, + 666 => 666, + '1500K' => 1500 * 1024, + '15m' => 15 * (1024 ** 2), +); + +while (@pairs) { + my ($in, $out) = splice(@pairs, 0, 2); + my $orig = $in; + ok(PublicInbox::Admin::parse_unsigned(\$in), "parse_unsigned $orig"); + is($in, $out, "got $orig => ($in == $out)"); +} + +for my $v ('', 'bogus', '1p', '1gig') { + ok(!PublicInbox::Admin::parse_unsigned(\$v), + "parse_unsigned rejects $v"); +} + done_testing(); diff --git a/t/cgi.t b/t/cgi.t index 52f80e88..bceb83e5 100644 --- a/t/cgi.t +++ b/t/cgi.t @@ -55,10 +55,14 @@ Date: Thu, 01 Jan 1970 00:00:00 +0000 zzzzzz EOF - $im->add($mime); + ok($im->add($mime), 'added initial message'); + + $mime->header_set('Message-ID', ''); + $mime->body_str_set("z\n" x 1024); + ok($im->add($mime), 'added big message'); # deliver a reply, too - my $reply = Email::MIME->new(<new(< To: Me Cc: $addr @@ -72,7 +76,7 @@ Me wrote: what? EOF - $im->add($reply); + ok($im->add($mime), 'added reply'); my $slashy_mid = 'slashy/asdf@example.com'; my $slashy = Email::MIME->new(<add($slashy); + ok($im->add($slashy), 'added slash'); $im->done; my $res = cgi_run("/test/slashy/asdf\@example.com/raw"); @@ -99,14 +103,9 @@ EOF my $path = "/test/blahblah\@example.com/t.mbox.gz"; my $res = cgi_run($path); like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled"); - my $indexed; - eval { - require DBD::SQLite; - require PublicInbox::SearchIdx; - my $s = PublicInbox::SearchIdx->new($ibx, 1); - $s->index_sync; - $indexed = 1; - }; + my $cmd = ['-index', $ibx->{inboxdir}, '--max-size=2k']; + my $opt = { 2 => \(my $err) }; + my $indexed = run_script($cmd, undef, $opt); if ($indexed) { $res = cgi_run($path); like($res->{head}, qr/^Status: 200 /, "search returned mbox"); @@ -117,9 +116,14 @@ EOF IO::Uncompress::Gunzip::gunzip(\$in => \$out); like($out, qr/^From /m, "From lines in mbox"); }; + $res = cgi_run('/test/toobig@example.com/'); + like($res->{head}, qr/^Status: 300 /, + 'did not index or return >max-size message'); + like($err, qr/skipping [a-f0-9]{40,}/, + 'warned about skipping large OID'); } else { like($res->{head}, qr/^Status: 501 /, "search not available"); - SKIP: { skip 'DBD::SQLite not available', 2 }; + SKIP: { skip 'DBD::SQLite not available', 4 }; } my $have_xml_treepp = eval { require XML::TreePP; 1 } if $indexed; diff --git a/t/v2mirror.t b/t/v2mirror.t index 406bbd4f..ecf96891 100644 --- a/t/v2mirror.t +++ b/t/v2mirror.t @@ -187,6 +187,37 @@ is($mibx->git->check($to_purge), undef, 'unindex+prune successful in mirror'); is(scalar($mset->items), 0, '1@example.com no longer visible in mirror'); } +if ('max size') { + $mime->header_set('Message-ID', '<2big@a>'); + my $max = '2k'; + $mime->body_str_set("z\n" x 1024); + ok($v2w->add($mime), "add big message"); + $v2w->done; + $ibx->cleanup; + $fetch_each_epoch->(); + PublicInbox::InboxWritable::cleanup($mibx); + my $cmd = ['-index', "$tmpdir/m", "--max-size=$max" ]; + my $opt = { 2 => \(my $err) }; + ok(run_script($cmd, undef, $opt), 'indexed with --max-size'); + like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message'); + $mset = $mibx->search->reopen->query('m:2big@a', {mset =>1}); + is(scalar($mset->items), 0, 'large message not indexed'); + + { + open my $fh, '>>', $pi_config or die; + print $fh <search->reopen->query('m:2big@a', {mset =>1}); + is(scalar($mset->items), 0, 'large message not re-indexed'); +} + ok($td->kill, 'killed httpd'); $td->join; -- cgit v1.2.3-24-ge0c7