From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH] index: support --max-size / publicinbox.indexMaxSize
Date: Mon, 20 Apr 2020 22:55:37 +0000 [thread overview]
Message-ID: <20200420225537.9860-1-e@yhbt.net> (raw)
In normal mail paths, we can rely on MTAs being configured with
reasonable limits in the -watch and -mda mail injection paths.
However, the MTA is bypassed in a git-only delivery path, a BOFH
could inject a large message and DoS users attempting to mirror
a public-inbox.
This doesn't protect unindexed WWW interfaces from Email::MIME
memory explosions on v1 inboxes. Probably nobody cares about
unindexed WWW interfaces anymore, especially now that Xapian is
optional for indexing.
---
Documentation/public-inbox-config.pod | 4 ++++
Documentation/public-inbox-index.pod | 23 ++++++++++++++++++++
lib/PublicInbox/Admin.pm | 11 ++++++++++
lib/PublicInbox/SearchIdx.pm | 14 +++++++++++-
lib/PublicInbox/V2Writable.pm | 3 +++
script/public-inbox-index | 15 ++++++++++---
t/admin.t | 20 +++++++++++++++++
t/cgi.t | 30 +++++++++++++++-----------
t/v2mirror.t | 31 +++++++++++++++++++++++++++
9 files changed, 134 insertions(+), 17 deletions(-)
diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod
index 708a785f..f3b6c8b7 100644
--- a/Documentation/public-inbox-config.pod
+++ b/Documentation/public-inbox-config.pod
@@ -288,6 +288,10 @@ or /usr/share/cgit/
See L<public-inbox-edit(1)>
+=item publicinbox.indexMaxSize
+
+See L<public-inbox-index(1)>
+
=item publicinbox.wwwlisting
Enable a HTML listing style when the root path of the URL '/' is accessed.
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index dede5d2e..398ac516 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -66,6 +66,12 @@ is detected. This is intended to be used in mirrors after running
L<public-inbox-edit(1)> or L<public-inbox-purge(1)> to ensure data
is expunged from mirrors.
+=item --max-size SIZE
+
+Sets or overrides L</publicinbox.indexMaxSize> on a
+per-invocation basis. See L</publicinbox.indexMaxSize>
+below.
+
=back
=head1 FILES
@@ -76,6 +82,23 @@ C<$GIT_DIR/public-inbox/> directory.
v2 inboxes are described in L<public-inbox-v2-format>.
+=head1 CONFIGURATION
+
+=over 8
+
+=item publicinbox.indexMaxSize
+
+Prevents indexing of messages larger than the specified size
+value. A single suffix modifier of C<k>, C<m> or C<g> is
+supported, thus the value of C<1m> to prevents indexing of
+messages larger than one megabyte.
+
+This is useful for avoiding memory exhaustion in mirrors.
+
+Default: none
+
+=back
+
=head1 ENVIRONMENT
=over 8
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 60f4f40d..62ddbe82 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -234,4 +234,15 @@ sub progress_prepare ($) {
}
}
+# same unit factors as git:
+sub parse_unsigned ($) {
+ my ($max_size) = @_;
+
+ $$max_size =~ /\A([0-9]+)([kmg])?\z/i or return;
+ my ($n, $unit_factor) = ($1, $2 // '');
+ my %u = ( k => 1024, m => 1024**2, g => 1024**3 );
+ $$max_size = $n * ($u{lc($unit_factor)} // 1);
+ 1;
+}
+
1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 579b85e3..25118f43 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -64,6 +64,7 @@ sub new {
$self->{lock_path} = "$inboxdir/ssoma.lock";
my $dir = $self->xdir;
$self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3");
+ $self->{index_max_size} = $ibx->{index_max_size};
} elsif ($version == 2) {
defined $shard or die "shard is required for v2\n";
# shard is a number
@@ -572,6 +573,16 @@ sub batch_adjust ($$$$$) {
}
}
+sub too_big ($$$) {
+ my ($self, $git, $oid) = @_;
+ my $max_size = $self->{index_max_size} or return;
+ my (undef, undef, $size) = $git->check($oid);
+ die "E: bad $oid in $git->{git_dir}\n" if !defined($size);
+ return if $size <= $max_size;
+ warn "W: skipping $oid ($size > $max_size)\n";
+ 1;
+}
+
# only for v1
sub read_log {
my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
@@ -598,6 +609,7 @@ sub read_log {
}
next;
}
+ next if too_big($self, $git, $blob);
my $mime = do_cat_mail($git, $blob, \$bytes);
my $smsg = bless {}, 'PublicInbox::Smsg';
batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
@@ -606,7 +618,7 @@ sub read_log {
$add_cb->($self, $mime, $smsg);
} elsif ($line =~ /$delmsg/o) {
my $blob = $1;
- $D{$blob} = 1;
+ $D{$blob} = 1 unless too_big($self, $git, $blob);
} elsif ($line =~ /^commit ($h40)/o) {
$latest = $1;
$newest ||= $latest;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 12cc1f13..01b8bed6 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -120,6 +120,7 @@ sub new {
last_commit => [], # git repo -> commit
};
$self->{shards} = count_shards($self) || nproc_shards($creat);
+ $self->{index_max_size} = $v2ibx->{index_max_size};
bless $self, $class;
}
@@ -867,6 +868,7 @@ sub atfork_child {
sub mark_deleted ($$$$) {
my ($self, $sync, $git, $oid) = @_;
+ return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
my $msgref = $git->cat_file($oid);
my $mime = PublicInbox::MIME->new($$msgref);
my $mids = mids($mime->header_obj);
@@ -993,6 +995,7 @@ sub multi_mid_q_push ($$$) {
sub reindex_oid ($$$$) {
my ($self, $sync, $git, $oid) = @_;
+ return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
my ($num, $mid0, $len);
my $msgref = $git->cat_file($oid, \$len);
return if $len == 0; # purged
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 7def9964..2d0f0eca 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -14,8 +14,9 @@ PublicInbox::Admin::require_or_die('-index');
use PublicInbox::Xapcmd;
my $compact_opt;
-my $opt = { quiet => -1, compact => 0 };
-GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune indexlevel|L=s))
+my $opt = { quiet => -1, compact => 0, maxsize => undef };
+GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune
+ indexlevel|L=s maxsize|max-size=s))
or die "bad command-line args\n$usage";
die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0;
@@ -25,14 +26,22 @@ if ($opt->{compact}) {
$compact_opt = { -coarse_lock => 1, compact => 1 };
}
-my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV);
+my $cfg = PublicInbox::Config->new;
+my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg);
PublicInbox::Admin::require_or_die('-index');
unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 }
my $mods = {};
+my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')};
+if (defined $max_size) {
+ PublicInbox::Admin::parse_unsigned(\$max_size) or
+ die "`publicInbox.indexMaxSize=$max_size' not parsed\n";
+}
+
foreach my $ibx (@ibxs) {
# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
$ibx->{indexlevel} //= $opt->{indexlevel} //
PublicInbox::Admin::detect_indexlevel($ibx);
+ $ibx->{index_max_size} = $max_size;
PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
}
diff --git a/t/admin.t b/t/admin.t
index a9d67d25..c25667b2 100644
--- a/t/admin.t
+++ b/t/admin.t
@@ -78,4 +78,24 @@ SKIP: {
}
chdir '/';
+
+my @pairs = (
+ '1g' => 1024 ** 3,
+ 666 => 666,
+ '1500K' => 1500 * 1024,
+ '15m' => 15 * (1024 ** 2),
+);
+
+while (@pairs) {
+ my ($in, $out) = splice(@pairs, 0, 2);
+ my $orig = $in;
+ ok(PublicInbox::Admin::parse_unsigned(\$in), "parse_unsigned $orig");
+ is($in, $out, "got $orig => ($in == $out)");
+}
+
+for my $v ('', 'bogus', '1p', '1gig') {
+ ok(!PublicInbox::Admin::parse_unsigned(\$v),
+ "parse_unsigned rejects $v");
+}
+
done_testing();
diff --git a/t/cgi.t b/t/cgi.t
index 52f80e88..bceb83e5 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -55,10 +55,14 @@ Date: Thu, 01 Jan 1970 00:00:00 +0000
zzzzzz
EOF
- $im->add($mime);
+ ok($im->add($mime), 'added initial message');
+
+ $mime->header_set('Message-ID', '<toobig@example.com>');
+ $mime->body_str_set("z\n" x 1024);
+ ok($im->add($mime), 'added big message');
# deliver a reply, too
- my $reply = Email::MIME->new(<<EOF);
+ $mime = Email::MIME->new(<<EOF);
From: You <you\@example.com>
To: Me <me\@example.com>
Cc: $addr
@@ -72,7 +76,7 @@ Me wrote:
what?
EOF
- $im->add($reply);
+ ok($im->add($mime), 'added reply');
my $slashy_mid = 'slashy/asdf@example.com';
my $slashy = Email::MIME->new(<<EOF);
@@ -85,7 +89,7 @@ Date: Thu, 01 Jan 1970 00:00:01 +0000
slashy
EOF
- $im->add($slashy);
+ ok($im->add($slashy), 'added slash');
$im->done;
my $res = cgi_run("/test/slashy/asdf\@example.com/raw");
@@ -99,14 +103,9 @@ EOF
my $path = "/test/blahblah\@example.com/t.mbox.gz";
my $res = cgi_run($path);
like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
- my $indexed;
- eval {
- require DBD::SQLite;
- require PublicInbox::SearchIdx;
- my $s = PublicInbox::SearchIdx->new($ibx, 1);
- $s->index_sync;
- $indexed = 1;
- };
+ my $cmd = ['-index', $ibx->{inboxdir}, '--max-size=2k'];
+ my $opt = { 2 => \(my $err) };
+ my $indexed = run_script($cmd, undef, $opt);
if ($indexed) {
$res = cgi_run($path);
like($res->{head}, qr/^Status: 200 /, "search returned mbox");
@@ -117,9 +116,14 @@ EOF
IO::Uncompress::Gunzip::gunzip(\$in => \$out);
like($out, qr/^From /m, "From lines in mbox");
};
+ $res = cgi_run('/test/toobig@example.com/');
+ like($res->{head}, qr/^Status: 300 /,
+ 'did not index or return >max-size message');
+ like($err, qr/skipping [a-f0-9]{40,}/,
+ 'warned about skipping large OID');
} else {
like($res->{head}, qr/^Status: 501 /, "search not available");
- SKIP: { skip 'DBD::SQLite not available', 2 };
+ SKIP: { skip 'DBD::SQLite not available', 4 };
}
my $have_xml_treepp = eval { require XML::TreePP; 1 } if $indexed;
diff --git a/t/v2mirror.t b/t/v2mirror.t
index 406bbd4f..ecf96891 100644
--- a/t/v2mirror.t
+++ b/t/v2mirror.t
@@ -187,6 +187,37 @@ is($mibx->git->check($to_purge), undef, 'unindex+prune successful in mirror');
is(scalar($mset->items), 0, '1@example.com no longer visible in mirror');
}
+if ('max size') {
+ $mime->header_set('Message-ID', '<2big@a>');
+ my $max = '2k';
+ $mime->body_str_set("z\n" x 1024);
+ ok($v2w->add($mime), "add big message");
+ $v2w->done;
+ $ibx->cleanup;
+ $fetch_each_epoch->();
+ PublicInbox::InboxWritable::cleanup($mibx);
+ my $cmd = ['-index', "$tmpdir/m", "--max-size=$max" ];
+ my $opt = { 2 => \(my $err) };
+ ok(run_script($cmd, undef, $opt), 'indexed with --max-size');
+ like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+ $mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+ is(scalar($mset->items), 0, 'large message not indexed');
+
+ {
+ open my $fh, '>>', $pi_config or die;
+ print $fh <<EOF or die;
+[publicinbox]
+ indexMaxSize = 2k
+EOF
+ close $fh or die;
+ }
+ $cmd = ['-index', "$tmpdir/m", "--reindex" ];
+ ok(run_script($cmd, undef, $opt), 'reindexed w/ indexMaxSize in file');
+ like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+ $mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+ is(scalar($mset->items), 0, 'large message not re-indexed');
+}
+
ok($td->kill, 'killed httpd');
$td->join;
reply other threads:[~2020-04-20 22:55 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200420225537.9860-1-e@yhbt.net \
--to=e@yhbt.net \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/public-inbox.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).