user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
* [PATCH] index: support --max-size / publicinbox.indexMaxSize
@ 2020-04-20 22:55 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2020-04-20 22:55 UTC (permalink / raw)
  To: meta

In normal mail paths, we can rely on MTAs being configured with
reasonable limits in the -watch and -mda mail injection paths.

However, the MTA is bypassed in a git-only delivery path, a BOFH
could inject a large message and DoS users attempting to mirror
a public-inbox.

This doesn't protect unindexed WWW interfaces from Email::MIME
memory explosions on v1 inboxes.  Probably nobody cares about
unindexed WWW interfaces anymore, especially now that Xapian is
optional for indexing.
---
 Documentation/public-inbox-config.pod |  4 ++++
 Documentation/public-inbox-index.pod  | 23 ++++++++++++++++++++
 lib/PublicInbox/Admin.pm              | 11 ++++++++++
 lib/PublicInbox/SearchIdx.pm          | 14 +++++++++++-
 lib/PublicInbox/V2Writable.pm         |  3 +++
 script/public-inbox-index             | 15 ++++++++++---
 t/admin.t                             | 20 +++++++++++++++++
 t/cgi.t                               | 30 +++++++++++++++-----------
 t/v2mirror.t                          | 31 +++++++++++++++++++++++++++
 9 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod
index 708a785f..f3b6c8b7 100644
--- a/Documentation/public-inbox-config.pod
+++ b/Documentation/public-inbox-config.pod
@@ -288,6 +288,10 @@ or /usr/share/cgit/
 
 See L<public-inbox-edit(1)>
 
+=item publicinbox.indexMaxSize
+
+See L<public-inbox-index(1)>
+
 =item publicinbox.wwwlisting
 
 Enable a HTML listing style when the root path of the URL '/' is accessed.
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index dede5d2e..398ac516 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -66,6 +66,12 @@ is detected.  This is intended to be used in mirrors after running
 L<public-inbox-edit(1)> or L<public-inbox-purge(1)> to ensure data
 is expunged from mirrors.
 
+=item --max-size SIZE
+
+Sets or overrides L</publicinbox.indexMaxSize> on a
+per-invocation basis.  See L</publicinbox.indexMaxSize>
+below.
+
 =back
 
 =head1 FILES
@@ -76,6 +82,23 @@ C<$GIT_DIR/public-inbox/> directory.
 
 v2 inboxes are described in L<public-inbox-v2-format>.
 
+=head1 CONFIGURATION
+
+=over 8
+
+=item publicinbox.indexMaxSize
+
+Prevents indexing of messages larger than the specified size
+value.  A single suffix modifier of C<k>, C<m> or C<g> is
+supported, thus the value of C<1m> to prevents indexing of
+messages larger than one megabyte.
+
+This is useful for avoiding memory exhaustion in mirrors.
+
+Default: none
+
+=back
+
 =head1 ENVIRONMENT
 
 =over 8
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 60f4f40d..62ddbe82 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -234,4 +234,15 @@ sub progress_prepare ($) {
 	}
 }
 
+# same unit factors as git:
+sub parse_unsigned ($) {
+	my ($max_size) = @_;
+
+	$$max_size =~ /\A([0-9]+)([kmg])?\z/i or return;
+	my ($n, $unit_factor) = ($1, $2 // '');
+	my %u = ( k => 1024, m => 1024**2, g => 1024**3 );
+	$$max_size = $n * ($u{lc($unit_factor)} // 1);
+	1;
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 579b85e3..25118f43 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -64,6 +64,7 @@ sub new {
 		$self->{lock_path} = "$inboxdir/ssoma.lock";
 		my $dir = $self->xdir;
 		$self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3");
+		$self->{index_max_size} = $ibx->{index_max_size};
 	} elsif ($version == 2) {
 		defined $shard or die "shard is required for v2\n";
 		# shard is a number
@@ -572,6 +573,16 @@ sub batch_adjust ($$$$$) {
 	}
 }
 
+sub too_big ($$$) {
+	my ($self, $git, $oid) = @_;
+	my $max_size = $self->{index_max_size} or return;
+	my (undef, undef, $size) = $git->check($oid);
+	die "E: bad $oid in $git->{git_dir}\n" if !defined($size);
+	return if $size <= $max_size;
+	warn "W: skipping $oid ($size > $max_size)\n";
+	1;
+}
+
 # only for v1
 sub read_log {
 	my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_;
@@ -598,6 +609,7 @@ sub read_log {
 				}
 				next;
 			}
+			next if too_big($self, $git, $blob);
 			my $mime = do_cat_mail($git, $blob, \$bytes);
 			my $smsg = bless {}, 'PublicInbox::Smsg';
 			batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
@@ -606,7 +618,7 @@ sub read_log {
 			$add_cb->($self, $mime, $smsg);
 		} elsif ($line =~ /$delmsg/o) {
 			my $blob = $1;
-			$D{$blob} = 1;
+			$D{$blob} = 1 unless too_big($self, $git, $blob);
 		} elsif ($line =~ /^commit ($h40)/o) {
 			$latest = $1;
 			$newest ||= $latest;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 12cc1f13..01b8bed6 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -120,6 +120,7 @@ sub new {
 		last_commit => [], # git repo -> commit
 	};
 	$self->{shards} = count_shards($self) || nproc_shards($creat);
+	$self->{index_max_size} = $v2ibx->{index_max_size};
 	bless $self, $class;
 }
 
@@ -867,6 +868,7 @@ sub atfork_child {
 
 sub mark_deleted ($$$$) {
 	my ($self, $sync, $git, $oid) = @_;
+	return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
 	my $msgref = $git->cat_file($oid);
 	my $mime = PublicInbox::MIME->new($$msgref);
 	my $mids = mids($mime->header_obj);
@@ -993,6 +995,7 @@ sub multi_mid_q_push ($$$) {
 
 sub reindex_oid ($$$$) {
 	my ($self, $sync, $git, $oid) = @_;
+	return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
 	my ($num, $mid0, $len);
 	my $msgref = $git->cat_file($oid, \$len);
 	return if $len == 0; # purged
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 7def9964..2d0f0eca 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -14,8 +14,9 @@ PublicInbox::Admin::require_or_die('-index');
 use PublicInbox::Xapcmd;
 
 my $compact_opt;
-my $opt = { quiet => -1, compact => 0 };
-GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune indexlevel|L=s))
+my $opt = { quiet => -1, compact => 0, maxsize => undef };
+GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune
+		indexlevel|L=s maxsize|max-size=s))
 	or die "bad command-line args\n$usage";
 die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0;
 
@@ -25,14 +26,22 @@ if ($opt->{compact}) {
 	$compact_opt = { -coarse_lock => 1, compact => 1 };
 }
 
-my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV);
+my $cfg = PublicInbox::Config->new;
+my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg);
 PublicInbox::Admin::require_or_die('-index');
 unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 }
 my $mods = {};
+my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')};
+if (defined $max_size) {
+	PublicInbox::Admin::parse_unsigned(\$max_size) or
+		die "`publicInbox.indexMaxSize=$max_size' not parsed\n";
+}
+
 foreach my $ibx (@ibxs) {
 	# XXX: users can shoot themselves in the foot, with opt->{indexlevel}
 	$ibx->{indexlevel} //= $opt->{indexlevel} //
 			PublicInbox::Admin::detect_indexlevel($ibx);
+	$ibx->{index_max_size} = $max_size;
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
 }
 
diff --git a/t/admin.t b/t/admin.t
index a9d67d25..c25667b2 100644
--- a/t/admin.t
+++ b/t/admin.t
@@ -78,4 +78,24 @@ SKIP: {
 }
 
 chdir '/';
+
+my @pairs = (
+	'1g' => 1024 ** 3,
+	666 => 666,
+	'1500K' => 1500 * 1024,
+	'15m' => 15 * (1024 ** 2),
+);
+
+while (@pairs) {
+	my ($in, $out) = splice(@pairs, 0, 2);
+	my $orig = $in;
+	ok(PublicInbox::Admin::parse_unsigned(\$in), "parse_unsigned $orig");
+	is($in, $out, "got $orig => ($in == $out)");
+}
+
+for my $v ('', 'bogus', '1p', '1gig') {
+	ok(!PublicInbox::Admin::parse_unsigned(\$v),
+		"parse_unsigned rejects $v");
+}
+
 done_testing();
diff --git a/t/cgi.t b/t/cgi.t
index 52f80e88..bceb83e5 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -55,10 +55,14 @@ Date: Thu, 01 Jan 1970 00:00:00 +0000
 
 zzzzzz
 EOF
-	$im->add($mime);
+	ok($im->add($mime), 'added initial message');
+
+	$mime->header_set('Message-ID', '<toobig@example.com>');
+	$mime->body_str_set("z\n" x 1024);
+	ok($im->add($mime), 'added big message');
 
 	# deliver a reply, too
-	my $reply = Email::MIME->new(<<EOF);
+	$mime = Email::MIME->new(<<EOF);
 From: You <you\@example.com>
 To: Me <me\@example.com>
 Cc: $addr
@@ -72,7 +76,7 @@ Me wrote:
 
 what?
 EOF
-	$im->add($reply);
+	ok($im->add($mime), 'added reply');
 
 	my $slashy_mid = 'slashy/asdf@example.com';
 	my $slashy = Email::MIME->new(<<EOF);
@@ -85,7 +89,7 @@ Date: Thu, 01 Jan 1970 00:00:01 +0000
 
 slashy
 EOF
-	$im->add($slashy);
+	ok($im->add($slashy), 'added slash');
 	$im->done;
 
 	my $res = cgi_run("/test/slashy/asdf\@example.com/raw");
@@ -99,14 +103,9 @@ EOF
 	my $path = "/test/blahblah\@example.com/t.mbox.gz";
 	my $res = cgi_run($path);
 	like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
-	my $indexed;
-	eval {
-		require DBD::SQLite;
-		require PublicInbox::SearchIdx;
-		my $s = PublicInbox::SearchIdx->new($ibx, 1);
-		$s->index_sync;
-		$indexed = 1;
-	};
+	my $cmd = ['-index', $ibx->{inboxdir}, '--max-size=2k'];
+	my $opt = { 2 => \(my $err) };
+	my $indexed = run_script($cmd, undef, $opt);
 	if ($indexed) {
 		$res = cgi_run($path);
 		like($res->{head}, qr/^Status: 200 /, "search returned mbox");
@@ -117,9 +116,14 @@ EOF
 			IO::Uncompress::Gunzip::gunzip(\$in => \$out);
 			like($out, qr/^From /m, "From lines in mbox");
 		};
+		$res = cgi_run('/test/toobig@example.com/');
+		like($res->{head}, qr/^Status: 300 /,
+			'did not index or return >max-size message');
+		like($err, qr/skipping [a-f0-9]{40,}/,
+			'warned about skipping large OID');
 	} else {
 		like($res->{head}, qr/^Status: 501 /, "search not available");
-		SKIP: { skip 'DBD::SQLite not available', 2 };
+		SKIP: { skip 'DBD::SQLite not available', 4 };
 	}
 
 	my $have_xml_treepp = eval { require XML::TreePP; 1 } if $indexed;
diff --git a/t/v2mirror.t b/t/v2mirror.t
index 406bbd4f..ecf96891 100644
--- a/t/v2mirror.t
+++ b/t/v2mirror.t
@@ -187,6 +187,37 @@ is($mibx->git->check($to_purge), undef, 'unindex+prune successful in mirror');
 	is(scalar($mset->items), 0, '1@example.com no longer visible in mirror');
 }
 
+if ('max size') {
+	$mime->header_set('Message-ID', '<2big@a>');
+	my $max = '2k';
+	$mime->body_str_set("z\n" x 1024);
+	ok($v2w->add($mime), "add big message");
+	$v2w->done;
+	$ibx->cleanup;
+	$fetch_each_epoch->();
+	PublicInbox::InboxWritable::cleanup($mibx);
+	my $cmd = ['-index', "$tmpdir/m", "--max-size=$max" ];
+	my $opt = { 2 => \(my $err) };
+	ok(run_script($cmd, undef, $opt), 'indexed with --max-size');
+	like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+	$mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+	is(scalar($mset->items), 0, 'large message not indexed');
+
+	{
+		open my $fh, '>>', $pi_config or die;
+		print $fh <<EOF or die;
+[publicinbox]
+	indexMaxSize = 2k
+EOF
+		close $fh or die;
+	}
+	$cmd = ['-index', "$tmpdir/m", "--reindex" ];
+	ok(run_script($cmd, undef, $opt), 'reindexed w/ indexMaxSize in file');
+	like($err, qr/skipping [a-f0-9]{40,}/, 'warned about skipping message');
+	$mset = $mibx->search->reopen->query('m:2big@a', {mset =>1});
+	is(scalar($mset->items), 0, 'large message not re-indexed');
+}
+
 ok($td->kill, 'killed httpd');
 $td->join;
 

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2020-04-20 22:55 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-20 22:55 [PATCH] index: support --max-size / publicinbox.indexMaxSize Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).