From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 5E9AF20D11 for ; Wed, 14 Jun 2017 00:14:50 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/3] searchidx: switch to accounting by message bytes Date: Wed, 14 Jun 2017 00:14:47 +0000 Message-Id: <20170614001448.27098-3-e@80x24.org> In-Reply-To: <20170614001448.27098-1-e@80x24.org> References: <20170614001448.27098-1-e@80x24.org> List-Id: Xapian memory usage is tied to the size of the indexed text, so take the raw message size into account when deciding when to flush Xapian data. More importantly, we now flush Xapian before we have it buffer beyond our maximum; and we do it unconditionally to prevent even high priority processes from OOM-ing. --- lib/PublicInbox/SearchIdx.pm | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 316111b..30d3fe9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -20,13 +20,14 @@ use Carp qw(croak); use POSIX qw(strftime); require PublicInbox::Git; -use constant MAX_MID_SIZE => 244; # max term size - 1 in Xapian use constant { + MAX_MID_SIZE => 244, # max term size - 1 in Xapian PERM_UMASK => 0, OLD_PERM_GROUP => 1, OLD_PERM_EVERYBODY => 2, PERM_GROUP => 0660, PERM_EVERYBODY => 0664, + BATCH_BYTES => 1_000_000, }; sub new { @@ -71,7 +72,6 @@ sub _xdb_acquire { require File::Path; _lock_acquire($self); File::Path::mkpath($dir); - $self->{batch_size} = 100; $flag = Search::Xapian::DB_CREATE_OR_OPEN; } $self->{xdb} = Search::Xapian::WritableDatabase->new($dir, $flag); @@ -395,6 +395,15 @@ sub index_sync { with_umask($self, sub { $self->_index_sync($opts) }); } +sub batch_adjust ($$$$) { + my ($max, $bytes, $batch_cb, $latest) = @_; + $$max -= $bytes; + if ($$max <= 0) { + $$max = BATCH_BYTES; + $batch_cb->($latest, 1); + } +} + sub rlog { my ($self, $log, $add_cb, $del_cb, $batch_cb) = @_; my $hex = '[a-f0-9]'; @@ -404,23 +413,21 @@ sub rlog { my $git = $self->{git}; my $latest; my $bytes; - my $max = $self->{batch_size}; # may be undef + my $max = BATCH_BYTES; local $/ = "\n"; my $line; while (defined($line = <$log>)) { if ($line =~ /$addmsg/o) { my $blob = $1; my $mime = do_cat_mail($git, $blob, \$bytes) or next; + batch_adjust(\$max, $bytes, $batch_cb, $latest); $add_cb->($self, $mime, $bytes, $blob); } elsif ($line =~ /$delmsg/o) { my $blob = $1; - my $mime = do_cat_mail($git, $blob) or next; + my $mime = do_cat_mail($git, $blob, \$bytes) or next; + batch_adjust(\$max, $bytes, $batch_cb, $latest); $del_cb->($self, $mime); } elsif ($line =~ /^commit ($h40)/o) { - if (defined $max && --$max <= 0) { - $max = $self->{batch_size}; - $batch_cb->($latest, 1); - } $latest = $1; } } -- EW