about summary refs log tree commit homepage
path: root/lib
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2020-10-27 07:54:52 +0000
committerEric Wong <e@80x24.org>2020-11-07 10:22:14 +0000
commit7ce1a0d97294c5603cf8c6cf89cab491cc5840d3 (patch)
tree39deb59c56e7e1a0e52f768f9ee0c2a87913629e /lib
parent01888d60a8f958f6ae23fffd8541011b5ef74a19 (diff)
downloadpublic-inbox-7ce1a0d97294c5603cf8c6cf89cab491cc5840d3.tar.gz
This is needed to limit the RSS of processes and ensure the
stored data in over.sqlite3 and Xapian DBs are consistent if
interrupted.  Without checkpoints, indexing lore causes shard
workers to take several GB of memory and thrash/OOM smaller
systems.
Diffstat (limited to 'lib')
-rw-r--r--lib/PublicInbox/ExtSearchIdx.pm20
1 files changed, 19 insertions, 1 deletions
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 050c4252..9d576adb 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -106,6 +106,18 @@ sub is_bad_blob ($$$$) {
         $size == 0 ? 1 : 0; # size == 0 means purged
 }
 
+sub check_batch_limit ($) {
+        my ($req) = @_;
+        my $self = $req->{self};
+        my $new_smsg = $req->{new_smsg};
+
+        # {raw_bytes} may be unset, so just use {bytes}
+        my $n = $self->{transact_bytes} += $new_smsg->{bytes};
+
+        # set flag for PublicInbox::V2Writable::index_todo:
+        ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes};
+}
+
 sub do_xpost ($$) {
         my ($req, $smsg) = @_;
         my $self = $req->{self};
@@ -118,6 +130,7 @@ sub do_xpost ($$) {
                 my $xnum = $req->{xnum};
                 $self->{oidx}->add_xref3($docid, $xnum, $oid, $xibx->eidx_key);
                 $idx->shard_add_eidx_info($docid, $oid, $xibx, $eml);
+                check_batch_limit($req);
         } else { # 'd'
                 $self->{oidx}->remove_xref3($docid, $oid, $xibx->eidx_key);
                 $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml);
@@ -141,6 +154,7 @@ sub index_unseen ($) {
         my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset';
         $self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key);
         $idx->index_raw(undef, $eml, $new_smsg, $ibx);
+        check_batch_limit($req);
 }
 
 sub do_finalize ($) {
@@ -301,7 +315,11 @@ sub eidx_sync { # main entry point
         local $SIG{__WARN__} = sub {
                 $warn_cb->($self->{current_info}, ': ', @_);
         };
-        _sync_inbox($self, $opt, $_) for (@{$self->{ibx_list}});
+
+        # don't use $_ here, it'll get clobbered by reindex_checkpoint
+        for my $ibx (@{$self->{ibx_list}}) {
+                _sync_inbox($self, $opt, $ibx);
+        }
 
         $self->{oidx}->rethread_done($opt);