From 356439a571c536eaa487031802b436d087113f4f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Wed, 22 Sep 2021 09:45:17 +0000 Subject: gcf2 + extsearch: check for unlinked files on Linux Check for unlinked mmap-ed files via /proc/$PID/maps every 60s or so. ExtSearch (extindex) is compatible-enough with Inbox objects to be wired into the old per-inbox code, but the startup cost is projected to be much higher down the line when there's >30K inboxes, so we scan /proc/$PID/maps for deleted files before unlinking. With old Inbox objects, it was (and is) simpler to just kill processes w/o checking due to the low startup cost (and non-portability of checking). Reported-by: Konstantin Ryabitsev Link: https://public-inbox.org/meta/20210921144754.gulkneuulzo27qbw@meerkat.local/ --- lib/PublicInbox/ExtSearch.pm | 10 ++++++++-- lib/PublicInbox/Gcf2.pm | 26 +++++++++++++++++++++++--- lib/PublicInbox/Git.pm | 16 +++++++++++++++- lib/PublicInbox/Inbox.pm | 11 +++++++++-- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index bd301158..cd7cba2e 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -33,12 +33,18 @@ sub misc { # same as per-inbox ->over, for now... sub over { my ($self) = @_; - $self->{over} //= PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + $self->{over} //= do { + PublicInbox::Inbox::_cleanup_later($self); + PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + }; } sub git { my ($self) = @_; - $self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git"); + $self->{git} //= do { + PublicInbox::Inbox::_cleanup_later($self); + PublicInbox::Git->new("$self->{topdir}/ALL.git"); + }; } # returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 64945ca6..f546208f 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -8,6 +8,7 @@ use strict; use v5.10.1; use PublicInbox::Spawn qw(which popen_rd); # may set PERL_INLINE_DIRECTORY use Fcntl qw(LOCK_EX SEEK_SET); +use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use IO::Handle; # autoflush BEGIN { my (%CFG, $c_src); @@ -96,11 +97,21 @@ sub add_alt ($$) { 1; } -# Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop +sub have_unlinked_files () { + # FIXME: port gcf2-like over to git.git so we won't need to + # deal with libgit2 + return 1 if $^O ne 'linux'; + open my $fh, '<', "/proc/$$/maps" or return; + while (<$fh>) { return 1 if /\.(?:idx|pack) \(deleted\)$/ } + undef; +} + +# Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop [EXPIRE-TIMEOUT] # (see lib/PublicInbox/Gcf2Client.pm) -sub loop () { +sub loop (;$) { + my $exp = $_[0] || $ARGV[0] || 60; # seconds my $gcf2 = new(); - my %seen; + my (%seen, $check_at); STDERR->autoflush(1); STDOUT->autoflush(1); @@ -116,6 +127,7 @@ sub loop () { $gcf2 = new(); %seen = ($git_dir => add_alt($gcf2,"$git_dir/objects")); + $check_at = clock_gettime(CLOCK_MONOTONIC) + $exp; if ($gcf2->cat_oid(1, $oid)) { warn "I: $$ $oid found after retry\n"; @@ -123,6 +135,14 @@ sub loop () { warn "W: $$ $oid missing after retry\n"; print "$oid missing\n"; # mimic git-cat-file } + } else { # check expiry to deal with deleted pack files + my $now = clock_gettime(CLOCK_MONOTONIC); + $check_at //= $now + $exp; + if ($now > $check_at && have_unlinked_files()) { + undef $check_at; + $gcf2 = new(); + %seen = (); + } } } } diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 5ef1db2f..cf51239f 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -12,7 +12,7 @@ use v5.10.1; use parent qw(Exporter); use POSIX (); use IO::Handle; # ->autoflush -use Errno qw(EINTR EAGAIN); +use Errno qw(EINTR EAGAIN ENOENT); use File::Glob qw(bsd_glob GLOB_NOSORT); use File::Spec (); use Time::HiRes qw(stat); @@ -523,6 +523,20 @@ sub manifest_entry { $ent; } +sub cleanup_if_unlinked { + my ($self) = @_; + return cleanup($self) if $^O ne 'linux'; + # Linux-specific /proc/$PID/maps access + # TODO: support this inside git.git + for my $fld (qw(pid pid_c)) { + my $pid = $self->{$fld} // next; + open my $fh, '<', "/proc/$pid/maps" or next; + while (<$fh>) { + return cleanup($self) if /\.(?:idx|pack) \(deleted\)$/; + } + } +} + 1; __END__ =pod diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 20f8c884..02ffc7be 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -22,8 +22,15 @@ my $CLEANUP = {}; # string(inbox) -> inbox sub git_cleanup ($) { my ($self) = @_; - my $git = $self->{git} or return; - $git->cleanup; + if ($self->isa(__PACKAGE__)) { + # normal Inbox; low startup cost, and likely to have many + # many so this keeps process/pipe counts in check + $self->{git}->cleanup if $self->{git}; + } else { + # ExtSearch, high startup cost if ->ALL, and probably + # only one per-daemon, so teardown only if required: + $self->git->cleanup_if_unlinked; + } } sub cleanup_task () { -- cgit v1.2.3-24-ge0c7