From b3cf37096874c6c80ef554e5153e5d995c72ab95 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 24 Dec 2020 10:09:18 +0000 Subject: inboxwritable: delay umask_prepare calls This simplifies all ->with_umask callers and opens the door for further optimizations to delay/elide process spawning. --- lib/PublicInbox/V2Writable.pm | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib/PublicInbox/V2Writable.pm') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 3e3b275f..531a72b2 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -97,8 +97,6 @@ sub new { die "$dir does not exist\n"; } } - $v2ibx->umask_prepare; - my $xpfx = "$dir/xap" . PublicInbox::Search::SCHEMA_VERSION; my $self = { ibx => $v2ibx, @@ -320,7 +318,6 @@ sub idx_init { $ibx->git->cleanup; parallel_init($self, $ibx->{indexlevel}); - $ibx->umask_prepare; $ibx->with_umask(\&_idx_init, $self, $opt); } -- cgit v1.2.3-24-ge0c7 From e8d6c34c749d1b0fd1dc1278cd4a2a310b31a9ac Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 24 Dec 2020 10:09:19 +0000 Subject: index: support --fast-noop / -F switch Note: I'm not sure if it's worth documenting and supporting this long-term. We can can avoid taking locks for invocations of "index --all" and rely on high-resolution ctime (struct timespec st_ctim) comparisons of msgmap.sqlite3 and the packed-refs + refs/heads directory of the newest epoch. This cuts public-inbox-index invocations with "--all --no-update-extindex -L basic" down from 0.92s to 0.31s. The change with "-L medium" or "-L full" and (default) non-zero jobs is even more drastic, reducing a 12-13s no-op invocation down to the same 0.31s --- lib/PublicInbox/V2Writable.pm | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'lib/PublicInbox/V2Writable.pm') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 531a72b2..2b849ddf 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1351,11 +1351,19 @@ sub index_sync { $opt //= {}; return xapian_only($self, $opt) if $opt->{xapian_only}; - my $pr = $opt->{-progress}; my $epoch_max; - my $latest = $self->{ibx}->git_dir_latest(\$epoch_max); - return unless defined $latest; + my $latest = $self->{ibx}->git_dir_latest(\$epoch_max) // return; + if ($opt->{'fast-noop'}) { # nanosecond (st_ctim) comparison + use Time::HiRes qw(stat); + if (my @mm = stat("$self->{ibx}->{inboxdir}/msgmap.sqlite3")) { + my $c = $mm[10]; # 10 = ctime (nsec NV) + my @hd = stat("$latest/refs/heads"); + my @pr = stat("$latest/packed-refs"); + return if $c > ($hd[10] // 0) && $c > ($pr[10] // 0); + } + } + my $pr = $opt->{-progress}; my $seq = $opt->{sequential_shard}; my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; -- cgit v1.2.3-24-ge0c7 From fb4dd7fdeeed8478cda9b7e63e56564da8cbdacf Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:12 +0000 Subject: index: do not attach inbox to extindex unless updated We'll count the number of log changes (regardless of index or unindex) and only attach inboxes to ExtSearchIdx objects when they get new work. We'll also reduce lock bouncing and only update external indices after all per-inbox indexing is done. This also updates existing v2 indexing/unindexing callers to be more consistent and ensures unindex log entries update per-inbox last commit information. --- lib/PublicInbox/V2Writable.pm | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'lib/PublicInbox/V2Writable.pm') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 2b849ddf..ca52874b 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -891,12 +891,22 @@ sub reindex_checkpoint ($$) { $mm_tmp->atfork_parent if $mm_tmp; } +sub index_finalize ($$) { + my ($arg, $index) = @_; + ++$arg->{self}->{nidx}; + if (defined(my $cur = $arg->{cur_cmt})) { + ${$arg->{latest_cmt}} = $cur; + } elsif ($index) { + die 'BUG: {cur_cmt} missing'; + } # else { unindexing @leftovers doesn't set {cur_cmt} +} + sub index_oid { # cat_async callback my ($bref, $oid, $type, $size, $arg) = @_; - return if is_bad_blob($oid, $type, $size, $arg->{oid}); + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 1); # size == 0 purged returns here my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - return if $size == 0; # purged my ($num, $mid0); my $eml = PublicInbox::Eml->new($$bref); my $mids = mids($eml); @@ -967,7 +977,7 @@ sub index_oid { # cat_async callback if (do_idx($self, $bref, $eml, $smsg)) { ${$arg->{need_checkpoint}} = 1; } - ${$arg->{latest_cmt}} = $arg->{cur_cmt} // die 'BUG: {cur_cmt} missing'; + index_finalize($arg, 1); } # only update last_commit for $i on reindex iff newer than current @@ -1157,11 +1167,12 @@ sub unindex_oid_aux ($$$) { } sub unindex_oid ($$;$) { # git->cat_async callback - my ($bref, $oid, $type, $size, $sync) = @_; - return if is_bad_blob($oid, $type, $size, $sync->{oid}); - my $self = $sync->{self}; + my ($bref, $oid, $type, $size, $arg) = @_; + is_bad_blob($oid, $type, $size, $arg->{oid}) and + return index_finalize($arg, 0); + my $self = $arg->{self}; local $self->{current_info} = "$self->{current_info} $oid"; - my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef; + my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef; my $mm = $self->{mm}; my $mids = mids(PublicInbox::Eml->new($bref)); undef $$bref; @@ -1186,6 +1197,7 @@ sub unindex_oid ($$;$) { # git->cat_async callback } unindex_oid_aux($self, $oid, $mid); } + index_finalize($arg, 0); } sub git { $_[0]->{ibx}->git } -- cgit v1.2.3-24-ge0c7 From 9a17bc230973405b0656bf05fb76902d46627ffa Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 25 Dec 2020 10:21:14 +0000 Subject: v2writable: don't verify tip if reindexing We only rely on git-rev-parse to resolve symbolic names ("HEAD") to a SHA-* git commit ID. We'll assume any git commit IDs we get from SQLite DBs are valid and let "git-log" fail if it isn't. --- lib/PublicInbox/V2Writable.pm | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'lib/PublicInbox/V2Writable.pm') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index ca52874b..f20b5c7f 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1104,12 +1104,14 @@ sub sync_prepare ($$) { -d $git_dir or next; # missing epochs are fine my $git = PublicInbox::Git->new($git_dir); my $unit = { git => $git, epoch => $i }; + my $tip; if ($reindex_heads) { - $head = $reindex_heads->[$i] or next; + $tip = $head = $reindex_heads->[$i] or next; + } else { + $tip = $git->qx(qw(rev-parse -q --verify), $head); + next if $?; # new repo + chomp $tip; } - chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head)); - next if $?; # new repo - my $range = log_range($sync, $unit, $tip) or next; # can't use 'rev-list --count' if we use --diff-filter $pr->("$pfx $i.git counting $range ... ") if $pr; -- cgit v1.2.3-24-ge0c7 From 1d96509a3f59c38394d2f3ac4323dc54c74dc202 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 26 Dec 2020 01:44:37 +0000 Subject: extindex: --watch for inotify-based updates This reuses existing InboxIdle infrastructure to update external indices based on per-inbox updates. This is an alternative to auto-updating external indices via the -index command and also works with existing uses of -mda and public-inbox-watch. Using inotify (or EVFILT_VNODE) allows watching thousands of inboxes without having to scan every single one at every invocation. This is especially beneficial in cases where an external index is not writable to the users writing to per-inbox indices. --- lib/PublicInbox/V2Writable.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/PublicInbox/V2Writable.pm') diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index f20b5c7f..567582c5 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -879,7 +879,7 @@ sub reindex_checkpoint ($$) { $self->done; # release lock } - if (my $pr = $sync->{-opt}->{-progress}) { + if (my $pr = $sync->{-regen_fmt} ? $sync->{-opt}->{-progress} : undef) { $pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}})); } -- cgit v1.2.3-24-ge0c7