user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@yhbt.net>
To: meta@public-inbox.org
Subject: [PATCH 03/20] v2writable: introduce idx_stack
Date: Fri, 24 Jul 2020 05:55:49 +0000	[thread overview]
Message-ID: <20200724055606.27332-4-e@yhbt.net> (raw)
In-Reply-To: <20200724055606.27332-1-e@yhbt.net>

This avoids pinning a potentially large chunk of memory from
`git-log --reverse' into RAM (or triggering less predictable
swap behavior).  Instead it uses a contiguous temporary file
with a fixed-size record for every blob we'll need to index.
---
 MANIFEST                      |   2 +
 lib/PublicInbox/IdxStack.pm   |  52 ++++++++++++++++
 lib/PublicInbox/V2Writable.pm | 114 ++++++++++++++++++++--------------
 t/idx_stack.t                 |  56 +++++++++++++++++
 4 files changed, 176 insertions(+), 48 deletions(-)
 create mode 100644 lib/PublicInbox/IdxStack.pm
 create mode 100644 t/idx_stack.t

diff --git a/MANIFEST b/MANIFEST
index 9d90c8c23..f46a0776d 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -138,6 +138,7 @@ lib/PublicInbox/IMAPD.pm
 lib/PublicInbox/IMAPTracker.pm
 lib/PublicInbox/IMAPdeflate.pm
 lib/PublicInbox/IMAPsearchqp.pm
+lib/PublicInbox/IdxStack.pm
 lib/PublicInbox/Import.pm
 lib/PublicInbox/In2Tie.pm
 lib/PublicInbox/Inbox.pm
@@ -277,6 +278,7 @@ t/httpd-https.t
 t/httpd-unix.t
 t/httpd.t
 t/hval.t
+t/idx_stack.t
 t/imap.t
 t/imap_searchqp.t
 t/imap_tracker.t
diff --git a/lib/PublicInbox/IdxStack.pm b/lib/PublicInbox/IdxStack.pm
new file mode 100644
index 000000000..b43b8064e
--- /dev/null
+++ b/lib/PublicInbox/IdxStack.pm
@@ -0,0 +1,52 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# temporary stack for public-inbox-index
+package PublicInbox::IdxStack;
+use v5.10.1;
+use strict;
+use Fcntl qw(:seek);
+use constant FMT => eval { pack('Q', 1) } ? 'A1QQH*' : 'A1IIH*';
+
+# start off in write-only mode
+sub new {
+	open(my $io, '+>', undef) or die "open: $!";
+	bless { wr => $io, latest_cmt => $_[1] }, __PACKAGE__
+}
+
+# file_char = [a|m]
+sub push_rec {
+	my ($self, $file_char, $at, $ct, $blob_oid) = @_;
+	my $rec = pack(FMT, $file_char, $at, $ct, $blob_oid);
+	$self->{rec_size} //= length($rec);
+	print { $self->{wr} } $rec or die "print: $!";
+	$self->{tot_size} += length($rec);
+}
+
+sub num_records {
+	my ($self) = @_;
+	$self->{rec_size} ? $self->{tot_size} / $self->{rec_size} : 0;
+}
+
+# switch into read-only mode and returns self
+sub read_prepare {
+	my ($self) = @_;
+	my $io = $self->{rd} = delete($self->{wr});
+	$io->flush or die "flush: $!";
+	$self;
+}
+
+sub pop_rec {
+	my ($self) = @_;
+	my $sz = $self->{rec_size} or return;
+	my $rec_pos = $self->{tot_size} -= $sz;
+	return if $rec_pos < 0;
+	my $io = $self->{rd};
+	seek($io, $rec_pos, SEEK_SET) or die "seek: $!";
+	my $r = read($io, my $buf, $sz);
+	defined($r) or die "read: $!";
+	$r == $sz or die "read($r != $sz)";
+	unpack(FMT, $buf);
+}
+
+1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index c04ea5d77..04c91e5dd 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -8,6 +8,7 @@ use strict;
 use v5.10.1;
 use parent qw(PublicInbox::Lock);
 use PublicInbox::SearchIdxShard;
+use PublicInbox::IdxStack;
 use PublicInbox::Eml;
 use PublicInbox::Git;
 use PublicInbox::Import;
@@ -881,9 +882,6 @@ sub reindex_checkpoint ($$$) {
 
 sub reindex_oid ($$$$) {
 	my ($self, $sync, $git, $oid) = @_;
-	if (my $D = $sync->{D}) { # don't waste I/O on deletes
-		return if $D->{pack('H*', $oid)};
-	}
 	return if PublicInbox::SearchIdx::too_big($self, $git, $oid);
 	my ($num, $mid0, $len);
 	my $msgref = $git->cat_file($oid, \$len);
@@ -1035,20 +1033,45 @@ $range
 	$range;
 }
 
-# don't bump num_highwater on --reindex
-sub mark_deleted ($$$) {
+sub prepare_range_stack {
 	my ($git, $sync, $range) = @_;
-	my $D = $sync->{D} //= {}; # pack("H*", $oid) => NR
-	my $fh = $git->popen(qw(log --raw --no-abbrev
-			--pretty=tformat:%H
-			--no-notes --no-color --no-renames
-			--diff-filter=AM), $range, '--', 'd');
+	# Don't bump num_highwater on --reindex by using {D}.
+	# We intentionally do NOT use {D} in the non-reindex case because
+	# we want NNTP article number gaps from unindexed messages to
+	# show up in mirrors, too.
+	my $D = $sync->{D} //= $sync->{reindex} ? {} : undef; # OID_BIN => NR
+
+	my $fh = $git->popen(qw(log --raw -r --pretty=tformat:%at-%ct-%H
+				--no-notes --no-color --no-renames --no-abbrev),
+				$range);
+	my ($at, $ct, $stk);
 	while (<$fh>) {
-		if (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
-			$D->{pack('H*', $1)}++;
+		if (/\A([0-9]+)-([0-9]+)-($x40)$/o) {
+			($at, $ct) = ($1 + 0, $2 + 0);
+			$stk //= PublicInbox::IdxStack->new($3);
+		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
+			my $oid = $1;
+			if ($D) { # reindex case
+				$D->{pack('H*', $oid)}++;
+			} else { # non-reindex case:
+				$stk->push_rec('d', $at, $ct, $oid);
+			}
+		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
+			my $oid = $1;
+			if ($D) {
+				my $oid_bin = pack('H*', $oid);
+				my $nr = --$D->{$oid_bin};
+				delete($D->{$oid_bin}) if $nr <= 0;
+
+				# nr < 0 (-1) means it never existed
+				$stk->push_rec('m', $at, $ct, $oid) if $nr < 0;
+			} else {
+				$stk->push_rec('m', $at, $ct, $oid);
+			}
 		}
 	}
 	close $fh or die "git log failed: \$?=$?";
+	$stk ? $stk->read_prepare : undef;
 }
 
 sub sync_prepare ($$$) {
@@ -1061,7 +1084,7 @@ sub sync_prepare ($$$) {
 	# without {reindex}
 	my $reindex_heads = last_commits($self, $epoch_max) if $sync->{reindex};
 
-	for my $i (0..$epoch_max) {
+	for (my $i = $epoch_max; $i >= 0; $i--) {
 		die 'BUG: already indexing!' if $self->{reindex_pipe};
 		my $git_dir = git_dir_n($self, $i);
 		-d $git_dir or next; # missing epochs are fine
@@ -1077,15 +1100,24 @@ sub sync_prepare ($$$) {
 
 		# can't use 'rev-list --count' if we use --diff-filter
 		$pr->("$i.git counting $range ... ") if $pr;
-		my $n = 0;
-		my $fh = $git->popen(qw(log --pretty=tformat:%H
-				--no-notes --no-color --no-renames
-				--diff-filter=AM), $range, '--', 'm');
-		++$n while <$fh>;
-		close $fh or die "git log failed: \$?=$?";
-		$pr->("$n\n") if $pr;
-		$regen_max += $n;
-		mark_deleted($git, $sync, $range) if $sync->{reindex};
+		my $stk = prepare_range_stack($git, $sync, $range);
+		my $nr = $stk ? $stk->num_records : 0;
+		$pr->("$nr\n") if $pr;
+		$sync->{stacks}->[$i] = $stk if $stk;
+		$regen_max += $nr;
+	}
+
+	# XXX this should not happen unless somebody bypasses checks in
+	# our code and blindly injects "d" file history into git repos
+	if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
+		warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
+		my $git = $self->{-inbox}->git;
+		for my $oid (@leftovers) {
+			$oid = unpack('H*', $oid);
+			$self->{current_info} = "leftover $oid";
+			unindex_oid($self, $git, $oid);
+		}
+		$git->cleanup;
 	}
 	return 0 if (!$regen_max && !keys(%{$self->{unindex_range}}));
 
@@ -1186,38 +1218,24 @@ sub index_epoch ($$$) {
 	if (my $unindex_range = delete $sync->{unindex_range}->{$i}) {
 		unindex($self, $sync, $git, $unindex_range);
 	}
-	defined(my $range = $sync->{ranges}->[$i]) or return;
+	defined(my $stk = $sync->{stacks}->[$i]) or return;
+	$sync->{stacks}->[$i] = undef;
+	my $range = $sync->{ranges}->[$i];
 	if (my $pr = $sync->{-opt}->{-progress}) {
 		$pr->("$i.git indexing $range\n");
 	}
-	my @cmd = qw(log --reverse --raw -r --pretty=tformat:%H.%at.%ct
-			--no-notes --no-color --no-abbrev --no-renames);
-	my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range);
-	my $cmt;
-	my $D = $sync->{D};
-	while (<$fh>) {
-		chomp;
-		$self->{current_info} = "$i.git $_";
-		if (/\A($x40)\.([0-9]+)\.([0-9]+)$/o) {
-			$cmt = $1;
-			$self->{autime} = $2;
-			$self->{cotime} = $3;
-		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
-			reindex_oid($self, $sync, $git, $1);
-		} elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) {
-			# allow re-add if there was user error
-			my $oid = $1;
-			if ($D) {
-				my $oid_bin = pack('H*', $oid);
-				my $nr = --$D->{$oid_bin};
-				delete($D->{$oid_bin}) if $nr <= 0;
-			}
+	while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
+		$self->{current_info} = "$i.git $oid";
+		if ($f eq 'm') {
+			$self->{autime} = $at;
+			$self->{cotime} = $ct;
+			reindex_oid($self, $sync, $git, $oid);
+		} elsif ($f eq 'd') {
 			unindex_oid($self, $git, $oid);
 		}
 	}
-	close $fh or die "git log failed: \$?=$?";
-	delete @$self{qw(reindex_pipe autime cotime)};
-	update_last_commit($self, $git, $i, $cmt) if defined $cmt;
+	delete @$self{qw(autime cotime)};
+	update_last_commit($self, $git, $i, $stk->{latest_cmt});
 }
 
 # public, called by public-inbox-index
diff --git a/t/idx_stack.t b/t/idx_stack.t
new file mode 100644
index 000000000..35aff37b7
--- /dev/null
+++ b/t/idx_stack.t
@@ -0,0 +1,56 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use_ok 'PublicInbox::IdxStack';
+my $oid_a = '03c21563cf15c241687966b5b2a3f37cdc193316';
+my $oid_b = '963caad026055ab9bcbe3ee9550247f9d8840feb';
+
+my $stk = PublicInbox::IdxStack->new;
+is($stk->read_prepare, $stk, 'nothing');
+is($stk->num_records, 0, 'no records');
+is($stk->pop_rec, undef, 'undef on empty');
+
+$stk = PublicInbox::IdxStack->new;
+$stk->push_rec('m', 1234, 5678, $oid_a);
+is($stk->read_prepare, $stk, 'read_prepare');
+is($stk->num_records, 1, 'num_records');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop once');
+is($stk->pop_rec, undef, 'undef on empty');
+
+$stk = PublicInbox::IdxStack->new;
+$stk->push_rec('m', 1234, 5678, $oid_a);
+$stk->push_rec('d', 1234, 5678, $oid_b);
+is($stk->read_prepare, $stk, 'read_prepare');
+is($stk->num_records, 2, 'num_records');
+is_deeply([$stk->pop_rec], ['d', 1234, 5678, $oid_b], 'pop');
+is_deeply([$stk->pop_rec], ['m', 1234, 5678, $oid_a], 'pop-pop');
+is($stk->pop_rec, undef, 'empty');
+
+SKIP: {
+	$stk = undef;
+	my $nr = $ENV{TEST_GIT_LOG} or skip 'TEST_GIT_LOG unset', 3;
+	open my $fh, '-|', qw(git log --pretty=tformat:%at.%ct.%H), "-$nr" or
+		die "git log: $!";
+	my @expect;
+	while (<$fh>) {
+		chomp;
+		my ($at, $ct, $H) = split(/\./);
+		$stk //= PublicInbox::IdxStack->new($H);
+		# not bothering to parse blobs here, just using commit OID
+		# as a blob OID since they're the same size + format
+		$stk->push_rec('m', $at + 0, $ct + 0, $H);
+		push(@expect, [ 'm', $at, $ct, $H ]);
+	}
+	$stk or skip('nothing from git log', 3);
+	is($stk->read_prepare, $stk, 'read_prepare');
+	is($stk->num_records, scalar(@expect), 'num_records matches expected');
+	my @result;
+	while (my @tmp = $stk->pop_rec) {
+		unshift @result, \@tmp;
+	}
+	is_deeply(\@result, \@expect, 'results match expected');
+}
+
+done_testing;

  parent reply	other threads:[~2020-07-24  5:56 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-24  5:55 [PATCH 00/20] indexing changes and new features Eric Wong
2020-07-24  5:55 ` [PATCH 01/20] index: support --rethread switch to fix old indices Eric Wong
2020-07-24  5:55 ` [PATCH 02/20] v2: index forwards (via `git log --reverse') Eric Wong
2020-07-24  5:55 ` Eric Wong [this message]
2020-07-24  5:55 ` [PATCH 04/20] v2writable: index_sync: reduce fill_alternates calls Eric Wong
2020-07-24  5:55 ` [PATCH 05/20] v2writable: move {autime} and {cotime} into $sync state Eric Wong
2020-07-24  5:55 ` [PATCH 06/20] v2writable: allow >= 40 byte git object IDs Eric Wong
2020-07-24  5:55 ` [PATCH 07/20] v2writable: drop "EPOCH.git indexing $RANGE" progress Eric Wong
2020-07-24  5:55 ` [PATCH 08/20] use consistent {ibx} field for writable code paths Eric Wong
2020-07-24  5:55 ` [PATCH 09/20] search: avoid copying {inboxdir} Eric Wong
2020-07-24  5:55 ` [PATCH 10/20] v2writable: use read-only PublicInbox::Git for cat_file Eric Wong
2020-07-24  5:55 ` [PATCH 11/20] v2writable: get rid of {reindex_pipe} field Eric Wong
2020-07-24  5:55 ` [PATCH 12/20] v2writable: clarify "epoch" comment Eric Wong
2020-07-24  5:55 ` [PATCH 13/20] xapcmd: set {from} properly for v1 inboxes Eric Wong
2020-07-24  5:56 ` [PATCH 14/20] searchidx: rename _xdb_{acquire,release} => idx_ Eric Wong
2020-07-24  5:56 ` [PATCH 15/20] searchidx: make v1 indexing closer to v2 Eric Wong
2020-07-24  5:56 ` [PATCH 16/20] index+xcpdb: support --no-sync flag Eric Wong
2020-07-24  5:56 ` [PATCH 17/20] v2writable: share log2stack code with v1 Eric Wong
2020-07-24  5:56 ` [PATCH 18/20] searchidx: support async git check Eric Wong
2020-07-24  5:56 ` [PATCH 19/20] searchidx: $batch_cb => v1_checkpoint Eric Wong
2020-07-24  5:56 ` [PATCH 20/20] v2writable: {unindexed} belongs in $sync state Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200724055606.27332-4-e@yhbt.net \
    --to=e@yhbt.net \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).