user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 08/10] v2writable: more accurate {current_info} warnings/progress
Date: Sat,  7 Nov 2020 10:56:58 +0000	[thread overview]
Message-ID: <20201107105700.12586-9-e@80x24.org> (raw)
In-Reply-To: <20201107105700.12586-1-e@80x24.org>

With async git blob retrievals, the OID being enqueued and the
OID being processed can be totally unrelated and misleading.

We'll also prefix $INBOX_DIR for v2, and not just the epoch
since we could be indexing multiple inboxes via both -index
and -extindex.
---
 lib/PublicInbox/ExtSearchIdx.pm |  7 +++++++
 lib/PublicInbox/V2Writable.pm   | 24 ++++++++++++++++++------
 script/public-inbox-extindex    |  1 +
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 3e7f5604..50342802 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -250,17 +250,22 @@ sub cur_ibx_xnum ($$) {
 
 sub index_oid { # git->cat_async callback for 'm'
 	my ($bref, $oid, $type, $size, $req) = @_;
+	my $self = $req->{self};
+	local $self->{current_info} = "$self->{current_info} $oid";
 	return if is_bad_blob($oid, $type, $size, $req->{oid});
 	my $new_smsg = $req->{new_smsg} = bless {
 		blob => $oid,
 	}, 'PublicInbox::Smsg';
 	$new_smsg->{bytes} = $size + crlf_adjust($$bref);
 	defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return;
+	++${$req->{nr}};
 	do_step($req);
 }
 
 sub unindex_oid { # git->cat_async callback for 'd'
 	my ($bref, $oid, $type, $size, $req) = @_;
+	my $self = $req->{self};
+	local $self->{current_info} = "$self->{current_info} $oid";
 	return if is_bad_blob($oid, $type, $size, $req->{oid});
 	return if defined(cur_ibx_xnum($req, $bref)); # was re-added
 	do_step($req);
@@ -286,6 +291,8 @@ sub _sync_inbox ($$$) {
 		-opt => $opt,
 		self => $self,
 		ibx => $ibx,
+		nr => \(my $nr = 0),
+		-regen_fmt => "%u/?\n",
 	};
 	my $v = $ibx->version;
 	my $ekey = $ibx->eidx_key;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 224675ab..18f33655 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -882,12 +882,13 @@ sub reindex_checkpoint ($$) {
 
 sub index_oid { # cat_async callback
 	my ($bref, $oid, $type, $size, $arg) = @_;
+	my $self = $arg->{self};
+	local $self->{current_info} = "$self->{current_info} $oid";
 	return if $size == 0; # purged
 	my ($num, $mid0);
 	my $eml = PublicInbox::Eml->new($$bref);
 	my $mids = mids($eml);
 	my $chash = content_hash($eml);
-	my $self = $arg->{self};
 
 	if (scalar(@$mids) == 0) {
 		warn "E: $oid has no Message-ID, skipping\n";
@@ -1047,6 +1048,11 @@ sub sync_prepare ($$) {
 	my $pr = $sync->{-opt}->{-progress};
 	my $regen_max = 0;
 	my $head = $sync->{ibx}->{ref_head} || 'HEAD';
+	my $pfx;
+	if ($pr) {
+		($pfx) = ($sync->{ibx}->{inboxdir} =~ m!([^/]+)\z!g);
+		$pfx //= $sync->{ibx}->{inboxdir};
+	}
 
 	# reindex stops at the current heads and we later rerun index_sync
 	# without {reindex}
@@ -1068,7 +1074,7 @@ sub sync_prepare ($$) {
 
 		my $range = log_range($sync, $unit, $tip) or next;
 		# can't use 'rev-list --count' if we use --diff-filter
-		$pr->("$i.git counting $range ... ") if $pr;
+		$pr->("$pfx $i.git counting $range ... ") if $pr;
 		# Don't bump num_highwater on --reindex by using {D}.
 		# We intentionally do NOT use {D} in the non-reindex case
 		# because we want NNTP article number gaps from unindexed
@@ -1086,10 +1092,10 @@ sub sync_prepare ($$) {
 	# our code and blindly injects "d" file history into git repos
 	if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
 		warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
+		local $self->{current_info} = 'leftover ';
 		my $unindex_oid = $self->can('unindex_oid');
 		for my $oid (@leftovers) {
 			$oid = unpack('H*', $oid);
-			$self->{current_info} = "leftover $oid";
 			my $req = { %$sync, oid => $oid };
 			$self->git->cat_async($oid, $unindex_oid, $req);
 		}
@@ -1121,6 +1127,7 @@ sub unindex_oid_aux ($$$) {
 sub unindex_oid ($$;$) { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $sync) = @_;
 	my $self = $sync->{self};
+	local $self->{current_info} = "$self->{current_info} $oid";
 	my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef;
 	my $mm = $self->{mm};
 	my $mids = mids(PublicInbox::Eml->new($bref));
@@ -1230,10 +1237,15 @@ sub index_todo ($$$) {
 	my $all = $self->git;
 	my $index_oid = $self->can('index_oid');
 	my $unindex_oid = $self->can('unindex_oid');
-	my ($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g);
-	$pfx //= $unit->{git}->{git_dir};
+	my $pfx;
+	if ($unit->{git}->{git_dir} =~ m!/([^/]+)/git/([0-9]+\.git)\z!) {
+		$pfx = "$1 $2"; # v2
+	} else { # v1
+		($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g);
+		$pfx //= $unit->{git}->{git_dir};
+	}
+	local $self->{current_info} = "$pfx ";
 	while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
-		$self->{current_info} = "$pfx $oid";
 		my $req = { %$sync, autime => $at, cotime => $ct, oid => $oid };
 		if ($f eq 'm') {
 			if ($sync->{max_size}) {
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index a58f35ca..bb1e174a 100644
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -37,6 +37,7 @@ require PublicInbox::Admin;
 my $cfg = PublicInbox::Config->new;
 my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
 PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Admin::progress_prepare($opt);
 require PublicInbox::ExtSearchIdx;
 my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
 $eidx->attach_inbox($_) for @ibxs;

  parent reply	other threads:[~2020-11-07 10:57 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
2020-11-07 10:56 ` [PATCH 01/10] extsearch: rename -eindex to -extindex Eric Wong
2020-11-07 10:56 ` [PATCH 02/10] extsearchidx: avoid needless alternates rewrite in ALL.git Eric Wong
2020-11-07 10:56 ` [PATCH 03/10] searchidxshard: reduce syscalls when writing ->eidx_key Eric Wong
2020-11-07 10:56 ` [PATCH 04/10] searchidxshard: further improve {current_info} readability Eric Wong
2020-11-07 10:56 ` [PATCH 05/10] v2writable: less expensive checkpoint for extindex Eric Wong
2020-11-07 10:56 ` [PATCH 06/10] extsearchidx: quiet warning for unindexed `d' messages Eric Wong
2020-11-07 10:56 ` [PATCH 07/10] extsearch: canonicalize topdir Eric Wong
2020-11-07 10:56 ` Eric Wong [this message]
2020-11-07 10:56 ` [PATCH 09/10] extindex: SIGUSR1 supports checkpoint Eric Wong
2020-11-07 10:57 ` [PATCH 10/10] extindex: fix --batch-size support Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201107105700.12586-9-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).