git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Jeff King <peff@peff.net>
To: "Ævar Arnfjörð Bjarmason" <avarab@gmail.com>
Cc: Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>, git@vger.kernel.org
Subject: Re: non-smooth progress  indication for git fsck and git gc
Date: Sun, 2 Sep 2018 03:55:29 -0400	[thread overview]
Message-ID: <20180902075528.GC18787@sigill.intra.peff.net> (raw)
In-Reply-To: <20180902074656.GB18787@sigill.intra.peff.net>

On Sun, Sep 02, 2018 at 03:46:57AM -0400, Jeff King wrote:

> Something like this, which chunks it there, uses a per-packfile meter
> (though still does not give any clue how many packfiles there are), and
> shows a throughput meter.

Actually, in typical cases it would not matter how many packfiles there
are, because there's generally one big one, and then none of the small
ones would take long enough to trigger the delayed meter. So you'd only
see one such meter usually, and it would cover the majority of the time.
In theory, anyway.

I still think the more interesting long-term thing here is to reuse the
pack verification from index-pack, which actually hashes as it does the
per-object countup.

That code isn't lib-ified enough to be run in process, but I think the
patch below should give similar behavior to what fsck currently does.
We'd need to tell index-pack to use our fsck.* config for its checks, I
imagine. The progress here is still per-pack, but I think we could pass
in sufficient information to have it do one continuous meter across all
of the packs (see the in-code comment).

And it makes the result multi-threaded, and lets us drop a bunch of
duplicate code.

---
 builtin/fsck.c |  53 +++++++------
 pack-check.c   | 142 -----------------------------------
 pack.h         |   1 -
 3 files changed, 32 insertions(+), 164 deletions(-)

diff --git a/builtin/fsck.c b/builtin/fsck.c
index 250f5af118..643e16125b 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -386,25 +386,6 @@ static int fsck_obj(struct object *obj, void *buffer, unsigned long size)
 	return err;
 }
 
-static int fsck_obj_buffer(const struct object_id *oid, enum object_type type,
-			   unsigned long size, void *buffer, int *eaten)
-{
-	/*
-	 * Note, buffer may be NULL if type is OBJ_BLOB. See
-	 * verify_packfile(), data_valid variable for details.
-	 */
-	struct object *obj;
-	obj = parse_object_buffer(the_repository, oid, type, size, buffer,
-				  eaten);
-	if (!obj) {
-		errors_found |= ERROR_OBJECT;
-		return error("%s: object corrupt or missing", oid_to_hex(oid));
-	}
-	obj->flags &= ~(REACHABLE | SEEN);
-	obj->flags |= HAS_OBJ;
-	return fsck_obj(obj, buffer, size);
-}
-
 static int default_refs;
 
 static void fsck_handle_reflog_oid(const char *refname, struct object_id *oid,
@@ -662,6 +643,37 @@ static int mark_packed_for_connectivity(const struct object_id *oid,
 	return 0;
 }
 
+static int verify_pack(struct packed_git *p)
+{
+	struct child_process index_pack = CHILD_PROCESS_INIT;
+
+	for_each_object_in_pack(p, mark_packed_for_connectivity, NULL, 0);
+
+	/*
+	 * This should probably be replaced with a command-line option
+	 * that teaches "index-pack --verify" to show a more compact and
+	 * fsck-oriented progress (see also the "-v" below).
+	 */
+	if (show_progress)
+		fprintf(stderr, "Checking packfile %s...\n",
+			p->pack_name);
+
+	index_pack.git_cmd = 1;
+	argv_array_pushl(&index_pack.args,
+			 "index-pack",
+			 "--verify",
+			 "--strict",
+			 NULL);
+	if (show_progress)
+		argv_array_push(&index_pack.args, "-v");
+	argv_array_push(&index_pack.args, p->pack_name);
+
+	if (run_command(&index_pack))
+		return -1;
+
+	return 0;
+}
+
 static char const * const fsck_usage[] = {
 	N_("git fsck [<options>] [<object>...]"),
 	NULL
@@ -752,8 +764,7 @@ int cmd_fsck(int argc, const char **argv, const char *prefix)
 			for (p = get_packed_git(the_repository); p;
 			     p = p->next) {
 				/* verify gives error messages itself */
-				if (verify_pack(p, fsck_obj_buffer,
-						progress, count))
+				if (verify_pack(p))
 					errors_found |= ERROR_PACK;
 				count += p->num_objects;
 			}
diff --git a/pack-check.c b/pack-check.c
index d3a57df34f..ea1457ce53 100644
--- a/pack-check.c
+++ b/pack-check.c
@@ -15,17 +15,6 @@ struct idx_entry {
 	unsigned int nr;
 };
 
-static int compare_entries(const void *e1, const void *e2)
-{
-	const struct idx_entry *entry1 = e1;
-	const struct idx_entry *entry2 = e2;
-	if (entry1->offset < entry2->offset)
-		return -1;
-	if (entry1->offset > entry2->offset)
-		return 1;
-	return 0;
-}
-
 int check_pack_crc(struct packed_git *p, struct pack_window **w_curs,
 		   off_t offset, off_t len, unsigned int nr)
 {
@@ -48,121 +37,6 @@ int check_pack_crc(struct packed_git *p, struct pack_window **w_curs,
 	return data_crc != ntohl(*index_crc);
 }
 
-static int verify_packfile(struct packed_git *p,
-			   struct pack_window **w_curs,
-			   verify_fn fn,
-			   struct progress *progress, uint32_t base_count)
-
-{
-	off_t index_size = p->index_size;
-	const unsigned char *index_base = p->index_data;
-	git_hash_ctx ctx;
-	unsigned char hash[GIT_MAX_RAWSZ], *pack_sig;
-	off_t offset = 0, pack_sig_ofs = 0;
-	uint32_t nr_objects, i;
-	int err = 0;
-	struct idx_entry *entries;
-
-	if (!is_pack_valid(p))
-		return error("packfile %s cannot be accessed", p->pack_name);
-
-	the_hash_algo->init_fn(&ctx);
-	do {
-		unsigned long remaining;
-		unsigned char *in = use_pack(p, w_curs, offset, &remaining);
-		offset += remaining;
-		if (!pack_sig_ofs)
-			pack_sig_ofs = p->pack_size - the_hash_algo->rawsz;
-		if (offset > pack_sig_ofs)
-			remaining -= (unsigned int)(offset - pack_sig_ofs);
-		the_hash_algo->update_fn(&ctx, in, remaining);
-	} while (offset < pack_sig_ofs);
-	the_hash_algo->final_fn(hash, &ctx);
-	pack_sig = use_pack(p, w_curs, pack_sig_ofs, NULL);
-	if (hashcmp(hash, pack_sig))
-		err = error("%s pack checksum mismatch",
-			    p->pack_name);
-	if (hashcmp(index_base + index_size - the_hash_algo->hexsz, pack_sig))
-		err = error("%s pack checksum does not match its index",
-			    p->pack_name);
-	unuse_pack(w_curs);
-
-	/* Make sure everything reachable from idx is valid.  Since we
-	 * have verified that nr_objects matches between idx and pack,
-	 * we do not do scan-streaming check on the pack file.
-	 */
-	nr_objects = p->num_objects;
-	ALLOC_ARRAY(entries, nr_objects + 1);
-	entries[nr_objects].offset = pack_sig_ofs;
-	/* first sort entries by pack offset, since unpacking them is more efficient that way */
-	for (i = 0; i < nr_objects; i++) {
-		entries[i].oid.hash = nth_packed_object_sha1(p, i);
-		if (!entries[i].oid.hash)
-			die("internal error pack-check nth-packed-object");
-		entries[i].offset = nth_packed_object_offset(p, i);
-		entries[i].nr = i;
-	}
-	QSORT(entries, nr_objects, compare_entries);
-
-	for (i = 0; i < nr_objects; i++) {
-		void *data;
-		enum object_type type;
-		unsigned long size;
-		off_t curpos;
-		int data_valid;
-
-		if (p->index_version > 1) {
-			off_t offset = entries[i].offset;
-			off_t len = entries[i+1].offset - offset;
-			unsigned int nr = entries[i].nr;
-			if (check_pack_crc(p, w_curs, offset, len, nr))
-				err = error("index CRC mismatch for object %s "
-					    "from %s at offset %"PRIuMAX"",
-					    oid_to_hex(entries[i].oid.oid),
-					    p->pack_name, (uintmax_t)offset);
-		}
-
-		curpos = entries[i].offset;
-		type = unpack_object_header(p, w_curs, &curpos, &size);
-		unuse_pack(w_curs);
-
-		if (type == OBJ_BLOB && big_file_threshold <= size) {
-			/*
-			 * Let check_object_signature() check it with
-			 * the streaming interface; no point slurping
-			 * the data in-core only to discard.
-			 */
-			data = NULL;
-			data_valid = 0;
-		} else {
-			data = unpack_entry(the_repository, p, entries[i].offset, &type, &size);
-			data_valid = 1;
-		}
-
-		if (data_valid && !data)
-			err = error("cannot unpack %s from %s at offset %"PRIuMAX"",
-				    oid_to_hex(entries[i].oid.oid), p->pack_name,
-				    (uintmax_t)entries[i].offset);
-		else if (check_object_signature(entries[i].oid.oid, data, size, type_name(type)))
-			err = error("packed %s from %s is corrupt",
-				    oid_to_hex(entries[i].oid.oid), p->pack_name);
-		else if (fn) {
-			int eaten = 0;
-			err |= fn(entries[i].oid.oid, type, size, data, &eaten);
-			if (eaten)
-				data = NULL;
-		}
-		if (((base_count + i) & 1023) == 0)
-			display_progress(progress, base_count + i);
-		free(data);
-
-	}
-	display_progress(progress, base_count + i);
-	free(entries);
-
-	return err;
-}
-
 int verify_pack_index(struct packed_git *p)
 {
 	off_t index_size;
@@ -185,19 +59,3 @@ int verify_pack_index(struct packed_git *p)
 			    p->pack_name);
 	return err;
 }
-
-int verify_pack(struct packed_git *p, verify_fn fn,
-		struct progress *progress, uint32_t base_count)
-{
-	int err = 0;
-	struct pack_window *w_curs = NULL;
-
-	err |= verify_pack_index(p);
-	if (!p->index_data)
-		return -1;
-
-	err |= verify_packfile(p, &w_curs, fn, progress, base_count);
-	unuse_pack(&w_curs);
-
-	return err;
-}
diff --git a/pack.h b/pack.h
index 34a9d458b4..0d346c5e31 100644
--- a/pack.h
+++ b/pack.h
@@ -80,7 +80,6 @@ typedef int (*verify_fn)(const struct object_id *, enum object_type, unsigned lo
 extern const char *write_idx_file(const char *index_name, struct pack_idx_entry **objects, int nr_objects, const struct pack_idx_option *, const unsigned char *sha1);
 extern int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, off_t offset, off_t len, unsigned int nr);
 extern int verify_pack_index(struct packed_git *);
-extern int verify_pack(struct packed_git *, verify_fn fn, struct progress *, uint32_t);
 extern off_t write_pack_header(struct hashfile *f, uint32_t);
 extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
 extern char *index_pack_lockfile(int fd);

  reply	other threads:[~2018-09-02  7:55 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-16  6:54 non-smooth progress indication for git fsck and git gc Ulrich Windl
2018-08-16 15:18 ` Duy Nguyen
2018-08-16 16:05   ` Jeff King
2018-08-20  8:27   ` Antw: " Ulrich Windl
2018-08-16 15:57 ` Jeff King
2018-08-16 20:02   ` Jeff King
2018-08-16 22:10     ` Junio C Hamano
2018-08-16 20:35   ` Ævar Arnfjörð Bjarmason
2018-08-16 20:55     ` Jeff King
2018-08-16 21:06       ` Jeff King
2018-08-17 14:39         ` Duy Nguyen
2018-08-20  8:33       ` Antw: " Ulrich Windl
2018-08-20  8:57         ` Ævar Arnfjörð Bjarmason
2018-08-20  9:37           ` Ulrich Windl
2018-08-21  1:07           ` Jeff King
2018-08-21  6:20             ` Ulrich Windl
2018-08-21 15:21             ` Duy Nguyen
2018-09-01 12:53     ` Ævar Arnfjörð Bjarmason
2018-09-01 13:52       ` Ævar Arnfjörð Bjarmason
2018-09-02  7:46       ` Jeff King
2018-09-02  7:55         ` Jeff King [this message]
2018-09-02  8:55           ` Jeff King
2018-09-03 16:48             ` Ævar Arnfjörð Bjarmason
2018-09-07  3:30               ` Jeff King
2018-09-04 15:53           ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180902075528.GC18787@sigill.intra.peff.net \
    --to=peff@peff.net \
    --cc=Ulrich.Windl@rz.uni-regensburg.de \
    --cc=avarab@gmail.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).