git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Jonathan Tan <jonathantanmy@google.com>
To: git@vger.kernel.org
Cc: Jonathan Tan <jonathantanmy@google.com>
Subject: [RFC PATCH 1/3] promised-blob, fsck: introduce promised blobs
Date: Tue, 11 Jul 2017 12:48:30 -0700	[thread overview]
Message-ID: <f9c7d4b3f800ea31e85e4897ee7048fec1e3c2f0.1499800530.git.jonathantanmy@google.com> (raw)
In-Reply-To: <cover.1499800530.git.jonathantanmy@google.com>
In-Reply-To: <cover.1499800530.git.jonathantanmy@google.com>

Currently, Git does not support repos with very large numbers of blobs
or repos that wish to minimize manipulation of certain blobs (for
example, because they are very large) very well, even if the user
operates mostly on part of the repo, because Git is designed on the
assumption that every blob referenced by a tree object is available
somewhere in the repo storage.

As a first step to reducing this problem, introduce the concept of
promised blobs. Each Git repo can contain a list of promised blobs and
their sizes at $GIT_DIR/objects/promisedblob. This patch contains
functions to query them; functions for creating and modifying that file
will be introduced in later patches.

A repository that is missing a blob but has that blob promised is not
considered to be in error, so also teach fsck this.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 Makefile                 |  1 +
 builtin/fsck.c           | 13 +++++++
 promised-blob.c          | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 promised-blob.h          | 14 +++++++
 t/t3907-promised-blob.sh | 29 +++++++++++++++
 t/test-lib-functions.sh  |  6 +++
 6 files changed, 158 insertions(+)
 create mode 100644 promised-blob.c
 create mode 100644 promised-blob.h
 create mode 100755 t/t3907-promised-blob.sh

diff --git a/Makefile b/Makefile
index 9c9c42f8f..e96163269 100644
--- a/Makefile
+++ b/Makefile
@@ -828,6 +828,7 @@ LIB_OBJS += preload-index.o
 LIB_OBJS += pretty.o
 LIB_OBJS += prio-queue.o
 LIB_OBJS += progress.o
+LIB_OBJS += promised-blob.o
 LIB_OBJS += prompt.o
 LIB_OBJS += quote.o
 LIB_OBJS += reachable.o
diff --git a/builtin/fsck.c b/builtin/fsck.c
index 99dea7adf..7454be7f1 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -15,6 +15,7 @@
 #include "progress.h"
 #include "streaming.h"
 #include "decorate.h"
+#include "promised-blob.h"
 
 #define REACHABLE 0x0001
 #define SEEN      0x0002
@@ -223,6 +224,9 @@ static void check_reachable_object(struct object *obj)
 	if (!(obj->flags & HAS_OBJ)) {
 		if (has_sha1_pack(obj->oid.hash))
 			return; /* it is in pack - forget about it */
+		if (obj->type == OBJ_BLOB &&
+		    is_promised_blob(&obj->oid, NULL))
+			return;
 		printf("missing %s %s\n", printable_type(obj),
 			describe_object(obj));
 		errors_found |= ERROR_REACHABLE;
@@ -642,6 +646,13 @@ static int mark_packed_for_connectivity(const struct object_id *oid,
 	return 0;
 }
 
+static int mark_promised_blob_for_connectivity(const struct object_id *oid,
+					       void *data)
+{
+	mark_object_for_connectivity(oid);
+	return 0;
+}
+
 static char const * const fsck_usage[] = {
 	N_("git fsck [<options>] [<object>...]"),
 	NULL
@@ -701,6 +712,8 @@ int cmd_fsck(int argc, const char **argv, const char *prefix)
 	if (connectivity_only) {
 		for_each_loose_object(mark_loose_for_connectivity, NULL, 0);
 		for_each_packed_object(mark_packed_for_connectivity, NULL, 0);
+		for_each_promised_blob(mark_promised_blob_for_connectivity,
+				       NULL);
 	} else {
 		fsck_object_dir(get_object_directory());
 
diff --git a/promised-blob.c b/promised-blob.c
new file mode 100644
index 000000000..493808ed2
--- /dev/null
+++ b/promised-blob.c
@@ -0,0 +1,95 @@
+#include "cache.h"
+#include "promised-blob.h"
+#include "sha1-lookup.h"
+#include "strbuf.h"
+
+#define ENTRY_SIZE (GIT_SHA1_RAWSZ + 8)
+/*
+ * A mmap-ed byte array of size (missing_blob_nr * ENTRY_SIZE). Each
+ * ENTRY_SIZE-sized entry consists of the SHA-1 of the promised blob and its
+ * 64-bit size in network byte order. The entries are sorted in ascending SHA-1
+ * order.
+ */
+static char *promised_blobs;
+static int64_t promised_blob_nr = -1;
+
+static void prepare_promised_blobs(void)
+{
+	char *filename;
+	int fd;
+	struct stat st;
+
+	if (promised_blob_nr >= 0)
+		return;
+
+	if (getenv("GIT_IGNORE_PROMISED_BLOBS")) {
+		promised_blob_nr = 0;
+		return;
+	}
+	
+	filename = xstrfmt("%s/promisedblob", get_object_directory());
+	fd = git_open(filename);
+	if (fd < 0) {
+		if (errno == ENOENT) {
+			promised_blob_nr = 0;
+			goto cleanup;
+		}
+		perror("prepare_promised_blobs");
+		die("Could not open %s", filename);
+	}
+	if (fstat(fd, &st)) {
+		perror("prepare_promised_blobs");
+		die("Could not stat %s", filename);
+	}
+	if (st.st_size == 0) {
+		promised_blob_nr = 0;
+		goto cleanup;
+	}
+	if (st.st_size % ENTRY_SIZE) {
+		die("Size of %s is not a multiple of %d", filename, ENTRY_SIZE);
+	}
+
+	promised_blobs = xmmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	promised_blob_nr = st.st_size / ENTRY_SIZE;
+
+cleanup:
+	free(filename);
+	if (fd >= 0)
+		close(fd);
+}
+
+int is_promised_blob(const struct object_id *oid, unsigned long *size)
+{
+	int result;
+
+	prepare_promised_blobs();
+	result = sha1_entry_pos(promised_blobs, ENTRY_SIZE, 0, 0,
+				promised_blob_nr, promised_blob_nr, oid->hash);
+	if (result >= 0) {
+		if (size) {
+			uint64_t size_nbo;
+			char *sizeptr = promised_blobs +
+					result * ENTRY_SIZE + GIT_SHA1_RAWSZ;
+			memcpy(&size_nbo, sizeptr, sizeof(size_nbo));
+			*size = ntohll(size_nbo);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+int for_each_promised_blob(each_promised_blob_fn cb, void *data)
+{
+	struct object_id oid;
+	int i, r;
+
+	prepare_promised_blobs();
+	for (i = 0; i < promised_blob_nr; i++) {
+		memcpy(oid.hash, &promised_blobs[i * ENTRY_SIZE],
+		       GIT_SHA1_RAWSZ);
+		r = cb(&oid, data);
+		if (r)
+			return r;
+	}
+	return 0;
+}
diff --git a/promised-blob.h b/promised-blob.h
new file mode 100644
index 000000000..a303ea1ff
--- /dev/null
+++ b/promised-blob.h
@@ -0,0 +1,14 @@
+#ifndef PROMISED_BLOB_H
+#define PROMISED_BLOB_H
+
+/*
+ * Returns 1 if oid is the name of a promised blob. If size is not NULL, also
+ * returns its size.
+ */
+extern int is_promised_blob(const struct object_id *oid,
+			    unsigned long *size);
+
+typedef int each_promised_blob_fn(const struct object_id *oid, void *data);
+int for_each_promised_blob(each_promised_blob_fn, void *);
+
+#endif
diff --git a/t/t3907-promised-blob.sh b/t/t3907-promised-blob.sh
new file mode 100755
index 000000000..827072004
--- /dev/null
+++ b/t/t3907-promised-blob.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+test_description='promised blobs'
+
+. ./test-lib.sh
+
+test_expect_success 'fsck fails on missing blobs' '
+	rm -rf repo &&
+
+	git init repo &&
+	test_commit -C repo 1 &&
+	HASH=$(git hash-object repo/1.t) &&
+
+	rm repo/.git/objects/$(echo $HASH | cut -c1-2)/$(echo $HASH | cut -c3-40) &&
+	test_must_fail git -C repo fsck
+'
+
+test_expect_success '...but succeeds if it is a promised blob' '
+	printf "%s%016x" "$HASH" "$(wc -c <repo/1.t)" |
+		hex_pack >repo/.git/objects/promisedblob &&
+	git -C repo fsck
+'
+
+test_expect_success '...but fails again with GIT_IGNORE_PROMISED_BLOBS' '
+	GIT_IGNORE_PROMISED_BLOBS=1 test_must_fail git -C repo fsck &&
+	unset GIT_IGNORE_PROMISED_BLOBS
+'
+
+test_done
diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
index db622c355..1ebdd2d04 100644
--- a/t/test-lib-functions.sh
+++ b/t/test-lib-functions.sh
@@ -1018,3 +1018,9 @@ nongit () {
 		"$@"
 	)
 }
+
+# Converts big-endian pairs of hexadecimal digits into bytes. For example,
+# "printf 61620d0a | hex_pack" results in "ab\r\n".
+hex_pack () {
+	perl -e '$/ = undef; $input = <>; print pack("H*", $input)'
+}
-- 
2.13.2.932.g7449e964c-goog


  reply	other threads:[~2017-07-11 19:48 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-07-11 19:48 [RFC PATCH 0/3] Partial clone: promised blobs (formerly "missing blobs") Jonathan Tan
2017-07-11 19:48 ` Jonathan Tan [this message]
2017-07-11 22:02   ` [RFC PATCH 1/3] promised-blob, fsck: introduce promised blobs Stefan Beller
2017-07-19 23:37     ` Jonathan Tan
2017-07-12 17:29   ` Jeff Hostetler
2017-07-12 19:28     ` Jonathan Nieder
2017-07-13 14:48       ` Jeff Hostetler
2017-07-13 15:05         ` Jeff Hostetler
2017-07-13 19:39     ` Jonathan Tan
2017-07-14 20:03       ` Jeff Hostetler
2017-07-14 21:30         ` Jonathan Nieder
2017-07-11 19:48 ` [RFC PATCH 2/3] sha1-array: support appending unsigned char hash Jonathan Tan
2017-07-11 22:06   ` Stefan Beller
2017-07-19 23:56     ` Jonathan Tan
2017-07-20  0:06       ` Stefan Beller
2017-07-11 19:48 ` [RFC PATCH 3/3] sha1_file: add promised blob hook support Jonathan Tan
2017-07-11 22:38   ` Stefan Beller
2017-07-12 17:40   ` Ben Peart
2017-07-12 20:38     ` Jonathan Nieder
2017-07-16 15:23 ` [RFC PATCH 0/3] Partial clone: promised blobs (formerly "missing blobs") Philip Oakley
2017-07-17 17:43   ` Ben Peart
2017-07-25 20:48     ` Philip Oakley
2017-07-17 18:03   ` Jonathan Nieder
2017-07-29 12:51     ` Philip Oakley
2017-07-20  0:21 ` [RFC PATCH v2 0/4] Partial clone: promised objects (not only blobs) Jonathan Tan
2017-07-20  0:21 ` [RFC PATCH v2 1/4] object: remove "used" field from struct object Jonathan Tan
2017-07-20  0:36   ` Stefan Beller
2017-07-20  0:55     ` Jonathan Tan
2017-07-20 17:44       ` Ben Peart
2017-07-20 21:20   ` Junio C Hamano
2017-07-20  0:21 ` [RFC PATCH v2 2/4] promised-object, fsck: introduce promised objects Jonathan Tan
2017-07-20 18:07   ` Stefan Beller
2017-07-20 19:17     ` Jonathan Tan
2017-07-20 19:58   ` Ben Peart
2017-07-20 21:13     ` Jonathan Tan
2017-07-21 16:24       ` Ben Peart
2017-07-21 20:33         ` Jonathan Tan
2017-07-25 15:10           ` Ben Peart
2017-07-29 13:26             ` Philip Oakley
2017-07-20  0:21 ` [RFC PATCH v2 3/4] sha1-array: support appending unsigned char hash Jonathan Tan
2017-07-20  0:21 ` [RFC PATCH v2 4/4] sha1_file: support promised object hook Jonathan Tan
2017-07-20 18:23   ` Stefan Beller
2017-07-20 20:58     ` Ben Peart
2017-07-20 21:18       ` Jonathan Tan
2017-07-21 16:27         ` Ben Peart

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f9c7d4b3f800ea31e85e4897ee7048fec1e3c2f0.1499800530.git.jonathantanmy@google.com \
    --to=jonathantanmy@google.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).