git@vger.kernel.org mailing list mirror (one of many)
 help / Atom feed
From: Jonathan Tan <jonathantanmy@google.com>
To: git@vger.kernel.org
Cc: Jonathan Tan <jonathantanmy@google.com>,
	gitster@pobox.com, git@jeffhostetler.com, peartben@gmail.com,
	christian.couder@gmail.com
Subject: [PATCH 01/18] fsck: introduce partialclone extension
Date: Fri, 29 Sep 2017 13:11:37 -0700
Message-ID: <a8c2d9c11c8bfba214cbd69f8c82697d384c0b55.1506714999.git.jonathantanmy@google.com> (raw)
In-Reply-To: <cover.1506714999.git.jonathantanmy@google.com>

Currently, Git does not support repos with very large numbers of objects
or repos that wish to minimize manipulation of certain blobs (for
example, because they are very large) very well, even if the user
operates mostly on part of the repo, because Git is designed on the
assumption that every referenced object is available somewhere in the
repo storage. In such an arrangement, the full set of objects is usually
available in remote storage, ready to be lazily downloaded.

Introduce the ability to have missing objects in a repo.  This
functionality is guarded behind a new repository extension option
`extensions.partialClone`. See the update to
Documentation/technical/repository-version.txt in this patch for more
information.

Teach fsck about the new state of affairs. In this commit, teach fsck
that missing promisor objects referenced from the reflog are not an
error case; in future commits, fsck will be taught about other cases.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 Documentation/technical/repository-version.txt | 12 ++++
 builtin/fsck.c                                 |  2 +-
 cache.h                                        |  5 +-
 environment.c                                  |  1 +
 packfile.c                                     | 77 +++++++++++++++++++++++-
 packfile.h                                     | 13 +++++
 setup.c                                        |  7 ++-
 t/t0410-partial-clone.sh                       | 81 ++++++++++++++++++++++++++
 8 files changed, 192 insertions(+), 6 deletions(-)
 create mode 100755 t/t0410-partial-clone.sh

diff --git a/Documentation/technical/repository-version.txt b/Documentation/technical/repository-version.txt
index 00ad37986..074ccb9e0 100644
--- a/Documentation/technical/repository-version.txt
+++ b/Documentation/technical/repository-version.txt
@@ -86,3 +86,15 @@ for testing format-1 compatibility.
 When the config key `extensions.preciousObjects` is set to `true`,
 objects in the repository MUST NOT be deleted (e.g., by `git-prune` or
 `git repack -d`).
+
+`partialClone`
+~~~~~~~~~~~~~~
+
+When the config key `extensions.partialClone` is set, a remote is
+designated as a "promisor remote". Objects referenced by packed objects
+obtained from that promisor remote do not need to be in the local repo.
+Instead, the promisor remote promises that all such objects can be
+fetched from it in the future, and Git will fetch those objects when
+needed.
+
+The value of this key is the name of the aforementioned promisor remote.
diff --git a/builtin/fsck.c b/builtin/fsck.c
index 1e4c471b4..97d1e621e 100644
--- a/builtin/fsck.c
+++ b/builtin/fsck.c
@@ -403,7 +403,7 @@ static void fsck_handle_reflog_oid(const char *refname, struct object_id *oid,
 					xstrfmt("%s@{%"PRItime"}", refname, timestamp));
 			obj->flags |= USED;
 			mark_object_reachable(obj);
-		} else {
+		} else if (!is_promisor_object(oid)) {
 			error("%s: invalid reflog entry %s", refname, oid_to_hex(oid));
 			errors_found |= ERROR_REACHABLE;
 		}
diff --git a/cache.h b/cache.h
index a916bc79e..437206d06 100644
--- a/cache.h
+++ b/cache.h
@@ -853,10 +853,12 @@ extern int grafts_replace_parents;
 #define GIT_REPO_VERSION 0
 #define GIT_REPO_VERSION_READ 1
 extern int repository_format_precious_objects;
+extern char *repository_format_partial_clone;
 
 struct repository_format {
 	int version;
 	int precious_objects;
+	char *partial_clone;
 	int is_bare;
 	char *work_tree;
 	struct string_list unknown_extensions;
@@ -1578,7 +1580,8 @@ extern struct packed_git {
 	unsigned pack_local:1,
 		 pack_keep:1,
 		 freshened:1,
-		 do_not_close:1;
+		 do_not_close:1,
+		 pack_promisor:1;
 	unsigned char sha1[20];
 	struct revindex_entry *revindex;
 	/* something like ".git/objects/pack/xxxxx.pack" */
diff --git a/environment.c b/environment.c
index f1f934b6f..624385fb2 100644
--- a/environment.c
+++ b/environment.c
@@ -27,6 +27,7 @@ int warn_ambiguous_refs = 1;
 int warn_on_object_refname_ambiguity = 1;
 int ref_paranoia = -1;
 int repository_format_precious_objects;
+char *repository_format_partial_clone;
 const char *git_commit_encoding;
 const char *git_log_output_encoding;
 const char *apply_default_whitespace;
diff --git a/packfile.c b/packfile.c
index f86fa051c..88f424320 100644
--- a/packfile.c
+++ b/packfile.c
@@ -8,6 +8,11 @@
 #include "list.h"
 #include "streaming.h"
 #include "sha1-lookup.h"
+#include "commit.h"
+#include "object.h"
+#include "tag.h"
+#include "tree-walk.h"
+#include "tree.h"
 
 char *odb_pack_name(struct strbuf *buf,
 		    const unsigned char *sha1,
@@ -638,10 +643,10 @@ struct packed_git *add_packed_git(const char *path, size_t path_len, int local)
 		return NULL;
 
 	/*
-	 * ".pack" is long enough to hold any suffix we're adding (and
+	 * ".promisor" is long enough to hold any suffix we're adding (and
 	 * the use xsnprintf double-checks that)
 	 */
-	alloc = st_add3(path_len, strlen(".pack"), 1);
+	alloc = st_add3(path_len, strlen(".promisor"), 1);
 	p = alloc_packed_git(alloc);
 	memcpy(p->pack_name, path, path_len);
 
@@ -649,6 +654,10 @@ struct packed_git *add_packed_git(const char *path, size_t path_len, int local)
 	if (!access(p->pack_name, F_OK))
 		p->pack_keep = 1;
 
+	xsnprintf(p->pack_name + path_len, alloc - path_len, ".promisor");
+	if (!access(p->pack_name, F_OK))
+		p->pack_promisor = 1;
+
 	xsnprintf(p->pack_name + path_len, alloc - path_len, ".pack");
 	if (stat(p->pack_name, &st) || !S_ISREG(st.st_mode)) {
 		free(p);
@@ -776,7 +785,8 @@ static void prepare_packed_git_one(char *objdir, int local)
 		if (ends_with(de->d_name, ".idx") ||
 		    ends_with(de->d_name, ".pack") ||
 		    ends_with(de->d_name, ".bitmap") ||
-		    ends_with(de->d_name, ".keep"))
+		    ends_with(de->d_name, ".keep") ||
+		    ends_with(de->d_name, ".promisor"))
 			string_list_append(&garbage, path.buf);
 		else
 			report_garbage(PACKDIR_FILE_GARBAGE, path.buf);
@@ -1884,6 +1894,9 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags)
 	for (p = packed_git; p; p = p->next) {
 		if ((flags & FOR_EACH_OBJECT_LOCAL_ONLY) && !p->pack_local)
 			continue;
+		if ((flags & FOR_EACH_OBJECT_PROMISOR_ONLY) &&
+		    !p->pack_promisor)
+			continue;
 		if (open_pack_index(p)) {
 			pack_errors = 1;
 			continue;
@@ -1894,3 +1907,61 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags)
 	}
 	return r ? r : pack_errors;
 }
+
+static int add_promisor_object(const struct object_id *oid,
+			       struct packed_git *pack,
+			       uint32_t pos,
+			       void *set_)
+{
+	struct oidset *set = set_;
+	struct object *obj = parse_object(oid);
+	if (!obj)
+		return 1;
+
+	oidset_insert(set, oid);
+
+	/*
+	 * If this is a tree, commit, or tag, the objects it refers
+	 * to are also promisor objects. (Blobs refer to no objects.)
+	 */
+	if (obj->type == OBJ_TREE) {
+		struct tree *tree = (struct tree *)obj;
+		struct tree_desc desc;
+		struct name_entry entry;
+		if (init_tree_desc_gently(&desc, tree->buffer, tree->size))
+			/*
+			 * Error messages are given when packs are
+			 * verified, so do not print any here.
+			 */
+			return 0;
+		while (tree_entry_gently(&desc, &entry))
+			oidset_insert(set, entry.oid);
+	} else if (obj->type == OBJ_COMMIT) {
+		struct commit *commit = (struct commit *) obj;
+		struct commit_list *parents = commit->parents;
+
+		oidset_insert(set, &commit->tree->object.oid);
+		for (; parents; parents = parents->next)
+			oidset_insert(set, &parents->item->object.oid);
+	} else if (obj->type == OBJ_TAG) {
+		struct tag *tag = (struct tag *) obj;
+		oidset_insert(set, &tag->tagged->oid);
+	}
+	return 0;
+}
+
+int is_promisor_object(const struct object_id *oid)
+{
+	static struct oidset promisor_objects;
+	static int promisor_objects_prepared;
+
+	if (!promisor_objects_prepared) {
+		if (repository_format_partial_clone) {
+			for_each_packed_object(add_promisor_object,
+					       &promisor_objects,
+					       FOR_EACH_OBJECT_PROMISOR_ONLY);
+		}
+		promisor_objects_prepared = 1;
+	}
+	return oidset_contains(&promisor_objects, oid);
+}
diff --git a/packfile.h b/packfile.h
index 0cdeb54dc..a7fca598d 100644
--- a/packfile.h
+++ b/packfile.h
@@ -1,6 +1,8 @@
 #ifndef PACKFILE_H
 #define PACKFILE_H
 
+#include "oidset.h"
+
 /*
  * Generate the filename to be used for a pack file with checksum "sha1" and
  * extension "ext". The result is written into the strbuf "buf", overwriting
@@ -124,6 +126,11 @@ extern int has_sha1_pack(const unsigned char *sha1);
 
 extern int has_pack_index(const unsigned char *sha1);
 
+/*
+ * Only iterate over packs obtained from the promisor remote.
+ */
+#define FOR_EACH_OBJECT_PROMISOR_ONLY 2
+
 /*
  * Iterate over packed objects in both the local
  * repository and any alternates repositories (unless the
@@ -135,4 +142,10 @@ typedef int each_packed_object_fn(const struct object_id *oid,
 				  void *data);
 extern int for_each_packed_object(each_packed_object_fn, void *, unsigned flags);
 
+/*
+ * Return 1 if an object in a promisor packfile is or refers to the given
+ * object, 0 otherwise.
+ */
+extern int is_promisor_object(const struct object_id *oid);
+
 #endif
diff --git a/setup.c b/setup.c
index 6d8380acd..35c9da806 100644
--- a/setup.c
+++ b/setup.c
@@ -420,7 +420,11 @@ static int check_repo_format(const char *var, const char *value, void *vdata)
 			;
 		else if (!strcmp(ext, "preciousobjects"))
 			data->precious_objects = git_config_bool(var, value);
-		else
+		else if (!strcmp(ext, "partialclone")) {
+			if (!value)
+				return config_error_nonbool(var);
+			data->partial_clone = xstrdup(value);
+		} else
 			string_list_append(&data->unknown_extensions, ext);
 	} else if (strcmp(var, "core.bare") == 0) {
 		data->is_bare = git_config_bool(var, value);
@@ -463,6 +467,7 @@ static int check_repository_format_gently(const char *gitdir, int *nongit_ok)
 	}
 
 	repository_format_precious_objects = candidate.precious_objects;
+	repository_format_partial_clone = candidate.partial_clone;
 	string_list_clear(&candidate.unknown_extensions, 0);
 	if (!has_common) {
 		if (candidate.is_bare != -1) {
diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh
new file mode 100755
index 000000000..3ddb3b98f
--- /dev/null
+++ b/t/t0410-partial-clone.sh
@@ -0,0 +1,81 @@
+#!/bin/sh
+
+test_description='partial clone'
+
+. ./test-lib.sh
+
+delete_object () {
+	rm $1/.git/objects/$(echo $2 | sed -e 's|^..|&/|')
+}
+
+pack_as_from_promisor () {
+	HASH=$(git -C repo pack-objects .git/objects/pack/pack) &&
+	>repo/.git/objects/pack/pack-$HASH.promisor
+}
+
+test_expect_success 'missing reflog object, but promised by a commit, passes fsck' '
+	test_create_repo repo &&
+	test_commit -C repo my_commit &&
+
+	A=$(git -C repo commit-tree -m a HEAD^{tree}) &&
+	C=$(git -C repo commit-tree -m c -p $A HEAD^{tree}) &&
+
+	# Reference $A only from reflog, and delete it
+	git -C repo branch my_branch "$A" &&
+	git -C repo branch -f my_branch my_commit &&
+	delete_object repo "$A" &&
+
+	# State that we got $C, which refers to $A, from promisor
+	printf "$C\n" | pack_as_from_promisor &&
+
+	# Normally, it fails
+	test_must_fail git -C repo fsck &&
+
+	# But with the extension, it succeeds
+	git -C repo config core.repositoryformatversion 1 &&
+	git -C repo config extensions.partialclone "arbitrary string" &&
+	git -C repo fsck
+'
+
+test_expect_success 'missing reflog object, but promised by a tag, passes fsck' '
+	rm -rf repo &&
+	test_create_repo repo &&
+	test_commit -C repo my_commit &&
+
+	A=$(git -C repo commit-tree -m a HEAD^{tree}) &&
+	git -C repo tag -a -m d my_tag_name $A &&
+	T=$(git -C repo rev-parse my_tag_name) &&
+	git -C repo tag -d my_tag_name &&
+
+	# Reference $A only from reflog, and delete it
+	git -C repo branch my_branch "$A" &&
+	git -C repo branch -f my_branch my_commit &&
+	delete_object repo "$A" &&
+
+	# State that we got $T, which refers to $A, from promisor
+	printf "$T\n" | pack_as_from_promisor &&
+
+	git -C repo config core.repositoryformatversion 1 &&
+	git -C repo config extensions.partialclone "arbitrary string" &&
+	git -C repo fsck
+'
+
+test_expect_success 'missing reflog object alone fails fsck, even with extension set' '
+	rm -rf repo &&
+	test_create_repo repo &&
+	test_commit -C repo my_commit &&
+
+	A=$(git -C repo commit-tree -m a HEAD^{tree}) &&
+	B=$(git -C repo commit-tree -m b HEAD^{tree}) &&
+
+	# Reference $A only from reflog, and delete it
+	git -C repo branch my_branch "$A" &&
+	git -C repo branch -f my_branch my_commit &&
+	delete_object repo "$A" &&
+
+	git -C repo config core.repositoryformatversion 1 &&
+	git -C repo config extensions.partialclone "arbitrary string" &&
+	test_must_fail git -C repo fsck
+'
+
+test_done
-- 
2.14.2.822.g60be5d43e6-goog


  reply index

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-29 20:11 [PATCH 00/18] Partial clone (from clone to lazy fetch in 18 patches) Jonathan Tan
2017-09-29 20:11 ` Jonathan Tan [this message]
2017-09-29 20:11 ` [PATCH 02/18] fsck: support refs pointing to promisor objects Jonathan Tan
2017-09-29 20:11 ` [PATCH 03/18] fsck: support referenced " Jonathan Tan
2017-09-29 20:11 ` [PATCH 04/18] fsck: support promisor objects as CLI argument Jonathan Tan
2017-09-29 20:11 ` [PATCH 05/18] index-pack: refactor writing of .keep files Jonathan Tan
2017-09-29 20:11 ` [PATCH 06/18] introduce fetch-object: fetch one promisor object Jonathan Tan
2017-09-29 20:11 ` [PATCH 07/18] sha1_file: support lazily fetching missing objects Jonathan Tan
2017-10-12 14:42   ` Christian Couder
2017-10-12 15:45     ` Christian Couder
2017-09-29 20:11 ` [PATCH 08/18] rev-list: support termination at promisor objects Jonathan Tan
2017-09-29 20:11 ` [PATCH 09/18] gc: do not repack promisor packfiles Jonathan Tan
2017-09-29 20:11 ` [PATCH 10/18] pack-objects: rename want_.* to ignore_.* Jonathan Tan
2017-09-29 20:11 ` [PATCH 11/18] pack-objects: support --blob-max-bytes Jonathan Tan
2017-09-29 20:11 ` [PATCH 12/18] fetch-pack: support excluding large blobs Jonathan Tan
2017-09-29 20:11 ` [PATCH 13/18] fetch: refactor calculation of remote list Jonathan Tan
2017-09-29 20:11 ` [PATCH 14/18] fetch: support excluding large blobs Jonathan Tan
2017-09-29 20:11 ` [PATCH 15/18] clone: " Jonathan Tan
2017-09-29 20:11 ` [PATCH 16/18] clone: configure blobmaxbytes in created repos Jonathan Tan
2017-09-29 20:11 ` [PATCH 17/18] unpack-trees: batch fetching of missing blobs Jonathan Tan
2017-09-29 20:11 ` [PATCH 18/18] fetch-pack: restore save_commit_buffer after use Jonathan Tan
2017-09-29 21:08 ` [PATCH 00/18] Partial clone (from clone to lazy fetch in 18 patches) Johannes Schindelin
2017-10-02  4:23 ` Junio C Hamano
2017-10-03  6:15 ` Christian Couder
2017-10-03  8:50   ` Junio C Hamano
2017-10-03 14:39     ` Jeff Hostetler
2017-10-03 23:42       ` Jonathan Tan
2017-10-04 13:30         ` Jeff Hostetler

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a8c2d9c11c8bfba214cbd69f8c82697d384c0b55.1506714999.git.jonathantanmy@google.com \
    --to=jonathantanmy@google.com \
    --cc=christian.couder@gmail.com \
    --cc=git@jeffhostetler.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=peartben@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

git@vger.kernel.org mailing list mirror (one of many)

Archives are clonable:
	git clone --mirror https://public-inbox.org/git
	git clone --mirror http://ou63pmih66umazou.onion/git
	git clone --mirror http://czquwvybam4bgbro.onion/git
	git clone --mirror http://hjrcffqmbrq6wope.onion/git

Newsgroups are available over NNTP:
	nntp://news.public-inbox.org/inbox.comp.version-control.git
	nntp://ou63pmih66umazou.onion/inbox.comp.version-control.git
	nntp://czquwvybam4bgbro.onion/inbox.comp.version-control.git
	nntp://hjrcffqmbrq6wope.onion/inbox.comp.version-control.git
	nntp://news.gmane.org/gmane.comp.version-control.git

 note: .onion URLs require Tor: https://www.torproject.org/
       or Tor2web: https://www.tor2web.org/

AGPL code for this site: git clone https://public-inbox.org/ public-inbox